diff --git a/doc/classes/RenderingDevice.xml b/doc/classes/RenderingDevice.xml
index 3ab4aa107d..427c57926c 100644
--- a/doc/classes/RenderingDevice.xml
+++ b/doc/classes/RenderingDevice.xml
@@ -2490,6 +2490,9 @@
 		<constant name="SUPPORTS_BUFFER_DEVICE_ADDRESS" value="6" enum="Features">
 			Features support for buffer device address extension.
 		</constant>
+		<constant name="SUPPORTS_IMAGE_ATOMIC_32_BIT" value="7" enum="Features">
+			Support for 32-bit image atomic operations.
+		</constant>
 		<constant name="LIMIT_MAX_BOUND_UNIFORM_SETS" value="0" enum="Limit">
 			Maximum number of uniform sets that can be bound at a given time.
 		</constant>
diff --git a/drivers/apple/foundation_helpers.h b/drivers/apple/foundation_helpers.h
new file mode 100644
index 0000000000..db87fba96c
--- /dev/null
+++ b/drivers/apple/foundation_helpers.h
@@ -0,0 +1,56 @@
+/**************************************************************************/
+/*  foundation_helpers.h                                                  */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#pragma once
+
+#import <Foundation/NSString.h>
+
+class String;
+template <typename T>
+class CharStringT;
+
+using CharString = CharStringT<char>;
+
+namespace conv {
+
+/**
+ * Converts a Godot String to an NSString without allocating an intermediate UTF-8 buffer.
+ * */
+NSString *to_nsstring(const String &p_str);
+/**
+ * Converts a Godot CharString to an NSString without allocating an intermediate UTF-8 buffer.
+ * */
+NSString *to_nsstring(const CharString &p_str);
+/**
+ * Converts an NSString to a Godot String without allocating intermediate buffers.
+ * */
+String to_string(NSString *p_str);
+
+} //namespace conv
diff --git a/drivers/apple/foundation_helpers.mm b/drivers/apple/foundation_helpers.mm
new file mode 100644
index 0000000000..0453011b1d
--- /dev/null
+++ b/drivers/apple/foundation_helpers.mm
@@ -0,0 +1,85 @@
+/**************************************************************************/
+/*  foundation_helpers.mm                                                 */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             GODOT ENGINE                               */
+/*                        https://godotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#import "foundation_helpers.h"
+
+#import "core/string/ustring.h"
+
+#import <CoreFoundation/CFString.h>
+
+namespace conv {
+
+NSString *to_nsstring(const String &p_str) {
+	return [[NSString alloc] initWithBytes:(const void *)p_str.ptr()
+									length:p_str.length() * sizeof(char32_t)
+								  encoding:NSUTF32LittleEndianStringEncoding];
+}
+
+NSString *to_nsstring(const CharString &p_str) {
+	return [[NSString alloc] initWithBytes:(const void *)p_str.ptr()
+									length:p_str.length()
+								  encoding:NSUTF8StringEncoding];
+}
+
+String to_string(NSString *p_str) {
+	CFStringRef str = (__bridge CFStringRef)p_str;
+	CFStringEncoding fastest = CFStringGetFastestEncoding(str);
+	// Sometimes, CFString will return a pointer to it's encoded data,
+	// so we can create the string without allocating intermediate buffers.
+	const char *p = CFStringGetCStringPtr(str, fastest);
+	if (p) {
+		switch (fastest) {
+			case kCFStringEncodingASCII:
+				return String::ascii(Span(p, CFStringGetLength(str)));
+			case kCFStringEncodingUTF8:
+				return String::utf8(p);
+			case kCFStringEncodingUTF32LE:
+				return String::utf32(Span((char32_t *)p, CFStringGetLength(str)));
+			default:
+				break;
+		}
+	}
+
+	CFRange range = CFRangeMake(0, CFStringGetLength(str));
+	CFIndex byte_len = 0;
+	// Try to losslessly convert the string directly into a String's buffer to avoid intermediate allocations.
+	CFIndex n = CFStringGetBytes(str, range, kCFStringEncodingUTF32LE, 0, NO, nil, 0, &byte_len);
+	if (n == range.length) {
+		String res;
+		res.resize_uninitialized((byte_len / sizeof(char32_t)) + 1);
+		res[n] = 0;
+		n = CFStringGetBytes(str, range, kCFStringEncodingUTF32LE, 0, NO, (UInt8 *)res.ptrw(), res.length() * sizeof(char32_t), nil);
+		return res;
+	}
+
+	return String::utf8(p_str.UTF8String);
+}
+
+} //namespace conv
diff --git a/drivers/d3d12/rendering_device_driver_d3d12.cpp b/drivers/d3d12/rendering_device_driver_d3d12.cpp
index a91eee3050..f454291a1b 100644
--- a/drivers/d3d12/rendering_device_driver_d3d12.cpp
+++ b/drivers/d3d12/rendering_device_driver_d3d12.cpp
@@ -5586,6 +5586,8 @@ bool RenderingDeviceDriverD3D12::has_feature(Features p_feature) {
 			return true;
 		case SUPPORTS_BUFFER_DEVICE_ADDRESS:
 			return true;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+			return true;
 		default:
 			return false;
 	}
diff --git a/drivers/metal/SCsub b/drivers/metal/SCsub
index a4c1c65b82..f55933a2c3 100644
--- a/drivers/metal/SCsub
+++ b/drivers/metal/SCsub
@@ -12,7 +12,6 @@ thirdparty_obj = []
 thirdparty_dir = "#thirdparty/spirv-cross/"
 thirdparty_sources = [
     "spirv_cfg.cpp",
-    "spirv_cross_util.cpp",
     "spirv_cross.cpp",
     "spirv_parser.cpp",
     "spirv_msl.cpp",
diff --git a/drivers/metal/metal_device_properties.h b/drivers/metal/metal_device_properties.h
index 720efd64e1..24a1a4cdf9 100644
--- a/drivers/metal/metal_device_properties.h
+++ b/drivers/metal/metal_device_properties.h
@@ -94,6 +94,8 @@ struct API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MetalFeatures {
 	bool metal_fx_spatial = false; /**< If true, Metal FX spatial functions are supported. */
 	bool metal_fx_temporal = false; /**< If true, Metal FX temporal functions are supported. */
 	bool supports_gpu_address = false; /**< If true, referencing a GPU address in a shader is supported. */
+	bool supports_image_atomic_32_bit = false; /**< If true, 32-bit atomic operations on images are supported. */
+	bool supports_image_atomic_64_bit = false; /**< If true, 64-bit atomic operations on images are supported. */
 };
 
 struct MetalLimits {
diff --git a/drivers/metal/metal_device_properties.mm b/drivers/metal/metal_device_properties.mm
index 43946ede6e..4b06e24ad3 100644
--- a/drivers/metal/metal_device_properties.mm
+++ b/drivers/metal/metal_device_properties.mm
@@ -121,6 +121,12 @@ void MetalDeviceProperties::init_features(id<MTLDevice> p_device) {
 	features.simdPermute = [p_device supportsFamily:MTLGPUFamilyApple6];
 	features.simdReduction = [p_device supportsFamily:MTLGPUFamilyApple7];
 	features.argument_buffers_tier = p_device.argumentBuffersSupport;
+	features.supports_image_atomic_32_bit = [p_device supportsFamily:MTLGPUFamilyApple6];
+	features.supports_image_atomic_64_bit = [p_device supportsFamily:MTLGPUFamilyApple8];
+	if (OS::get_singleton()->get_environment("GODOT_MTL_DISABLE_IMAGE_ATOMICS") == "1") {
+		features.supports_image_atomic_32_bit = false;
+		features.supports_image_atomic_64_bit = false;
+	}
 
 	if (@available(macOS 13.0, iOS 16.0, tvOS 16.0, *)) {
 		features.needs_arg_encoders = !([p_device supportsFamily:MTLGPUFamilyMetal3] && features.argument_buffers_tier == MTLArgumentBuffersTier2);
diff --git a/drivers/metal/metal_objects.h b/drivers/metal/metal_objects.h
index b89d4ba2e0..4826ded95d 100644
--- a/drivers/metal/metal_objects.h
+++ b/drivers/metal/metal_objects.h
@@ -309,9 +309,23 @@ public:
 
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDCommandBuffer {
 private:
+#pragma mark - Common State
+
+	// From RenderingDevice
+	static constexpr uint32_t MAX_PUSH_CONSTANT_SIZE = 128;
+
 	RenderingDeviceDriverMetal *device_driver = nullptr;
 	id<MTLCommandQueue> queue = nil;
 	id<MTLCommandBuffer> commandBuffer = nil;
+	bool state_begin = false;
+
+	_FORCE_INLINE_ id<MTLCommandBuffer> command_buffer() {
+		DEV_ASSERT(state_begin);
+		if (commandBuffer == nil) {
+			commandBuffer = queue.commandBuffer;
+		}
+		return commandBuffer;
+	}
 
 	void _end_compute_dispatch();
 	void _end_blit();
@@ -326,6 +340,11 @@ private:
 	void _end_render_pass();
 	void _render_clear_render_area();
 
+#pragma mark - Compute
+
+	void _compute_set_dirty_state();
+	void _compute_bind_uniform_sets();
+
 public:
 	MDCommandBufferStateType type = MDCommandBufferStateType::None;
 
@@ -349,18 +368,18 @@ public:
 		LocalVector<NSUInteger> vertex_offsets;
 		ResourceUsageMap resource_usage;
 		// clang-format off
-		enum DirtyFlag: uint8_t {
-			DIRTY_NONE     = 0b0000'0000,
-			DIRTY_PIPELINE = 0b0000'0001, //! pipeline state
-			DIRTY_UNIFORMS = 0b0000'0010, //! uniform sets
-			DIRTY_DEPTH    = 0b0000'0100, //! depth / stencil state
-			DIRTY_VERTEX   = 0b0000'1000, //! vertex buffers
-			DIRTY_VIEWPORT = 0b0001'0000, //! viewport rectangles
-			DIRTY_SCISSOR  = 0b0010'0000, //! scissor rectangles
-			DIRTY_BLEND    = 0b0100'0000, //! blend state
-			DIRTY_RASTER   = 0b1000'0000, //! encoder state like cull mode
-
-			DIRTY_ALL      = 0xff,
+		enum DirtyFlag: uint16_t {
+			DIRTY_NONE     = 0,
+			DIRTY_PIPELINE = 1 << 0, //! pipeline state
+			DIRTY_UNIFORMS = 1 << 1, //! uniform sets
+			DIRTY_PUSH     = 1 << 2, //! push constants
+			DIRTY_DEPTH    = 1 << 3, //! depth / stencil state
+			DIRTY_VERTEX   = 1 << 4, //! vertex buffers
+			DIRTY_VIEWPORT = 1 << 5, //! viewport rectangles
+			DIRTY_SCISSOR  = 1 << 6, //! scissor rectangles
+			DIRTY_BLEND    = 1 << 7, //! blend state
+			DIRTY_RASTER   = 1 << 8, //! encoder state like cull mode
+			DIRTY_ALL      = (1 << 9) - 1,
 		};
 		// clang-format on
 		BitField<DirtyFlag> dirty = DIRTY_NONE;
@@ -368,6 +387,9 @@ public:
 		LocalVector<MDUniformSet *> uniform_sets;
 		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
 		uint64_t uniform_set_mask = 0;
+		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
+		uint32_t push_constant_data_len = 0;
+		uint32_t push_constant_bindings[2] = { 0 };
 
 		_FORCE_INLINE_ void reset();
 		void end_encoding();
@@ -422,6 +444,13 @@ public:
 			dirty.set_flag(DirtyFlag::DIRTY_UNIFORMS);
 		}
 
+		_FORCE_INLINE_ void mark_push_constants_dirty() {
+			if (push_constant_data_len == 0) {
+				return;
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_PUSH);
+		}
+
 		_FORCE_INLINE_ void mark_blend_dirty() {
 			if (!blend_constants.has_value()) {
 				return;
@@ -464,16 +493,46 @@ public:
 		MDComputePipeline *pipeline = nullptr;
 		id<MTLComputeCommandEncoder> encoder = nil;
 		ResourceUsageMap resource_usage;
-		_FORCE_INLINE_ void reset() {
-			pipeline = nil;
-			encoder = nil;
-			// Keep the keys, as they are likely to be used again.
-			for (KeyValue<StageResourceUsage, LocalVector<__unsafe_unretained id<MTLResource>>> &kv : resource_usage) {
-				kv.value.clear();
+		// clang-format off
+		enum DirtyFlag: uint16_t {
+			DIRTY_NONE     = 0,
+			DIRTY_PIPELINE = 1 << 0, //! pipeline state
+			DIRTY_UNIFORMS = 1 << 1, //! uniform sets
+			DIRTY_PUSH     = 1 << 2, //! push constants
+			DIRTY_ALL      = (1 << 3) - 1,
+		};
+		// clang-format on
+		BitField<DirtyFlag> dirty = DIRTY_NONE;
+
+		LocalVector<MDUniformSet *> uniform_sets;
+		// Bit mask of the uniform sets that are dirty, to prevent redundant binding.
+		uint64_t uniform_set_mask = 0;
+		uint8_t push_constant_data[MAX_PUSH_CONSTANT_SIZE];
+		uint32_t push_constant_data_len = 0;
+		uint32_t push_constant_bindings[1] = { 0 };
+
+		_FORCE_INLINE_ void reset();
+		void end_encoding();
+
+		_FORCE_INLINE_ void mark_uniforms_dirty(void) {
+			if (uniform_sets.is_empty()) {
+				return;
 			}
+			for (uint32_t i = 0; i < uniform_sets.size(); i++) {
+				if (uniform_sets[i] != nullptr) {
+					uniform_set_mask |= 1 << i;
+				}
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_UNIFORMS);
+		}
+
+		_FORCE_INLINE_ void mark_push_constants_dirty() {
+			if (push_constant_data_len == 0) {
+				return;
+			}
+			dirty.set_flag(DirtyFlag::DIRTY_PUSH);
 		}
 
-		void end_encoding();
 	} compute;
 
 	// State specific to a blit pass.
@@ -496,6 +555,7 @@ public:
 	void encodeRenderCommandEncoderWithDescriptor(MTLRenderPassDescriptor *p_desc, NSString *p_label);
 
 	void bind_pipeline(RDD::PipelineID p_pipeline);
+	void encode_push_constant_data(RDD::ShaderID p_shader, VectorView<uint32_t> p_data);
 
 #pragma mark - Render Commands
 
@@ -661,8 +721,6 @@ public:
 	Vector<UniformSet> sets;
 	bool uses_argument_buffers = true;
 
-	virtual void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) = 0;
-
 	MDShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers) :
 			name(p_name), sets(p_sets), uses_argument_buffers(p_uses_argument_buffers) {}
 	virtual ~MDShader() = default;
@@ -671,15 +729,13 @@ public:
 class API_AVAILABLE(macos(11.0), ios(14.0), tvos(14.0)) MDComputeShader final : public MDShader {
 public:
 	struct {
-		uint32_t binding = -1;
+		int32_t binding = -1;
 		uint32_t size = 0;
 	} push_constants;
 	MTLSize local = {};
 
 	MDLibrary *kernel;
 
-	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
-
 	MDComputeShader(CharString p_name, Vector<UniformSet> p_sets, bool p_uses_argument_buffers, MDLibrary *p_kernel);
 };
 
@@ -700,8 +756,6 @@ public:
 	MDLibrary *vert;
 	MDLibrary *frag;
 
-	void encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) final;
-
 	MDRenderShader(CharString p_name,
 			Vector<UniformSet> p_sets,
 			bool p_needs_view_mask_buffer,
diff --git a/drivers/metal/metal_objects.mm b/drivers/metal/metal_objects.mm
index f8056e217b..96ade55c1b 100644
--- a/drivers/metal/metal_objects.mm
+++ b/drivers/metal/metal_objects.mm
@@ -62,8 +62,8 @@
 #undef MAX
 
 void MDCommandBuffer::begin() {
-	DEV_ASSERT(commandBuffer == nil);
-	commandBuffer = queue.commandBuffer;
+	DEV_ASSERT(commandBuffer == nil && !state_begin);
+	state_begin = true;
 }
 
 void MDCommandBuffer::end() {
@@ -83,6 +83,7 @@ void MDCommandBuffer::commit() {
 	end();
 	[commandBuffer commit];
 	commandBuffer = nil;
+	state_begin = false;
 }
 
 void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
@@ -136,7 +137,7 @@ void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
 				render.desc.colorAttachments[0].resolveTexture = res_tex;
 			}
 #endif
-			render.encoder = [commandBuffer renderCommandEncoderWithDescriptor:render.desc];
+			render.encoder = [command_buffer() renderCommandEncoderWithDescriptor:render.desc];
 		}
 
 		if (render.pipeline != rp) {
@@ -160,9 +161,44 @@ void MDCommandBuffer::bind_pipeline(RDD::PipelineID p_pipeline) {
 		DEV_ASSERT(type == MDCommandBufferStateType::None);
 		type = MDCommandBufferStateType::Compute;
 
-		compute.pipeline = (MDComputePipeline *)p;
-		compute.encoder = commandBuffer.computeCommandEncoder;
-		[compute.encoder setComputePipelineState:compute.pipeline->state];
+		if (compute.pipeline != p) {
+			compute.dirty.set_flag(ComputeState::DIRTY_PIPELINE);
+			compute.mark_uniforms_dirty();
+			compute.pipeline = (MDComputePipeline *)p;
+		}
+	}
+}
+
+void MDCommandBuffer::encode_push_constant_data(RDD::ShaderID p_shader, VectorView<uint32_t> p_data) {
+	switch (type) {
+		case MDCommandBufferStateType::Render: {
+			MDRenderShader *shader = (MDRenderShader *)(p_shader.id);
+			if (shader->push_constants.vert.binding == -1 && shader->push_constants.frag.binding == -1) {
+				return;
+			}
+			render.push_constant_bindings[0] = shader->push_constants.vert.binding;
+			render.push_constant_bindings[1] = shader->push_constants.frag.binding;
+			void const *ptr = p_data.ptr();
+			render.push_constant_data_len = p_data.size() * sizeof(uint32_t);
+			DEV_ASSERT(render.push_constant_data_len <= sizeof(RenderState::push_constant_data));
+			memcpy(render.push_constant_data, ptr, render.push_constant_data_len);
+			render.mark_push_constants_dirty();
+		} break;
+		case MDCommandBufferStateType::Compute: {
+			MDComputeShader *shader = (MDComputeShader *)(p_shader.id);
+			if (shader->push_constants.binding == -1) {
+				return;
+			}
+			compute.push_constant_bindings[0] = shader->push_constants.binding;
+			void const *ptr = p_data.ptr();
+			compute.push_constant_data_len = p_data.size() * sizeof(uint32_t);
+			DEV_ASSERT(compute.push_constant_data_len <= sizeof(ComputeState::push_constant_data));
+			memcpy(compute.push_constant_data, ptr, compute.push_constant_data_len);
+			compute.mark_push_constants_dirty();
+		} break;
+		case MDCommandBufferStateType::Blit:
+		case MDCommandBufferStateType::None:
+			return;
 	}
 }
 
@@ -181,7 +217,7 @@ id<MTLBlitCommandEncoder> MDCommandBuffer::blit_command_encoder() {
 	}
 
 	type = MDCommandBufferStateType::Blit;
-	blit.encoder = commandBuffer.blitCommandEncoder;
+	blit.encoder = command_buffer().blitCommandEncoder;
 	return blit.encoder;
 }
 
@@ -200,7 +236,7 @@ void MDCommandBuffer::encodeRenderCommandEncoderWithDescriptor(MTLRenderPassDesc
 			break;
 	}
 
-	id<MTLRenderCommandEncoder> enc = [commandBuffer renderCommandEncoderWithDescriptor:p_desc];
+	id<MTLRenderCommandEncoder> enc = [command_buffer() renderCommandEncoderWithDescriptor:p_desc];
 	if (p_label != nil) {
 		[enc pushDebugGroup:p_label];
 		[enc popDebugGroup];
@@ -344,6 +380,19 @@ void MDCommandBuffer::render_clear_attachments(VectorView<RDD::AttachmentClear>
 void MDCommandBuffer::_render_set_dirty_state() {
 	_render_bind_uniform_sets();
 
+	if (render.dirty.has_flag(RenderState::DIRTY_PUSH)) {
+		if (render.push_constant_bindings[0] != (uint32_t)-1) {
+			[render.encoder setVertexBytes:render.push_constant_data
+									length:render.push_constant_data_len
+								   atIndex:render.push_constant_bindings[0]];
+		}
+		if (render.push_constant_bindings[1] != (uint32_t)-1) {
+			[render.encoder setFragmentBytes:render.push_constant_data
+									  length:render.push_constant_data_len
+									 atIndex:render.push_constant_bindings[1]];
+		}
+	}
+
 	MDSubpass const &subpass = render.get_subpass();
 	if (subpass.view_count > 1) {
 		uint32_t view_range[2] = { 0, subpass.view_count };
@@ -552,7 +601,7 @@ uint32_t MDCommandBuffer::_populate_vertices(simd::float4 *p_vertices, uint32_t
 }
 
 void MDCommandBuffer::render_begin_pass(RDD::RenderPassID p_render_pass, RDD::FramebufferID p_frameBuffer, RDD::CommandBufferType p_cmd_buffer_type, const Rect2i &p_rect, VectorView<RDD::RenderPassClearValue> p_clear_values) {
-	DEV_ASSERT(commandBuffer != nil);
+	DEV_ASSERT(command_buffer() != nil);
 	end();
 
 	MDRenderPass *pass = (MDRenderPass *)(p_render_pass.id);
@@ -639,7 +688,7 @@ void MDCommandBuffer::_render_clear_render_area() {
 }
 
 void MDCommandBuffer::render_next_subpass() {
-	DEV_ASSERT(commandBuffer != nil);
+	DEV_ASSERT(command_buffer() != nil);
 
 	if (render.current_subpass == UINT32_MAX) {
 		render.current_subpass = 0;
@@ -726,7 +775,7 @@ void MDCommandBuffer::render_next_subpass() {
 		// the defaultRasterSampleCount from the pipeline's sample count.
 		render.desc = desc;
 	} else {
-		render.encoder = [commandBuffer renderCommandEncoderWithDescriptor:desc];
+		render.encoder = [command_buffer() renderCommandEncoderWithDescriptor:desc];
 
 		if (!render.is_rendering_entire_area) {
 			_render_clear_render_area();
@@ -895,6 +944,7 @@ void MDCommandBuffer::RenderState::reset() {
 	dirty = DIRTY_NONE;
 	uniform_sets.clear();
 	uniform_set_mask = 0;
+	push_constant_data_len = 0;
 	clear_values.clear();
 	viewports.clear();
 	scissors.clear();
@@ -960,29 +1010,108 @@ void MDCommandBuffer::ComputeState::end_encoding() {
 
 #pragma mark - Compute
 
+void MDCommandBuffer::_compute_set_dirty_state() {
+	if (compute.dirty.has_flag(ComputeState::DIRTY_PIPELINE)) {
+		compute.encoder = [command_buffer() computeCommandEncoderWithDispatchType:MTLDispatchTypeConcurrent];
+		[compute.encoder setComputePipelineState:compute.pipeline->state];
+	}
+
+	_compute_bind_uniform_sets();
+
+	if (compute.dirty.has_flag(ComputeState::DIRTY_PUSH)) {
+		if (compute.push_constant_bindings[0] != (uint32_t)-1) {
+			[compute.encoder setBytes:compute.push_constant_data
+							   length:compute.push_constant_data_len
+							  atIndex:compute.push_constant_bindings[0]];
+		}
+	}
+
+	compute.dirty.clear();
+}
+
+void MDCommandBuffer::_compute_bind_uniform_sets() {
+	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
+	if (!compute.dirty.has_flag(ComputeState::DIRTY_UNIFORMS)) {
+		return;
+	}
+
+	compute.dirty.clear_flag(ComputeState::DIRTY_UNIFORMS);
+	uint64_t set_uniforms = compute.uniform_set_mask;
+	compute.uniform_set_mask = 0;
+
+	MDComputeShader *shader = compute.pipeline->shader;
+
+	while (set_uniforms != 0) {
+		// Find the index of the next set bit.
+		uint32_t index = (uint32_t)__builtin_ctzll(set_uniforms);
+		// Clear the set bit.
+		set_uniforms &= (set_uniforms - 1);
+		MDUniformSet *set = compute.uniform_sets[index];
+		if (set == nullptr || index >= (uint32_t)shader->sets.size()) {
+			continue;
+		}
+		set->bind_uniforms(shader, compute, index);
+	}
+}
+
+void MDCommandBuffer::ComputeState::reset() {
+	pipeline = nil;
+	encoder = nil;
+	dirty = DIRTY_NONE;
+	uniform_sets.clear();
+	uniform_set_mask = 0;
+	push_constant_data_len = 0;
+	// Keep the keys, as they are likely to be used again.
+	for (KeyValue<StageResourceUsage, LocalVector<__unsafe_unretained id<MTLResource>>> &kv : resource_usage) {
+		kv.value.clear();
+	}
+}
+
 void MDCommandBuffer::compute_bind_uniform_set(RDD::UniformSetID p_uniform_set, RDD::ShaderID p_shader, uint32_t p_set_index) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
-	MDShader *shader = (MDShader *)(p_shader.id);
 	MDUniformSet *set = (MDUniformSet *)(p_uniform_set.id);
-	set->bind_uniforms(shader, compute, p_set_index);
+	if (compute.uniform_sets.size() <= p_set_index) {
+		uint32_t s = render.uniform_sets.size();
+		compute.uniform_sets.resize(p_set_index + 1);
+		// Set intermediate values to null.
+		std::fill(&compute.uniform_sets[s], &compute.uniform_sets[p_set_index] + 1, nullptr);
+	}
+
+	if (compute.uniform_sets[p_set_index] != set) {
+		compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
+		compute.uniform_set_mask |= 1ULL << p_set_index;
+		compute.uniform_sets[p_set_index] = set;
+	}
 }
 
 void MDCommandBuffer::compute_bind_uniform_sets(VectorView<RDD::UniformSetID> p_uniform_sets, RDD::ShaderID p_shader, uint32_t p_first_set_index, uint32_t p_set_count) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
-	MDShader *shader = (MDShader *)(p_shader.id);
-
-	// TODO(sgc): Bind multiple buffers using [encoder setBuffers:offsets:withRange:]
-	for (size_t i = 0u; i < p_set_count; ++i) {
+	for (size_t i = 0; i < p_set_count; ++i) {
 		MDUniformSet *set = (MDUniformSet *)(p_uniform_sets[i].id);
-		set->bind_uniforms(shader, compute, p_first_set_index + i);
+
+		uint32_t index = p_first_set_index + i;
+		if (compute.uniform_sets.size() <= index) {
+			uint32_t s = compute.uniform_sets.size();
+			compute.uniform_sets.resize(index + 1);
+			// Set intermediate values to null.
+			std::fill(&compute.uniform_sets[s], &compute.uniform_sets[index] + 1, nullptr);
+		}
+
+		if (compute.uniform_sets[index] != set) {
+			compute.dirty.set_flag(ComputeState::DIRTY_UNIFORMS);
+			compute.uniform_set_mask |= 1ULL << index;
+			compute.uniform_sets[index] = set;
+		}
 	}
 }
 
 void MDCommandBuffer::compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups, uint32_t p_z_groups) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
+	_compute_set_dirty_state();
+
 	MTLRegion region = MTLRegionMake3D(0, 0, 0, p_x_groups, p_y_groups, p_z_groups);
 
 	id<MTLComputeCommandEncoder> enc = compute.encoder;
@@ -992,6 +1121,8 @@ void MDCommandBuffer::compute_dispatch(uint32_t p_x_groups, uint32_t p_y_groups,
 void MDCommandBuffer::compute_dispatch_indirect(RDD::BufferID p_indirect_buffer, uint64_t p_offset) {
 	DEV_ASSERT(type == MDCommandBufferStateType::Compute);
 
+	_compute_set_dirty_state();
+
 	id<MTLBuffer> indirectBuffer = rid::get(p_indirect_buffer);
 
 	id<MTLComputeCommandEncoder> enc = compute.encoder;
@@ -1021,20 +1152,6 @@ MDComputeShader::MDComputeShader(CharString p_name,
 		MDShader(p_name, p_sets, p_uses_argument_buffers), kernel(p_kernel) {
 }
 
-void MDComputeShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
-	DEV_ASSERT(p_cb->type == MDCommandBufferStateType::Compute);
-	if (push_constants.binding == (uint32_t)-1) {
-		return;
-	}
-
-	id<MTLComputeCommandEncoder> enc = p_cb->compute.encoder;
-
-	void const *ptr = p_data.ptr();
-	size_t length = p_data.size() * sizeof(uint32_t);
-
-	[enc setBytes:ptr length:length atIndex:push_constants.binding];
-}
-
 MDRenderShader::MDRenderShader(CharString p_name,
 		Vector<UniformSet> p_sets,
 		bool p_needs_view_mask_buffer,
@@ -1046,22 +1163,6 @@ MDRenderShader::MDRenderShader(CharString p_name,
 		frag(p_frag) {
 }
 
-void MDRenderShader::encode_push_constant_data(VectorView<uint32_t> p_data, MDCommandBuffer *p_cb) {
-	DEV_ASSERT(p_cb->type == MDCommandBufferStateType::Render);
-	id<MTLRenderCommandEncoder> __unsafe_unretained enc = p_cb->render.encoder;
-
-	void const *ptr = p_data.ptr();
-	size_t length = p_data.size() * sizeof(uint32_t);
-
-	if (push_constants.vert.binding > -1) {
-		[enc setVertexBytes:ptr length:length atIndex:push_constants.vert.binding];
-	}
-
-	if (push_constants.frag.binding > -1) {
-		[enc setFragmentBytes:ptr length:length atIndex:push_constants.frag.binding];
-	}
-}
-
 void MDUniformSet::bind_uniforms_argument_buffers(MDShader *p_shader, MDCommandBuffer::RenderState &p_state, uint32_t p_set_index) {
 	DEV_ASSERT(p_shader->uses_argument_buffers);
 	DEV_ASSERT(p_state.encoder != nil);
diff --git a/drivers/metal/rendering_device_driver_metal.mm b/drivers/metal/rendering_device_driver_metal.mm
index ed450b8bf8..e099e28a39 100644
--- a/drivers/metal/rendering_device_driver_metal.mm
+++ b/drivers/metal/rendering_device_driver_metal.mm
@@ -58,6 +58,7 @@
 #include "core/io/marshalls.h"
 #include "core/string/ustring.h"
 #include "core/templates/hash_map.h"
+#include "drivers/apple/foundation_helpers.h"
 
 #import <Metal/MTLTexture.h>
 #import <Metal/Metal.h>
@@ -317,12 +318,6 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p
 		desc.usage |= MTLTextureUsageShaderWrite;
 	}
 
-	if (@available(macOS 14.0, iOS 17.0, tvOS 17.0, *)) {
-		if (format_caps & kMTLFmtCapsAtomic) {
-			desc.usage |= MTLTextureUsageShaderAtomic;
-		}
-	}
-
 	bool can_be_attachment = flags::any(format_caps, (kMTLFmtCapsColorAtt | kMTLFmtCapsDSAtt));
 
 	if (flags::any(p_format.usage_bits, TEXTURE_USAGE_COLOR_ATTACHMENT_BIT | TEXTURE_USAGE_DEPTH_STENCIL_ATTACHMENT_BIT) &&
@@ -334,6 +329,18 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p
 		desc.usage |= MTLTextureUsageShaderRead;
 	}
 
+	if (p_format.usage_bits & TEXTURE_USAGE_STORAGE_ATOMIC_BIT) {
+		if (@available(macOS 14.0, iOS 17.0, tvOS 17.0, *)) {
+			if (format_caps & kMTLFmtCapsAtomic) {
+				desc.usage |= MTLTextureUsageShaderAtomic;
+			} else {
+				ERR_FAIL_V_MSG(RDD::TextureID(), "Atomic operations on this texture format are not supported.");
+			}
+		} else {
+			ERR_FAIL_V_MSG(RDD::TextureID(), "Atomic texture operations not supported on this OS version.");
+		}
+	}
+
 	if (p_format.usage_bits & TEXTURE_USAGE_VRS_ATTACHMENT_BIT) {
 		ERR_FAIL_V_MSG(RDD::TextureID(), "unsupported: TEXTURE_USAGE_VRS_ATTACHMENT_BIT");
 	}
@@ -363,7 +370,29 @@ RDD::TextureID RenderingDeviceDriverMetal::texture_create(const TextureFormat &p
 
 	// Check if it is a linear format for atomic operations and therefore needs a buffer,
 	// as generally Metal does not support atomic operations on textures.
-	bool needs_buffer = is_linear || (p_format.array_layers == 1 && p_format.mipmaps == 1 && p_format.texture_type == TEXTURE_TYPE_2D && flags::any(p_format.usage_bits, TEXTURE_USAGE_STORAGE_BIT) && (p_format.format == DATA_FORMAT_R32_UINT || p_format.format == DATA_FORMAT_R32_SINT || p_format.format == DATA_FORMAT_R32G32_UINT || p_format.format == DATA_FORMAT_R32G32_SINT));
+	bool needs_buffer = is_linear;
+
+	// Check for atomic requirements.
+	if (flags::any(p_format.usage_bits, TEXTURE_USAGE_STORAGE_BIT) && p_format.array_layers == 1 && p_format.mipmaps == 1 && p_format.texture_type == TEXTURE_TYPE_2D) {
+		switch (p_format.format) {
+			case RenderingDeviceCommons::DATA_FORMAT_R32_SINT:
+			case RenderingDeviceCommons::DATA_FORMAT_R32_UINT: {
+				if (!device_properties->features.supports_image_atomic_32_bit) {
+					// We can emulate 32-bit atomic operations on textures.
+					needs_buffer = true;
+				}
+			} break;
+			case RenderingDeviceCommons::DATA_FORMAT_R32G32_SINT:
+			case RenderingDeviceCommons::DATA_FORMAT_R32G32_UINT: {
+				if (!device_properties->features.supports_image_atomic_64_bit) {
+					// No emulation for 64-bit atomics.
+					ERR_FAIL_V_MSG(TextureID(), "64-bit atomic operations are not supported.");
+				}
+			} break;
+			default:
+				break;
+		}
+	}
 
 	id<MTLTexture> obj = nil;
 	if (needs_buffer) {
@@ -900,9 +929,15 @@ Error RenderingDeviceDriverMetal::command_queue_execute_and_present(CommandQueue
 	MDCommandBuffer *cmd_buffer = (MDCommandBuffer *)(p_cmd_buffers[size - 1].id);
 	Fence *fence = (Fence *)(p_cmd_fence.id);
 	if (fence != nullptr) {
-		[cmd_buffer->get_command_buffer() addCompletedHandler:^(id<MTLCommandBuffer> buffer) {
+		id<MTLCommandBuffer> cb = cmd_buffer->get_command_buffer();
+		if (cb == nil) {
+			// If there is nothing to do, signal the fence immediately.
 			dispatch_semaphore_signal(fence->semaphore);
-		}];
+		} else {
+			[cb addCompletedHandler:^(id<MTLCommandBuffer> buffer) {
+				dispatch_semaphore_signal(fence->semaphore);
+			}];
+		}
 	}
 
 	for (uint32_t i = 0; i < p_swap_chains.size(); i++) {
@@ -1730,8 +1765,7 @@ void RenderingDeviceDriverMetal::pipeline_free(PipelineID p_pipeline_id) {
 
 void RenderingDeviceDriverMetal::command_bind_push_constants(CommandBufferID p_cmd_buffer, ShaderID p_shader, uint32_t p_dst_first_index, VectorView<uint32_t> p_data) {
 	MDCommandBuffer *cb = (MDCommandBuffer *)(p_cmd_buffer.id);
-	MDShader *shader = (MDShader *)(p_shader.id);
-	shader->encode_push_constant_data(p_data, cb);
+	cb->encode_push_constant_data(p_shader, p_data);
 }
 
 // ----- CACHE -----
@@ -2417,6 +2451,7 @@ RDD::PipelineID RenderingDeviceDriverMetal::compute_pipeline_create(ShaderID p_s
 
 	MTLComputePipelineDescriptor *desc = [MTLComputePipelineDescriptor new];
 	desc.computeFunction = function;
+	desc.label = conv::to_nsstring(shader->name);
 	if (archive) {
 		desc.binaryArchives = @[ archive ];
 	}
@@ -2735,6 +2770,8 @@ bool RenderingDeviceDriverMetal::has_feature(Features p_feature) {
 			return device_properties->features.metal_fx_spatial;
 		case SUPPORTS_METALFX_TEMPORAL:
 			return device_properties->features.metal_fx_temporal;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+			return device_properties->features.supports_image_atomic_32_bit;
 		default:
 			return false;
 	}
diff --git a/drivers/metal/rendering_shader_container_metal.mm b/drivers/metal/rendering_shader_container_metal.mm
index c2e4518a06..d9c81dba6f 100644
--- a/drivers/metal/rendering_shader_container_metal.mm
+++ b/drivers/metal/rendering_shader_container_metal.mm
@@ -199,6 +199,8 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 		set_indexes[set_indexes_size - 1] = offset;
 	}
 	CompilerMSL::Options msl_options{};
+	// MAJOR * 10000 + MINOR * 100
+	uint32_t msl_version = CompilerMSL::Options::make_msl_version(device_profile->features.mslVersionMajor, device_profile->features.mslVersionMinor);
 	msl_options.set_msl_version(device_profile->features.mslVersionMajor, device_profile->features.mslVersionMinor);
 	mtl_reflection_data.msl_version = msl_options.msl_version;
 	msl_options.platform = device_profile->platform == MetalDeviceProfile::Platform::macOS ? CompilerMSL::Options::macOS : CompilerMSL::Options::iOS;
@@ -209,7 +211,7 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 	}
 
 	bool disable_argument_buffers = false;
-	if (String v = OS::get_singleton()->get_environment(U"GODOT_DISABLE_ARGUMENT_BUFFERS"); v == U"1") {
+	if (String v = OS::get_singleton()->get_environment("GODOT_MTL_DISABLE_ARGUMENT_BUFFERS"); v == "1") {
 		disable_argument_buffers = true;
 	}
 
@@ -236,6 +238,10 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 		msl_options.multiview_layered_rendering = true;
 		msl_options.view_mask_buffer_index = VIEW_MASK_BUFFER_INDEX;
 	}
+	if (msl_version >= CompilerMSL::Options::make_msl_version(3, 2)) {
+		// All 3.2+ versions support device coherence, so we can disable texture fences.
+		msl_options.readwrite_texture_fences = false;
+	}
 
 	CompilerGLSL::Options options{};
 	options.vertex.flip_vert_y = true;
@@ -417,6 +423,10 @@ bool RenderingShaderContainerMetal::_set_code_from_spirv(const Vector<RenderingD
 							// VK_DESCRIPTOR_TYPE_STORAGE_TEXEL_BUFFER
 							primary.texture_type = MTLTextureTypeTextureBuffer;
 						} break;
+						case spv::DimTileImageDataEXT: {
+							// Godot does not use this extension.
+							// See: https://registry.khronos.org/vulkan/specs/latest/man/html/VK_EXT_shader_tile_image.html
+						} break;
 						case spv::DimMax: {
 							// Add all enumerations to silence the compiler warning
 							// and generate future warnings, should a new one be added.
diff --git a/drivers/vulkan/rendering_device_driver_vulkan.cpp b/drivers/vulkan/rendering_device_driver_vulkan.cpp
index e53f6bfbcf..542322a738 100644
--- a/drivers/vulkan/rendering_device_driver_vulkan.cpp
+++ b/drivers/vulkan/rendering_device_driver_vulkan.cpp
@@ -5894,6 +5894,13 @@ bool RenderingDeviceDriverVulkan::has_feature(Features p_feature) {
 			return true;
 		case SUPPORTS_BUFFER_DEVICE_ADDRESS:
 			return buffer_device_address_support;
+		case SUPPORTS_IMAGE_ATOMIC_32_BIT:
+#if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
+			// MoltenVK has previously had issues with 32-bit atomics on images.
+			return false;
+#else
+			return true;
+#endif
 		default:
 			return false;
 	}
diff --git a/servers/rendering/renderer_rd/environment/fog.cpp b/servers/rendering/renderer_rd/environment/fog.cpp
index 0664209ce7..449576d8c4 100644
--- a/servers/rendering/renderer_rd/environment/fog.cpp
+++ b/servers/rendering/renderer_rd/environment/fog.cpp
@@ -417,6 +417,7 @@ void Fog::VolumetricFog::init(const Vector3i &fog_size, RID p_sky_shader) {
 	width = fog_size.x;
 	height = fog_size.y;
 	depth = fog_size.z;
+	atomic_type = RD::get_singleton()->has_feature(RD::SUPPORTS_IMAGE_ATOMIC_32_BIT) ? RD::UNIFORM_TYPE_IMAGE : RD::UNIFORM_TYPE_STORAGE_BUFFER;
 
 	RD::TextureFormat tf;
 	tf.format = RD::DATA_FORMAT_R16G16B16A16_SFLOAT;
@@ -440,29 +441,29 @@ void Fog::VolumetricFog::init(const Vector3i &fog_size, RID p_sky_shader) {
 	fog_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
 	RD::get_singleton()->set_resource_name(fog_map, "Fog map");
 
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-	Vector<uint8_t> dm;
-	dm.resize_initialized(fog_size.x * fog_size.y * fog_size.z * 4);
+	if (atomic_type == RD::UNIFORM_TYPE_STORAGE_BUFFER) {
+		Vector<uint8_t> dm;
+		dm.resize_initialized(fog_size.x * fog_size.y * fog_size.z * 4);
 
-	density_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(density_map, "Fog density map");
-	light_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(light_map, "Fog light map");
-	emissive_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
-	RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
-#else
-	tf.format = RD::DATA_FORMAT_R32_UINT;
-	tf.usage_bits = RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT;
-	density_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(density_map, "Fog density map");
-	RD::get_singleton()->texture_clear(density_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-	light_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(light_map, "Fog light map");
-	RD::get_singleton()->texture_clear(light_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-	emissive_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
-	RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
-	RD::get_singleton()->texture_clear(emissive_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
-#endif
+		density_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(density_map, "Fog density map");
+		light_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(light_map, "Fog light map");
+		emissive_map = RD::get_singleton()->storage_buffer_create(dm.size(), dm);
+		RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
+	} else {
+		tf.format = RD::DATA_FORMAT_R32_UINT;
+		tf.usage_bits = RD::TEXTURE_USAGE_STORAGE_BIT | RD::TEXTURE_USAGE_CAN_COPY_TO_BIT | RD::TEXTURE_USAGE_STORAGE_ATOMIC_BIT;
+		density_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(density_map, "Fog density map");
+		RD::get_singleton()->texture_clear(density_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+		light_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(light_map, "Fog light map");
+		RD::get_singleton()->texture_clear(light_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+		emissive_map = RD::get_singleton()->texture_create(tf, RD::TextureView());
+		RD::get_singleton()->set_resource_name(emissive_map, "Fog emissive map");
+		RD::get_singleton()->texture_clear(emissive_map, Color(0, 0, 0, 0), 0, 1, 0, 1);
+	}
 
 	Vector<RD::Uniform> uniforms;
 	{
@@ -579,11 +580,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 
 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 1;
 				u.append_id(fog->emissive_map);
 				uniforms.push_back(u);
@@ -599,11 +596,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 
 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 3;
 				u.append_id(fog->density_map);
 				uniforms.push_back(u);
@@ -611,11 +604,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 
 			{
 				RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-				u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-				u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+				u.uniform_type = fog->atomic_type;
 				u.binding = 4;
 				u.append_id(fog->light_map);
 				uniforms.push_back(u);
@@ -918,22 +907,14 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 		}
 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 16;
 			u.append_id(fog->density_map);
 			uniforms.push_back(u);
 		}
 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 17;
 			u.append_id(fog->light_map);
 			uniforms.push_back(u);
@@ -941,11 +922,7 @@ void Fog::volumetric_fog_update(const VolumetricFogSettings &p_settings, const P
 
 		{
 			RD::Uniform u;
-#if defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED)
-			u.uniform_type = RD::UNIFORM_TYPE_STORAGE_BUFFER;
-#else
-			u.uniform_type = RD::UNIFORM_TYPE_IMAGE;
-#endif
+			u.uniform_type = fog->atomic_type;
 			u.binding = 18;
 			u.append_id(fog->emissive_map);
 			uniforms.push_back(u);
diff --git a/servers/rendering/renderer_rd/environment/fog.h b/servers/rendering/renderer_rd/environment/fog.h
index 6e91b4bcf4..f593b3310f 100644
--- a/servers/rendering/renderer_rd/environment/fog.h
+++ b/servers/rendering/renderer_rd/environment/fog.h
@@ -316,6 +316,9 @@ public:
 
 		int last_shadow_filter = -1;
 
+		// If the device doesn't support image atomics, use storage buffers instead.
+		RD::UniformType atomic_type = RD::UNIFORM_TYPE_IMAGE;
+
 		virtual void configure(RenderSceneBuffersRD *p_render_buffers) override {}
 		virtual void free_data() override {}
 
diff --git a/servers/rendering/renderer_rd/shader_rd.cpp b/servers/rendering/renderer_rd/shader_rd.cpp
index 49ebbcdaf7..ed8ecdc4b1 100644
--- a/servers/rendering/renderer_rd/shader_rd.cpp
+++ b/servers/rendering/renderer_rd/shader_rd.cpp
@@ -234,11 +234,13 @@ void ShaderRD::_build_variant_code(StringBuilder &builder, uint32_t p_variant, c
 					builder.append(String("#define ") + String(E.key) + "_CODE_USED\n");
 				}
 #if (defined(MACOS_ENABLED) || defined(APPLE_EMBEDDED_ENABLED))
-				if (RD::get_singleton()->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
+				RenderingDevice *rd = RD::get_singleton();
+				if (rd->get_device_capabilities().device_family == RDD::DEVICE_VULKAN) {
 					builder.append("#define MOLTENVK_USED\n");
 				}
-				// Image atomics are supported on Metal 3.1 but no support in MoltenVK or SPIRV-Cross yet.
-				builder.append("#define NO_IMAGE_ATOMICS\n");
+				if (!rd->has_feature(RD::SUPPORTS_IMAGE_ATOMIC_32_BIT)) {
+					builder.append("#define NO_IMAGE_ATOMICS\n");
+				}
 #endif
 
 				builder.append(String("#define RENDER_DRIVER_") + OS::get_singleton()->get_current_rendering_driver_name().to_upper() + "\n");
diff --git a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl
index 929f1e34df..4ca666019a 100644
--- a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog.glsl
@@ -2,6 +2,8 @@
 
 #version 450
 
+#pragma use_vulkan_memory_model
+
 #VERSION_DEFINES
 
 layout(local_size_x = 4, local_size_y = 4, local_size_z = 4) in;
diff --git a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
index 832058553e..17ee5ced28 100644
--- a/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
+++ b/servers/rendering/renderer_rd/shaders/environment/volumetric_fog_process.glsl
@@ -2,6 +2,8 @@
 
 #version 450
 
+#pragma use_vulkan_memory_model
+
 #VERSION_DEFINES
 
 #ifdef MODE_DENSITY
diff --git a/servers/rendering/rendering_device.cpp b/servers/rendering/rendering_device.cpp
index ddf6f91d25..0030c8674d 100644
--- a/servers/rendering/rendering_device.cpp
+++ b/servers/rendering/rendering_device.cpp
@@ -7979,6 +7979,7 @@ void RenderingDevice::_bind_methods() {
 	BIND_ENUM_CONSTANT(SUPPORTS_METALFX_SPATIAL);
 	BIND_ENUM_CONSTANT(SUPPORTS_METALFX_TEMPORAL);
 	BIND_ENUM_CONSTANT(SUPPORTS_BUFFER_DEVICE_ADDRESS);
+	BIND_ENUM_CONSTANT(SUPPORTS_IMAGE_ATOMIC_32_BIT);
 
 	BIND_ENUM_CONSTANT(LIMIT_MAX_BOUND_UNIFORM_SETS);
 	BIND_ENUM_CONSTANT(LIMIT_MAX_FRAMEBUFFER_COLOR_ATTACHMENTS);
diff --git a/servers/rendering/rendering_device_commons.h b/servers/rendering/rendering_device_commons.h
index 2196622828..960233e3ab 100644
--- a/servers/rendering/rendering_device_commons.h
+++ b/servers/rendering/rendering_device_commons.h
@@ -952,6 +952,7 @@ public:
 		// If not supported, a fragment shader with only side effects (i.e., writes  to buffers, but doesn't output to attachments), may be optimized down to no-op by the GPU driver.
 		SUPPORTS_FRAGMENT_SHADER_WITH_ONLY_SIDE_EFFECTS,
 		SUPPORTS_BUFFER_DEVICE_ADDRESS,
+		SUPPORTS_IMAGE_ATOMIC_32_BIT,
 	};
 
 	enum SubgroupOperations {
diff --git a/thirdparty/README.md b/thirdparty/README.md
index a15f7304cf..8996fc8aa2 100644
--- a/thirdparty/README.md
+++ b/thirdparty/README.md
@@ -978,7 +978,7 @@ Its version and license is described in this file under `hidapi`.
 ## spirv-cross
 
 - Upstream: https://github.com/KhronosGroup/SPIRV-Cross
-- Version: git (6173e24b31f09a0c3217103a130e74c4ddec14a6, 2024)
+- Version: git (d7440cbc6c50332600fdf21c45e6a5df0b07e54c, 2025)
 - License: Apache 2.0
 
 Files extracted from upstream source:
diff --git a/thirdparty/spirv-cross/spirv.hpp b/thirdparty/spirv-cross/spirv.hpp
index 5047b9b302..f7a7bf835e 100644
--- a/thirdparty/spirv-cross/spirv.hpp
+++ b/thirdparty/spirv-cross/spirv.hpp
@@ -1,26 +1,10 @@
-// Copyright (c) 2014-2024 The Khronos Group Inc.
+// Copyright: 2014-2024 The Khronos Group Inc.
+// License: MIT
 // 
-// Permission is hereby granted, free of charge, to any person obtaining a copy
-// of this software and/or associated documentation files (the "Materials"),
-// to deal in the Materials without restriction, including without limitation
-// the rights to use, copy, modify, merge, publish, distribute, sublicense,
-// and/or sell copies of the Materials, and to permit persons to whom the
-// Materials are furnished to do so, subject to the following conditions:
-// 
-// The above copyright notice and this permission notice shall be included in
-// all copies or substantial portions of the Materials.
-// 
-// MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS KHRONOS
-// STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS SPECIFICATIONS AND
-// HEADER INFORMATION ARE LOCATED AT https://www.khronos.org/registry/ 
-// 
-// THE MATERIALS ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
-// OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-// THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-// FROM,OUT OF OR IN CONNECTION WITH THE MATERIALS OR THE USE OR OTHER DEALINGS
-// IN THE MATERIALS.
+// MODIFICATIONS TO THIS FILE MAY MEAN IT NO LONGER ACCURATELY REFLECTS
+// KHRONOS STANDARDS. THE UNMODIFIED, NORMATIVE VERSIONS OF KHRONOS
+// SPECIFICATIONS AND HEADER INFORMATION ARE LOCATED AT
+// https://www.khronos.org/registry/
 
 // This header is automatically generated by the same tool that creates
 // the Binary Section of the SPIR-V specification.
@@ -69,6 +53,12 @@ enum SourceLanguage {
     SourceLanguageHLSL = 5,
     SourceLanguageCPP_for_OpenCL = 6,
     SourceLanguageSYCL = 7,
+    SourceLanguageHERO_C = 8,
+    SourceLanguageNZSL = 9,
+    SourceLanguageWGSL = 10,
+    SourceLanguageSlang = 11,
+    SourceLanguageZig = 12,
+    SourceLanguageRust = 13,
     SourceLanguageMax = 0x7fffffff,
 };
 
@@ -156,6 +146,9 @@ enum ExecutionMode {
     ExecutionModeSubgroupsPerWorkgroupId = 37,
     ExecutionModeLocalSizeId = 38,
     ExecutionModeLocalSizeHintId = 39,
+    ExecutionModeNonCoherentColorAttachmentReadEXT = 4169,
+    ExecutionModeNonCoherentDepthAttachmentReadEXT = 4170,
+    ExecutionModeNonCoherentStencilAttachmentReadEXT = 4171,
     ExecutionModeSubgroupUniformControlFlowKHR = 4421,
     ExecutionModePostDepthCoverage = 4446,
     ExecutionModeDenormPreserve = 4459,
@@ -163,19 +156,32 @@ enum ExecutionMode {
     ExecutionModeSignedZeroInfNanPreserve = 4461,
     ExecutionModeRoundingModeRTE = 4462,
     ExecutionModeRoundingModeRTZ = 4463,
+    ExecutionModeNonCoherentTileAttachmentReadQCOM = 4489,
+    ExecutionModeTileShadingRateQCOM = 4490,
     ExecutionModeEarlyAndLateFragmentTestsAMD = 5017,
     ExecutionModeStencilRefReplacingEXT = 5027,
+    ExecutionModeCoalescingAMDX = 5069,
+    ExecutionModeIsApiEntryAMDX = 5070,
+    ExecutionModeMaxNodeRecursionAMDX = 5071,
+    ExecutionModeStaticNumWorkgroupsAMDX = 5072,
+    ExecutionModeShaderIndexAMDX = 5073,
+    ExecutionModeMaxNumWorkgroupsAMDX = 5077,
     ExecutionModeStencilRefUnchangedFrontAMD = 5079,
     ExecutionModeStencilRefGreaterFrontAMD = 5080,
     ExecutionModeStencilRefLessFrontAMD = 5081,
     ExecutionModeStencilRefUnchangedBackAMD = 5082,
     ExecutionModeStencilRefGreaterBackAMD = 5083,
     ExecutionModeStencilRefLessBackAMD = 5084,
+    ExecutionModeQuadDerivativesKHR = 5088,
+    ExecutionModeRequireFullQuadsKHR = 5089,
+    ExecutionModeSharesInputWithAMDX = 5102,
     ExecutionModeOutputLinesEXT = 5269,
     ExecutionModeOutputLinesNV = 5269,
     ExecutionModeOutputPrimitivesEXT = 5270,
     ExecutionModeOutputPrimitivesNV = 5270,
+    ExecutionModeDerivativeGroupQuadsKHR = 5289,
     ExecutionModeDerivativeGroupQuadsNV = 5289,
+    ExecutionModeDerivativeGroupLinearKHR = 5290,
     ExecutionModeDerivativeGroupLinearNV = 5290,
     ExecutionModeOutputTrianglesEXT = 5298,
     ExecutionModeOutputTrianglesNV = 5298,
@@ -195,7 +201,14 @@ enum ExecutionMode {
     ExecutionModeNoGlobalOffsetINTEL = 5895,
     ExecutionModeNumSIMDWorkitemsINTEL = 5896,
     ExecutionModeSchedulerTargetFmaxMhzINTEL = 5903,
+    ExecutionModeMaximallyReconvergesKHR = 6023,
+    ExecutionModeFPFastMathDefault = 6028,
+    ExecutionModeStreamingInterfaceINTEL = 6154,
+    ExecutionModeRegisterMapInterfaceINTEL = 6160,
     ExecutionModeNamedBarrierCountINTEL = 6417,
+    ExecutionModeMaximumRegistersINTEL = 6461,
+    ExecutionModeMaximumRegistersIdINTEL = 6462,
+    ExecutionModeNamedMaximumRegistersINTEL = 6463,
     ExecutionModeMax = 0x7fffffff,
 };
 
@@ -213,6 +226,9 @@ enum StorageClass {
     StorageClassAtomicCounter = 10,
     StorageClassImage = 11,
     StorageClassStorageBuffer = 12,
+    StorageClassTileImageEXT = 4172,
+    StorageClassTileAttachmentQCOM = 4491,
+    StorageClassNodePayloadAMDX = 5068,
     StorageClassCallableDataKHR = 5328,
     StorageClassCallableDataNV = 5328,
     StorageClassIncomingCallableDataKHR = 5329,
@@ -227,6 +243,7 @@ enum StorageClass {
     StorageClassShaderRecordBufferNV = 5343,
     StorageClassPhysicalStorageBuffer = 5349,
     StorageClassPhysicalStorageBufferEXT = 5349,
+    StorageClassHitObjectAttributeNV = 5385,
     StorageClassTaskPayloadWorkgroupEXT = 5402,
     StorageClassCodeSectionINTEL = 5605,
     StorageClassDeviceOnlyINTEL = 5936,
@@ -242,6 +259,7 @@ enum Dim {
     DimRect = 4,
     DimBuffer = 5,
     DimSubpassData = 6,
+    DimTileImageDataEXT = 4173,
     DimMax = 0x7fffffff,
 };
 
@@ -348,6 +366,15 @@ enum ImageChannelDataType {
     ImageChannelDataTypeFloat = 14,
     ImageChannelDataTypeUnormInt24 = 15,
     ImageChannelDataTypeUnormInt101010_2 = 16,
+    ImageChannelDataTypeUnormInt10X6EXT = 17,
+    ImageChannelDataTypeUnsignedIntRaw10EXT = 19,
+    ImageChannelDataTypeUnsignedIntRaw12EXT = 20,
+    ImageChannelDataTypeUnormInt2_101010EXT = 21,
+    ImageChannelDataTypeUnsignedInt10X6EXT = 22,
+    ImageChannelDataTypeUnsignedInt12X4EXT = 23,
+    ImageChannelDataTypeUnsignedInt14X2EXT = 24,
+    ImageChannelDataTypeUnormInt12X4EXT = 25,
+    ImageChannelDataTypeUnormInt14X2EXT = 26,
     ImageChannelDataTypeMax = 0x7fffffff,
 };
 
@@ -405,8 +432,11 @@ enum FPFastMathModeShift {
     FPFastMathModeNSZShift = 2,
     FPFastMathModeAllowRecipShift = 3,
     FPFastMathModeFastShift = 4,
+    FPFastMathModeAllowContractShift = 16,
     FPFastMathModeAllowContractFastINTELShift = 16,
+    FPFastMathModeAllowReassocShift = 17,
     FPFastMathModeAllowReassocINTELShift = 17,
+    FPFastMathModeAllowTransformShift = 18,
     FPFastMathModeMax = 0x7fffffff,
 };
 
@@ -417,8 +447,11 @@ enum FPFastMathModeMask {
     FPFastMathModeNSZMask = 0x00000004,
     FPFastMathModeAllowRecipMask = 0x00000008,
     FPFastMathModeFastMask = 0x00000010,
+    FPFastMathModeAllowContractMask = 0x00010000,
     FPFastMathModeAllowContractFastINTELMask = 0x00010000,
+    FPFastMathModeAllowReassocMask = 0x00020000,
     FPFastMathModeAllowReassocINTELMask = 0x00020000,
+    FPFastMathModeAllowTransformMask = 0x00040000,
 };
 
 enum FPRoundingMode {
@@ -452,6 +485,7 @@ enum FunctionParameterAttribute {
     FunctionParameterAttributeNoCapture = 5,
     FunctionParameterAttributeNoWrite = 6,
     FunctionParameterAttributeNoReadWrite = 7,
+    FunctionParameterAttributeRuntimeAlignedINTEL = 5940,
     FunctionParameterAttributeMax = 0x7fffffff,
 };
 
@@ -503,12 +537,21 @@ enum Decoration {
     DecorationMaxByteOffset = 45,
     DecorationAlignmentId = 46,
     DecorationMaxByteOffsetId = 47,
+    DecorationSaturatedToLargestFloat8NormalConversionEXT = 4216,
     DecorationNoSignedWrap = 4469,
     DecorationNoUnsignedWrap = 4470,
     DecorationWeightTextureQCOM = 4487,
     DecorationBlockMatchTextureQCOM = 4488,
     DecorationBlockMatchSamplerQCOM = 4499,
     DecorationExplicitInterpAMD = 4999,
+    DecorationNodeSharesPayloadLimitsWithAMDX = 5019,
+    DecorationNodeMaxPayloadsAMDX = 5020,
+    DecorationTrackFinishWritingAMDX = 5078,
+    DecorationPayloadNodeNameAMDX = 5091,
+    DecorationPayloadNodeBaseIndexAMDX = 5098,
+    DecorationPayloadNodeSparseArrayAMDX = 5099,
+    DecorationPayloadNodeArraySizeAMDX = 5100,
+    DecorationPayloadDispatchIndirectAMDX = 5105,
     DecorationOverrideCoverageNV = 5248,
     DecorationPassthroughNV = 5250,
     DecorationViewportRelativeNV = 5252,
@@ -525,6 +568,7 @@ enum Decoration {
     DecorationRestrictPointerEXT = 5355,
     DecorationAliasedPointer = 5356,
     DecorationAliasedPointerEXT = 5356,
+    DecorationHitObjectShaderRecordBufferNV = 5386,
     DecorationBindlessSamplerNV = 5398,
     DecorationBindlessImageNV = 5399,
     DecorationBoundSamplerNV = 5400,
@@ -557,20 +601,45 @@ enum Decoration {
     DecorationMergeINTEL = 5834,
     DecorationBankBitsINTEL = 5835,
     DecorationForcePow2DepthINTEL = 5836,
+    DecorationStridesizeINTEL = 5883,
+    DecorationWordsizeINTEL = 5884,
+    DecorationTrueDualPortINTEL = 5885,
     DecorationBurstCoalesceINTEL = 5899,
     DecorationCacheSizeINTEL = 5900,
     DecorationDontStaticallyCoalesceINTEL = 5901,
     DecorationPrefetchINTEL = 5902,
     DecorationStallEnableINTEL = 5905,
     DecorationFuseLoopsInFunctionINTEL = 5907,
+    DecorationMathOpDSPModeINTEL = 5909,
     DecorationAliasScopeINTEL = 5914,
     DecorationNoAliasINTEL = 5915,
+    DecorationInitiationIntervalINTEL = 5917,
+    DecorationMaxConcurrencyINTEL = 5918,
+    DecorationPipelineEnableINTEL = 5919,
     DecorationBufferLocationINTEL = 5921,
     DecorationIOPipeStorageINTEL = 5944,
     DecorationFunctionFloatingPointModeINTEL = 6080,
     DecorationSingleElementVectorINTEL = 6085,
     DecorationVectorComputeCallableFunctionINTEL = 6087,
     DecorationMediaBlockIOINTEL = 6140,
+    DecorationStallFreeINTEL = 6151,
+    DecorationFPMaxErrorDecorationINTEL = 6170,
+    DecorationLatencyControlLabelINTEL = 6172,
+    DecorationLatencyControlConstraintINTEL = 6173,
+    DecorationConduitKernelArgumentINTEL = 6175,
+    DecorationRegisterMapKernelArgumentINTEL = 6176,
+    DecorationMMHostInterfaceAddressWidthINTEL = 6177,
+    DecorationMMHostInterfaceDataWidthINTEL = 6178,
+    DecorationMMHostInterfaceLatencyINTEL = 6179,
+    DecorationMMHostInterfaceReadWriteModeINTEL = 6180,
+    DecorationMMHostInterfaceMaxBurstINTEL = 6181,
+    DecorationMMHostInterfaceWaitRequestINTEL = 6182,
+    DecorationStableKernelArgumentINTEL = 6183,
+    DecorationHostAccessINTEL = 6188,
+    DecorationInitModeINTEL = 6190,
+    DecorationImplementInRegisterMapINTEL = 6191,
+    DecorationCacheControlLoadINTEL = 6442,
+    DecorationCacheControlStoreINTEL = 6443,
     DecorationMax = 0x7fffffff,
 };
 
@@ -616,6 +685,11 @@ enum BuiltIn {
     BuiltInSubgroupLocalInvocationId = 41,
     BuiltInVertexIndex = 42,
     BuiltInInstanceIndex = 43,
+    BuiltInCoreIDARM = 4160,
+    BuiltInCoreCountARM = 4161,
+    BuiltInCoreMaxIDARM = 4162,
+    BuiltInWarpIDARM = 4163,
+    BuiltInWarpMaxIDARM = 4164,
     BuiltInSubgroupEqMask = 4416,
     BuiltInSubgroupEqMaskKHR = 4416,
     BuiltInSubgroupGeMask = 4417,
@@ -633,6 +707,9 @@ enum BuiltIn {
     BuiltInDeviceIndex = 4438,
     BuiltInViewIndex = 4440,
     BuiltInShadingRateKHR = 4444,
+    BuiltInTileOffsetQCOM = 4492,
+    BuiltInTileDimensionQCOM = 4493,
+    BuiltInTileApronSizeQCOM = 4494,
     BuiltInBaryCoordNoPerspAMD = 4992,
     BuiltInBaryCoordNoPerspCentroidAMD = 4993,
     BuiltInBaryCoordNoPerspSampleAMD = 4994,
@@ -641,6 +718,8 @@ enum BuiltIn {
     BuiltInBaryCoordSmoothSampleAMD = 4997,
     BuiltInBaryCoordPullModelAMD = 4998,
     BuiltInFragStencilRefEXT = 5014,
+    BuiltInRemainingRecursionLevelsAMDX = 5021,
+    BuiltInShaderIndexAMDX = 5073,
     BuiltInViewportMaskNV = 5253,
     BuiltInSecondaryPositionNV = 5257,
     BuiltInSecondaryViewportMaskNV = 5258,
@@ -693,13 +772,25 @@ enum BuiltIn {
     BuiltInHitKindKHR = 5333,
     BuiltInHitKindNV = 5333,
     BuiltInCurrentRayTimeNV = 5334,
+    BuiltInHitTriangleVertexPositionsKHR = 5335,
+    BuiltInHitMicroTriangleVertexPositionsNV = 5337,
+    BuiltInHitMicroTriangleVertexBarycentricsNV = 5344,
     BuiltInIncomingRayFlagsKHR = 5351,
     BuiltInIncomingRayFlagsNV = 5351,
     BuiltInRayGeometryIndexKHR = 5352,
+    BuiltInHitIsSphereNV = 5359,
+    BuiltInHitIsLSSNV = 5360,
+    BuiltInHitSpherePositionNV = 5361,
     BuiltInWarpsPerSMNV = 5374,
     BuiltInSMCountNV = 5375,
     BuiltInWarpIDNV = 5376,
     BuiltInSMIDNV = 5377,
+    BuiltInHitLSSPositionsNV = 5396,
+    BuiltInHitKindFrontFacingMicroTriangleNV = 5405,
+    BuiltInHitKindBackFacingMicroTriangleNV = 5406,
+    BuiltInHitSphereRadiusNV = 5420,
+    BuiltInHitLSSRadiiNV = 5421,
+    BuiltInClusterIDNV = 5436,
     BuiltInCullMaskKHR = 6021,
     BuiltInMax = 0x7fffffff,
 };
@@ -734,6 +825,8 @@ enum LoopControlShift {
     LoopControlMaxInterleavingINTELShift = 21,
     LoopControlSpeculatedIterationsINTELShift = 22,
     LoopControlNoFusionINTELShift = 23,
+    LoopControlLoopCountINTELShift = 24,
+    LoopControlMaxReinvocationDelayINTELShift = 25,
     LoopControlMax = 0x7fffffff,
 };
 
@@ -756,6 +849,8 @@ enum LoopControlMask {
     LoopControlMaxInterleavingINTELMask = 0x00200000,
     LoopControlSpeculatedIterationsINTELMask = 0x00400000,
     LoopControlNoFusionINTELMask = 0x00800000,
+    LoopControlLoopCountINTELMask = 0x01000000,
+    LoopControlMaxReinvocationDelayINTELMask = 0x02000000,
 };
 
 enum FunctionControlShift {
@@ -763,6 +858,7 @@ enum FunctionControlShift {
     FunctionControlDontInlineShift = 1,
     FunctionControlPureShift = 2,
     FunctionControlConstShift = 3,
+    FunctionControlOptNoneEXTShift = 16,
     FunctionControlOptNoneINTELShift = 16,
     FunctionControlMax = 0x7fffffff,
 };
@@ -773,6 +869,7 @@ enum FunctionControlMask {
     FunctionControlDontInlineMask = 0x00000002,
     FunctionControlPureMask = 0x00000004,
     FunctionControlConstMask = 0x00000008,
+    FunctionControlOptNoneEXTMask = 0x00010000,
     FunctionControlOptNoneINTELMask = 0x00010000,
 };
 
@@ -959,6 +1056,17 @@ enum Capability {
     CapabilityShaderLayer = 69,
     CapabilityShaderViewportIndex = 70,
     CapabilityUniformDecoration = 71,
+    CapabilityCoreBuiltinsARM = 4165,
+    CapabilityTileImageColorReadAccessEXT = 4166,
+    CapabilityTileImageDepthReadAccessEXT = 4167,
+    CapabilityTileImageStencilReadAccessEXT = 4168,
+    CapabilityTensorsARM = 4174,
+    CapabilityStorageTensorArrayDynamicIndexingARM = 4175,
+    CapabilityStorageTensorArrayNonUniformIndexingARM = 4176,
+    CapabilityGraphARM = 4191,
+    CapabilityCooperativeMatrixLayoutsARM = 4201,
+    CapabilityFloat8EXT = 4212,
+    CapabilityFloat8CooperativeMatrixEXT = 4213,
     CapabilityFragmentShadingRateKHR = 4422,
     CapabilitySubgroupBallotKHR = 4423,
     CapabilityDrawParameters = 4427,
@@ -988,11 +1096,13 @@ enum Capability {
     CapabilityRoundingModeRTZ = 4468,
     CapabilityRayQueryProvisionalKHR = 4471,
     CapabilityRayQueryKHR = 4472,
+    CapabilityUntypedPointersKHR = 4473,
     CapabilityRayTraversalPrimitiveCullingKHR = 4478,
     CapabilityRayTracingKHR = 4479,
     CapabilityTextureSampleWeightedQCOM = 4484,
     CapabilityTextureBoxFilterQCOM = 4485,
     CapabilityTextureBlockMatchQCOM = 4486,
+    CapabilityTileShadingQCOM = 4495,
     CapabilityTextureBlockMatch2QCOM = 4498,
     CapabilityFloat16ImageAMD = 5008,
     CapabilityImageGatherBiasLodAMD = 5009,
@@ -1001,6 +1111,13 @@ enum Capability {
     CapabilityImageReadWriteLodAMD = 5015,
     CapabilityInt64ImageEXT = 5016,
     CapabilityShaderClockKHR = 5055,
+    CapabilityShaderEnqueueAMDX = 5067,
+    CapabilityQuadControlKHR = 5087,
+    CapabilityInt4TypeINTEL = 5112,
+    CapabilityInt4CooperativeMatrixINTEL = 5114,
+    CapabilityBFloat16TypeKHR = 5116,
+    CapabilityBFloat16DotProductKHR = 5117,
+    CapabilityBFloat16CooperativeMatrixKHR = 5118,
     CapabilitySampleMaskOverrideCoverageNV = 5249,
     CapabilityGeometryShaderPassthroughNV = 5251,
     CapabilityShaderViewportIndexLayerEXT = 5254,
@@ -1014,6 +1131,7 @@ enum Capability {
     CapabilityMeshShadingEXT = 5283,
     CapabilityFragmentBarycentricKHR = 5284,
     CapabilityFragmentBarycentricNV = 5284,
+    CapabilityComputeDerivativeGroupQuadsKHR = 5288,
     CapabilityComputeDerivativeGroupQuadsNV = 5288,
     CapabilityFragmentDensityEXT = 5291,
     CapabilityShadingRateNV = 5291,
@@ -1042,6 +1160,7 @@ enum Capability {
     CapabilityUniformTexelBufferArrayNonUniformIndexingEXT = 5311,
     CapabilityStorageTexelBufferArrayNonUniformIndexing = 5312,
     CapabilityStorageTexelBufferArrayNonUniformIndexingEXT = 5312,
+    CapabilityRayTracingPositionFetchKHR = 5336,
     CapabilityRayTracingNV = 5340,
     CapabilityRayTracingMotionBlurNV = 5341,
     CapabilityVulkanMemoryModel = 5345,
@@ -1050,6 +1169,7 @@ enum Capability {
     CapabilityVulkanMemoryModelDeviceScopeKHR = 5346,
     CapabilityPhysicalStorageBufferAddresses = 5347,
     CapabilityPhysicalStorageBufferAddressesEXT = 5347,
+    CapabilityComputeDerivativeGroupLinearKHR = 5350,
     CapabilityComputeDerivativeGroupLinearNV = 5350,
     CapabilityRayTracingProvisionalKHR = 5353,
     CapabilityCooperativeMatrixNV = 5357,
@@ -1059,7 +1179,25 @@ enum Capability {
     CapabilityFragmentShaderPixelInterlockEXT = 5378,
     CapabilityDemoteToHelperInvocation = 5379,
     CapabilityDemoteToHelperInvocationEXT = 5379,
+    CapabilityDisplacementMicromapNV = 5380,
+    CapabilityRayTracingOpacityMicromapEXT = 5381,
+    CapabilityShaderInvocationReorderNV = 5383,
     CapabilityBindlessTextureNV = 5390,
+    CapabilityRayQueryPositionFetchKHR = 5391,
+    CapabilityCooperativeVectorNV = 5394,
+    CapabilityAtomicFloat16VectorNV = 5404,
+    CapabilityRayTracingDisplacementMicromapNV = 5409,
+    CapabilityRawAccessChainsNV = 5414,
+    CapabilityRayTracingSpheresGeometryNV = 5418,
+    CapabilityRayTracingLinearSweptSpheresGeometryNV = 5419,
+    CapabilityCooperativeMatrixReductionsNV = 5430,
+    CapabilityCooperativeMatrixConversionsNV = 5431,
+    CapabilityCooperativeMatrixPerElementOperationsNV = 5432,
+    CapabilityCooperativeMatrixTensorAddressingNV = 5433,
+    CapabilityCooperativeMatrixBlockLoadsNV = 5434,
+    CapabilityCooperativeVectorTrainingNV = 5435,
+    CapabilityRayTracingClusterAccelerationStructureNV = 5437,
+    CapabilityTensorAddressingNV = 5439,
     CapabilitySubgroupShuffleINTEL = 5568,
     CapabilitySubgroupBufferBlockIOINTEL = 5569,
     CapabilitySubgroupImageBlockIOINTEL = 5570,
@@ -1092,10 +1230,13 @@ enum Capability {
     CapabilityFPGAMemoryAccessesINTEL = 5898,
     CapabilityFPGAClusterAttributesINTEL = 5904,
     CapabilityLoopFuseINTEL = 5906,
+    CapabilityFPGADSPControlINTEL = 5908,
     CapabilityMemoryAccessAliasingINTEL = 5910,
+    CapabilityFPGAInvocationPipeliningAttributesINTEL = 5916,
     CapabilityFPGABufferLocationINTEL = 5920,
     CapabilityArbitraryPrecisionFixedPointINTEL = 5922,
     CapabilityUSMStorageClassesINTEL = 5935,
+    CapabilityRuntimeAlignedAttributeINTEL = 5939,
     CapabilityIOPipesINTEL = 5943,
     CapabilityBlockingPipesINTEL = 5945,
     CapabilityFPGARegINTEL = 5948,
@@ -1108,16 +1249,41 @@ enum Capability {
     CapabilityDotProduct = 6019,
     CapabilityDotProductKHR = 6019,
     CapabilityRayCullMaskKHR = 6020,
+    CapabilityCooperativeMatrixKHR = 6022,
+    CapabilityReplicatedCompositesEXT = 6024,
     CapabilityBitInstructions = 6025,
     CapabilityGroupNonUniformRotateKHR = 6026,
+    CapabilityFloatControls2 = 6029,
     CapabilityAtomicFloat32AddEXT = 6033,
     CapabilityAtomicFloat64AddEXT = 6034,
-    CapabilityLongConstantCompositeINTEL = 6089,
+    CapabilityLongCompositesINTEL = 6089,
+    CapabilityOptNoneEXT = 6094,
     CapabilityOptNoneINTEL = 6094,
     CapabilityAtomicFloat16AddEXT = 6095,
     CapabilityDebugInfoModuleINTEL = 6114,
+    CapabilityBFloat16ConversionINTEL = 6115,
     CapabilitySplitBarrierINTEL = 6141,
+    CapabilityArithmeticFenceEXT = 6144,
+    CapabilityFPGAClusterAttributesV2INTEL = 6150,
+    CapabilityFPGAKernelAttributesv2INTEL = 6161,
+    CapabilityTaskSequenceINTEL = 6162,
+    CapabilityFPMaxErrorINTEL = 6169,
+    CapabilityFPGALatencyControlINTEL = 6171,
+    CapabilityFPGAArgumentInterfacesINTEL = 6174,
+    CapabilityGlobalVariableHostAccessINTEL = 6187,
+    CapabilityGlobalVariableFPGADecorationsINTEL = 6189,
+    CapabilitySubgroupBufferPrefetchINTEL = 6220,
+    CapabilitySubgroup2DBlockIOINTEL = 6228,
+    CapabilitySubgroup2DBlockTransformINTEL = 6229,
+    CapabilitySubgroup2DBlockTransposeINTEL = 6230,
+    CapabilitySubgroupMatrixMultiplyAccumulateINTEL = 6236,
+    CapabilityTernaryBitwiseFunctionINTEL = 6241,
     CapabilityGroupUniformArithmeticKHR = 6400,
+    CapabilityTensorFloat32RoundingINTEL = 6425,
+    CapabilityMaskedGatherScatterINTEL = 6427,
+    CapabilityCacheControlsINTEL = 6441,
+    CapabilityRegisterLimitsINTEL = 6460,
+    CapabilityBindlessImagesINTEL = 6528,
     CapabilityMax = 0x7fffffff,
 };
 
@@ -1130,8 +1296,10 @@ enum RayFlagsShift {
     RayFlagsCullFrontFacingTrianglesKHRShift = 5,
     RayFlagsCullOpaqueKHRShift = 6,
     RayFlagsCullNoOpaqueKHRShift = 7,
+    RayFlagsSkipBuiltinPrimitivesNVShift = 8,
     RayFlagsSkipTrianglesKHRShift = 8,
     RayFlagsSkipAABBsKHRShift = 9,
+    RayFlagsForceOpacityMicromap2StateEXTShift = 10,
     RayFlagsMax = 0x7fffffff,
 };
 
@@ -1145,8 +1313,10 @@ enum RayFlagsMask {
     RayFlagsCullFrontFacingTrianglesKHRMask = 0x00000020,
     RayFlagsCullOpaqueKHRMask = 0x00000040,
     RayFlagsCullNoOpaqueKHRMask = 0x00000080,
+    RayFlagsSkipBuiltinPrimitivesNVMask = 0x00000100,
     RayFlagsSkipTrianglesKHRMask = 0x00000100,
     RayFlagsSkipAABBsKHRMask = 0x00000200,
+    RayFlagsForceOpacityMicromap2StateEXTMask = 0x00000400,
 };
 
 enum RayQueryIntersection {
@@ -1222,6 +1392,210 @@ enum PackedVectorFormat {
     PackedVectorFormatMax = 0x7fffffff,
 };
 
+enum CooperativeMatrixOperandsShift {
+    CooperativeMatrixOperandsMatrixASignedComponentsKHRShift = 0,
+    CooperativeMatrixOperandsMatrixBSignedComponentsKHRShift = 1,
+    CooperativeMatrixOperandsMatrixCSignedComponentsKHRShift = 2,
+    CooperativeMatrixOperandsMatrixResultSignedComponentsKHRShift = 3,
+    CooperativeMatrixOperandsSaturatingAccumulationKHRShift = 4,
+    CooperativeMatrixOperandsMax = 0x7fffffff,
+};
+
+enum CooperativeMatrixOperandsMask {
+    CooperativeMatrixOperandsMaskNone = 0,
+    CooperativeMatrixOperandsMatrixASignedComponentsKHRMask = 0x00000001,
+    CooperativeMatrixOperandsMatrixBSignedComponentsKHRMask = 0x00000002,
+    CooperativeMatrixOperandsMatrixCSignedComponentsKHRMask = 0x00000004,
+    CooperativeMatrixOperandsMatrixResultSignedComponentsKHRMask = 0x00000008,
+    CooperativeMatrixOperandsSaturatingAccumulationKHRMask = 0x00000010,
+};
+
+enum CooperativeMatrixLayout {
+    CooperativeMatrixLayoutRowMajorKHR = 0,
+    CooperativeMatrixLayoutColumnMajorKHR = 1,
+    CooperativeMatrixLayoutRowBlockedInterleavedARM = 4202,
+    CooperativeMatrixLayoutColumnBlockedInterleavedARM = 4203,
+    CooperativeMatrixLayoutMax = 0x7fffffff,
+};
+
+enum CooperativeMatrixUse {
+    CooperativeMatrixUseMatrixAKHR = 0,
+    CooperativeMatrixUseMatrixBKHR = 1,
+    CooperativeMatrixUseMatrixAccumulatorKHR = 2,
+    CooperativeMatrixUseMax = 0x7fffffff,
+};
+
+enum CooperativeMatrixReduceShift {
+    CooperativeMatrixReduceRowShift = 0,
+    CooperativeMatrixReduceColumnShift = 1,
+    CooperativeMatrixReduce2x2Shift = 2,
+    CooperativeMatrixReduceMax = 0x7fffffff,
+};
+
+enum CooperativeMatrixReduceMask {
+    CooperativeMatrixReduceMaskNone = 0,
+    CooperativeMatrixReduceRowMask = 0x00000001,
+    CooperativeMatrixReduceColumnMask = 0x00000002,
+    CooperativeMatrixReduce2x2Mask = 0x00000004,
+};
+
+enum TensorClampMode {
+    TensorClampModeUndefined = 0,
+    TensorClampModeConstant = 1,
+    TensorClampModeClampToEdge = 2,
+    TensorClampModeRepeat = 3,
+    TensorClampModeRepeatMirrored = 4,
+    TensorClampModeMax = 0x7fffffff,
+};
+
+enum TensorAddressingOperandsShift {
+    TensorAddressingOperandsTensorViewShift = 0,
+    TensorAddressingOperandsDecodeFuncShift = 1,
+    TensorAddressingOperandsMax = 0x7fffffff,
+};
+
+enum TensorAddressingOperandsMask {
+    TensorAddressingOperandsMaskNone = 0,
+    TensorAddressingOperandsTensorViewMask = 0x00000001,
+    TensorAddressingOperandsDecodeFuncMask = 0x00000002,
+};
+
+enum TensorOperandsShift {
+    TensorOperandsNontemporalARMShift = 0,
+    TensorOperandsOutOfBoundsValueARMShift = 1,
+    TensorOperandsMakeElementAvailableARMShift = 2,
+    TensorOperandsMakeElementVisibleARMShift = 3,
+    TensorOperandsNonPrivateElementARMShift = 4,
+    TensorOperandsMax = 0x7fffffff,
+};
+
+enum TensorOperandsMask {
+    TensorOperandsMaskNone = 0,
+    TensorOperandsNontemporalARMMask = 0x00000001,
+    TensorOperandsOutOfBoundsValueARMMask = 0x00000002,
+    TensorOperandsMakeElementAvailableARMMask = 0x00000004,
+    TensorOperandsMakeElementVisibleARMMask = 0x00000008,
+    TensorOperandsNonPrivateElementARMMask = 0x00000010,
+};
+
+enum InitializationModeQualifier {
+    InitializationModeQualifierInitOnDeviceReprogramINTEL = 0,
+    InitializationModeQualifierInitOnDeviceResetINTEL = 1,
+    InitializationModeQualifierMax = 0x7fffffff,
+};
+
+enum HostAccessQualifier {
+    HostAccessQualifierNoneINTEL = 0,
+    HostAccessQualifierReadINTEL = 1,
+    HostAccessQualifierWriteINTEL = 2,
+    HostAccessQualifierReadWriteINTEL = 3,
+    HostAccessQualifierMax = 0x7fffffff,
+};
+
+enum LoadCacheControl {
+    LoadCacheControlUncachedINTEL = 0,
+    LoadCacheControlCachedINTEL = 1,
+    LoadCacheControlStreamingINTEL = 2,
+    LoadCacheControlInvalidateAfterReadINTEL = 3,
+    LoadCacheControlConstCachedINTEL = 4,
+    LoadCacheControlMax = 0x7fffffff,
+};
+
+enum StoreCacheControl {
+    StoreCacheControlUncachedINTEL = 0,
+    StoreCacheControlWriteThroughINTEL = 1,
+    StoreCacheControlWriteBackINTEL = 2,
+    StoreCacheControlStreamingINTEL = 3,
+    StoreCacheControlMax = 0x7fffffff,
+};
+
+enum NamedMaximumNumberOfRegisters {
+    NamedMaximumNumberOfRegistersAutoINTEL = 0,
+    NamedMaximumNumberOfRegistersMax = 0x7fffffff,
+};
+
+enum MatrixMultiplyAccumulateOperandsShift {
+    MatrixMultiplyAccumulateOperandsMatrixASignedComponentsINTELShift = 0,
+    MatrixMultiplyAccumulateOperandsMatrixBSignedComponentsINTELShift = 1,
+    MatrixMultiplyAccumulateOperandsMatrixCBFloat16INTELShift = 2,
+    MatrixMultiplyAccumulateOperandsMatrixResultBFloat16INTELShift = 3,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedInt8INTELShift = 4,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedInt8INTELShift = 5,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedInt4INTELShift = 6,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedInt4INTELShift = 7,
+    MatrixMultiplyAccumulateOperandsMatrixATF32INTELShift = 8,
+    MatrixMultiplyAccumulateOperandsMatrixBTF32INTELShift = 9,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedFloat16INTELShift = 10,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedFloat16INTELShift = 11,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedBFloat16INTELShift = 12,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedBFloat16INTELShift = 13,
+    MatrixMultiplyAccumulateOperandsMax = 0x7fffffff,
+};
+
+enum MatrixMultiplyAccumulateOperandsMask {
+    MatrixMultiplyAccumulateOperandsMaskNone = 0,
+    MatrixMultiplyAccumulateOperandsMatrixASignedComponentsINTELMask = 0x00000001,
+    MatrixMultiplyAccumulateOperandsMatrixBSignedComponentsINTELMask = 0x00000002,
+    MatrixMultiplyAccumulateOperandsMatrixCBFloat16INTELMask = 0x00000004,
+    MatrixMultiplyAccumulateOperandsMatrixResultBFloat16INTELMask = 0x00000008,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedInt8INTELMask = 0x00000010,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedInt8INTELMask = 0x00000020,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedInt4INTELMask = 0x00000040,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedInt4INTELMask = 0x00000080,
+    MatrixMultiplyAccumulateOperandsMatrixATF32INTELMask = 0x00000100,
+    MatrixMultiplyAccumulateOperandsMatrixBTF32INTELMask = 0x00000200,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedFloat16INTELMask = 0x00000400,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedFloat16INTELMask = 0x00000800,
+    MatrixMultiplyAccumulateOperandsMatrixAPackedBFloat16INTELMask = 0x00001000,
+    MatrixMultiplyAccumulateOperandsMatrixBPackedBFloat16INTELMask = 0x00002000,
+};
+
+enum RawAccessChainOperandsShift {
+    RawAccessChainOperandsRobustnessPerComponentNVShift = 0,
+    RawAccessChainOperandsRobustnessPerElementNVShift = 1,
+    RawAccessChainOperandsMax = 0x7fffffff,
+};
+
+enum RawAccessChainOperandsMask {
+    RawAccessChainOperandsMaskNone = 0,
+    RawAccessChainOperandsRobustnessPerComponentNVMask = 0x00000001,
+    RawAccessChainOperandsRobustnessPerElementNVMask = 0x00000002,
+};
+
+enum FPEncoding {
+    FPEncodingBFloat16KHR = 0,
+    FPEncodingFloat8E4M3EXT = 4214,
+    FPEncodingFloat8E5M2EXT = 4215,
+    FPEncodingMax = 0x7fffffff,
+};
+
+enum CooperativeVectorMatrixLayout {
+    CooperativeVectorMatrixLayoutRowMajorNV = 0,
+    CooperativeVectorMatrixLayoutColumnMajorNV = 1,
+    CooperativeVectorMatrixLayoutInferencingOptimalNV = 2,
+    CooperativeVectorMatrixLayoutTrainingOptimalNV = 3,
+    CooperativeVectorMatrixLayoutMax = 0x7fffffff,
+};
+
+enum ComponentType {
+    ComponentTypeFloat16NV = 0,
+    ComponentTypeFloat32NV = 1,
+    ComponentTypeFloat64NV = 2,
+    ComponentTypeSignedInt8NV = 3,
+    ComponentTypeSignedInt16NV = 4,
+    ComponentTypeSignedInt32NV = 5,
+    ComponentTypeSignedInt64NV = 6,
+    ComponentTypeUnsignedInt8NV = 7,
+    ComponentTypeUnsignedInt16NV = 8,
+    ComponentTypeUnsignedInt32NV = 9,
+    ComponentTypeUnsignedInt64NV = 10,
+    ComponentTypeSignedInt8PackedNV = 1000491000,
+    ComponentTypeUnsignedInt8PackedNV = 1000491001,
+    ComponentTypeFloatE4M3NV = 1000491002,
+    ComponentTypeFloatE5M2NV = 1000491003,
+    ComponentTypeMax = 0x7fffffff,
+};
+
 enum Op {
     OpNop = 0,
     OpUndef = 1,
@@ -1567,14 +1941,37 @@ enum Op {
     OpPtrEqual = 401,
     OpPtrNotEqual = 402,
     OpPtrDiff = 403,
+    OpColorAttachmentReadEXT = 4160,
+    OpDepthAttachmentReadEXT = 4161,
+    OpStencilAttachmentReadEXT = 4162,
+    OpTypeTensorARM = 4163,
+    OpTensorReadARM = 4164,
+    OpTensorWriteARM = 4165,
+    OpTensorQuerySizeARM = 4166,
+    OpGraphConstantARM = 4181,
+    OpGraphEntryPointARM = 4182,
+    OpGraphARM = 4183,
+    OpGraphInputARM = 4184,
+    OpGraphSetOutputARM = 4185,
+    OpGraphEndARM = 4186,
+    OpTypeGraphARM = 4190,
     OpTerminateInvocation = 4416,
+    OpTypeUntypedPointerKHR = 4417,
+    OpUntypedVariableKHR = 4418,
+    OpUntypedAccessChainKHR = 4419,
+    OpUntypedInBoundsAccessChainKHR = 4420,
     OpSubgroupBallotKHR = 4421,
     OpSubgroupFirstInvocationKHR = 4422,
+    OpUntypedPtrAccessChainKHR = 4423,
+    OpUntypedInBoundsPtrAccessChainKHR = 4424,
+    OpUntypedArrayLengthKHR = 4425,
+    OpUntypedPrefetchKHR = 4426,
     OpSubgroupAllKHR = 4428,
     OpSubgroupAnyKHR = 4429,
     OpSubgroupAllEqualKHR = 4430,
     OpGroupNonUniformRotateKHR = 4431,
     OpSubgroupReadInvocationKHR = 4432,
+    OpExtInstWithForwardRefsKHR = 4433,
     OpTraceRayKHR = 4445,
     OpExecuteCallableKHR = 4446,
     OpConvertUToAccelerationStructureKHR = 4447,
@@ -1592,6 +1989,14 @@ enum Op {
     OpUDotAccSatKHR = 4454,
     OpSUDotAccSat = 4455,
     OpSUDotAccSatKHR = 4455,
+    OpTypeCooperativeMatrixKHR = 4456,
+    OpCooperativeMatrixLoadKHR = 4457,
+    OpCooperativeMatrixStoreKHR = 4458,
+    OpCooperativeMatrixMulAddKHR = 4459,
+    OpCooperativeMatrixLengthKHR = 4460,
+    OpConstantCompositeReplicateEXT = 4461,
+    OpSpecConstantCompositeReplicateEXT = 4462,
+    OpCompositeConstructReplicateEXT = 4463,
     OpTypeRayQueryKHR = 4472,
     OpRayQueryInitializeKHR = 4473,
     OpRayQueryTerminateKHR = 4474,
@@ -1618,11 +2023,64 @@ enum Op {
     OpFragmentMaskFetchAMD = 5011,
     OpFragmentFetchAMD = 5012,
     OpReadClockKHR = 5056,
+    OpAllocateNodePayloadsAMDX = 5074,
+    OpEnqueueNodePayloadsAMDX = 5075,
+    OpTypeNodePayloadArrayAMDX = 5076,
+    OpFinishWritingNodePayloadAMDX = 5078,
+    OpNodePayloadArrayLengthAMDX = 5090,
+    OpIsNodePayloadValidAMDX = 5101,
+    OpConstantStringAMDX = 5103,
+    OpSpecConstantStringAMDX = 5104,
+    OpGroupNonUniformQuadAllKHR = 5110,
+    OpGroupNonUniformQuadAnyKHR = 5111,
+    OpHitObjectRecordHitMotionNV = 5249,
+    OpHitObjectRecordHitWithIndexMotionNV = 5250,
+    OpHitObjectRecordMissMotionNV = 5251,
+    OpHitObjectGetWorldToObjectNV = 5252,
+    OpHitObjectGetObjectToWorldNV = 5253,
+    OpHitObjectGetObjectRayDirectionNV = 5254,
+    OpHitObjectGetObjectRayOriginNV = 5255,
+    OpHitObjectTraceRayMotionNV = 5256,
+    OpHitObjectGetShaderRecordBufferHandleNV = 5257,
+    OpHitObjectGetShaderBindingTableRecordIndexNV = 5258,
+    OpHitObjectRecordEmptyNV = 5259,
+    OpHitObjectTraceRayNV = 5260,
+    OpHitObjectRecordHitNV = 5261,
+    OpHitObjectRecordHitWithIndexNV = 5262,
+    OpHitObjectRecordMissNV = 5263,
+    OpHitObjectExecuteShaderNV = 5264,
+    OpHitObjectGetCurrentTimeNV = 5265,
+    OpHitObjectGetAttributesNV = 5266,
+    OpHitObjectGetHitKindNV = 5267,
+    OpHitObjectGetPrimitiveIndexNV = 5268,
+    OpHitObjectGetGeometryIndexNV = 5269,
+    OpHitObjectGetInstanceIdNV = 5270,
+    OpHitObjectGetInstanceCustomIndexNV = 5271,
+    OpHitObjectGetWorldRayDirectionNV = 5272,
+    OpHitObjectGetWorldRayOriginNV = 5273,
+    OpHitObjectGetRayTMaxNV = 5274,
+    OpHitObjectGetRayTMinNV = 5275,
+    OpHitObjectIsEmptyNV = 5276,
+    OpHitObjectIsHitNV = 5277,
+    OpHitObjectIsMissNV = 5278,
+    OpReorderThreadWithHitObjectNV = 5279,
+    OpReorderThreadWithHintNV = 5280,
+    OpTypeHitObjectNV = 5281,
     OpImageSampleFootprintNV = 5283,
+    OpTypeCooperativeVectorNV = 5288,
+    OpCooperativeVectorMatrixMulNV = 5289,
+    OpCooperativeVectorOuterProductAccumulateNV = 5290,
+    OpCooperativeVectorReduceSumAccumulateNV = 5291,
+    OpCooperativeVectorMatrixMulAddNV = 5292,
+    OpCooperativeMatrixConvertNV = 5293,
     OpEmitMeshTasksEXT = 5294,
     OpSetMeshOutputsEXT = 5295,
     OpGroupNonUniformPartitionNV = 5296,
     OpWritePackedPrimitiveIndices4x8NV = 5299,
+    OpFetchMicroTriangleVertexPositionNV = 5300,
+    OpFetchMicroTriangleVertexBarycentricNV = 5301,
+    OpCooperativeVectorLoadNV = 5302,
+    OpCooperativeVectorStoreNV = 5303,
     OpReportIntersectionKHR = 5334,
     OpReportIntersectionNV = 5334,
     OpIgnoreIntersectionNV = 5335,
@@ -1630,9 +2088,12 @@ enum Op {
     OpTraceNV = 5337,
     OpTraceMotionNV = 5338,
     OpTraceRayMotionNV = 5339,
+    OpRayQueryGetIntersectionTriangleVertexPositionsKHR = 5340,
     OpTypeAccelerationStructureKHR = 5341,
     OpTypeAccelerationStructureNV = 5341,
     OpExecuteCallableNV = 5344,
+    OpRayQueryGetClusterIdNV = 5345,
+    OpHitObjectGetClusterIdNV = 5346,
     OpTypeCooperativeMatrixNV = 5358,
     OpCooperativeMatrixLoadNV = 5359,
     OpCooperativeMatrixStoreNV = 5360,
@@ -1640,9 +2101,26 @@ enum Op {
     OpCooperativeMatrixLengthNV = 5362,
     OpBeginInvocationInterlockEXT = 5364,
     OpEndInvocationInterlockEXT = 5365,
+    OpCooperativeMatrixReduceNV = 5366,
+    OpCooperativeMatrixLoadTensorNV = 5367,
+    OpCooperativeMatrixStoreTensorNV = 5368,
+    OpCooperativeMatrixPerElementOpNV = 5369,
+    OpTypeTensorLayoutNV = 5370,
+    OpTypeTensorViewNV = 5371,
+    OpCreateTensorLayoutNV = 5372,
+    OpTensorLayoutSetDimensionNV = 5373,
+    OpTensorLayoutSetStrideNV = 5374,
+    OpTensorLayoutSliceNV = 5375,
+    OpTensorLayoutSetClampValueNV = 5376,
+    OpCreateTensorViewNV = 5377,
+    OpTensorViewSetDimensionNV = 5378,
+    OpTensorViewSetStrideNV = 5379,
     OpDemoteToHelperInvocation = 5380,
     OpDemoteToHelperInvocationEXT = 5380,
     OpIsHelperInvocationEXT = 5381,
+    OpTensorViewSetClipNV = 5382,
+    OpTensorLayoutSetBlockSizeNV = 5384,
+    OpCooperativeMatrixTransposeNV = 5390,
     OpConvertUToImageNV = 5391,
     OpConvertUToSamplerNV = 5392,
     OpConvertImageToUNV = 5393,
@@ -1650,6 +2128,20 @@ enum Op {
     OpConvertUToSampledImageNV = 5395,
     OpConvertSampledImageToUNV = 5396,
     OpSamplerImageAddressingModeNV = 5397,
+    OpRawAccessChainNV = 5398,
+    OpRayQueryGetIntersectionSpherePositionNV = 5427,
+    OpRayQueryGetIntersectionSphereRadiusNV = 5428,
+    OpRayQueryGetIntersectionLSSPositionsNV = 5429,
+    OpRayQueryGetIntersectionLSSRadiiNV = 5430,
+    OpRayQueryGetIntersectionLSSHitValueNV = 5431,
+    OpHitObjectGetSpherePositionNV = 5432,
+    OpHitObjectGetSphereRadiusNV = 5433,
+    OpHitObjectGetLSSPositionsNV = 5434,
+    OpHitObjectGetLSSRadiiNV = 5435,
+    OpHitObjectIsSphereHitNV = 5436,
+    OpHitObjectIsLSSHitNV = 5437,
+    OpRayQueryIsSphereHitNV = 5438,
+    OpRayQueryIsLSSHitNV = 5439,
     OpSubgroupShuffleINTEL = 5571,
     OpSubgroupShuffleDownINTEL = 5572,
     OpSubgroupShuffleUpINTEL = 5573,
@@ -1891,8 +2383,25 @@ enum Op {
     OpTypeStructContinuedINTEL = 6090,
     OpConstantCompositeContinuedINTEL = 6091,
     OpSpecConstantCompositeContinuedINTEL = 6092,
+    OpCompositeConstructContinuedINTEL = 6096,
+    OpConvertFToBF16INTEL = 6116,
+    OpConvertBF16ToFINTEL = 6117,
     OpControlBarrierArriveINTEL = 6142,
     OpControlBarrierWaitINTEL = 6143,
+    OpArithmeticFenceEXT = 6145,
+    OpTaskSequenceCreateINTEL = 6163,
+    OpTaskSequenceAsyncINTEL = 6164,
+    OpTaskSequenceGetINTEL = 6165,
+    OpTaskSequenceReleaseINTEL = 6166,
+    OpTypeTaskSequenceINTEL = 6199,
+    OpSubgroupBlockPrefetchINTEL = 6221,
+    OpSubgroup2DBlockLoadINTEL = 6231,
+    OpSubgroup2DBlockLoadTransformINTEL = 6232,
+    OpSubgroup2DBlockLoadTransposeINTEL = 6233,
+    OpSubgroup2DBlockPrefetchINTEL = 6234,
+    OpSubgroup2DBlockStoreINTEL = 6235,
+    OpSubgroupMatrixMultiplyAccumulateINTEL = 6237,
+    OpBitwiseFunctionINTEL = 6242,
     OpGroupIMulKHR = 6401,
     OpGroupFMulKHR = 6402,
     OpGroupBitwiseAndKHR = 6403,
@@ -1901,6 +2410,12 @@ enum Op {
     OpGroupLogicalAndKHR = 6406,
     OpGroupLogicalOrKHR = 6407,
     OpGroupLogicalXorKHR = 6408,
+    OpRoundFToTF32INTEL = 6426,
+    OpMaskedGatherINTEL = 6428,
+    OpMaskedScatterINTEL = 6429,
+    OpConvertHandleToImageINTEL = 6529,
+    OpConvertHandleToSamplerINTEL = 6530,
+    OpConvertHandleToSampledImageINTEL = 6531,
     OpMax = 0x7fffffff,
 };
 
@@ -2256,14 +2771,37 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpPtrEqual: *hasResult = true; *hasResultType = true; break;
     case OpPtrNotEqual: *hasResult = true; *hasResultType = true; break;
     case OpPtrDiff: *hasResult = true; *hasResultType = true; break;
+    case OpColorAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
+    case OpDepthAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
+    case OpStencilAttachmentReadEXT: *hasResult = true; *hasResultType = true; break;
+    case OpTypeTensorARM: *hasResult = true; *hasResultType = false; break;
+    case OpTensorReadARM: *hasResult = true; *hasResultType = true; break;
+    case OpTensorWriteARM: *hasResult = false; *hasResultType = false; break;
+    case OpTensorQuerySizeARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphConstantARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphEntryPointARM: *hasResult = false; *hasResultType = false; break;
+    case OpGraphARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphInputARM: *hasResult = true; *hasResultType = true; break;
+    case OpGraphSetOutputARM: *hasResult = false; *hasResultType = false; break;
+    case OpGraphEndARM: *hasResult = false; *hasResultType = false; break;
+    case OpTypeGraphARM: *hasResult = true; *hasResultType = false; break;
     case OpTerminateInvocation: *hasResult = false; *hasResultType = false; break;
+    case OpTypeUntypedPointerKHR: *hasResult = true; *hasResultType = false; break;
+    case OpUntypedVariableKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedInBoundsAccessChainKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupBallotKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupFirstInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedPtrAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedInBoundsPtrAccessChainKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedArrayLengthKHR: *hasResult = true; *hasResultType = true; break;
+    case OpUntypedPrefetchKHR: *hasResult = false; *hasResultType = false; break;
     case OpSubgroupAllKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupAnyKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupAllEqualKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupNonUniformRotateKHR: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupReadInvocationKHR: *hasResult = true; *hasResultType = true; break;
+    case OpExtInstWithForwardRefsKHR: *hasResult = true; *hasResultType = true; break;
     case OpTraceRayKHR: *hasResult = false; *hasResultType = false; break;
     case OpExecuteCallableKHR: *hasResult = false; *hasResultType = false; break;
     case OpConvertUToAccelerationStructureKHR: *hasResult = true; *hasResultType = true; break;
@@ -2275,6 +2813,14 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpSDotAccSat: *hasResult = true; *hasResultType = true; break;
     case OpUDotAccSat: *hasResult = true; *hasResultType = true; break;
     case OpSUDotAccSat: *hasResult = true; *hasResultType = true; break;
+    case OpTypeCooperativeMatrixKHR: *hasResult = true; *hasResultType = false; break;
+    case OpCooperativeMatrixLoadKHR: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixStoreKHR: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeMatrixMulAddKHR: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixLengthKHR: *hasResult = true; *hasResultType = true; break;
+    case OpConstantCompositeReplicateEXT: *hasResult = true; *hasResultType = true; break;
+    case OpSpecConstantCompositeReplicateEXT: *hasResult = true; *hasResultType = true; break;
+    case OpCompositeConstructReplicateEXT: *hasResult = true; *hasResultType = true; break;
     case OpTypeRayQueryKHR: *hasResult = true; *hasResultType = false; break;
     case OpRayQueryInitializeKHR: *hasResult = false; *hasResultType = false; break;
     case OpRayQueryTerminateKHR: *hasResult = false; *hasResultType = false; break;
@@ -2301,19 +2847,75 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpFragmentMaskFetchAMD: *hasResult = true; *hasResultType = true; break;
     case OpFragmentFetchAMD: *hasResult = true; *hasResultType = true; break;
     case OpReadClockKHR: *hasResult = true; *hasResultType = true; break;
+    case OpAllocateNodePayloadsAMDX: *hasResult = true; *hasResultType = true; break;
+    case OpEnqueueNodePayloadsAMDX: *hasResult = false; *hasResultType = false; break;
+    case OpTypeNodePayloadArrayAMDX: *hasResult = true; *hasResultType = false; break;
+    case OpFinishWritingNodePayloadAMDX: *hasResult = true; *hasResultType = true; break;
+    case OpNodePayloadArrayLengthAMDX: *hasResult = true; *hasResultType = true; break;
+    case OpIsNodePayloadValidAMDX: *hasResult = true; *hasResultType = true; break;
+    case OpConstantStringAMDX: *hasResult = true; *hasResultType = false; break;
+    case OpSpecConstantStringAMDX: *hasResult = true; *hasResultType = false; break;
+    case OpGroupNonUniformQuadAllKHR: *hasResult = true; *hasResultType = true; break;
+    case OpGroupNonUniformQuadAnyKHR: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectRecordHitMotionNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectRecordHitWithIndexMotionNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectRecordMissMotionNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectGetWorldToObjectNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetObjectToWorldNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetObjectRayDirectionNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetObjectRayOriginNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectTraceRayMotionNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectGetShaderRecordBufferHandleNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetShaderBindingTableRecordIndexNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectRecordEmptyNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectTraceRayNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectRecordHitNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectRecordHitWithIndexNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectRecordMissNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectExecuteShaderNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectGetCurrentTimeNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetAttributesNV: *hasResult = false; *hasResultType = false; break;
+    case OpHitObjectGetHitKindNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetPrimitiveIndexNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetGeometryIndexNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetInstanceIdNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetInstanceCustomIndexNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetWorldRayDirectionNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetWorldRayOriginNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetRayTMaxNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetRayTMinNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectIsEmptyNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectIsHitNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectIsMissNV: *hasResult = true; *hasResultType = true; break;
+    case OpReorderThreadWithHitObjectNV: *hasResult = false; *hasResultType = false; break;
+    case OpReorderThreadWithHintNV: *hasResult = false; *hasResultType = false; break;
+    case OpTypeHitObjectNV: *hasResult = true; *hasResultType = false; break;
     case OpImageSampleFootprintNV: *hasResult = true; *hasResultType = true; break;
+    case OpTypeCooperativeVectorNV: *hasResult = true; *hasResultType = false; break;
+    case OpCooperativeVectorMatrixMulNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeVectorOuterProductAccumulateNV: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeVectorReduceSumAccumulateNV: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeVectorMatrixMulAddNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixConvertNV: *hasResult = true; *hasResultType = true; break;
     case OpEmitMeshTasksEXT: *hasResult = false; *hasResultType = false; break;
     case OpSetMeshOutputsEXT: *hasResult = false; *hasResultType = false; break;
     case OpGroupNonUniformPartitionNV: *hasResult = true; *hasResultType = true; break;
     case OpWritePackedPrimitiveIndices4x8NV: *hasResult = false; *hasResultType = false; break;
-    case OpReportIntersectionNV: *hasResult = true; *hasResultType = true; break;
+    case OpFetchMicroTriangleVertexPositionNV: *hasResult = true; *hasResultType = true; break;
+    case OpFetchMicroTriangleVertexBarycentricNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeVectorLoadNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeVectorStoreNV: *hasResult = false; *hasResultType = false; break;
+    case OpReportIntersectionKHR: *hasResult = true; *hasResultType = true; break;
     case OpIgnoreIntersectionNV: *hasResult = false; *hasResultType = false; break;
     case OpTerminateRayNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceMotionNV: *hasResult = false; *hasResultType = false; break;
     case OpTraceRayMotionNV: *hasResult = false; *hasResultType = false; break;
-    case OpTypeAccelerationStructureNV: *hasResult = true; *hasResultType = false; break;
+    case OpRayQueryGetIntersectionTriangleVertexPositionsKHR: *hasResult = true; *hasResultType = true; break;
+    case OpTypeAccelerationStructureKHR: *hasResult = true; *hasResultType = false; break;
     case OpExecuteCallableNV: *hasResult = false; *hasResultType = false; break;
+    case OpRayQueryGetClusterIdNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetClusterIdNV: *hasResult = true; *hasResultType = true; break;
     case OpTypeCooperativeMatrixNV: *hasResult = true; *hasResultType = false; break;
     case OpCooperativeMatrixLoadNV: *hasResult = true; *hasResultType = true; break;
     case OpCooperativeMatrixStoreNV: *hasResult = false; *hasResultType = false; break;
@@ -2321,8 +2923,25 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpCooperativeMatrixLengthNV: *hasResult = true; *hasResultType = true; break;
     case OpBeginInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
     case OpEndInvocationInterlockEXT: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeMatrixReduceNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixLoadTensorNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixStoreTensorNV: *hasResult = false; *hasResultType = false; break;
+    case OpCooperativeMatrixPerElementOpNV: *hasResult = true; *hasResultType = true; break;
+    case OpTypeTensorLayoutNV: *hasResult = true; *hasResultType = false; break;
+    case OpTypeTensorViewNV: *hasResult = true; *hasResultType = false; break;
+    case OpCreateTensorLayoutNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorLayoutSetDimensionNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorLayoutSetStrideNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorLayoutSliceNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorLayoutSetClampValueNV: *hasResult = true; *hasResultType = true; break;
+    case OpCreateTensorViewNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorViewSetDimensionNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorViewSetStrideNV: *hasResult = true; *hasResultType = true; break;
     case OpDemoteToHelperInvocation: *hasResult = false; *hasResultType = false; break;
     case OpIsHelperInvocationEXT: *hasResult = true; *hasResultType = true; break;
+    case OpTensorViewSetClipNV: *hasResult = true; *hasResultType = true; break;
+    case OpTensorLayoutSetBlockSizeNV: *hasResult = true; *hasResultType = true; break;
+    case OpCooperativeMatrixTransposeNV: *hasResult = true; *hasResultType = true; break;
     case OpConvertUToImageNV: *hasResult = true; *hasResultType = true; break;
     case OpConvertUToSamplerNV: *hasResult = true; *hasResultType = true; break;
     case OpConvertImageToUNV: *hasResult = true; *hasResultType = true; break;
@@ -2330,6 +2949,20 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpConvertUToSampledImageNV: *hasResult = true; *hasResultType = true; break;
     case OpConvertSampledImageToUNV: *hasResult = true; *hasResultType = true; break;
     case OpSamplerImageAddressingModeNV: *hasResult = false; *hasResultType = false; break;
+    case OpRawAccessChainNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryGetIntersectionSpherePositionNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryGetIntersectionSphereRadiusNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryGetIntersectionLSSPositionsNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryGetIntersectionLSSRadiiNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryGetIntersectionLSSHitValueNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetSpherePositionNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetSphereRadiusNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetLSSPositionsNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectGetLSSRadiiNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectIsSphereHitNV: *hasResult = true; *hasResultType = true; break;
+    case OpHitObjectIsLSSHitNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryIsSphereHitNV: *hasResult = true; *hasResultType = true; break;
+    case OpRayQueryIsLSSHitNV: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleINTEL: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleDownINTEL: *hasResult = true; *hasResultType = true; break;
     case OpSubgroupShuffleUpINTEL: *hasResult = true; *hasResultType = true; break;
@@ -2356,7 +2989,7 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpUMul32x16INTEL: *hasResult = true; *hasResultType = true; break;
     case OpConstantFunctionPointerINTEL: *hasResult = true; *hasResultType = true; break;
     case OpFunctionPointerCallINTEL: *hasResult = true; *hasResultType = true; break;
-    case OpAsmTargetINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpAsmTargetINTEL: *hasResult = true; *hasResultType = false; break;
     case OpAsmINTEL: *hasResult = true; *hasResultType = true; break;
     case OpAsmCallINTEL: *hasResult = true; *hasResultType = true; break;
     case OpAtomicFMinEXT: *hasResult = true; *hasResultType = true; break;
@@ -2569,8 +3202,25 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpTypeStructContinuedINTEL: *hasResult = false; *hasResultType = false; break;
     case OpConstantCompositeContinuedINTEL: *hasResult = false; *hasResultType = false; break;
     case OpSpecConstantCompositeContinuedINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpCompositeConstructContinuedINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertFToBF16INTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertBF16ToFINTEL: *hasResult = true; *hasResultType = true; break;
     case OpControlBarrierArriveINTEL: *hasResult = false; *hasResultType = false; break;
     case OpControlBarrierWaitINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpArithmeticFenceEXT: *hasResult = true; *hasResultType = true; break;
+    case OpTaskSequenceCreateINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpTaskSequenceAsyncINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpTaskSequenceGetINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpTaskSequenceReleaseINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpTypeTaskSequenceINTEL: *hasResult = true; *hasResultType = false; break;
+    case OpSubgroupBlockPrefetchINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroup2DBlockLoadINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroup2DBlockLoadTransformINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroup2DBlockLoadTransposeINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroup2DBlockPrefetchINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroup2DBlockStoreINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpSubgroupMatrixMultiplyAccumulateINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpBitwiseFunctionINTEL: *hasResult = true; *hasResultType = true; break;
     case OpGroupIMulKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupFMulKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupBitwiseAndKHR: *hasResult = true; *hasResultType = true; break;
@@ -2579,22 +3229,2057 @@ inline void HasResultAndType(Op opcode, bool *hasResult, bool *hasResultType) {
     case OpGroupLogicalAndKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupLogicalOrKHR: *hasResult = true; *hasResultType = true; break;
     case OpGroupLogicalXorKHR: *hasResult = true; *hasResultType = true; break;
+    case OpRoundFToTF32INTEL: *hasResult = true; *hasResultType = true; break;
+    case OpMaskedGatherINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpMaskedScatterINTEL: *hasResult = false; *hasResultType = false; break;
+    case OpConvertHandleToImageINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertHandleToSamplerINTEL: *hasResult = true; *hasResultType = true; break;
+    case OpConvertHandleToSampledImageINTEL: *hasResult = true; *hasResultType = true; break;
     }
 }
+inline const char* SourceLanguageToString(SourceLanguage value) {
+    switch (value) {
+    case SourceLanguageUnknown: return "Unknown";
+    case SourceLanguageESSL: return "ESSL";
+    case SourceLanguageGLSL: return "GLSL";
+    case SourceLanguageOpenCL_C: return "OpenCL_C";
+    case SourceLanguageOpenCL_CPP: return "OpenCL_CPP";
+    case SourceLanguageHLSL: return "HLSL";
+    case SourceLanguageCPP_for_OpenCL: return "CPP_for_OpenCL";
+    case SourceLanguageSYCL: return "SYCL";
+    case SourceLanguageHERO_C: return "HERO_C";
+    case SourceLanguageNZSL: return "NZSL";
+    case SourceLanguageWGSL: return "WGSL";
+    case SourceLanguageSlang: return "Slang";
+    case SourceLanguageZig: return "Zig";
+    case SourceLanguageRust: return "Rust";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ExecutionModelToString(ExecutionModel value) {
+    switch (value) {
+    case ExecutionModelVertex: return "Vertex";
+    case ExecutionModelTessellationControl: return "TessellationControl";
+    case ExecutionModelTessellationEvaluation: return "TessellationEvaluation";
+    case ExecutionModelGeometry: return "Geometry";
+    case ExecutionModelFragment: return "Fragment";
+    case ExecutionModelGLCompute: return "GLCompute";
+    case ExecutionModelKernel: return "Kernel";
+    case ExecutionModelTaskNV: return "TaskNV";
+    case ExecutionModelMeshNV: return "MeshNV";
+    case ExecutionModelRayGenerationKHR: return "RayGenerationKHR";
+    case ExecutionModelIntersectionKHR: return "IntersectionKHR";
+    case ExecutionModelAnyHitKHR: return "AnyHitKHR";
+    case ExecutionModelClosestHitKHR: return "ClosestHitKHR";
+    case ExecutionModelMissKHR: return "MissKHR";
+    case ExecutionModelCallableKHR: return "CallableKHR";
+    case ExecutionModelTaskEXT: return "TaskEXT";
+    case ExecutionModelMeshEXT: return "MeshEXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* AddressingModelToString(AddressingModel value) {
+    switch (value) {
+    case AddressingModelLogical: return "Logical";
+    case AddressingModelPhysical32: return "Physical32";
+    case AddressingModelPhysical64: return "Physical64";
+    case AddressingModelPhysicalStorageBuffer64: return "PhysicalStorageBuffer64";
+    default: return "Unknown";
+    }
+}
+
+inline const char* MemoryModelToString(MemoryModel value) {
+    switch (value) {
+    case MemoryModelSimple: return "Simple";
+    case MemoryModelGLSL450: return "GLSL450";
+    case MemoryModelOpenCL: return "OpenCL";
+    case MemoryModelVulkan: return "Vulkan";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ExecutionModeToString(ExecutionMode value) {
+    switch (value) {
+    case ExecutionModeInvocations: return "Invocations";
+    case ExecutionModeSpacingEqual: return "SpacingEqual";
+    case ExecutionModeSpacingFractionalEven: return "SpacingFractionalEven";
+    case ExecutionModeSpacingFractionalOdd: return "SpacingFractionalOdd";
+    case ExecutionModeVertexOrderCw: return "VertexOrderCw";
+    case ExecutionModeVertexOrderCcw: return "VertexOrderCcw";
+    case ExecutionModePixelCenterInteger: return "PixelCenterInteger";
+    case ExecutionModeOriginUpperLeft: return "OriginUpperLeft";
+    case ExecutionModeOriginLowerLeft: return "OriginLowerLeft";
+    case ExecutionModeEarlyFragmentTests: return "EarlyFragmentTests";
+    case ExecutionModePointMode: return "PointMode";
+    case ExecutionModeXfb: return "Xfb";
+    case ExecutionModeDepthReplacing: return "DepthReplacing";
+    case ExecutionModeDepthGreater: return "DepthGreater";
+    case ExecutionModeDepthLess: return "DepthLess";
+    case ExecutionModeDepthUnchanged: return "DepthUnchanged";
+    case ExecutionModeLocalSize: return "LocalSize";
+    case ExecutionModeLocalSizeHint: return "LocalSizeHint";
+    case ExecutionModeInputPoints: return "InputPoints";
+    case ExecutionModeInputLines: return "InputLines";
+    case ExecutionModeInputLinesAdjacency: return "InputLinesAdjacency";
+    case ExecutionModeTriangles: return "Triangles";
+    case ExecutionModeInputTrianglesAdjacency: return "InputTrianglesAdjacency";
+    case ExecutionModeQuads: return "Quads";
+    case ExecutionModeIsolines: return "Isolines";
+    case ExecutionModeOutputVertices: return "OutputVertices";
+    case ExecutionModeOutputPoints: return "OutputPoints";
+    case ExecutionModeOutputLineStrip: return "OutputLineStrip";
+    case ExecutionModeOutputTriangleStrip: return "OutputTriangleStrip";
+    case ExecutionModeVecTypeHint: return "VecTypeHint";
+    case ExecutionModeContractionOff: return "ContractionOff";
+    case ExecutionModeInitializer: return "Initializer";
+    case ExecutionModeFinalizer: return "Finalizer";
+    case ExecutionModeSubgroupSize: return "SubgroupSize";
+    case ExecutionModeSubgroupsPerWorkgroup: return "SubgroupsPerWorkgroup";
+    case ExecutionModeSubgroupsPerWorkgroupId: return "SubgroupsPerWorkgroupId";
+    case ExecutionModeLocalSizeId: return "LocalSizeId";
+    case ExecutionModeLocalSizeHintId: return "LocalSizeHintId";
+    case ExecutionModeNonCoherentColorAttachmentReadEXT: return "NonCoherentColorAttachmentReadEXT";
+    case ExecutionModeNonCoherentDepthAttachmentReadEXT: return "NonCoherentDepthAttachmentReadEXT";
+    case ExecutionModeNonCoherentStencilAttachmentReadEXT: return "NonCoherentStencilAttachmentReadEXT";
+    case ExecutionModeSubgroupUniformControlFlowKHR: return "SubgroupUniformControlFlowKHR";
+    case ExecutionModePostDepthCoverage: return "PostDepthCoverage";
+    case ExecutionModeDenormPreserve: return "DenormPreserve";
+    case ExecutionModeDenormFlushToZero: return "DenormFlushToZero";
+    case ExecutionModeSignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
+    case ExecutionModeRoundingModeRTE: return "RoundingModeRTE";
+    case ExecutionModeRoundingModeRTZ: return "RoundingModeRTZ";
+    case ExecutionModeNonCoherentTileAttachmentReadQCOM: return "NonCoherentTileAttachmentReadQCOM";
+    case ExecutionModeTileShadingRateQCOM: return "TileShadingRateQCOM";
+    case ExecutionModeEarlyAndLateFragmentTestsAMD: return "EarlyAndLateFragmentTestsAMD";
+    case ExecutionModeStencilRefReplacingEXT: return "StencilRefReplacingEXT";
+    case ExecutionModeCoalescingAMDX: return "CoalescingAMDX";
+    case ExecutionModeIsApiEntryAMDX: return "IsApiEntryAMDX";
+    case ExecutionModeMaxNodeRecursionAMDX: return "MaxNodeRecursionAMDX";
+    case ExecutionModeStaticNumWorkgroupsAMDX: return "StaticNumWorkgroupsAMDX";
+    case ExecutionModeShaderIndexAMDX: return "ShaderIndexAMDX";
+    case ExecutionModeMaxNumWorkgroupsAMDX: return "MaxNumWorkgroupsAMDX";
+    case ExecutionModeStencilRefUnchangedFrontAMD: return "StencilRefUnchangedFrontAMD";
+    case ExecutionModeStencilRefGreaterFrontAMD: return "StencilRefGreaterFrontAMD";
+    case ExecutionModeStencilRefLessFrontAMD: return "StencilRefLessFrontAMD";
+    case ExecutionModeStencilRefUnchangedBackAMD: return "StencilRefUnchangedBackAMD";
+    case ExecutionModeStencilRefGreaterBackAMD: return "StencilRefGreaterBackAMD";
+    case ExecutionModeStencilRefLessBackAMD: return "StencilRefLessBackAMD";
+    case ExecutionModeQuadDerivativesKHR: return "QuadDerivativesKHR";
+    case ExecutionModeRequireFullQuadsKHR: return "RequireFullQuadsKHR";
+    case ExecutionModeSharesInputWithAMDX: return "SharesInputWithAMDX";
+    case ExecutionModeOutputLinesEXT: return "OutputLinesEXT";
+    case ExecutionModeOutputPrimitivesEXT: return "OutputPrimitivesEXT";
+    case ExecutionModeDerivativeGroupQuadsKHR: return "DerivativeGroupQuadsKHR";
+    case ExecutionModeDerivativeGroupLinearKHR: return "DerivativeGroupLinearKHR";
+    case ExecutionModeOutputTrianglesEXT: return "OutputTrianglesEXT";
+    case ExecutionModePixelInterlockOrderedEXT: return "PixelInterlockOrderedEXT";
+    case ExecutionModePixelInterlockUnorderedEXT: return "PixelInterlockUnorderedEXT";
+    case ExecutionModeSampleInterlockOrderedEXT: return "SampleInterlockOrderedEXT";
+    case ExecutionModeSampleInterlockUnorderedEXT: return "SampleInterlockUnorderedEXT";
+    case ExecutionModeShadingRateInterlockOrderedEXT: return "ShadingRateInterlockOrderedEXT";
+    case ExecutionModeShadingRateInterlockUnorderedEXT: return "ShadingRateInterlockUnorderedEXT";
+    case ExecutionModeSharedLocalMemorySizeINTEL: return "SharedLocalMemorySizeINTEL";
+    case ExecutionModeRoundingModeRTPINTEL: return "RoundingModeRTPINTEL";
+    case ExecutionModeRoundingModeRTNINTEL: return "RoundingModeRTNINTEL";
+    case ExecutionModeFloatingPointModeALTINTEL: return "FloatingPointModeALTINTEL";
+    case ExecutionModeFloatingPointModeIEEEINTEL: return "FloatingPointModeIEEEINTEL";
+    case ExecutionModeMaxWorkgroupSizeINTEL: return "MaxWorkgroupSizeINTEL";
+    case ExecutionModeMaxWorkDimINTEL: return "MaxWorkDimINTEL";
+    case ExecutionModeNoGlobalOffsetINTEL: return "NoGlobalOffsetINTEL";
+    case ExecutionModeNumSIMDWorkitemsINTEL: return "NumSIMDWorkitemsINTEL";
+    case ExecutionModeSchedulerTargetFmaxMhzINTEL: return "SchedulerTargetFmaxMhzINTEL";
+    case ExecutionModeMaximallyReconvergesKHR: return "MaximallyReconvergesKHR";
+    case ExecutionModeFPFastMathDefault: return "FPFastMathDefault";
+    case ExecutionModeStreamingInterfaceINTEL: return "StreamingInterfaceINTEL";
+    case ExecutionModeRegisterMapInterfaceINTEL: return "RegisterMapInterfaceINTEL";
+    case ExecutionModeNamedBarrierCountINTEL: return "NamedBarrierCountINTEL";
+    case ExecutionModeMaximumRegistersINTEL: return "MaximumRegistersINTEL";
+    case ExecutionModeMaximumRegistersIdINTEL: return "MaximumRegistersIdINTEL";
+    case ExecutionModeNamedMaximumRegistersINTEL: return "NamedMaximumRegistersINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* StorageClassToString(StorageClass value) {
+    switch (value) {
+    case StorageClassUniformConstant: return "UniformConstant";
+    case StorageClassInput: return "Input";
+    case StorageClassUniform: return "Uniform";
+    case StorageClassOutput: return "Output";
+    case StorageClassWorkgroup: return "Workgroup";
+    case StorageClassCrossWorkgroup: return "CrossWorkgroup";
+    case StorageClassPrivate: return "Private";
+    case StorageClassFunction: return "Function";
+    case StorageClassGeneric: return "Generic";
+    case StorageClassPushConstant: return "PushConstant";
+    case StorageClassAtomicCounter: return "AtomicCounter";
+    case StorageClassImage: return "Image";
+    case StorageClassStorageBuffer: return "StorageBuffer";
+    case StorageClassTileImageEXT: return "TileImageEXT";
+    case StorageClassTileAttachmentQCOM: return "TileAttachmentQCOM";
+    case StorageClassNodePayloadAMDX: return "NodePayloadAMDX";
+    case StorageClassCallableDataKHR: return "CallableDataKHR";
+    case StorageClassIncomingCallableDataKHR: return "IncomingCallableDataKHR";
+    case StorageClassRayPayloadKHR: return "RayPayloadKHR";
+    case StorageClassHitAttributeKHR: return "HitAttributeKHR";
+    case StorageClassIncomingRayPayloadKHR: return "IncomingRayPayloadKHR";
+    case StorageClassShaderRecordBufferKHR: return "ShaderRecordBufferKHR";
+    case StorageClassPhysicalStorageBuffer: return "PhysicalStorageBuffer";
+    case StorageClassHitObjectAttributeNV: return "HitObjectAttributeNV";
+    case StorageClassTaskPayloadWorkgroupEXT: return "TaskPayloadWorkgroupEXT";
+    case StorageClassCodeSectionINTEL: return "CodeSectionINTEL";
+    case StorageClassDeviceOnlyINTEL: return "DeviceOnlyINTEL";
+    case StorageClassHostOnlyINTEL: return "HostOnlyINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* DimToString(Dim value) {
+    switch (value) {
+    case Dim1D: return "1D";
+    case Dim2D: return "2D";
+    case Dim3D: return "3D";
+    case DimCube: return "Cube";
+    case DimRect: return "Rect";
+    case DimBuffer: return "Buffer";
+    case DimSubpassData: return "SubpassData";
+    case DimTileImageDataEXT: return "TileImageDataEXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* SamplerAddressingModeToString(SamplerAddressingMode value) {
+    switch (value) {
+    case SamplerAddressingModeNone: return "None";
+    case SamplerAddressingModeClampToEdge: return "ClampToEdge";
+    case SamplerAddressingModeClamp: return "Clamp";
+    case SamplerAddressingModeRepeat: return "Repeat";
+    case SamplerAddressingModeRepeatMirrored: return "RepeatMirrored";
+    default: return "Unknown";
+    }
+}
+
+inline const char* SamplerFilterModeToString(SamplerFilterMode value) {
+    switch (value) {
+    case SamplerFilterModeNearest: return "Nearest";
+    case SamplerFilterModeLinear: return "Linear";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageFormatToString(ImageFormat value) {
+    switch (value) {
+    case ImageFormatUnknown: return "Unknown";
+    case ImageFormatRgba32f: return "Rgba32f";
+    case ImageFormatRgba16f: return "Rgba16f";
+    case ImageFormatR32f: return "R32f";
+    case ImageFormatRgba8: return "Rgba8";
+    case ImageFormatRgba8Snorm: return "Rgba8Snorm";
+    case ImageFormatRg32f: return "Rg32f";
+    case ImageFormatRg16f: return "Rg16f";
+    case ImageFormatR11fG11fB10f: return "R11fG11fB10f";
+    case ImageFormatR16f: return "R16f";
+    case ImageFormatRgba16: return "Rgba16";
+    case ImageFormatRgb10A2: return "Rgb10A2";
+    case ImageFormatRg16: return "Rg16";
+    case ImageFormatRg8: return "Rg8";
+    case ImageFormatR16: return "R16";
+    case ImageFormatR8: return "R8";
+    case ImageFormatRgba16Snorm: return "Rgba16Snorm";
+    case ImageFormatRg16Snorm: return "Rg16Snorm";
+    case ImageFormatRg8Snorm: return "Rg8Snorm";
+    case ImageFormatR16Snorm: return "R16Snorm";
+    case ImageFormatR8Snorm: return "R8Snorm";
+    case ImageFormatRgba32i: return "Rgba32i";
+    case ImageFormatRgba16i: return "Rgba16i";
+    case ImageFormatRgba8i: return "Rgba8i";
+    case ImageFormatR32i: return "R32i";
+    case ImageFormatRg32i: return "Rg32i";
+    case ImageFormatRg16i: return "Rg16i";
+    case ImageFormatRg8i: return "Rg8i";
+    case ImageFormatR16i: return "R16i";
+    case ImageFormatR8i: return "R8i";
+    case ImageFormatRgba32ui: return "Rgba32ui";
+    case ImageFormatRgba16ui: return "Rgba16ui";
+    case ImageFormatRgba8ui: return "Rgba8ui";
+    case ImageFormatR32ui: return "R32ui";
+    case ImageFormatRgb10a2ui: return "Rgb10a2ui";
+    case ImageFormatRg32ui: return "Rg32ui";
+    case ImageFormatRg16ui: return "Rg16ui";
+    case ImageFormatRg8ui: return "Rg8ui";
+    case ImageFormatR16ui: return "R16ui";
+    case ImageFormatR8ui: return "R8ui";
+    case ImageFormatR64ui: return "R64ui";
+    case ImageFormatR64i: return "R64i";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageChannelOrderToString(ImageChannelOrder value) {
+    switch (value) {
+    case ImageChannelOrderR: return "R";
+    case ImageChannelOrderA: return "A";
+    case ImageChannelOrderRG: return "RG";
+    case ImageChannelOrderRA: return "RA";
+    case ImageChannelOrderRGB: return "RGB";
+    case ImageChannelOrderRGBA: return "RGBA";
+    case ImageChannelOrderBGRA: return "BGRA";
+    case ImageChannelOrderARGB: return "ARGB";
+    case ImageChannelOrderIntensity: return "Intensity";
+    case ImageChannelOrderLuminance: return "Luminance";
+    case ImageChannelOrderRx: return "Rx";
+    case ImageChannelOrderRGx: return "RGx";
+    case ImageChannelOrderRGBx: return "RGBx";
+    case ImageChannelOrderDepth: return "Depth";
+    case ImageChannelOrderDepthStencil: return "DepthStencil";
+    case ImageChannelOrdersRGB: return "sRGB";
+    case ImageChannelOrdersRGBx: return "sRGBx";
+    case ImageChannelOrdersRGBA: return "sRGBA";
+    case ImageChannelOrdersBGRA: return "sBGRA";
+    case ImageChannelOrderABGR: return "ABGR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ImageChannelDataTypeToString(ImageChannelDataType value) {
+    switch (value) {
+    case ImageChannelDataTypeSnormInt8: return "SnormInt8";
+    case ImageChannelDataTypeSnormInt16: return "SnormInt16";
+    case ImageChannelDataTypeUnormInt8: return "UnormInt8";
+    case ImageChannelDataTypeUnormInt16: return "UnormInt16";
+    case ImageChannelDataTypeUnormShort565: return "UnormShort565";
+    case ImageChannelDataTypeUnormShort555: return "UnormShort555";
+    case ImageChannelDataTypeUnormInt101010: return "UnormInt101010";
+    case ImageChannelDataTypeSignedInt8: return "SignedInt8";
+    case ImageChannelDataTypeSignedInt16: return "SignedInt16";
+    case ImageChannelDataTypeSignedInt32: return "SignedInt32";
+    case ImageChannelDataTypeUnsignedInt8: return "UnsignedInt8";
+    case ImageChannelDataTypeUnsignedInt16: return "UnsignedInt16";
+    case ImageChannelDataTypeUnsignedInt32: return "UnsignedInt32";
+    case ImageChannelDataTypeHalfFloat: return "HalfFloat";
+    case ImageChannelDataTypeFloat: return "Float";
+    case ImageChannelDataTypeUnormInt24: return "UnormInt24";
+    case ImageChannelDataTypeUnormInt101010_2: return "UnormInt101010_2";
+    case ImageChannelDataTypeUnormInt10X6EXT: return "UnormInt10X6EXT";
+    case ImageChannelDataTypeUnsignedIntRaw10EXT: return "UnsignedIntRaw10EXT";
+    case ImageChannelDataTypeUnsignedIntRaw12EXT: return "UnsignedIntRaw12EXT";
+    case ImageChannelDataTypeUnormInt2_101010EXT: return "UnormInt2_101010EXT";
+    case ImageChannelDataTypeUnsignedInt10X6EXT: return "UnsignedInt10X6EXT";
+    case ImageChannelDataTypeUnsignedInt12X4EXT: return "UnsignedInt12X4EXT";
+    case ImageChannelDataTypeUnsignedInt14X2EXT: return "UnsignedInt14X2EXT";
+    case ImageChannelDataTypeUnormInt12X4EXT: return "UnormInt12X4EXT";
+    case ImageChannelDataTypeUnormInt14X2EXT: return "UnormInt14X2EXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPRoundingModeToString(FPRoundingMode value) {
+    switch (value) {
+    case FPRoundingModeRTE: return "RTE";
+    case FPRoundingModeRTZ: return "RTZ";
+    case FPRoundingModeRTP: return "RTP";
+    case FPRoundingModeRTN: return "RTN";
+    default: return "Unknown";
+    }
+}
+
+inline const char* LinkageTypeToString(LinkageType value) {
+    switch (value) {
+    case LinkageTypeExport: return "Export";
+    case LinkageTypeImport: return "Import";
+    case LinkageTypeLinkOnceODR: return "LinkOnceODR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* AccessQualifierToString(AccessQualifier value) {
+    switch (value) {
+    case AccessQualifierReadOnly: return "ReadOnly";
+    case AccessQualifierWriteOnly: return "WriteOnly";
+    case AccessQualifierReadWrite: return "ReadWrite";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FunctionParameterAttributeToString(FunctionParameterAttribute value) {
+    switch (value) {
+    case FunctionParameterAttributeZext: return "Zext";
+    case FunctionParameterAttributeSext: return "Sext";
+    case FunctionParameterAttributeByVal: return "ByVal";
+    case FunctionParameterAttributeSret: return "Sret";
+    case FunctionParameterAttributeNoAlias: return "NoAlias";
+    case FunctionParameterAttributeNoCapture: return "NoCapture";
+    case FunctionParameterAttributeNoWrite: return "NoWrite";
+    case FunctionParameterAttributeNoReadWrite: return "NoReadWrite";
+    case FunctionParameterAttributeRuntimeAlignedINTEL: return "RuntimeAlignedINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* DecorationToString(Decoration value) {
+    switch (value) {
+    case DecorationRelaxedPrecision: return "RelaxedPrecision";
+    case DecorationSpecId: return "SpecId";
+    case DecorationBlock: return "Block";
+    case DecorationBufferBlock: return "BufferBlock";
+    case DecorationRowMajor: return "RowMajor";
+    case DecorationColMajor: return "ColMajor";
+    case DecorationArrayStride: return "ArrayStride";
+    case DecorationMatrixStride: return "MatrixStride";
+    case DecorationGLSLShared: return "GLSLShared";
+    case DecorationGLSLPacked: return "GLSLPacked";
+    case DecorationCPacked: return "CPacked";
+    case DecorationBuiltIn: return "BuiltIn";
+    case DecorationNoPerspective: return "NoPerspective";
+    case DecorationFlat: return "Flat";
+    case DecorationPatch: return "Patch";
+    case DecorationCentroid: return "Centroid";
+    case DecorationSample: return "Sample";
+    case DecorationInvariant: return "Invariant";
+    case DecorationRestrict: return "Restrict";
+    case DecorationAliased: return "Aliased";
+    case DecorationVolatile: return "Volatile";
+    case DecorationConstant: return "Constant";
+    case DecorationCoherent: return "Coherent";
+    case DecorationNonWritable: return "NonWritable";
+    case DecorationNonReadable: return "NonReadable";
+    case DecorationUniform: return "Uniform";
+    case DecorationUniformId: return "UniformId";
+    case DecorationSaturatedConversion: return "SaturatedConversion";
+    case DecorationStream: return "Stream";
+    case DecorationLocation: return "Location";
+    case DecorationComponent: return "Component";
+    case DecorationIndex: return "Index";
+    case DecorationBinding: return "Binding";
+    case DecorationDescriptorSet: return "DescriptorSet";
+    case DecorationOffset: return "Offset";
+    case DecorationXfbBuffer: return "XfbBuffer";
+    case DecorationXfbStride: return "XfbStride";
+    case DecorationFuncParamAttr: return "FuncParamAttr";
+    case DecorationFPRoundingMode: return "FPRoundingMode";
+    case DecorationFPFastMathMode: return "FPFastMathMode";
+    case DecorationLinkageAttributes: return "LinkageAttributes";
+    case DecorationNoContraction: return "NoContraction";
+    case DecorationInputAttachmentIndex: return "InputAttachmentIndex";
+    case DecorationAlignment: return "Alignment";
+    case DecorationMaxByteOffset: return "MaxByteOffset";
+    case DecorationAlignmentId: return "AlignmentId";
+    case DecorationMaxByteOffsetId: return "MaxByteOffsetId";
+    case DecorationSaturatedToLargestFloat8NormalConversionEXT: return "SaturatedToLargestFloat8NormalConversionEXT";
+    case DecorationNoSignedWrap: return "NoSignedWrap";
+    case DecorationNoUnsignedWrap: return "NoUnsignedWrap";
+    case DecorationWeightTextureQCOM: return "WeightTextureQCOM";
+    case DecorationBlockMatchTextureQCOM: return "BlockMatchTextureQCOM";
+    case DecorationBlockMatchSamplerQCOM: return "BlockMatchSamplerQCOM";
+    case DecorationExplicitInterpAMD: return "ExplicitInterpAMD";
+    case DecorationNodeSharesPayloadLimitsWithAMDX: return "NodeSharesPayloadLimitsWithAMDX";
+    case DecorationNodeMaxPayloadsAMDX: return "NodeMaxPayloadsAMDX";
+    case DecorationTrackFinishWritingAMDX: return "TrackFinishWritingAMDX";
+    case DecorationPayloadNodeNameAMDX: return "PayloadNodeNameAMDX";
+    case DecorationPayloadNodeBaseIndexAMDX: return "PayloadNodeBaseIndexAMDX";
+    case DecorationPayloadNodeSparseArrayAMDX: return "PayloadNodeSparseArrayAMDX";
+    case DecorationPayloadNodeArraySizeAMDX: return "PayloadNodeArraySizeAMDX";
+    case DecorationPayloadDispatchIndirectAMDX: return "PayloadDispatchIndirectAMDX";
+    case DecorationOverrideCoverageNV: return "OverrideCoverageNV";
+    case DecorationPassthroughNV: return "PassthroughNV";
+    case DecorationViewportRelativeNV: return "ViewportRelativeNV";
+    case DecorationSecondaryViewportRelativeNV: return "SecondaryViewportRelativeNV";
+    case DecorationPerPrimitiveEXT: return "PerPrimitiveEXT";
+    case DecorationPerViewNV: return "PerViewNV";
+    case DecorationPerTaskNV: return "PerTaskNV";
+    case DecorationPerVertexKHR: return "PerVertexKHR";
+    case DecorationNonUniform: return "NonUniform";
+    case DecorationRestrictPointer: return "RestrictPointer";
+    case DecorationAliasedPointer: return "AliasedPointer";
+    case DecorationHitObjectShaderRecordBufferNV: return "HitObjectShaderRecordBufferNV";
+    case DecorationBindlessSamplerNV: return "BindlessSamplerNV";
+    case DecorationBindlessImageNV: return "BindlessImageNV";
+    case DecorationBoundSamplerNV: return "BoundSamplerNV";
+    case DecorationBoundImageNV: return "BoundImageNV";
+    case DecorationSIMTCallINTEL: return "SIMTCallINTEL";
+    case DecorationReferencedIndirectlyINTEL: return "ReferencedIndirectlyINTEL";
+    case DecorationClobberINTEL: return "ClobberINTEL";
+    case DecorationSideEffectsINTEL: return "SideEffectsINTEL";
+    case DecorationVectorComputeVariableINTEL: return "VectorComputeVariableINTEL";
+    case DecorationFuncParamIOKindINTEL: return "FuncParamIOKindINTEL";
+    case DecorationVectorComputeFunctionINTEL: return "VectorComputeFunctionINTEL";
+    case DecorationStackCallINTEL: return "StackCallINTEL";
+    case DecorationGlobalVariableOffsetINTEL: return "GlobalVariableOffsetINTEL";
+    case DecorationCounterBuffer: return "CounterBuffer";
+    case DecorationHlslSemanticGOOGLE: return "HlslSemanticGOOGLE";
+    case DecorationUserTypeGOOGLE: return "UserTypeGOOGLE";
+    case DecorationFunctionRoundingModeINTEL: return "FunctionRoundingModeINTEL";
+    case DecorationFunctionDenormModeINTEL: return "FunctionDenormModeINTEL";
+    case DecorationRegisterINTEL: return "RegisterINTEL";
+    case DecorationMemoryINTEL: return "MemoryINTEL";
+    case DecorationNumbanksINTEL: return "NumbanksINTEL";
+    case DecorationBankwidthINTEL: return "BankwidthINTEL";
+    case DecorationMaxPrivateCopiesINTEL: return "MaxPrivateCopiesINTEL";
+    case DecorationSinglepumpINTEL: return "SinglepumpINTEL";
+    case DecorationDoublepumpINTEL: return "DoublepumpINTEL";
+    case DecorationMaxReplicatesINTEL: return "MaxReplicatesINTEL";
+    case DecorationSimpleDualPortINTEL: return "SimpleDualPortINTEL";
+    case DecorationMergeINTEL: return "MergeINTEL";
+    case DecorationBankBitsINTEL: return "BankBitsINTEL";
+    case DecorationForcePow2DepthINTEL: return "ForcePow2DepthINTEL";
+    case DecorationStridesizeINTEL: return "StridesizeINTEL";
+    case DecorationWordsizeINTEL: return "WordsizeINTEL";
+    case DecorationTrueDualPortINTEL: return "TrueDualPortINTEL";
+    case DecorationBurstCoalesceINTEL: return "BurstCoalesceINTEL";
+    case DecorationCacheSizeINTEL: return "CacheSizeINTEL";
+    case DecorationDontStaticallyCoalesceINTEL: return "DontStaticallyCoalesceINTEL";
+    case DecorationPrefetchINTEL: return "PrefetchINTEL";
+    case DecorationStallEnableINTEL: return "StallEnableINTEL";
+    case DecorationFuseLoopsInFunctionINTEL: return "FuseLoopsInFunctionINTEL";
+    case DecorationMathOpDSPModeINTEL: return "MathOpDSPModeINTEL";
+    case DecorationAliasScopeINTEL: return "AliasScopeINTEL";
+    case DecorationNoAliasINTEL: return "NoAliasINTEL";
+    case DecorationInitiationIntervalINTEL: return "InitiationIntervalINTEL";
+    case DecorationMaxConcurrencyINTEL: return "MaxConcurrencyINTEL";
+    case DecorationPipelineEnableINTEL: return "PipelineEnableINTEL";
+    case DecorationBufferLocationINTEL: return "BufferLocationINTEL";
+    case DecorationIOPipeStorageINTEL: return "IOPipeStorageINTEL";
+    case DecorationFunctionFloatingPointModeINTEL: return "FunctionFloatingPointModeINTEL";
+    case DecorationSingleElementVectorINTEL: return "SingleElementVectorINTEL";
+    case DecorationVectorComputeCallableFunctionINTEL: return "VectorComputeCallableFunctionINTEL";
+    case DecorationMediaBlockIOINTEL: return "MediaBlockIOINTEL";
+    case DecorationStallFreeINTEL: return "StallFreeINTEL";
+    case DecorationFPMaxErrorDecorationINTEL: return "FPMaxErrorDecorationINTEL";
+    case DecorationLatencyControlLabelINTEL: return "LatencyControlLabelINTEL";
+    case DecorationLatencyControlConstraintINTEL: return "LatencyControlConstraintINTEL";
+    case DecorationConduitKernelArgumentINTEL: return "ConduitKernelArgumentINTEL";
+    case DecorationRegisterMapKernelArgumentINTEL: return "RegisterMapKernelArgumentINTEL";
+    case DecorationMMHostInterfaceAddressWidthINTEL: return "MMHostInterfaceAddressWidthINTEL";
+    case DecorationMMHostInterfaceDataWidthINTEL: return "MMHostInterfaceDataWidthINTEL";
+    case DecorationMMHostInterfaceLatencyINTEL: return "MMHostInterfaceLatencyINTEL";
+    case DecorationMMHostInterfaceReadWriteModeINTEL: return "MMHostInterfaceReadWriteModeINTEL";
+    case DecorationMMHostInterfaceMaxBurstINTEL: return "MMHostInterfaceMaxBurstINTEL";
+    case DecorationMMHostInterfaceWaitRequestINTEL: return "MMHostInterfaceWaitRequestINTEL";
+    case DecorationStableKernelArgumentINTEL: return "StableKernelArgumentINTEL";
+    case DecorationHostAccessINTEL: return "HostAccessINTEL";
+    case DecorationInitModeINTEL: return "InitModeINTEL";
+    case DecorationImplementInRegisterMapINTEL: return "ImplementInRegisterMapINTEL";
+    case DecorationCacheControlLoadINTEL: return "CacheControlLoadINTEL";
+    case DecorationCacheControlStoreINTEL: return "CacheControlStoreINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* BuiltInToString(BuiltIn value) {
+    switch (value) {
+    case BuiltInPosition: return "Position";
+    case BuiltInPointSize: return "PointSize";
+    case BuiltInClipDistance: return "ClipDistance";
+    case BuiltInCullDistance: return "CullDistance";
+    case BuiltInVertexId: return "VertexId";
+    case BuiltInInstanceId: return "InstanceId";
+    case BuiltInPrimitiveId: return "PrimitiveId";
+    case BuiltInInvocationId: return "InvocationId";
+    case BuiltInLayer: return "Layer";
+    case BuiltInViewportIndex: return "ViewportIndex";
+    case BuiltInTessLevelOuter: return "TessLevelOuter";
+    case BuiltInTessLevelInner: return "TessLevelInner";
+    case BuiltInTessCoord: return "TessCoord";
+    case BuiltInPatchVertices: return "PatchVertices";
+    case BuiltInFragCoord: return "FragCoord";
+    case BuiltInPointCoord: return "PointCoord";
+    case BuiltInFrontFacing: return "FrontFacing";
+    case BuiltInSampleId: return "SampleId";
+    case BuiltInSamplePosition: return "SamplePosition";
+    case BuiltInSampleMask: return "SampleMask";
+    case BuiltInFragDepth: return "FragDepth";
+    case BuiltInHelperInvocation: return "HelperInvocation";
+    case BuiltInNumWorkgroups: return "NumWorkgroups";
+    case BuiltInWorkgroupSize: return "WorkgroupSize";
+    case BuiltInWorkgroupId: return "WorkgroupId";
+    case BuiltInLocalInvocationId: return "LocalInvocationId";
+    case BuiltInGlobalInvocationId: return "GlobalInvocationId";
+    case BuiltInLocalInvocationIndex: return "LocalInvocationIndex";
+    case BuiltInWorkDim: return "WorkDim";
+    case BuiltInGlobalSize: return "GlobalSize";
+    case BuiltInEnqueuedWorkgroupSize: return "EnqueuedWorkgroupSize";
+    case BuiltInGlobalOffset: return "GlobalOffset";
+    case BuiltInGlobalLinearId: return "GlobalLinearId";
+    case BuiltInSubgroupSize: return "SubgroupSize";
+    case BuiltInSubgroupMaxSize: return "SubgroupMaxSize";
+    case BuiltInNumSubgroups: return "NumSubgroups";
+    case BuiltInNumEnqueuedSubgroups: return "NumEnqueuedSubgroups";
+    case BuiltInSubgroupId: return "SubgroupId";
+    case BuiltInSubgroupLocalInvocationId: return "SubgroupLocalInvocationId";
+    case BuiltInVertexIndex: return "VertexIndex";
+    case BuiltInInstanceIndex: return "InstanceIndex";
+    case BuiltInCoreIDARM: return "CoreIDARM";
+    case BuiltInCoreCountARM: return "CoreCountARM";
+    case BuiltInCoreMaxIDARM: return "CoreMaxIDARM";
+    case BuiltInWarpIDARM: return "WarpIDARM";
+    case BuiltInWarpMaxIDARM: return "WarpMaxIDARM";
+    case BuiltInSubgroupEqMask: return "SubgroupEqMask";
+    case BuiltInSubgroupGeMask: return "SubgroupGeMask";
+    case BuiltInSubgroupGtMask: return "SubgroupGtMask";
+    case BuiltInSubgroupLeMask: return "SubgroupLeMask";
+    case BuiltInSubgroupLtMask: return "SubgroupLtMask";
+    case BuiltInBaseVertex: return "BaseVertex";
+    case BuiltInBaseInstance: return "BaseInstance";
+    case BuiltInDrawIndex: return "DrawIndex";
+    case BuiltInPrimitiveShadingRateKHR: return "PrimitiveShadingRateKHR";
+    case BuiltInDeviceIndex: return "DeviceIndex";
+    case BuiltInViewIndex: return "ViewIndex";
+    case BuiltInShadingRateKHR: return "ShadingRateKHR";
+    case BuiltInTileOffsetQCOM: return "TileOffsetQCOM";
+    case BuiltInTileDimensionQCOM: return "TileDimensionQCOM";
+    case BuiltInTileApronSizeQCOM: return "TileApronSizeQCOM";
+    case BuiltInBaryCoordNoPerspAMD: return "BaryCoordNoPerspAMD";
+    case BuiltInBaryCoordNoPerspCentroidAMD: return "BaryCoordNoPerspCentroidAMD";
+    case BuiltInBaryCoordNoPerspSampleAMD: return "BaryCoordNoPerspSampleAMD";
+    case BuiltInBaryCoordSmoothAMD: return "BaryCoordSmoothAMD";
+    case BuiltInBaryCoordSmoothCentroidAMD: return "BaryCoordSmoothCentroidAMD";
+    case BuiltInBaryCoordSmoothSampleAMD: return "BaryCoordSmoothSampleAMD";
+    case BuiltInBaryCoordPullModelAMD: return "BaryCoordPullModelAMD";
+    case BuiltInFragStencilRefEXT: return "FragStencilRefEXT";
+    case BuiltInRemainingRecursionLevelsAMDX: return "RemainingRecursionLevelsAMDX";
+    case BuiltInShaderIndexAMDX: return "ShaderIndexAMDX";
+    case BuiltInViewportMaskNV: return "ViewportMaskNV";
+    case BuiltInSecondaryPositionNV: return "SecondaryPositionNV";
+    case BuiltInSecondaryViewportMaskNV: return "SecondaryViewportMaskNV";
+    case BuiltInPositionPerViewNV: return "PositionPerViewNV";
+    case BuiltInViewportMaskPerViewNV: return "ViewportMaskPerViewNV";
+    case BuiltInFullyCoveredEXT: return "FullyCoveredEXT";
+    case BuiltInTaskCountNV: return "TaskCountNV";
+    case BuiltInPrimitiveCountNV: return "PrimitiveCountNV";
+    case BuiltInPrimitiveIndicesNV: return "PrimitiveIndicesNV";
+    case BuiltInClipDistancePerViewNV: return "ClipDistancePerViewNV";
+    case BuiltInCullDistancePerViewNV: return "CullDistancePerViewNV";
+    case BuiltInLayerPerViewNV: return "LayerPerViewNV";
+    case BuiltInMeshViewCountNV: return "MeshViewCountNV";
+    case BuiltInMeshViewIndicesNV: return "MeshViewIndicesNV";
+    case BuiltInBaryCoordKHR: return "BaryCoordKHR";
+    case BuiltInBaryCoordNoPerspKHR: return "BaryCoordNoPerspKHR";
+    case BuiltInFragSizeEXT: return "FragSizeEXT";
+    case BuiltInFragInvocationCountEXT: return "FragInvocationCountEXT";
+    case BuiltInPrimitivePointIndicesEXT: return "PrimitivePointIndicesEXT";
+    case BuiltInPrimitiveLineIndicesEXT: return "PrimitiveLineIndicesEXT";
+    case BuiltInPrimitiveTriangleIndicesEXT: return "PrimitiveTriangleIndicesEXT";
+    case BuiltInCullPrimitiveEXT: return "CullPrimitiveEXT";
+    case BuiltInLaunchIdKHR: return "LaunchIdKHR";
+    case BuiltInLaunchSizeKHR: return "LaunchSizeKHR";
+    case BuiltInWorldRayOriginKHR: return "WorldRayOriginKHR";
+    case BuiltInWorldRayDirectionKHR: return "WorldRayDirectionKHR";
+    case BuiltInObjectRayOriginKHR: return "ObjectRayOriginKHR";
+    case BuiltInObjectRayDirectionKHR: return "ObjectRayDirectionKHR";
+    case BuiltInRayTminKHR: return "RayTminKHR";
+    case BuiltInRayTmaxKHR: return "RayTmaxKHR";
+    case BuiltInInstanceCustomIndexKHR: return "InstanceCustomIndexKHR";
+    case BuiltInObjectToWorldKHR: return "ObjectToWorldKHR";
+    case BuiltInWorldToObjectKHR: return "WorldToObjectKHR";
+    case BuiltInHitTNV: return "HitTNV";
+    case BuiltInHitKindKHR: return "HitKindKHR";
+    case BuiltInCurrentRayTimeNV: return "CurrentRayTimeNV";
+    case BuiltInHitTriangleVertexPositionsKHR: return "HitTriangleVertexPositionsKHR";
+    case BuiltInHitMicroTriangleVertexPositionsNV: return "HitMicroTriangleVertexPositionsNV";
+    case BuiltInHitMicroTriangleVertexBarycentricsNV: return "HitMicroTriangleVertexBarycentricsNV";
+    case BuiltInIncomingRayFlagsKHR: return "IncomingRayFlagsKHR";
+    case BuiltInRayGeometryIndexKHR: return "RayGeometryIndexKHR";
+    case BuiltInHitIsSphereNV: return "HitIsSphereNV";
+    case BuiltInHitIsLSSNV: return "HitIsLSSNV";
+    case BuiltInHitSpherePositionNV: return "HitSpherePositionNV";
+    case BuiltInWarpsPerSMNV: return "WarpsPerSMNV";
+    case BuiltInSMCountNV: return "SMCountNV";
+    case BuiltInWarpIDNV: return "WarpIDNV";
+    case BuiltInSMIDNV: return "SMIDNV";
+    case BuiltInHitLSSPositionsNV: return "HitLSSPositionsNV";
+    case BuiltInHitKindFrontFacingMicroTriangleNV: return "HitKindFrontFacingMicroTriangleNV";
+    case BuiltInHitKindBackFacingMicroTriangleNV: return "HitKindBackFacingMicroTriangleNV";
+    case BuiltInHitSphereRadiusNV: return "HitSphereRadiusNV";
+    case BuiltInHitLSSRadiiNV: return "HitLSSRadiiNV";
+    case BuiltInClusterIDNV: return "ClusterIDNV";
+    case BuiltInCullMaskKHR: return "CullMaskKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ScopeToString(Scope value) {
+    switch (value) {
+    case ScopeCrossDevice: return "CrossDevice";
+    case ScopeDevice: return "Device";
+    case ScopeWorkgroup: return "Workgroup";
+    case ScopeSubgroup: return "Subgroup";
+    case ScopeInvocation: return "Invocation";
+    case ScopeQueueFamily: return "QueueFamily";
+    case ScopeShaderCallKHR: return "ShaderCallKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* GroupOperationToString(GroupOperation value) {
+    switch (value) {
+    case GroupOperationReduce: return "Reduce";
+    case GroupOperationInclusiveScan: return "InclusiveScan";
+    case GroupOperationExclusiveScan: return "ExclusiveScan";
+    case GroupOperationClusteredReduce: return "ClusteredReduce";
+    case GroupOperationPartitionedReduceNV: return "PartitionedReduceNV";
+    case GroupOperationPartitionedInclusiveScanNV: return "PartitionedInclusiveScanNV";
+    case GroupOperationPartitionedExclusiveScanNV: return "PartitionedExclusiveScanNV";
+    default: return "Unknown";
+    }
+}
+
+inline const char* KernelEnqueueFlagsToString(KernelEnqueueFlags value) {
+    switch (value) {
+    case KernelEnqueueFlagsNoWait: return "NoWait";
+    case KernelEnqueueFlagsWaitKernel: return "WaitKernel";
+    case KernelEnqueueFlagsWaitWorkGroup: return "WaitWorkGroup";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CapabilityToString(Capability value) {
+    switch (value) {
+    case CapabilityMatrix: return "Matrix";
+    case CapabilityShader: return "Shader";
+    case CapabilityGeometry: return "Geometry";
+    case CapabilityTessellation: return "Tessellation";
+    case CapabilityAddresses: return "Addresses";
+    case CapabilityLinkage: return "Linkage";
+    case CapabilityKernel: return "Kernel";
+    case CapabilityVector16: return "Vector16";
+    case CapabilityFloat16Buffer: return "Float16Buffer";
+    case CapabilityFloat16: return "Float16";
+    case CapabilityFloat64: return "Float64";
+    case CapabilityInt64: return "Int64";
+    case CapabilityInt64Atomics: return "Int64Atomics";
+    case CapabilityImageBasic: return "ImageBasic";
+    case CapabilityImageReadWrite: return "ImageReadWrite";
+    case CapabilityImageMipmap: return "ImageMipmap";
+    case CapabilityPipes: return "Pipes";
+    case CapabilityGroups: return "Groups";
+    case CapabilityDeviceEnqueue: return "DeviceEnqueue";
+    case CapabilityLiteralSampler: return "LiteralSampler";
+    case CapabilityAtomicStorage: return "AtomicStorage";
+    case CapabilityInt16: return "Int16";
+    case CapabilityTessellationPointSize: return "TessellationPointSize";
+    case CapabilityGeometryPointSize: return "GeometryPointSize";
+    case CapabilityImageGatherExtended: return "ImageGatherExtended";
+    case CapabilityStorageImageMultisample: return "StorageImageMultisample";
+    case CapabilityUniformBufferArrayDynamicIndexing: return "UniformBufferArrayDynamicIndexing";
+    case CapabilitySampledImageArrayDynamicIndexing: return "SampledImageArrayDynamicIndexing";
+    case CapabilityStorageBufferArrayDynamicIndexing: return "StorageBufferArrayDynamicIndexing";
+    case CapabilityStorageImageArrayDynamicIndexing: return "StorageImageArrayDynamicIndexing";
+    case CapabilityClipDistance: return "ClipDistance";
+    case CapabilityCullDistance: return "CullDistance";
+    case CapabilityImageCubeArray: return "ImageCubeArray";
+    case CapabilitySampleRateShading: return "SampleRateShading";
+    case CapabilityImageRect: return "ImageRect";
+    case CapabilitySampledRect: return "SampledRect";
+    case CapabilityGenericPointer: return "GenericPointer";
+    case CapabilityInt8: return "Int8";
+    case CapabilityInputAttachment: return "InputAttachment";
+    case CapabilitySparseResidency: return "SparseResidency";
+    case CapabilityMinLod: return "MinLod";
+    case CapabilitySampled1D: return "Sampled1D";
+    case CapabilityImage1D: return "Image1D";
+    case CapabilitySampledCubeArray: return "SampledCubeArray";
+    case CapabilitySampledBuffer: return "SampledBuffer";
+    case CapabilityImageBuffer: return "ImageBuffer";
+    case CapabilityImageMSArray: return "ImageMSArray";
+    case CapabilityStorageImageExtendedFormats: return "StorageImageExtendedFormats";
+    case CapabilityImageQuery: return "ImageQuery";
+    case CapabilityDerivativeControl: return "DerivativeControl";
+    case CapabilityInterpolationFunction: return "InterpolationFunction";
+    case CapabilityTransformFeedback: return "TransformFeedback";
+    case CapabilityGeometryStreams: return "GeometryStreams";
+    case CapabilityStorageImageReadWithoutFormat: return "StorageImageReadWithoutFormat";
+    case CapabilityStorageImageWriteWithoutFormat: return "StorageImageWriteWithoutFormat";
+    case CapabilityMultiViewport: return "MultiViewport";
+    case CapabilitySubgroupDispatch: return "SubgroupDispatch";
+    case CapabilityNamedBarrier: return "NamedBarrier";
+    case CapabilityPipeStorage: return "PipeStorage";
+    case CapabilityGroupNonUniform: return "GroupNonUniform";
+    case CapabilityGroupNonUniformVote: return "GroupNonUniformVote";
+    case CapabilityGroupNonUniformArithmetic: return "GroupNonUniformArithmetic";
+    case CapabilityGroupNonUniformBallot: return "GroupNonUniformBallot";
+    case CapabilityGroupNonUniformShuffle: return "GroupNonUniformShuffle";
+    case CapabilityGroupNonUniformShuffleRelative: return "GroupNonUniformShuffleRelative";
+    case CapabilityGroupNonUniformClustered: return "GroupNonUniformClustered";
+    case CapabilityGroupNonUniformQuad: return "GroupNonUniformQuad";
+    case CapabilityShaderLayer: return "ShaderLayer";
+    case CapabilityShaderViewportIndex: return "ShaderViewportIndex";
+    case CapabilityUniformDecoration: return "UniformDecoration";
+    case CapabilityCoreBuiltinsARM: return "CoreBuiltinsARM";
+    case CapabilityTileImageColorReadAccessEXT: return "TileImageColorReadAccessEXT";
+    case CapabilityTileImageDepthReadAccessEXT: return "TileImageDepthReadAccessEXT";
+    case CapabilityTileImageStencilReadAccessEXT: return "TileImageStencilReadAccessEXT";
+    case CapabilityTensorsARM: return "TensorsARM";
+    case CapabilityStorageTensorArrayDynamicIndexingARM: return "StorageTensorArrayDynamicIndexingARM";
+    case CapabilityStorageTensorArrayNonUniformIndexingARM: return "StorageTensorArrayNonUniformIndexingARM";
+    case CapabilityGraphARM: return "GraphARM";
+    case CapabilityCooperativeMatrixLayoutsARM: return "CooperativeMatrixLayoutsARM";
+    case CapabilityFloat8EXT: return "Float8EXT";
+    case CapabilityFloat8CooperativeMatrixEXT: return "Float8CooperativeMatrixEXT";
+    case CapabilityFragmentShadingRateKHR: return "FragmentShadingRateKHR";
+    case CapabilitySubgroupBallotKHR: return "SubgroupBallotKHR";
+    case CapabilityDrawParameters: return "DrawParameters";
+    case CapabilityWorkgroupMemoryExplicitLayoutKHR: return "WorkgroupMemoryExplicitLayoutKHR";
+    case CapabilityWorkgroupMemoryExplicitLayout8BitAccessKHR: return "WorkgroupMemoryExplicitLayout8BitAccessKHR";
+    case CapabilityWorkgroupMemoryExplicitLayout16BitAccessKHR: return "WorkgroupMemoryExplicitLayout16BitAccessKHR";
+    case CapabilitySubgroupVoteKHR: return "SubgroupVoteKHR";
+    case CapabilityStorageBuffer16BitAccess: return "StorageBuffer16BitAccess";
+    case CapabilityStorageUniform16: return "StorageUniform16";
+    case CapabilityStoragePushConstant16: return "StoragePushConstant16";
+    case CapabilityStorageInputOutput16: return "StorageInputOutput16";
+    case CapabilityDeviceGroup: return "DeviceGroup";
+    case CapabilityMultiView: return "MultiView";
+    case CapabilityVariablePointersStorageBuffer: return "VariablePointersStorageBuffer";
+    case CapabilityVariablePointers: return "VariablePointers";
+    case CapabilityAtomicStorageOps: return "AtomicStorageOps";
+    case CapabilitySampleMaskPostDepthCoverage: return "SampleMaskPostDepthCoverage";
+    case CapabilityStorageBuffer8BitAccess: return "StorageBuffer8BitAccess";
+    case CapabilityUniformAndStorageBuffer8BitAccess: return "UniformAndStorageBuffer8BitAccess";
+    case CapabilityStoragePushConstant8: return "StoragePushConstant8";
+    case CapabilityDenormPreserve: return "DenormPreserve";
+    case CapabilityDenormFlushToZero: return "DenormFlushToZero";
+    case CapabilitySignedZeroInfNanPreserve: return "SignedZeroInfNanPreserve";
+    case CapabilityRoundingModeRTE: return "RoundingModeRTE";
+    case CapabilityRoundingModeRTZ: return "RoundingModeRTZ";
+    case CapabilityRayQueryProvisionalKHR: return "RayQueryProvisionalKHR";
+    case CapabilityRayQueryKHR: return "RayQueryKHR";
+    case CapabilityUntypedPointersKHR: return "UntypedPointersKHR";
+    case CapabilityRayTraversalPrimitiveCullingKHR: return "RayTraversalPrimitiveCullingKHR";
+    case CapabilityRayTracingKHR: return "RayTracingKHR";
+    case CapabilityTextureSampleWeightedQCOM: return "TextureSampleWeightedQCOM";
+    case CapabilityTextureBoxFilterQCOM: return "TextureBoxFilterQCOM";
+    case CapabilityTextureBlockMatchQCOM: return "TextureBlockMatchQCOM";
+    case CapabilityTileShadingQCOM: return "TileShadingQCOM";
+    case CapabilityTextureBlockMatch2QCOM: return "TextureBlockMatch2QCOM";
+    case CapabilityFloat16ImageAMD: return "Float16ImageAMD";
+    case CapabilityImageGatherBiasLodAMD: return "ImageGatherBiasLodAMD";
+    case CapabilityFragmentMaskAMD: return "FragmentMaskAMD";
+    case CapabilityStencilExportEXT: return "StencilExportEXT";
+    case CapabilityImageReadWriteLodAMD: return "ImageReadWriteLodAMD";
+    case CapabilityInt64ImageEXT: return "Int64ImageEXT";
+    case CapabilityShaderClockKHR: return "ShaderClockKHR";
+    case CapabilityShaderEnqueueAMDX: return "ShaderEnqueueAMDX";
+    case CapabilityQuadControlKHR: return "QuadControlKHR";
+    case CapabilityInt4TypeINTEL: return "Int4TypeINTEL";
+    case CapabilityInt4CooperativeMatrixINTEL: return "Int4CooperativeMatrixINTEL";
+    case CapabilityBFloat16TypeKHR: return "BFloat16TypeKHR";
+    case CapabilityBFloat16DotProductKHR: return "BFloat16DotProductKHR";
+    case CapabilityBFloat16CooperativeMatrixKHR: return "BFloat16CooperativeMatrixKHR";
+    case CapabilitySampleMaskOverrideCoverageNV: return "SampleMaskOverrideCoverageNV";
+    case CapabilityGeometryShaderPassthroughNV: return "GeometryShaderPassthroughNV";
+    case CapabilityShaderViewportIndexLayerEXT: return "ShaderViewportIndexLayerEXT";
+    case CapabilityShaderViewportMaskNV: return "ShaderViewportMaskNV";
+    case CapabilityShaderStereoViewNV: return "ShaderStereoViewNV";
+    case CapabilityPerViewAttributesNV: return "PerViewAttributesNV";
+    case CapabilityFragmentFullyCoveredEXT: return "FragmentFullyCoveredEXT";
+    case CapabilityMeshShadingNV: return "MeshShadingNV";
+    case CapabilityImageFootprintNV: return "ImageFootprintNV";
+    case CapabilityMeshShadingEXT: return "MeshShadingEXT";
+    case CapabilityFragmentBarycentricKHR: return "FragmentBarycentricKHR";
+    case CapabilityComputeDerivativeGroupQuadsKHR: return "ComputeDerivativeGroupQuadsKHR";
+    case CapabilityFragmentDensityEXT: return "FragmentDensityEXT";
+    case CapabilityGroupNonUniformPartitionedNV: return "GroupNonUniformPartitionedNV";
+    case CapabilityShaderNonUniform: return "ShaderNonUniform";
+    case CapabilityRuntimeDescriptorArray: return "RuntimeDescriptorArray";
+    case CapabilityInputAttachmentArrayDynamicIndexing: return "InputAttachmentArrayDynamicIndexing";
+    case CapabilityUniformTexelBufferArrayDynamicIndexing: return "UniformTexelBufferArrayDynamicIndexing";
+    case CapabilityStorageTexelBufferArrayDynamicIndexing: return "StorageTexelBufferArrayDynamicIndexing";
+    case CapabilityUniformBufferArrayNonUniformIndexing: return "UniformBufferArrayNonUniformIndexing";
+    case CapabilitySampledImageArrayNonUniformIndexing: return "SampledImageArrayNonUniformIndexing";
+    case CapabilityStorageBufferArrayNonUniformIndexing: return "StorageBufferArrayNonUniformIndexing";
+    case CapabilityStorageImageArrayNonUniformIndexing: return "StorageImageArrayNonUniformIndexing";
+    case CapabilityInputAttachmentArrayNonUniformIndexing: return "InputAttachmentArrayNonUniformIndexing";
+    case CapabilityUniformTexelBufferArrayNonUniformIndexing: return "UniformTexelBufferArrayNonUniformIndexing";
+    case CapabilityStorageTexelBufferArrayNonUniformIndexing: return "StorageTexelBufferArrayNonUniformIndexing";
+    case CapabilityRayTracingPositionFetchKHR: return "RayTracingPositionFetchKHR";
+    case CapabilityRayTracingNV: return "RayTracingNV";
+    case CapabilityRayTracingMotionBlurNV: return "RayTracingMotionBlurNV";
+    case CapabilityVulkanMemoryModel: return "VulkanMemoryModel";
+    case CapabilityVulkanMemoryModelDeviceScope: return "VulkanMemoryModelDeviceScope";
+    case CapabilityPhysicalStorageBufferAddresses: return "PhysicalStorageBufferAddresses";
+    case CapabilityComputeDerivativeGroupLinearKHR: return "ComputeDerivativeGroupLinearKHR";
+    case CapabilityRayTracingProvisionalKHR: return "RayTracingProvisionalKHR";
+    case CapabilityCooperativeMatrixNV: return "CooperativeMatrixNV";
+    case CapabilityFragmentShaderSampleInterlockEXT: return "FragmentShaderSampleInterlockEXT";
+    case CapabilityFragmentShaderShadingRateInterlockEXT: return "FragmentShaderShadingRateInterlockEXT";
+    case CapabilityShaderSMBuiltinsNV: return "ShaderSMBuiltinsNV";
+    case CapabilityFragmentShaderPixelInterlockEXT: return "FragmentShaderPixelInterlockEXT";
+    case CapabilityDemoteToHelperInvocation: return "DemoteToHelperInvocation";
+    case CapabilityDisplacementMicromapNV: return "DisplacementMicromapNV";
+    case CapabilityRayTracingOpacityMicromapEXT: return "RayTracingOpacityMicromapEXT";
+    case CapabilityShaderInvocationReorderNV: return "ShaderInvocationReorderNV";
+    case CapabilityBindlessTextureNV: return "BindlessTextureNV";
+    case CapabilityRayQueryPositionFetchKHR: return "RayQueryPositionFetchKHR";
+    case CapabilityCooperativeVectorNV: return "CooperativeVectorNV";
+    case CapabilityAtomicFloat16VectorNV: return "AtomicFloat16VectorNV";
+    case CapabilityRayTracingDisplacementMicromapNV: return "RayTracingDisplacementMicromapNV";
+    case CapabilityRawAccessChainsNV: return "RawAccessChainsNV";
+    case CapabilityRayTracingSpheresGeometryNV: return "RayTracingSpheresGeometryNV";
+    case CapabilityRayTracingLinearSweptSpheresGeometryNV: return "RayTracingLinearSweptSpheresGeometryNV";
+    case CapabilityCooperativeMatrixReductionsNV: return "CooperativeMatrixReductionsNV";
+    case CapabilityCooperativeMatrixConversionsNV: return "CooperativeMatrixConversionsNV";
+    case CapabilityCooperativeMatrixPerElementOperationsNV: return "CooperativeMatrixPerElementOperationsNV";
+    case CapabilityCooperativeMatrixTensorAddressingNV: return "CooperativeMatrixTensorAddressingNV";
+    case CapabilityCooperativeMatrixBlockLoadsNV: return "CooperativeMatrixBlockLoadsNV";
+    case CapabilityCooperativeVectorTrainingNV: return "CooperativeVectorTrainingNV";
+    case CapabilityRayTracingClusterAccelerationStructureNV: return "RayTracingClusterAccelerationStructureNV";
+    case CapabilityTensorAddressingNV: return "TensorAddressingNV";
+    case CapabilitySubgroupShuffleINTEL: return "SubgroupShuffleINTEL";
+    case CapabilitySubgroupBufferBlockIOINTEL: return "SubgroupBufferBlockIOINTEL";
+    case CapabilitySubgroupImageBlockIOINTEL: return "SubgroupImageBlockIOINTEL";
+    case CapabilitySubgroupImageMediaBlockIOINTEL: return "SubgroupImageMediaBlockIOINTEL";
+    case CapabilityRoundToInfinityINTEL: return "RoundToInfinityINTEL";
+    case CapabilityFloatingPointModeINTEL: return "FloatingPointModeINTEL";
+    case CapabilityIntegerFunctions2INTEL: return "IntegerFunctions2INTEL";
+    case CapabilityFunctionPointersINTEL: return "FunctionPointersINTEL";
+    case CapabilityIndirectReferencesINTEL: return "IndirectReferencesINTEL";
+    case CapabilityAsmINTEL: return "AsmINTEL";
+    case CapabilityAtomicFloat32MinMaxEXT: return "AtomicFloat32MinMaxEXT";
+    case CapabilityAtomicFloat64MinMaxEXT: return "AtomicFloat64MinMaxEXT";
+    case CapabilityAtomicFloat16MinMaxEXT: return "AtomicFloat16MinMaxEXT";
+    case CapabilityVectorComputeINTEL: return "VectorComputeINTEL";
+    case CapabilityVectorAnyINTEL: return "VectorAnyINTEL";
+    case CapabilityExpectAssumeKHR: return "ExpectAssumeKHR";
+    case CapabilitySubgroupAvcMotionEstimationINTEL: return "SubgroupAvcMotionEstimationINTEL";
+    case CapabilitySubgroupAvcMotionEstimationIntraINTEL: return "SubgroupAvcMotionEstimationIntraINTEL";
+    case CapabilitySubgroupAvcMotionEstimationChromaINTEL: return "SubgroupAvcMotionEstimationChromaINTEL";
+    case CapabilityVariableLengthArrayINTEL: return "VariableLengthArrayINTEL";
+    case CapabilityFunctionFloatControlINTEL: return "FunctionFloatControlINTEL";
+    case CapabilityFPGAMemoryAttributesINTEL: return "FPGAMemoryAttributesINTEL";
+    case CapabilityFPFastMathModeINTEL: return "FPFastMathModeINTEL";
+    case CapabilityArbitraryPrecisionIntegersINTEL: return "ArbitraryPrecisionIntegersINTEL";
+    case CapabilityArbitraryPrecisionFloatingPointINTEL: return "ArbitraryPrecisionFloatingPointINTEL";
+    case CapabilityUnstructuredLoopControlsINTEL: return "UnstructuredLoopControlsINTEL";
+    case CapabilityFPGALoopControlsINTEL: return "FPGALoopControlsINTEL";
+    case CapabilityKernelAttributesINTEL: return "KernelAttributesINTEL";
+    case CapabilityFPGAKernelAttributesINTEL: return "FPGAKernelAttributesINTEL";
+    case CapabilityFPGAMemoryAccessesINTEL: return "FPGAMemoryAccessesINTEL";
+    case CapabilityFPGAClusterAttributesINTEL: return "FPGAClusterAttributesINTEL";
+    case CapabilityLoopFuseINTEL: return "LoopFuseINTEL";
+    case CapabilityFPGADSPControlINTEL: return "FPGADSPControlINTEL";
+    case CapabilityMemoryAccessAliasingINTEL: return "MemoryAccessAliasingINTEL";
+    case CapabilityFPGAInvocationPipeliningAttributesINTEL: return "FPGAInvocationPipeliningAttributesINTEL";
+    case CapabilityFPGABufferLocationINTEL: return "FPGABufferLocationINTEL";
+    case CapabilityArbitraryPrecisionFixedPointINTEL: return "ArbitraryPrecisionFixedPointINTEL";
+    case CapabilityUSMStorageClassesINTEL: return "USMStorageClassesINTEL";
+    case CapabilityRuntimeAlignedAttributeINTEL: return "RuntimeAlignedAttributeINTEL";
+    case CapabilityIOPipesINTEL: return "IOPipesINTEL";
+    case CapabilityBlockingPipesINTEL: return "BlockingPipesINTEL";
+    case CapabilityFPGARegINTEL: return "FPGARegINTEL";
+    case CapabilityDotProductInputAll: return "DotProductInputAll";
+    case CapabilityDotProductInput4x8Bit: return "DotProductInput4x8Bit";
+    case CapabilityDotProductInput4x8BitPacked: return "DotProductInput4x8BitPacked";
+    case CapabilityDotProduct: return "DotProduct";
+    case CapabilityRayCullMaskKHR: return "RayCullMaskKHR";
+    case CapabilityCooperativeMatrixKHR: return "CooperativeMatrixKHR";
+    case CapabilityReplicatedCompositesEXT: return "ReplicatedCompositesEXT";
+    case CapabilityBitInstructions: return "BitInstructions";
+    case CapabilityGroupNonUniformRotateKHR: return "GroupNonUniformRotateKHR";
+    case CapabilityFloatControls2: return "FloatControls2";
+    case CapabilityAtomicFloat32AddEXT: return "AtomicFloat32AddEXT";
+    case CapabilityAtomicFloat64AddEXT: return "AtomicFloat64AddEXT";
+    case CapabilityLongCompositesINTEL: return "LongCompositesINTEL";
+    case CapabilityOptNoneEXT: return "OptNoneEXT";
+    case CapabilityAtomicFloat16AddEXT: return "AtomicFloat16AddEXT";
+    case CapabilityDebugInfoModuleINTEL: return "DebugInfoModuleINTEL";
+    case CapabilityBFloat16ConversionINTEL: return "BFloat16ConversionINTEL";
+    case CapabilitySplitBarrierINTEL: return "SplitBarrierINTEL";
+    case CapabilityArithmeticFenceEXT: return "ArithmeticFenceEXT";
+    case CapabilityFPGAClusterAttributesV2INTEL: return "FPGAClusterAttributesV2INTEL";
+    case CapabilityFPGAKernelAttributesv2INTEL: return "FPGAKernelAttributesv2INTEL";
+    case CapabilityTaskSequenceINTEL: return "TaskSequenceINTEL";
+    case CapabilityFPMaxErrorINTEL: return "FPMaxErrorINTEL";
+    case CapabilityFPGALatencyControlINTEL: return "FPGALatencyControlINTEL";
+    case CapabilityFPGAArgumentInterfacesINTEL: return "FPGAArgumentInterfacesINTEL";
+    case CapabilityGlobalVariableHostAccessINTEL: return "GlobalVariableHostAccessINTEL";
+    case CapabilityGlobalVariableFPGADecorationsINTEL: return "GlobalVariableFPGADecorationsINTEL";
+    case CapabilitySubgroupBufferPrefetchINTEL: return "SubgroupBufferPrefetchINTEL";
+    case CapabilitySubgroup2DBlockIOINTEL: return "Subgroup2DBlockIOINTEL";
+    case CapabilitySubgroup2DBlockTransformINTEL: return "Subgroup2DBlockTransformINTEL";
+    case CapabilitySubgroup2DBlockTransposeINTEL: return "Subgroup2DBlockTransposeINTEL";
+    case CapabilitySubgroupMatrixMultiplyAccumulateINTEL: return "SubgroupMatrixMultiplyAccumulateINTEL";
+    case CapabilityTernaryBitwiseFunctionINTEL: return "TernaryBitwiseFunctionINTEL";
+    case CapabilityGroupUniformArithmeticKHR: return "GroupUniformArithmeticKHR";
+    case CapabilityTensorFloat32RoundingINTEL: return "TensorFloat32RoundingINTEL";
+    case CapabilityMaskedGatherScatterINTEL: return "MaskedGatherScatterINTEL";
+    case CapabilityCacheControlsINTEL: return "CacheControlsINTEL";
+    case CapabilityRegisterLimitsINTEL: return "RegisterLimitsINTEL";
+    case CapabilityBindlessImagesINTEL: return "BindlessImagesINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryIntersectionToString(RayQueryIntersection value) {
+    switch (value) {
+    case RayQueryIntersectionRayQueryCandidateIntersectionKHR: return "RayQueryCandidateIntersectionKHR";
+    case RayQueryIntersectionRayQueryCommittedIntersectionKHR: return "RayQueryCommittedIntersectionKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryCommittedIntersectionTypeToString(RayQueryCommittedIntersectionType value) {
+    switch (value) {
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionNoneKHR: return "RayQueryCommittedIntersectionNoneKHR";
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionTriangleKHR: return "RayQueryCommittedIntersectionTriangleKHR";
+    case RayQueryCommittedIntersectionTypeRayQueryCommittedIntersectionGeneratedKHR: return "RayQueryCommittedIntersectionGeneratedKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* RayQueryCandidateIntersectionTypeToString(RayQueryCandidateIntersectionType value) {
+    switch (value) {
+    case RayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionTriangleKHR: return "RayQueryCandidateIntersectionTriangleKHR";
+    case RayQueryCandidateIntersectionTypeRayQueryCandidateIntersectionAABBKHR: return "RayQueryCandidateIntersectionAABBKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPDenormModeToString(FPDenormMode value) {
+    switch (value) {
+    case FPDenormModePreserve: return "Preserve";
+    case FPDenormModeFlushToZero: return "FlushToZero";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPOperationModeToString(FPOperationMode value) {
+    switch (value) {
+    case FPOperationModeIEEE: return "IEEE";
+    case FPOperationModeALT: return "ALT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* QuantizationModesToString(QuantizationModes value) {
+    switch (value) {
+    case QuantizationModesTRN: return "TRN";
+    case QuantizationModesTRN_ZERO: return "TRN_ZERO";
+    case QuantizationModesRND: return "RND";
+    case QuantizationModesRND_ZERO: return "RND_ZERO";
+    case QuantizationModesRND_INF: return "RND_INF";
+    case QuantizationModesRND_MIN_INF: return "RND_MIN_INF";
+    case QuantizationModesRND_CONV: return "RND_CONV";
+    case QuantizationModesRND_CONV_ODD: return "RND_CONV_ODD";
+    default: return "Unknown";
+    }
+}
+
+inline const char* OverflowModesToString(OverflowModes value) {
+    switch (value) {
+    case OverflowModesWRAP: return "WRAP";
+    case OverflowModesSAT: return "SAT";
+    case OverflowModesSAT_ZERO: return "SAT_ZERO";
+    case OverflowModesSAT_SYM: return "SAT_SYM";
+    default: return "Unknown";
+    }
+}
+
+inline const char* PackedVectorFormatToString(PackedVectorFormat value) {
+    switch (value) {
+    case PackedVectorFormatPackedVectorFormat4x8Bit: return "PackedVectorFormat4x8Bit";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CooperativeMatrixLayoutToString(CooperativeMatrixLayout value) {
+    switch (value) {
+    case CooperativeMatrixLayoutRowMajorKHR: return "RowMajorKHR";
+    case CooperativeMatrixLayoutColumnMajorKHR: return "ColumnMajorKHR";
+    case CooperativeMatrixLayoutRowBlockedInterleavedARM: return "RowBlockedInterleavedARM";
+    case CooperativeMatrixLayoutColumnBlockedInterleavedARM: return "ColumnBlockedInterleavedARM";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CooperativeMatrixUseToString(CooperativeMatrixUse value) {
+    switch (value) {
+    case CooperativeMatrixUseMatrixAKHR: return "MatrixAKHR";
+    case CooperativeMatrixUseMatrixBKHR: return "MatrixBKHR";
+    case CooperativeMatrixUseMatrixAccumulatorKHR: return "MatrixAccumulatorKHR";
+    default: return "Unknown";
+    }
+}
+
+inline const char* TensorClampModeToString(TensorClampMode value) {
+    switch (value) {
+    case TensorClampModeUndefined: return "Undefined";
+    case TensorClampModeConstant: return "Constant";
+    case TensorClampModeClampToEdge: return "ClampToEdge";
+    case TensorClampModeRepeat: return "Repeat";
+    case TensorClampModeRepeatMirrored: return "RepeatMirrored";
+    default: return "Unknown";
+    }
+}
+
+inline const char* InitializationModeQualifierToString(InitializationModeQualifier value) {
+    switch (value) {
+    case InitializationModeQualifierInitOnDeviceReprogramINTEL: return "InitOnDeviceReprogramINTEL";
+    case InitializationModeQualifierInitOnDeviceResetINTEL: return "InitOnDeviceResetINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* HostAccessQualifierToString(HostAccessQualifier value) {
+    switch (value) {
+    case HostAccessQualifierNoneINTEL: return "NoneINTEL";
+    case HostAccessQualifierReadINTEL: return "ReadINTEL";
+    case HostAccessQualifierWriteINTEL: return "WriteINTEL";
+    case HostAccessQualifierReadWriteINTEL: return "ReadWriteINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* LoadCacheControlToString(LoadCacheControl value) {
+    switch (value) {
+    case LoadCacheControlUncachedINTEL: return "UncachedINTEL";
+    case LoadCacheControlCachedINTEL: return "CachedINTEL";
+    case LoadCacheControlStreamingINTEL: return "StreamingINTEL";
+    case LoadCacheControlInvalidateAfterReadINTEL: return "InvalidateAfterReadINTEL";
+    case LoadCacheControlConstCachedINTEL: return "ConstCachedINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* StoreCacheControlToString(StoreCacheControl value) {
+    switch (value) {
+    case StoreCacheControlUncachedINTEL: return "UncachedINTEL";
+    case StoreCacheControlWriteThroughINTEL: return "WriteThroughINTEL";
+    case StoreCacheControlWriteBackINTEL: return "WriteBackINTEL";
+    case StoreCacheControlStreamingINTEL: return "StreamingINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* NamedMaximumNumberOfRegistersToString(NamedMaximumNumberOfRegisters value) {
+    switch (value) {
+    case NamedMaximumNumberOfRegistersAutoINTEL: return "AutoINTEL";
+    default: return "Unknown";
+    }
+}
+
+inline const char* FPEncodingToString(FPEncoding value) {
+    switch (value) {
+    case FPEncodingBFloat16KHR: return "BFloat16KHR";
+    case FPEncodingFloat8E4M3EXT: return "Float8E4M3EXT";
+    case FPEncodingFloat8E5M2EXT: return "Float8E5M2EXT";
+    default: return "Unknown";
+    }
+}
+
+inline const char* CooperativeVectorMatrixLayoutToString(CooperativeVectorMatrixLayout value) {
+    switch (value) {
+    case CooperativeVectorMatrixLayoutRowMajorNV: return "RowMajorNV";
+    case CooperativeVectorMatrixLayoutColumnMajorNV: return "ColumnMajorNV";
+    case CooperativeVectorMatrixLayoutInferencingOptimalNV: return "InferencingOptimalNV";
+    case CooperativeVectorMatrixLayoutTrainingOptimalNV: return "TrainingOptimalNV";
+    default: return "Unknown";
+    }
+}
+
+inline const char* ComponentTypeToString(ComponentType value) {
+    switch (value) {
+    case ComponentTypeFloat16NV: return "Float16NV";
+    case ComponentTypeFloat32NV: return "Float32NV";
+    case ComponentTypeFloat64NV: return "Float64NV";
+    case ComponentTypeSignedInt8NV: return "SignedInt8NV";
+    case ComponentTypeSignedInt16NV: return "SignedInt16NV";
+    case ComponentTypeSignedInt32NV: return "SignedInt32NV";
+    case ComponentTypeSignedInt64NV: return "SignedInt64NV";
+    case ComponentTypeUnsignedInt8NV: return "UnsignedInt8NV";
+    case ComponentTypeUnsignedInt16NV: return "UnsignedInt16NV";
+    case ComponentTypeUnsignedInt32NV: return "UnsignedInt32NV";
+    case ComponentTypeUnsignedInt64NV: return "UnsignedInt64NV";
+    case ComponentTypeSignedInt8PackedNV: return "SignedInt8PackedNV";
+    case ComponentTypeUnsignedInt8PackedNV: return "UnsignedInt8PackedNV";
+    case ComponentTypeFloatE4M3NV: return "FloatE4M3NV";
+    case ComponentTypeFloatE5M2NV: return "FloatE5M2NV";
+    default: return "Unknown";
+    }
+}
+
+inline const char* OpToString(Op value) {
+    switch (value) {
+    case OpNop: return "OpNop";
+    case OpUndef: return "OpUndef";
+    case OpSourceContinued: return "OpSourceContinued";
+    case OpSource: return "OpSource";
+    case OpSourceExtension: return "OpSourceExtension";
+    case OpName: return "OpName";
+    case OpMemberName: return "OpMemberName";
+    case OpString: return "OpString";
+    case OpLine: return "OpLine";
+    case OpExtension: return "OpExtension";
+    case OpExtInstImport: return "OpExtInstImport";
+    case OpExtInst: return "OpExtInst";
+    case OpMemoryModel: return "OpMemoryModel";
+    case OpEntryPoint: return "OpEntryPoint";
+    case OpExecutionMode: return "OpExecutionMode";
+    case OpCapability: return "OpCapability";
+    case OpTypeVoid: return "OpTypeVoid";
+    case OpTypeBool: return "OpTypeBool";
+    case OpTypeInt: return "OpTypeInt";
+    case OpTypeFloat: return "OpTypeFloat";
+    case OpTypeVector: return "OpTypeVector";
+    case OpTypeMatrix: return "OpTypeMatrix";
+    case OpTypeImage: return "OpTypeImage";
+    case OpTypeSampler: return "OpTypeSampler";
+    case OpTypeSampledImage: return "OpTypeSampledImage";
+    case OpTypeArray: return "OpTypeArray";
+    case OpTypeRuntimeArray: return "OpTypeRuntimeArray";
+    case OpTypeStruct: return "OpTypeStruct";
+    case OpTypeOpaque: return "OpTypeOpaque";
+    case OpTypePointer: return "OpTypePointer";
+    case OpTypeFunction: return "OpTypeFunction";
+    case OpTypeEvent: return "OpTypeEvent";
+    case OpTypeDeviceEvent: return "OpTypeDeviceEvent";
+    case OpTypeReserveId: return "OpTypeReserveId";
+    case OpTypeQueue: return "OpTypeQueue";
+    case OpTypePipe: return "OpTypePipe";
+    case OpTypeForwardPointer: return "OpTypeForwardPointer";
+    case OpConstantTrue: return "OpConstantTrue";
+    case OpConstantFalse: return "OpConstantFalse";
+    case OpConstant: return "OpConstant";
+    case OpConstantComposite: return "OpConstantComposite";
+    case OpConstantSampler: return "OpConstantSampler";
+    case OpConstantNull: return "OpConstantNull";
+    case OpSpecConstantTrue: return "OpSpecConstantTrue";
+    case OpSpecConstantFalse: return "OpSpecConstantFalse";
+    case OpSpecConstant: return "OpSpecConstant";
+    case OpSpecConstantComposite: return "OpSpecConstantComposite";
+    case OpSpecConstantOp: return "OpSpecConstantOp";
+    case OpFunction: return "OpFunction";
+    case OpFunctionParameter: return "OpFunctionParameter";
+    case OpFunctionEnd: return "OpFunctionEnd";
+    case OpFunctionCall: return "OpFunctionCall";
+    case OpVariable: return "OpVariable";
+    case OpImageTexelPointer: return "OpImageTexelPointer";
+    case OpLoad: return "OpLoad";
+    case OpStore: return "OpStore";
+    case OpCopyMemory: return "OpCopyMemory";
+    case OpCopyMemorySized: return "OpCopyMemorySized";
+    case OpAccessChain: return "OpAccessChain";
+    case OpInBoundsAccessChain: return "OpInBoundsAccessChain";
+    case OpPtrAccessChain: return "OpPtrAccessChain";
+    case OpArrayLength: return "OpArrayLength";
+    case OpGenericPtrMemSemantics: return "OpGenericPtrMemSemantics";
+    case OpInBoundsPtrAccessChain: return "OpInBoundsPtrAccessChain";
+    case OpDecorate: return "OpDecorate";
+    case OpMemberDecorate: return "OpMemberDecorate";
+    case OpDecorationGroup: return "OpDecorationGroup";
+    case OpGroupDecorate: return "OpGroupDecorate";
+    case OpGroupMemberDecorate: return "OpGroupMemberDecorate";
+    case OpVectorExtractDynamic: return "OpVectorExtractDynamic";
+    case OpVectorInsertDynamic: return "OpVectorInsertDynamic";
+    case OpVectorShuffle: return "OpVectorShuffle";
+    case OpCompositeConstruct: return "OpCompositeConstruct";
+    case OpCompositeExtract: return "OpCompositeExtract";
+    case OpCompositeInsert: return "OpCompositeInsert";
+    case OpCopyObject: return "OpCopyObject";
+    case OpTranspose: return "OpTranspose";
+    case OpSampledImage: return "OpSampledImage";
+    case OpImageSampleImplicitLod: return "OpImageSampleImplicitLod";
+    case OpImageSampleExplicitLod: return "OpImageSampleExplicitLod";
+    case OpImageSampleDrefImplicitLod: return "OpImageSampleDrefImplicitLod";
+    case OpImageSampleDrefExplicitLod: return "OpImageSampleDrefExplicitLod";
+    case OpImageSampleProjImplicitLod: return "OpImageSampleProjImplicitLod";
+    case OpImageSampleProjExplicitLod: return "OpImageSampleProjExplicitLod";
+    case OpImageSampleProjDrefImplicitLod: return "OpImageSampleProjDrefImplicitLod";
+    case OpImageSampleProjDrefExplicitLod: return "OpImageSampleProjDrefExplicitLod";
+    case OpImageFetch: return "OpImageFetch";
+    case OpImageGather: return "OpImageGather";
+    case OpImageDrefGather: return "OpImageDrefGather";
+    case OpImageRead: return "OpImageRead";
+    case OpImageWrite: return "OpImageWrite";
+    case OpImage: return "OpImage";
+    case OpImageQueryFormat: return "OpImageQueryFormat";
+    case OpImageQueryOrder: return "OpImageQueryOrder";
+    case OpImageQuerySizeLod: return "OpImageQuerySizeLod";
+    case OpImageQuerySize: return "OpImageQuerySize";
+    case OpImageQueryLod: return "OpImageQueryLod";
+    case OpImageQueryLevels: return "OpImageQueryLevels";
+    case OpImageQuerySamples: return "OpImageQuerySamples";
+    case OpConvertFToU: return "OpConvertFToU";
+    case OpConvertFToS: return "OpConvertFToS";
+    case OpConvertSToF: return "OpConvertSToF";
+    case OpConvertUToF: return "OpConvertUToF";
+    case OpUConvert: return "OpUConvert";
+    case OpSConvert: return "OpSConvert";
+    case OpFConvert: return "OpFConvert";
+    case OpQuantizeToF16: return "OpQuantizeToF16";
+    case OpConvertPtrToU: return "OpConvertPtrToU";
+    case OpSatConvertSToU: return "OpSatConvertSToU";
+    case OpSatConvertUToS: return "OpSatConvertUToS";
+    case OpConvertUToPtr: return "OpConvertUToPtr";
+    case OpPtrCastToGeneric: return "OpPtrCastToGeneric";
+    case OpGenericCastToPtr: return "OpGenericCastToPtr";
+    case OpGenericCastToPtrExplicit: return "OpGenericCastToPtrExplicit";
+    case OpBitcast: return "OpBitcast";
+    case OpSNegate: return "OpSNegate";
+    case OpFNegate: return "OpFNegate";
+    case OpIAdd: return "OpIAdd";
+    case OpFAdd: return "OpFAdd";
+    case OpISub: return "OpISub";
+    case OpFSub: return "OpFSub";
+    case OpIMul: return "OpIMul";
+    case OpFMul: return "OpFMul";
+    case OpUDiv: return "OpUDiv";
+    case OpSDiv: return "OpSDiv";
+    case OpFDiv: return "OpFDiv";
+    case OpUMod: return "OpUMod";
+    case OpSRem: return "OpSRem";
+    case OpSMod: return "OpSMod";
+    case OpFRem: return "OpFRem";
+    case OpFMod: return "OpFMod";
+    case OpVectorTimesScalar: return "OpVectorTimesScalar";
+    case OpMatrixTimesScalar: return "OpMatrixTimesScalar";
+    case OpVectorTimesMatrix: return "OpVectorTimesMatrix";
+    case OpMatrixTimesVector: return "OpMatrixTimesVector";
+    case OpMatrixTimesMatrix: return "OpMatrixTimesMatrix";
+    case OpOuterProduct: return "OpOuterProduct";
+    case OpDot: return "OpDot";
+    case OpIAddCarry: return "OpIAddCarry";
+    case OpISubBorrow: return "OpISubBorrow";
+    case OpUMulExtended: return "OpUMulExtended";
+    case OpSMulExtended: return "OpSMulExtended";
+    case OpAny: return "OpAny";
+    case OpAll: return "OpAll";
+    case OpIsNan: return "OpIsNan";
+    case OpIsInf: return "OpIsInf";
+    case OpIsFinite: return "OpIsFinite";
+    case OpIsNormal: return "OpIsNormal";
+    case OpSignBitSet: return "OpSignBitSet";
+    case OpLessOrGreater: return "OpLessOrGreater";
+    case OpOrdered: return "OpOrdered";
+    case OpUnordered: return "OpUnordered";
+    case OpLogicalEqual: return "OpLogicalEqual";
+    case OpLogicalNotEqual: return "OpLogicalNotEqual";
+    case OpLogicalOr: return "OpLogicalOr";
+    case OpLogicalAnd: return "OpLogicalAnd";
+    case OpLogicalNot: return "OpLogicalNot";
+    case OpSelect: return "OpSelect";
+    case OpIEqual: return "OpIEqual";
+    case OpINotEqual: return "OpINotEqual";
+    case OpUGreaterThan: return "OpUGreaterThan";
+    case OpSGreaterThan: return "OpSGreaterThan";
+    case OpUGreaterThanEqual: return "OpUGreaterThanEqual";
+    case OpSGreaterThanEqual: return "OpSGreaterThanEqual";
+    case OpULessThan: return "OpULessThan";
+    case OpSLessThan: return "OpSLessThan";
+    case OpULessThanEqual: return "OpULessThanEqual";
+    case OpSLessThanEqual: return "OpSLessThanEqual";
+    case OpFOrdEqual: return "OpFOrdEqual";
+    case OpFUnordEqual: return "OpFUnordEqual";
+    case OpFOrdNotEqual: return "OpFOrdNotEqual";
+    case OpFUnordNotEqual: return "OpFUnordNotEqual";
+    case OpFOrdLessThan: return "OpFOrdLessThan";
+    case OpFUnordLessThan: return "OpFUnordLessThan";
+    case OpFOrdGreaterThan: return "OpFOrdGreaterThan";
+    case OpFUnordGreaterThan: return "OpFUnordGreaterThan";
+    case OpFOrdLessThanEqual: return "OpFOrdLessThanEqual";
+    case OpFUnordLessThanEqual: return "OpFUnordLessThanEqual";
+    case OpFOrdGreaterThanEqual: return "OpFOrdGreaterThanEqual";
+    case OpFUnordGreaterThanEqual: return "OpFUnordGreaterThanEqual";
+    case OpShiftRightLogical: return "OpShiftRightLogical";
+    case OpShiftRightArithmetic: return "OpShiftRightArithmetic";
+    case OpShiftLeftLogical: return "OpShiftLeftLogical";
+    case OpBitwiseOr: return "OpBitwiseOr";
+    case OpBitwiseXor: return "OpBitwiseXor";
+    case OpBitwiseAnd: return "OpBitwiseAnd";
+    case OpNot: return "OpNot";
+    case OpBitFieldInsert: return "OpBitFieldInsert";
+    case OpBitFieldSExtract: return "OpBitFieldSExtract";
+    case OpBitFieldUExtract: return "OpBitFieldUExtract";
+    case OpBitReverse: return "OpBitReverse";
+    case OpBitCount: return "OpBitCount";
+    case OpDPdx: return "OpDPdx";
+    case OpDPdy: return "OpDPdy";
+    case OpFwidth: return "OpFwidth";
+    case OpDPdxFine: return "OpDPdxFine";
+    case OpDPdyFine: return "OpDPdyFine";
+    case OpFwidthFine: return "OpFwidthFine";
+    case OpDPdxCoarse: return "OpDPdxCoarse";
+    case OpDPdyCoarse: return "OpDPdyCoarse";
+    case OpFwidthCoarse: return "OpFwidthCoarse";
+    case OpEmitVertex: return "OpEmitVertex";
+    case OpEndPrimitive: return "OpEndPrimitive";
+    case OpEmitStreamVertex: return "OpEmitStreamVertex";
+    case OpEndStreamPrimitive: return "OpEndStreamPrimitive";
+    case OpControlBarrier: return "OpControlBarrier";
+    case OpMemoryBarrier: return "OpMemoryBarrier";
+    case OpAtomicLoad: return "OpAtomicLoad";
+    case OpAtomicStore: return "OpAtomicStore";
+    case OpAtomicExchange: return "OpAtomicExchange";
+    case OpAtomicCompareExchange: return "OpAtomicCompareExchange";
+    case OpAtomicCompareExchangeWeak: return "OpAtomicCompareExchangeWeak";
+    case OpAtomicIIncrement: return "OpAtomicIIncrement";
+    case OpAtomicIDecrement: return "OpAtomicIDecrement";
+    case OpAtomicIAdd: return "OpAtomicIAdd";
+    case OpAtomicISub: return "OpAtomicISub";
+    case OpAtomicSMin: return "OpAtomicSMin";
+    case OpAtomicUMin: return "OpAtomicUMin";
+    case OpAtomicSMax: return "OpAtomicSMax";
+    case OpAtomicUMax: return "OpAtomicUMax";
+    case OpAtomicAnd: return "OpAtomicAnd";
+    case OpAtomicOr: return "OpAtomicOr";
+    case OpAtomicXor: return "OpAtomicXor";
+    case OpPhi: return "OpPhi";
+    case OpLoopMerge: return "OpLoopMerge";
+    case OpSelectionMerge: return "OpSelectionMerge";
+    case OpLabel: return "OpLabel";
+    case OpBranch: return "OpBranch";
+    case OpBranchConditional: return "OpBranchConditional";
+    case OpSwitch: return "OpSwitch";
+    case OpKill: return "OpKill";
+    case OpReturn: return "OpReturn";
+    case OpReturnValue: return "OpReturnValue";
+    case OpUnreachable: return "OpUnreachable";
+    case OpLifetimeStart: return "OpLifetimeStart";
+    case OpLifetimeStop: return "OpLifetimeStop";
+    case OpGroupAsyncCopy: return "OpGroupAsyncCopy";
+    case OpGroupWaitEvents: return "OpGroupWaitEvents";
+    case OpGroupAll: return "OpGroupAll";
+    case OpGroupAny: return "OpGroupAny";
+    case OpGroupBroadcast: return "OpGroupBroadcast";
+    case OpGroupIAdd: return "OpGroupIAdd";
+    case OpGroupFAdd: return "OpGroupFAdd";
+    case OpGroupFMin: return "OpGroupFMin";
+    case OpGroupUMin: return "OpGroupUMin";
+    case OpGroupSMin: return "OpGroupSMin";
+    case OpGroupFMax: return "OpGroupFMax";
+    case OpGroupUMax: return "OpGroupUMax";
+    case OpGroupSMax: return "OpGroupSMax";
+    case OpReadPipe: return "OpReadPipe";
+    case OpWritePipe: return "OpWritePipe";
+    case OpReservedReadPipe: return "OpReservedReadPipe";
+    case OpReservedWritePipe: return "OpReservedWritePipe";
+    case OpReserveReadPipePackets: return "OpReserveReadPipePackets";
+    case OpReserveWritePipePackets: return "OpReserveWritePipePackets";
+    case OpCommitReadPipe: return "OpCommitReadPipe";
+    case OpCommitWritePipe: return "OpCommitWritePipe";
+    case OpIsValidReserveId: return "OpIsValidReserveId";
+    case OpGetNumPipePackets: return "OpGetNumPipePackets";
+    case OpGetMaxPipePackets: return "OpGetMaxPipePackets";
+    case OpGroupReserveReadPipePackets: return "OpGroupReserveReadPipePackets";
+    case OpGroupReserveWritePipePackets: return "OpGroupReserveWritePipePackets";
+    case OpGroupCommitReadPipe: return "OpGroupCommitReadPipe";
+    case OpGroupCommitWritePipe: return "OpGroupCommitWritePipe";
+    case OpEnqueueMarker: return "OpEnqueueMarker";
+    case OpEnqueueKernel: return "OpEnqueueKernel";
+    case OpGetKernelNDrangeSubGroupCount: return "OpGetKernelNDrangeSubGroupCount";
+    case OpGetKernelNDrangeMaxSubGroupSize: return "OpGetKernelNDrangeMaxSubGroupSize";
+    case OpGetKernelWorkGroupSize: return "OpGetKernelWorkGroupSize";
+    case OpGetKernelPreferredWorkGroupSizeMultiple: return "OpGetKernelPreferredWorkGroupSizeMultiple";
+    case OpRetainEvent: return "OpRetainEvent";
+    case OpReleaseEvent: return "OpReleaseEvent";
+    case OpCreateUserEvent: return "OpCreateUserEvent";
+    case OpIsValidEvent: return "OpIsValidEvent";
+    case OpSetUserEventStatus: return "OpSetUserEventStatus";
+    case OpCaptureEventProfilingInfo: return "OpCaptureEventProfilingInfo";
+    case OpGetDefaultQueue: return "OpGetDefaultQueue";
+    case OpBuildNDRange: return "OpBuildNDRange";
+    case OpImageSparseSampleImplicitLod: return "OpImageSparseSampleImplicitLod";
+    case OpImageSparseSampleExplicitLod: return "OpImageSparseSampleExplicitLod";
+    case OpImageSparseSampleDrefImplicitLod: return "OpImageSparseSampleDrefImplicitLod";
+    case OpImageSparseSampleDrefExplicitLod: return "OpImageSparseSampleDrefExplicitLod";
+    case OpImageSparseSampleProjImplicitLod: return "OpImageSparseSampleProjImplicitLod";
+    case OpImageSparseSampleProjExplicitLod: return "OpImageSparseSampleProjExplicitLod";
+    case OpImageSparseSampleProjDrefImplicitLod: return "OpImageSparseSampleProjDrefImplicitLod";
+    case OpImageSparseSampleProjDrefExplicitLod: return "OpImageSparseSampleProjDrefExplicitLod";
+    case OpImageSparseFetch: return "OpImageSparseFetch";
+    case OpImageSparseGather: return "OpImageSparseGather";
+    case OpImageSparseDrefGather: return "OpImageSparseDrefGather";
+    case OpImageSparseTexelsResident: return "OpImageSparseTexelsResident";
+    case OpNoLine: return "OpNoLine";
+    case OpAtomicFlagTestAndSet: return "OpAtomicFlagTestAndSet";
+    case OpAtomicFlagClear: return "OpAtomicFlagClear";
+    case OpImageSparseRead: return "OpImageSparseRead";
+    case OpSizeOf: return "OpSizeOf";
+    case OpTypePipeStorage: return "OpTypePipeStorage";
+    case OpConstantPipeStorage: return "OpConstantPipeStorage";
+    case OpCreatePipeFromPipeStorage: return "OpCreatePipeFromPipeStorage";
+    case OpGetKernelLocalSizeForSubgroupCount: return "OpGetKernelLocalSizeForSubgroupCount";
+    case OpGetKernelMaxNumSubgroups: return "OpGetKernelMaxNumSubgroups";
+    case OpTypeNamedBarrier: return "OpTypeNamedBarrier";
+    case OpNamedBarrierInitialize: return "OpNamedBarrierInitialize";
+    case OpMemoryNamedBarrier: return "OpMemoryNamedBarrier";
+    case OpModuleProcessed: return "OpModuleProcessed";
+    case OpExecutionModeId: return "OpExecutionModeId";
+    case OpDecorateId: return "OpDecorateId";
+    case OpGroupNonUniformElect: return "OpGroupNonUniformElect";
+    case OpGroupNonUniformAll: return "OpGroupNonUniformAll";
+    case OpGroupNonUniformAny: return "OpGroupNonUniformAny";
+    case OpGroupNonUniformAllEqual: return "OpGroupNonUniformAllEqual";
+    case OpGroupNonUniformBroadcast: return "OpGroupNonUniformBroadcast";
+    case OpGroupNonUniformBroadcastFirst: return "OpGroupNonUniformBroadcastFirst";
+    case OpGroupNonUniformBallot: return "OpGroupNonUniformBallot";
+    case OpGroupNonUniformInverseBallot: return "OpGroupNonUniformInverseBallot";
+    case OpGroupNonUniformBallotBitExtract: return "OpGroupNonUniformBallotBitExtract";
+    case OpGroupNonUniformBallotBitCount: return "OpGroupNonUniformBallotBitCount";
+    case OpGroupNonUniformBallotFindLSB: return "OpGroupNonUniformBallotFindLSB";
+    case OpGroupNonUniformBallotFindMSB: return "OpGroupNonUniformBallotFindMSB";
+    case OpGroupNonUniformShuffle: return "OpGroupNonUniformShuffle";
+    case OpGroupNonUniformShuffleXor: return "OpGroupNonUniformShuffleXor";
+    case OpGroupNonUniformShuffleUp: return "OpGroupNonUniformShuffleUp";
+    case OpGroupNonUniformShuffleDown: return "OpGroupNonUniformShuffleDown";
+    case OpGroupNonUniformIAdd: return "OpGroupNonUniformIAdd";
+    case OpGroupNonUniformFAdd: return "OpGroupNonUniformFAdd";
+    case OpGroupNonUniformIMul: return "OpGroupNonUniformIMul";
+    case OpGroupNonUniformFMul: return "OpGroupNonUniformFMul";
+    case OpGroupNonUniformSMin: return "OpGroupNonUniformSMin";
+    case OpGroupNonUniformUMin: return "OpGroupNonUniformUMin";
+    case OpGroupNonUniformFMin: return "OpGroupNonUniformFMin";
+    case OpGroupNonUniformSMax: return "OpGroupNonUniformSMax";
+    case OpGroupNonUniformUMax: return "OpGroupNonUniformUMax";
+    case OpGroupNonUniformFMax: return "OpGroupNonUniformFMax";
+    case OpGroupNonUniformBitwiseAnd: return "OpGroupNonUniformBitwiseAnd";
+    case OpGroupNonUniformBitwiseOr: return "OpGroupNonUniformBitwiseOr";
+    case OpGroupNonUniformBitwiseXor: return "OpGroupNonUniformBitwiseXor";
+    case OpGroupNonUniformLogicalAnd: return "OpGroupNonUniformLogicalAnd";
+    case OpGroupNonUniformLogicalOr: return "OpGroupNonUniformLogicalOr";
+    case OpGroupNonUniformLogicalXor: return "OpGroupNonUniformLogicalXor";
+    case OpGroupNonUniformQuadBroadcast: return "OpGroupNonUniformQuadBroadcast";
+    case OpGroupNonUniformQuadSwap: return "OpGroupNonUniformQuadSwap";
+    case OpCopyLogical: return "OpCopyLogical";
+    case OpPtrEqual: return "OpPtrEqual";
+    case OpPtrNotEqual: return "OpPtrNotEqual";
+    case OpPtrDiff: return "OpPtrDiff";
+    case OpColorAttachmentReadEXT: return "OpColorAttachmentReadEXT";
+    case OpDepthAttachmentReadEXT: return "OpDepthAttachmentReadEXT";
+    case OpStencilAttachmentReadEXT: return "OpStencilAttachmentReadEXT";
+    case OpTypeTensorARM: return "OpTypeTensorARM";
+    case OpTensorReadARM: return "OpTensorReadARM";
+    case OpTensorWriteARM: return "OpTensorWriteARM";
+    case OpTensorQuerySizeARM: return "OpTensorQuerySizeARM";
+    case OpGraphConstantARM: return "OpGraphConstantARM";
+    case OpGraphEntryPointARM: return "OpGraphEntryPointARM";
+    case OpGraphARM: return "OpGraphARM";
+    case OpGraphInputARM: return "OpGraphInputARM";
+    case OpGraphSetOutputARM: return "OpGraphSetOutputARM";
+    case OpGraphEndARM: return "OpGraphEndARM";
+    case OpTypeGraphARM: return "OpTypeGraphARM";
+    case OpTerminateInvocation: return "OpTerminateInvocation";
+    case OpTypeUntypedPointerKHR: return "OpTypeUntypedPointerKHR";
+    case OpUntypedVariableKHR: return "OpUntypedVariableKHR";
+    case OpUntypedAccessChainKHR: return "OpUntypedAccessChainKHR";
+    case OpUntypedInBoundsAccessChainKHR: return "OpUntypedInBoundsAccessChainKHR";
+    case OpSubgroupBallotKHR: return "OpSubgroupBallotKHR";
+    case OpSubgroupFirstInvocationKHR: return "OpSubgroupFirstInvocationKHR";
+    case OpUntypedPtrAccessChainKHR: return "OpUntypedPtrAccessChainKHR";
+    case OpUntypedInBoundsPtrAccessChainKHR: return "OpUntypedInBoundsPtrAccessChainKHR";
+    case OpUntypedArrayLengthKHR: return "OpUntypedArrayLengthKHR";
+    case OpUntypedPrefetchKHR: return "OpUntypedPrefetchKHR";
+    case OpSubgroupAllKHR: return "OpSubgroupAllKHR";
+    case OpSubgroupAnyKHR: return "OpSubgroupAnyKHR";
+    case OpSubgroupAllEqualKHR: return "OpSubgroupAllEqualKHR";
+    case OpGroupNonUniformRotateKHR: return "OpGroupNonUniformRotateKHR";
+    case OpSubgroupReadInvocationKHR: return "OpSubgroupReadInvocationKHR";
+    case OpExtInstWithForwardRefsKHR: return "OpExtInstWithForwardRefsKHR";
+    case OpTraceRayKHR: return "OpTraceRayKHR";
+    case OpExecuteCallableKHR: return "OpExecuteCallableKHR";
+    case OpConvertUToAccelerationStructureKHR: return "OpConvertUToAccelerationStructureKHR";
+    case OpIgnoreIntersectionKHR: return "OpIgnoreIntersectionKHR";
+    case OpTerminateRayKHR: return "OpTerminateRayKHR";
+    case OpSDot: return "OpSDot";
+    case OpUDot: return "OpUDot";
+    case OpSUDot: return "OpSUDot";
+    case OpSDotAccSat: return "OpSDotAccSat";
+    case OpUDotAccSat: return "OpUDotAccSat";
+    case OpSUDotAccSat: return "OpSUDotAccSat";
+    case OpTypeCooperativeMatrixKHR: return "OpTypeCooperativeMatrixKHR";
+    case OpCooperativeMatrixLoadKHR: return "OpCooperativeMatrixLoadKHR";
+    case OpCooperativeMatrixStoreKHR: return "OpCooperativeMatrixStoreKHR";
+    case OpCooperativeMatrixMulAddKHR: return "OpCooperativeMatrixMulAddKHR";
+    case OpCooperativeMatrixLengthKHR: return "OpCooperativeMatrixLengthKHR";
+    case OpConstantCompositeReplicateEXT: return "OpConstantCompositeReplicateEXT";
+    case OpSpecConstantCompositeReplicateEXT: return "OpSpecConstantCompositeReplicateEXT";
+    case OpCompositeConstructReplicateEXT: return "OpCompositeConstructReplicateEXT";
+    case OpTypeRayQueryKHR: return "OpTypeRayQueryKHR";
+    case OpRayQueryInitializeKHR: return "OpRayQueryInitializeKHR";
+    case OpRayQueryTerminateKHR: return "OpRayQueryTerminateKHR";
+    case OpRayQueryGenerateIntersectionKHR: return "OpRayQueryGenerateIntersectionKHR";
+    case OpRayQueryConfirmIntersectionKHR: return "OpRayQueryConfirmIntersectionKHR";
+    case OpRayQueryProceedKHR: return "OpRayQueryProceedKHR";
+    case OpRayQueryGetIntersectionTypeKHR: return "OpRayQueryGetIntersectionTypeKHR";
+    case OpImageSampleWeightedQCOM: return "OpImageSampleWeightedQCOM";
+    case OpImageBoxFilterQCOM: return "OpImageBoxFilterQCOM";
+    case OpImageBlockMatchSSDQCOM: return "OpImageBlockMatchSSDQCOM";
+    case OpImageBlockMatchSADQCOM: return "OpImageBlockMatchSADQCOM";
+    case OpImageBlockMatchWindowSSDQCOM: return "OpImageBlockMatchWindowSSDQCOM";
+    case OpImageBlockMatchWindowSADQCOM: return "OpImageBlockMatchWindowSADQCOM";
+    case OpImageBlockMatchGatherSSDQCOM: return "OpImageBlockMatchGatherSSDQCOM";
+    case OpImageBlockMatchGatherSADQCOM: return "OpImageBlockMatchGatherSADQCOM";
+    case OpGroupIAddNonUniformAMD: return "OpGroupIAddNonUniformAMD";
+    case OpGroupFAddNonUniformAMD: return "OpGroupFAddNonUniformAMD";
+    case OpGroupFMinNonUniformAMD: return "OpGroupFMinNonUniformAMD";
+    case OpGroupUMinNonUniformAMD: return "OpGroupUMinNonUniformAMD";
+    case OpGroupSMinNonUniformAMD: return "OpGroupSMinNonUniformAMD";
+    case OpGroupFMaxNonUniformAMD: return "OpGroupFMaxNonUniformAMD";
+    case OpGroupUMaxNonUniformAMD: return "OpGroupUMaxNonUniformAMD";
+    case OpGroupSMaxNonUniformAMD: return "OpGroupSMaxNonUniformAMD";
+    case OpFragmentMaskFetchAMD: return "OpFragmentMaskFetchAMD";
+    case OpFragmentFetchAMD: return "OpFragmentFetchAMD";
+    case OpReadClockKHR: return "OpReadClockKHR";
+    case OpAllocateNodePayloadsAMDX: return "OpAllocateNodePayloadsAMDX";
+    case OpEnqueueNodePayloadsAMDX: return "OpEnqueueNodePayloadsAMDX";
+    case OpTypeNodePayloadArrayAMDX: return "OpTypeNodePayloadArrayAMDX";
+    case OpFinishWritingNodePayloadAMDX: return "OpFinishWritingNodePayloadAMDX";
+    case OpNodePayloadArrayLengthAMDX: return "OpNodePayloadArrayLengthAMDX";
+    case OpIsNodePayloadValidAMDX: return "OpIsNodePayloadValidAMDX";
+    case OpConstantStringAMDX: return "OpConstantStringAMDX";
+    case OpSpecConstantStringAMDX: return "OpSpecConstantStringAMDX";
+    case OpGroupNonUniformQuadAllKHR: return "OpGroupNonUniformQuadAllKHR";
+    case OpGroupNonUniformQuadAnyKHR: return "OpGroupNonUniformQuadAnyKHR";
+    case OpHitObjectRecordHitMotionNV: return "OpHitObjectRecordHitMotionNV";
+    case OpHitObjectRecordHitWithIndexMotionNV: return "OpHitObjectRecordHitWithIndexMotionNV";
+    case OpHitObjectRecordMissMotionNV: return "OpHitObjectRecordMissMotionNV";
+    case OpHitObjectGetWorldToObjectNV: return "OpHitObjectGetWorldToObjectNV";
+    case OpHitObjectGetObjectToWorldNV: return "OpHitObjectGetObjectToWorldNV";
+    case OpHitObjectGetObjectRayDirectionNV: return "OpHitObjectGetObjectRayDirectionNV";
+    case OpHitObjectGetObjectRayOriginNV: return "OpHitObjectGetObjectRayOriginNV";
+    case OpHitObjectTraceRayMotionNV: return "OpHitObjectTraceRayMotionNV";
+    case OpHitObjectGetShaderRecordBufferHandleNV: return "OpHitObjectGetShaderRecordBufferHandleNV";
+    case OpHitObjectGetShaderBindingTableRecordIndexNV: return "OpHitObjectGetShaderBindingTableRecordIndexNV";
+    case OpHitObjectRecordEmptyNV: return "OpHitObjectRecordEmptyNV";
+    case OpHitObjectTraceRayNV: return "OpHitObjectTraceRayNV";
+    case OpHitObjectRecordHitNV: return "OpHitObjectRecordHitNV";
+    case OpHitObjectRecordHitWithIndexNV: return "OpHitObjectRecordHitWithIndexNV";
+    case OpHitObjectRecordMissNV: return "OpHitObjectRecordMissNV";
+    case OpHitObjectExecuteShaderNV: return "OpHitObjectExecuteShaderNV";
+    case OpHitObjectGetCurrentTimeNV: return "OpHitObjectGetCurrentTimeNV";
+    case OpHitObjectGetAttributesNV: return "OpHitObjectGetAttributesNV";
+    case OpHitObjectGetHitKindNV: return "OpHitObjectGetHitKindNV";
+    case OpHitObjectGetPrimitiveIndexNV: return "OpHitObjectGetPrimitiveIndexNV";
+    case OpHitObjectGetGeometryIndexNV: return "OpHitObjectGetGeometryIndexNV";
+    case OpHitObjectGetInstanceIdNV: return "OpHitObjectGetInstanceIdNV";
+    case OpHitObjectGetInstanceCustomIndexNV: return "OpHitObjectGetInstanceCustomIndexNV";
+    case OpHitObjectGetWorldRayDirectionNV: return "OpHitObjectGetWorldRayDirectionNV";
+    case OpHitObjectGetWorldRayOriginNV: return "OpHitObjectGetWorldRayOriginNV";
+    case OpHitObjectGetRayTMaxNV: return "OpHitObjectGetRayTMaxNV";
+    case OpHitObjectGetRayTMinNV: return "OpHitObjectGetRayTMinNV";
+    case OpHitObjectIsEmptyNV: return "OpHitObjectIsEmptyNV";
+    case OpHitObjectIsHitNV: return "OpHitObjectIsHitNV";
+    case OpHitObjectIsMissNV: return "OpHitObjectIsMissNV";
+    case OpReorderThreadWithHitObjectNV: return "OpReorderThreadWithHitObjectNV";
+    case OpReorderThreadWithHintNV: return "OpReorderThreadWithHintNV";
+    case OpTypeHitObjectNV: return "OpTypeHitObjectNV";
+    case OpImageSampleFootprintNV: return "OpImageSampleFootprintNV";
+    case OpTypeCooperativeVectorNV: return "OpTypeCooperativeVectorNV";
+    case OpCooperativeVectorMatrixMulNV: return "OpCooperativeVectorMatrixMulNV";
+    case OpCooperativeVectorOuterProductAccumulateNV: return "OpCooperativeVectorOuterProductAccumulateNV";
+    case OpCooperativeVectorReduceSumAccumulateNV: return "OpCooperativeVectorReduceSumAccumulateNV";
+    case OpCooperativeVectorMatrixMulAddNV: return "OpCooperativeVectorMatrixMulAddNV";
+    case OpCooperativeMatrixConvertNV: return "OpCooperativeMatrixConvertNV";
+    case OpEmitMeshTasksEXT: return "OpEmitMeshTasksEXT";
+    case OpSetMeshOutputsEXT: return "OpSetMeshOutputsEXT";
+    case OpGroupNonUniformPartitionNV: return "OpGroupNonUniformPartitionNV";
+    case OpWritePackedPrimitiveIndices4x8NV: return "OpWritePackedPrimitiveIndices4x8NV";
+    case OpFetchMicroTriangleVertexPositionNV: return "OpFetchMicroTriangleVertexPositionNV";
+    case OpFetchMicroTriangleVertexBarycentricNV: return "OpFetchMicroTriangleVertexBarycentricNV";
+    case OpCooperativeVectorLoadNV: return "OpCooperativeVectorLoadNV";
+    case OpCooperativeVectorStoreNV: return "OpCooperativeVectorStoreNV";
+    case OpReportIntersectionKHR: return "OpReportIntersectionKHR";
+    case OpIgnoreIntersectionNV: return "OpIgnoreIntersectionNV";
+    case OpTerminateRayNV: return "OpTerminateRayNV";
+    case OpTraceNV: return "OpTraceNV";
+    case OpTraceMotionNV: return "OpTraceMotionNV";
+    case OpTraceRayMotionNV: return "OpTraceRayMotionNV";
+    case OpRayQueryGetIntersectionTriangleVertexPositionsKHR: return "OpRayQueryGetIntersectionTriangleVertexPositionsKHR";
+    case OpTypeAccelerationStructureKHR: return "OpTypeAccelerationStructureKHR";
+    case OpExecuteCallableNV: return "OpExecuteCallableNV";
+    case OpRayQueryGetClusterIdNV: return "OpRayQueryGetClusterIdNV";
+    case OpHitObjectGetClusterIdNV: return "OpHitObjectGetClusterIdNV";
+    case OpTypeCooperativeMatrixNV: return "OpTypeCooperativeMatrixNV";
+    case OpCooperativeMatrixLoadNV: return "OpCooperativeMatrixLoadNV";
+    case OpCooperativeMatrixStoreNV: return "OpCooperativeMatrixStoreNV";
+    case OpCooperativeMatrixMulAddNV: return "OpCooperativeMatrixMulAddNV";
+    case OpCooperativeMatrixLengthNV: return "OpCooperativeMatrixLengthNV";
+    case OpBeginInvocationInterlockEXT: return "OpBeginInvocationInterlockEXT";
+    case OpEndInvocationInterlockEXT: return "OpEndInvocationInterlockEXT";
+    case OpCooperativeMatrixReduceNV: return "OpCooperativeMatrixReduceNV";
+    case OpCooperativeMatrixLoadTensorNV: return "OpCooperativeMatrixLoadTensorNV";
+    case OpCooperativeMatrixStoreTensorNV: return "OpCooperativeMatrixStoreTensorNV";
+    case OpCooperativeMatrixPerElementOpNV: return "OpCooperativeMatrixPerElementOpNV";
+    case OpTypeTensorLayoutNV: return "OpTypeTensorLayoutNV";
+    case OpTypeTensorViewNV: return "OpTypeTensorViewNV";
+    case OpCreateTensorLayoutNV: return "OpCreateTensorLayoutNV";
+    case OpTensorLayoutSetDimensionNV: return "OpTensorLayoutSetDimensionNV";
+    case OpTensorLayoutSetStrideNV: return "OpTensorLayoutSetStrideNV";
+    case OpTensorLayoutSliceNV: return "OpTensorLayoutSliceNV";
+    case OpTensorLayoutSetClampValueNV: return "OpTensorLayoutSetClampValueNV";
+    case OpCreateTensorViewNV: return "OpCreateTensorViewNV";
+    case OpTensorViewSetDimensionNV: return "OpTensorViewSetDimensionNV";
+    case OpTensorViewSetStrideNV: return "OpTensorViewSetStrideNV";
+    case OpDemoteToHelperInvocation: return "OpDemoteToHelperInvocation";
+    case OpIsHelperInvocationEXT: return "OpIsHelperInvocationEXT";
+    case OpTensorViewSetClipNV: return "OpTensorViewSetClipNV";
+    case OpTensorLayoutSetBlockSizeNV: return "OpTensorLayoutSetBlockSizeNV";
+    case OpCooperativeMatrixTransposeNV: return "OpCooperativeMatrixTransposeNV";
+    case OpConvertUToImageNV: return "OpConvertUToImageNV";
+    case OpConvertUToSamplerNV: return "OpConvertUToSamplerNV";
+    case OpConvertImageToUNV: return "OpConvertImageToUNV";
+    case OpConvertSamplerToUNV: return "OpConvertSamplerToUNV";
+    case OpConvertUToSampledImageNV: return "OpConvertUToSampledImageNV";
+    case OpConvertSampledImageToUNV: return "OpConvertSampledImageToUNV";
+    case OpSamplerImageAddressingModeNV: return "OpSamplerImageAddressingModeNV";
+    case OpRawAccessChainNV: return "OpRawAccessChainNV";
+    case OpRayQueryGetIntersectionSpherePositionNV: return "OpRayQueryGetIntersectionSpherePositionNV";
+    case OpRayQueryGetIntersectionSphereRadiusNV: return "OpRayQueryGetIntersectionSphereRadiusNV";
+    case OpRayQueryGetIntersectionLSSPositionsNV: return "OpRayQueryGetIntersectionLSSPositionsNV";
+    case OpRayQueryGetIntersectionLSSRadiiNV: return "OpRayQueryGetIntersectionLSSRadiiNV";
+    case OpRayQueryGetIntersectionLSSHitValueNV: return "OpRayQueryGetIntersectionLSSHitValueNV";
+    case OpHitObjectGetSpherePositionNV: return "OpHitObjectGetSpherePositionNV";
+    case OpHitObjectGetSphereRadiusNV: return "OpHitObjectGetSphereRadiusNV";
+    case OpHitObjectGetLSSPositionsNV: return "OpHitObjectGetLSSPositionsNV";
+    case OpHitObjectGetLSSRadiiNV: return "OpHitObjectGetLSSRadiiNV";
+    case OpHitObjectIsSphereHitNV: return "OpHitObjectIsSphereHitNV";
+    case OpHitObjectIsLSSHitNV: return "OpHitObjectIsLSSHitNV";
+    case OpRayQueryIsSphereHitNV: return "OpRayQueryIsSphereHitNV";
+    case OpRayQueryIsLSSHitNV: return "OpRayQueryIsLSSHitNV";
+    case OpSubgroupShuffleINTEL: return "OpSubgroupShuffleINTEL";
+    case OpSubgroupShuffleDownINTEL: return "OpSubgroupShuffleDownINTEL";
+    case OpSubgroupShuffleUpINTEL: return "OpSubgroupShuffleUpINTEL";
+    case OpSubgroupShuffleXorINTEL: return "OpSubgroupShuffleXorINTEL";
+    case OpSubgroupBlockReadINTEL: return "OpSubgroupBlockReadINTEL";
+    case OpSubgroupBlockWriteINTEL: return "OpSubgroupBlockWriteINTEL";
+    case OpSubgroupImageBlockReadINTEL: return "OpSubgroupImageBlockReadINTEL";
+    case OpSubgroupImageBlockWriteINTEL: return "OpSubgroupImageBlockWriteINTEL";
+    case OpSubgroupImageMediaBlockReadINTEL: return "OpSubgroupImageMediaBlockReadINTEL";
+    case OpSubgroupImageMediaBlockWriteINTEL: return "OpSubgroupImageMediaBlockWriteINTEL";
+    case OpUCountLeadingZerosINTEL: return "OpUCountLeadingZerosINTEL";
+    case OpUCountTrailingZerosINTEL: return "OpUCountTrailingZerosINTEL";
+    case OpAbsISubINTEL: return "OpAbsISubINTEL";
+    case OpAbsUSubINTEL: return "OpAbsUSubINTEL";
+    case OpIAddSatINTEL: return "OpIAddSatINTEL";
+    case OpUAddSatINTEL: return "OpUAddSatINTEL";
+    case OpIAverageINTEL: return "OpIAverageINTEL";
+    case OpUAverageINTEL: return "OpUAverageINTEL";
+    case OpIAverageRoundedINTEL: return "OpIAverageRoundedINTEL";
+    case OpUAverageRoundedINTEL: return "OpUAverageRoundedINTEL";
+    case OpISubSatINTEL: return "OpISubSatINTEL";
+    case OpUSubSatINTEL: return "OpUSubSatINTEL";
+    case OpIMul32x16INTEL: return "OpIMul32x16INTEL";
+    case OpUMul32x16INTEL: return "OpUMul32x16INTEL";
+    case OpConstantFunctionPointerINTEL: return "OpConstantFunctionPointerINTEL";
+    case OpFunctionPointerCallINTEL: return "OpFunctionPointerCallINTEL";
+    case OpAsmTargetINTEL: return "OpAsmTargetINTEL";
+    case OpAsmINTEL: return "OpAsmINTEL";
+    case OpAsmCallINTEL: return "OpAsmCallINTEL";
+    case OpAtomicFMinEXT: return "OpAtomicFMinEXT";
+    case OpAtomicFMaxEXT: return "OpAtomicFMaxEXT";
+    case OpAssumeTrueKHR: return "OpAssumeTrueKHR";
+    case OpExpectKHR: return "OpExpectKHR";
+    case OpDecorateString: return "OpDecorateString";
+    case OpMemberDecorateString: return "OpMemberDecorateString";
+    case OpVmeImageINTEL: return "OpVmeImageINTEL";
+    case OpTypeVmeImageINTEL: return "OpTypeVmeImageINTEL";
+    case OpTypeAvcImePayloadINTEL: return "OpTypeAvcImePayloadINTEL";
+    case OpTypeAvcRefPayloadINTEL: return "OpTypeAvcRefPayloadINTEL";
+    case OpTypeAvcSicPayloadINTEL: return "OpTypeAvcSicPayloadINTEL";
+    case OpTypeAvcMcePayloadINTEL: return "OpTypeAvcMcePayloadINTEL";
+    case OpTypeAvcMceResultINTEL: return "OpTypeAvcMceResultINTEL";
+    case OpTypeAvcImeResultINTEL: return "OpTypeAvcImeResultINTEL";
+    case OpTypeAvcImeResultSingleReferenceStreamoutINTEL: return "OpTypeAvcImeResultSingleReferenceStreamoutINTEL";
+    case OpTypeAvcImeResultDualReferenceStreamoutINTEL: return "OpTypeAvcImeResultDualReferenceStreamoutINTEL";
+    case OpTypeAvcImeSingleReferenceStreaminINTEL: return "OpTypeAvcImeSingleReferenceStreaminINTEL";
+    case OpTypeAvcImeDualReferenceStreaminINTEL: return "OpTypeAvcImeDualReferenceStreaminINTEL";
+    case OpTypeAvcRefResultINTEL: return "OpTypeAvcRefResultINTEL";
+    case OpTypeAvcSicResultINTEL: return "OpTypeAvcSicResultINTEL";
+    case OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterBaseMultiReferencePenaltyINTEL";
+    case OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL: return "OpSubgroupAvcMceSetInterBaseMultiReferencePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterShapePenaltyINTEL";
+    case OpSubgroupAvcMceSetInterShapePenaltyINTEL: return "OpSubgroupAvcMceSetInterShapePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL: return "OpSubgroupAvcMceGetDefaultInterDirectionPenaltyINTEL";
+    case OpSubgroupAvcMceSetInterDirectionPenaltyINTEL: return "OpSubgroupAvcMceSetInterDirectionPenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraLumaShapePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL: return "OpSubgroupAvcMceGetDefaultInterMotionVectorCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultHighPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultMediumPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL: return "OpSubgroupAvcMceGetDefaultLowPenaltyCostTableINTEL";
+    case OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL: return "OpSubgroupAvcMceSetMotionVectorCostFunctionINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraLumaModePenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL: return "OpSubgroupAvcMceGetDefaultNonDcLumaIntraPenaltyINTEL";
+    case OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL: return "OpSubgroupAvcMceGetDefaultIntraChromaModeBasePenaltyINTEL";
+    case OpSubgroupAvcMceSetAcOnlyHaarINTEL: return "OpSubgroupAvcMceSetAcOnlyHaarINTEL";
+    case OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL: return "OpSubgroupAvcMceSetSourceInterlacedFieldPolarityINTEL";
+    case OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL: return "OpSubgroupAvcMceSetSingleReferenceInterlacedFieldPolarityINTEL";
+    case OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL: return "OpSubgroupAvcMceSetDualReferenceInterlacedFieldPolaritiesINTEL";
+    case OpSubgroupAvcMceConvertToImePayloadINTEL: return "OpSubgroupAvcMceConvertToImePayloadINTEL";
+    case OpSubgroupAvcMceConvertToImeResultINTEL: return "OpSubgroupAvcMceConvertToImeResultINTEL";
+    case OpSubgroupAvcMceConvertToRefPayloadINTEL: return "OpSubgroupAvcMceConvertToRefPayloadINTEL";
+    case OpSubgroupAvcMceConvertToRefResultINTEL: return "OpSubgroupAvcMceConvertToRefResultINTEL";
+    case OpSubgroupAvcMceConvertToSicPayloadINTEL: return "OpSubgroupAvcMceConvertToSicPayloadINTEL";
+    case OpSubgroupAvcMceConvertToSicResultINTEL: return "OpSubgroupAvcMceConvertToSicResultINTEL";
+    case OpSubgroupAvcMceGetMotionVectorsINTEL: return "OpSubgroupAvcMceGetMotionVectorsINTEL";
+    case OpSubgroupAvcMceGetInterDistortionsINTEL: return "OpSubgroupAvcMceGetInterDistortionsINTEL";
+    case OpSubgroupAvcMceGetBestInterDistortionsINTEL: return "OpSubgroupAvcMceGetBestInterDistortionsINTEL";
+    case OpSubgroupAvcMceGetInterMajorShapeINTEL: return "OpSubgroupAvcMceGetInterMajorShapeINTEL";
+    case OpSubgroupAvcMceGetInterMinorShapeINTEL: return "OpSubgroupAvcMceGetInterMinorShapeINTEL";
+    case OpSubgroupAvcMceGetInterDirectionsINTEL: return "OpSubgroupAvcMceGetInterDirectionsINTEL";
+    case OpSubgroupAvcMceGetInterMotionVectorCountINTEL: return "OpSubgroupAvcMceGetInterMotionVectorCountINTEL";
+    case OpSubgroupAvcMceGetInterReferenceIdsINTEL: return "OpSubgroupAvcMceGetInterReferenceIdsINTEL";
+    case OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL: return "OpSubgroupAvcMceGetInterReferenceInterlacedFieldPolaritiesINTEL";
+    case OpSubgroupAvcImeInitializeINTEL: return "OpSubgroupAvcImeInitializeINTEL";
+    case OpSubgroupAvcImeSetSingleReferenceINTEL: return "OpSubgroupAvcImeSetSingleReferenceINTEL";
+    case OpSubgroupAvcImeSetDualReferenceINTEL: return "OpSubgroupAvcImeSetDualReferenceINTEL";
+    case OpSubgroupAvcImeRefWindowSizeINTEL: return "OpSubgroupAvcImeRefWindowSizeINTEL";
+    case OpSubgroupAvcImeAdjustRefOffsetINTEL: return "OpSubgroupAvcImeAdjustRefOffsetINTEL";
+    case OpSubgroupAvcImeConvertToMcePayloadINTEL: return "OpSubgroupAvcImeConvertToMcePayloadINTEL";
+    case OpSubgroupAvcImeSetMaxMotionVectorCountINTEL: return "OpSubgroupAvcImeSetMaxMotionVectorCountINTEL";
+    case OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL: return "OpSubgroupAvcImeSetUnidirectionalMixDisableINTEL";
+    case OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL: return "OpSubgroupAvcImeSetEarlySearchTerminationThresholdINTEL";
+    case OpSubgroupAvcImeSetWeightedSadINTEL: return "OpSubgroupAvcImeSetWeightedSadINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL: return "OpSubgroupAvcImeEvaluateWithSingleReferenceStreaminoutINTEL";
+    case OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL: return "OpSubgroupAvcImeEvaluateWithDualReferenceStreaminoutINTEL";
+    case OpSubgroupAvcImeConvertToMceResultINTEL: return "OpSubgroupAvcImeConvertToMceResultINTEL";
+    case OpSubgroupAvcImeGetSingleReferenceStreaminINTEL: return "OpSubgroupAvcImeGetSingleReferenceStreaminINTEL";
+    case OpSubgroupAvcImeGetDualReferenceStreaminINTEL: return "OpSubgroupAvcImeGetDualReferenceStreaminINTEL";
+    case OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL: return "OpSubgroupAvcImeStripSingleReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeStripDualReferenceStreamoutINTEL: return "OpSubgroupAvcImeStripDualReferenceStreamoutINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeMotionVectorsINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeDistortionsINTEL";
+    case OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL: return "OpSubgroupAvcImeGetStreamoutSingleReferenceMajorShapeReferenceIdsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeMotionVectorsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeDistortionsINTEL";
+    case OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL: return "OpSubgroupAvcImeGetStreamoutDualReferenceMajorShapeReferenceIdsINTEL";
+    case OpSubgroupAvcImeGetBorderReachedINTEL: return "OpSubgroupAvcImeGetBorderReachedINTEL";
+    case OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL: return "OpSubgroupAvcImeGetTruncatedSearchIndicationINTEL";
+    case OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL: return "OpSubgroupAvcImeGetUnidirectionalEarlySearchTerminationINTEL";
+    case OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL: return "OpSubgroupAvcImeGetWeightingPatternMinimumMotionVectorINTEL";
+    case OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL: return "OpSubgroupAvcImeGetWeightingPatternMinimumDistortionINTEL";
+    case OpSubgroupAvcFmeInitializeINTEL: return "OpSubgroupAvcFmeInitializeINTEL";
+    case OpSubgroupAvcBmeInitializeINTEL: return "OpSubgroupAvcBmeInitializeINTEL";
+    case OpSubgroupAvcRefConvertToMcePayloadINTEL: return "OpSubgroupAvcRefConvertToMcePayloadINTEL";
+    case OpSubgroupAvcRefSetBidirectionalMixDisableINTEL: return "OpSubgroupAvcRefSetBidirectionalMixDisableINTEL";
+    case OpSubgroupAvcRefSetBilinearFilterEnableINTEL: return "OpSubgroupAvcRefSetBilinearFilterEnableINTEL";
+    case OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL: return "OpSubgroupAvcRefEvaluateWithMultiReferenceINTEL";
+    case OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL: return "OpSubgroupAvcRefEvaluateWithMultiReferenceInterlacedINTEL";
+    case OpSubgroupAvcRefConvertToMceResultINTEL: return "OpSubgroupAvcRefConvertToMceResultINTEL";
+    case OpSubgroupAvcSicInitializeINTEL: return "OpSubgroupAvcSicInitializeINTEL";
+    case OpSubgroupAvcSicConfigureSkcINTEL: return "OpSubgroupAvcSicConfigureSkcINTEL";
+    case OpSubgroupAvcSicConfigureIpeLumaINTEL: return "OpSubgroupAvcSicConfigureIpeLumaINTEL";
+    case OpSubgroupAvcSicConfigureIpeLumaChromaINTEL: return "OpSubgroupAvcSicConfigureIpeLumaChromaINTEL";
+    case OpSubgroupAvcSicGetMotionVectorMaskINTEL: return "OpSubgroupAvcSicGetMotionVectorMaskINTEL";
+    case OpSubgroupAvcSicConvertToMcePayloadINTEL: return "OpSubgroupAvcSicConvertToMcePayloadINTEL";
+    case OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL: return "OpSubgroupAvcSicSetIntraLumaShapePenaltyINTEL";
+    case OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL: return "OpSubgroupAvcSicSetIntraLumaModeCostFunctionINTEL";
+    case OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL: return "OpSubgroupAvcSicSetIntraChromaModeCostFunctionINTEL";
+    case OpSubgroupAvcSicSetBilinearFilterEnableINTEL: return "OpSubgroupAvcSicSetBilinearFilterEnableINTEL";
+    case OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL: return "OpSubgroupAvcSicSetSkcForwardTransformEnableINTEL";
+    case OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL: return "OpSubgroupAvcSicSetBlockBasedRawSkipSadINTEL";
+    case OpSubgroupAvcSicEvaluateIpeINTEL: return "OpSubgroupAvcSicEvaluateIpeINTEL";
+    case OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithSingleReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithDualReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithDualReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL: return "OpSubgroupAvcSicEvaluateWithMultiReferenceINTEL";
+    case OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL: return "OpSubgroupAvcSicEvaluateWithMultiReferenceInterlacedINTEL";
+    case OpSubgroupAvcSicConvertToMceResultINTEL: return "OpSubgroupAvcSicConvertToMceResultINTEL";
+    case OpSubgroupAvcSicGetIpeLumaShapeINTEL: return "OpSubgroupAvcSicGetIpeLumaShapeINTEL";
+    case OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL: return "OpSubgroupAvcSicGetBestIpeLumaDistortionINTEL";
+    case OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL: return "OpSubgroupAvcSicGetBestIpeChromaDistortionINTEL";
+    case OpSubgroupAvcSicGetPackedIpeLumaModesINTEL: return "OpSubgroupAvcSicGetPackedIpeLumaModesINTEL";
+    case OpSubgroupAvcSicGetIpeChromaModeINTEL: return "OpSubgroupAvcSicGetIpeChromaModeINTEL";
+    case OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL: return "OpSubgroupAvcSicGetPackedSkcLumaCountThresholdINTEL";
+    case OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL: return "OpSubgroupAvcSicGetPackedSkcLumaSumThresholdINTEL";
+    case OpSubgroupAvcSicGetInterRawSadsINTEL: return "OpSubgroupAvcSicGetInterRawSadsINTEL";
+    case OpVariableLengthArrayINTEL: return "OpVariableLengthArrayINTEL";
+    case OpSaveMemoryINTEL: return "OpSaveMemoryINTEL";
+    case OpRestoreMemoryINTEL: return "OpRestoreMemoryINTEL";
+    case OpArbitraryFloatSinCosPiINTEL: return "OpArbitraryFloatSinCosPiINTEL";
+    case OpArbitraryFloatCastINTEL: return "OpArbitraryFloatCastINTEL";
+    case OpArbitraryFloatCastFromIntINTEL: return "OpArbitraryFloatCastFromIntINTEL";
+    case OpArbitraryFloatCastToIntINTEL: return "OpArbitraryFloatCastToIntINTEL";
+    case OpArbitraryFloatAddINTEL: return "OpArbitraryFloatAddINTEL";
+    case OpArbitraryFloatSubINTEL: return "OpArbitraryFloatSubINTEL";
+    case OpArbitraryFloatMulINTEL: return "OpArbitraryFloatMulINTEL";
+    case OpArbitraryFloatDivINTEL: return "OpArbitraryFloatDivINTEL";
+    case OpArbitraryFloatGTINTEL: return "OpArbitraryFloatGTINTEL";
+    case OpArbitraryFloatGEINTEL: return "OpArbitraryFloatGEINTEL";
+    case OpArbitraryFloatLTINTEL: return "OpArbitraryFloatLTINTEL";
+    case OpArbitraryFloatLEINTEL: return "OpArbitraryFloatLEINTEL";
+    case OpArbitraryFloatEQINTEL: return "OpArbitraryFloatEQINTEL";
+    case OpArbitraryFloatRecipINTEL: return "OpArbitraryFloatRecipINTEL";
+    case OpArbitraryFloatRSqrtINTEL: return "OpArbitraryFloatRSqrtINTEL";
+    case OpArbitraryFloatCbrtINTEL: return "OpArbitraryFloatCbrtINTEL";
+    case OpArbitraryFloatHypotINTEL: return "OpArbitraryFloatHypotINTEL";
+    case OpArbitraryFloatSqrtINTEL: return "OpArbitraryFloatSqrtINTEL";
+    case OpArbitraryFloatLogINTEL: return "OpArbitraryFloatLogINTEL";
+    case OpArbitraryFloatLog2INTEL: return "OpArbitraryFloatLog2INTEL";
+    case OpArbitraryFloatLog10INTEL: return "OpArbitraryFloatLog10INTEL";
+    case OpArbitraryFloatLog1pINTEL: return "OpArbitraryFloatLog1pINTEL";
+    case OpArbitraryFloatExpINTEL: return "OpArbitraryFloatExpINTEL";
+    case OpArbitraryFloatExp2INTEL: return "OpArbitraryFloatExp2INTEL";
+    case OpArbitraryFloatExp10INTEL: return "OpArbitraryFloatExp10INTEL";
+    case OpArbitraryFloatExpm1INTEL: return "OpArbitraryFloatExpm1INTEL";
+    case OpArbitraryFloatSinINTEL: return "OpArbitraryFloatSinINTEL";
+    case OpArbitraryFloatCosINTEL: return "OpArbitraryFloatCosINTEL";
+    case OpArbitraryFloatSinCosINTEL: return "OpArbitraryFloatSinCosINTEL";
+    case OpArbitraryFloatSinPiINTEL: return "OpArbitraryFloatSinPiINTEL";
+    case OpArbitraryFloatCosPiINTEL: return "OpArbitraryFloatCosPiINTEL";
+    case OpArbitraryFloatASinINTEL: return "OpArbitraryFloatASinINTEL";
+    case OpArbitraryFloatASinPiINTEL: return "OpArbitraryFloatASinPiINTEL";
+    case OpArbitraryFloatACosINTEL: return "OpArbitraryFloatACosINTEL";
+    case OpArbitraryFloatACosPiINTEL: return "OpArbitraryFloatACosPiINTEL";
+    case OpArbitraryFloatATanINTEL: return "OpArbitraryFloatATanINTEL";
+    case OpArbitraryFloatATanPiINTEL: return "OpArbitraryFloatATanPiINTEL";
+    case OpArbitraryFloatATan2INTEL: return "OpArbitraryFloatATan2INTEL";
+    case OpArbitraryFloatPowINTEL: return "OpArbitraryFloatPowINTEL";
+    case OpArbitraryFloatPowRINTEL: return "OpArbitraryFloatPowRINTEL";
+    case OpArbitraryFloatPowNINTEL: return "OpArbitraryFloatPowNINTEL";
+    case OpLoopControlINTEL: return "OpLoopControlINTEL";
+    case OpAliasDomainDeclINTEL: return "OpAliasDomainDeclINTEL";
+    case OpAliasScopeDeclINTEL: return "OpAliasScopeDeclINTEL";
+    case OpAliasScopeListDeclINTEL: return "OpAliasScopeListDeclINTEL";
+    case OpFixedSqrtINTEL: return "OpFixedSqrtINTEL";
+    case OpFixedRecipINTEL: return "OpFixedRecipINTEL";
+    case OpFixedRsqrtINTEL: return "OpFixedRsqrtINTEL";
+    case OpFixedSinINTEL: return "OpFixedSinINTEL";
+    case OpFixedCosINTEL: return "OpFixedCosINTEL";
+    case OpFixedSinCosINTEL: return "OpFixedSinCosINTEL";
+    case OpFixedSinPiINTEL: return "OpFixedSinPiINTEL";
+    case OpFixedCosPiINTEL: return "OpFixedCosPiINTEL";
+    case OpFixedSinCosPiINTEL: return "OpFixedSinCosPiINTEL";
+    case OpFixedLogINTEL: return "OpFixedLogINTEL";
+    case OpFixedExpINTEL: return "OpFixedExpINTEL";
+    case OpPtrCastToCrossWorkgroupINTEL: return "OpPtrCastToCrossWorkgroupINTEL";
+    case OpCrossWorkgroupCastToPtrINTEL: return "OpCrossWorkgroupCastToPtrINTEL";
+    case OpReadPipeBlockingINTEL: return "OpReadPipeBlockingINTEL";
+    case OpWritePipeBlockingINTEL: return "OpWritePipeBlockingINTEL";
+    case OpFPGARegINTEL: return "OpFPGARegINTEL";
+    case OpRayQueryGetRayTMinKHR: return "OpRayQueryGetRayTMinKHR";
+    case OpRayQueryGetRayFlagsKHR: return "OpRayQueryGetRayFlagsKHR";
+    case OpRayQueryGetIntersectionTKHR: return "OpRayQueryGetIntersectionTKHR";
+    case OpRayQueryGetIntersectionInstanceCustomIndexKHR: return "OpRayQueryGetIntersectionInstanceCustomIndexKHR";
+    case OpRayQueryGetIntersectionInstanceIdKHR: return "OpRayQueryGetIntersectionInstanceIdKHR";
+    case OpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR: return "OpRayQueryGetIntersectionInstanceShaderBindingTableRecordOffsetKHR";
+    case OpRayQueryGetIntersectionGeometryIndexKHR: return "OpRayQueryGetIntersectionGeometryIndexKHR";
+    case OpRayQueryGetIntersectionPrimitiveIndexKHR: return "OpRayQueryGetIntersectionPrimitiveIndexKHR";
+    case OpRayQueryGetIntersectionBarycentricsKHR: return "OpRayQueryGetIntersectionBarycentricsKHR";
+    case OpRayQueryGetIntersectionFrontFaceKHR: return "OpRayQueryGetIntersectionFrontFaceKHR";
+    case OpRayQueryGetIntersectionCandidateAABBOpaqueKHR: return "OpRayQueryGetIntersectionCandidateAABBOpaqueKHR";
+    case OpRayQueryGetIntersectionObjectRayDirectionKHR: return "OpRayQueryGetIntersectionObjectRayDirectionKHR";
+    case OpRayQueryGetIntersectionObjectRayOriginKHR: return "OpRayQueryGetIntersectionObjectRayOriginKHR";
+    case OpRayQueryGetWorldRayDirectionKHR: return "OpRayQueryGetWorldRayDirectionKHR";
+    case OpRayQueryGetWorldRayOriginKHR: return "OpRayQueryGetWorldRayOriginKHR";
+    case OpRayQueryGetIntersectionObjectToWorldKHR: return "OpRayQueryGetIntersectionObjectToWorldKHR";
+    case OpRayQueryGetIntersectionWorldToObjectKHR: return "OpRayQueryGetIntersectionWorldToObjectKHR";
+    case OpAtomicFAddEXT: return "OpAtomicFAddEXT";
+    case OpTypeBufferSurfaceINTEL: return "OpTypeBufferSurfaceINTEL";
+    case OpTypeStructContinuedINTEL: return "OpTypeStructContinuedINTEL";
+    case OpConstantCompositeContinuedINTEL: return "OpConstantCompositeContinuedINTEL";
+    case OpSpecConstantCompositeContinuedINTEL: return "OpSpecConstantCompositeContinuedINTEL";
+    case OpCompositeConstructContinuedINTEL: return "OpCompositeConstructContinuedINTEL";
+    case OpConvertFToBF16INTEL: return "OpConvertFToBF16INTEL";
+    case OpConvertBF16ToFINTEL: return "OpConvertBF16ToFINTEL";
+    case OpControlBarrierArriveINTEL: return "OpControlBarrierArriveINTEL";
+    case OpControlBarrierWaitINTEL: return "OpControlBarrierWaitINTEL";
+    case OpArithmeticFenceEXT: return "OpArithmeticFenceEXT";
+    case OpTaskSequenceCreateINTEL: return "OpTaskSequenceCreateINTEL";
+    case OpTaskSequenceAsyncINTEL: return "OpTaskSequenceAsyncINTEL";
+    case OpTaskSequenceGetINTEL: return "OpTaskSequenceGetINTEL";
+    case OpTaskSequenceReleaseINTEL: return "OpTaskSequenceReleaseINTEL";
+    case OpTypeTaskSequenceINTEL: return "OpTypeTaskSequenceINTEL";
+    case OpSubgroupBlockPrefetchINTEL: return "OpSubgroupBlockPrefetchINTEL";
+    case OpSubgroup2DBlockLoadINTEL: return "OpSubgroup2DBlockLoadINTEL";
+    case OpSubgroup2DBlockLoadTransformINTEL: return "OpSubgroup2DBlockLoadTransformINTEL";
+    case OpSubgroup2DBlockLoadTransposeINTEL: return "OpSubgroup2DBlockLoadTransposeINTEL";
+    case OpSubgroup2DBlockPrefetchINTEL: return "OpSubgroup2DBlockPrefetchINTEL";
+    case OpSubgroup2DBlockStoreINTEL: return "OpSubgroup2DBlockStoreINTEL";
+    case OpSubgroupMatrixMultiplyAccumulateINTEL: return "OpSubgroupMatrixMultiplyAccumulateINTEL";
+    case OpBitwiseFunctionINTEL: return "OpBitwiseFunctionINTEL";
+    case OpGroupIMulKHR: return "OpGroupIMulKHR";
+    case OpGroupFMulKHR: return "OpGroupFMulKHR";
+    case OpGroupBitwiseAndKHR: return "OpGroupBitwiseAndKHR";
+    case OpGroupBitwiseOrKHR: return "OpGroupBitwiseOrKHR";
+    case OpGroupBitwiseXorKHR: return "OpGroupBitwiseXorKHR";
+    case OpGroupLogicalAndKHR: return "OpGroupLogicalAndKHR";
+    case OpGroupLogicalOrKHR: return "OpGroupLogicalOrKHR";
+    case OpGroupLogicalXorKHR: return "OpGroupLogicalXorKHR";
+    case OpRoundFToTF32INTEL: return "OpRoundFToTF32INTEL";
+    case OpMaskedGatherINTEL: return "OpMaskedGatherINTEL";
+    case OpMaskedScatterINTEL: return "OpMaskedScatterINTEL";
+    case OpConvertHandleToImageINTEL: return "OpConvertHandleToImageINTEL";
+    case OpConvertHandleToSamplerINTEL: return "OpConvertHandleToSamplerINTEL";
+    case OpConvertHandleToSampledImageINTEL: return "OpConvertHandleToSampledImageINTEL";
+    default: return "Unknown";
+    }
+}
+
 #endif /* SPV_ENABLE_UTILITY_CODE */
 
-// Overload operator| for mask bit combining
+// Overload bitwise operators for mask bit combining
 
 inline ImageOperandsMask operator|(ImageOperandsMask a, ImageOperandsMask b) { return ImageOperandsMask(unsigned(a) | unsigned(b)); }
+inline ImageOperandsMask operator&(ImageOperandsMask a, ImageOperandsMask b) { return ImageOperandsMask(unsigned(a) & unsigned(b)); }
+inline ImageOperandsMask operator^(ImageOperandsMask a, ImageOperandsMask b) { return ImageOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline ImageOperandsMask operator~(ImageOperandsMask a) { return ImageOperandsMask(~unsigned(a)); }
 inline FPFastMathModeMask operator|(FPFastMathModeMask a, FPFastMathModeMask b) { return FPFastMathModeMask(unsigned(a) | unsigned(b)); }
+inline FPFastMathModeMask operator&(FPFastMathModeMask a, FPFastMathModeMask b) { return FPFastMathModeMask(unsigned(a) & unsigned(b)); }
+inline FPFastMathModeMask operator^(FPFastMathModeMask a, FPFastMathModeMask b) { return FPFastMathModeMask(unsigned(a) ^ unsigned(b)); }
+inline FPFastMathModeMask operator~(FPFastMathModeMask a) { return FPFastMathModeMask(~unsigned(a)); }
 inline SelectionControlMask operator|(SelectionControlMask a, SelectionControlMask b) { return SelectionControlMask(unsigned(a) | unsigned(b)); }
+inline SelectionControlMask operator&(SelectionControlMask a, SelectionControlMask b) { return SelectionControlMask(unsigned(a) & unsigned(b)); }
+inline SelectionControlMask operator^(SelectionControlMask a, SelectionControlMask b) { return SelectionControlMask(unsigned(a) ^ unsigned(b)); }
+inline SelectionControlMask operator~(SelectionControlMask a) { return SelectionControlMask(~unsigned(a)); }
 inline LoopControlMask operator|(LoopControlMask a, LoopControlMask b) { return LoopControlMask(unsigned(a) | unsigned(b)); }
+inline LoopControlMask operator&(LoopControlMask a, LoopControlMask b) { return LoopControlMask(unsigned(a) & unsigned(b)); }
+inline LoopControlMask operator^(LoopControlMask a, LoopControlMask b) { return LoopControlMask(unsigned(a) ^ unsigned(b)); }
+inline LoopControlMask operator~(LoopControlMask a) { return LoopControlMask(~unsigned(a)); }
 inline FunctionControlMask operator|(FunctionControlMask a, FunctionControlMask b) { return FunctionControlMask(unsigned(a) | unsigned(b)); }
+inline FunctionControlMask operator&(FunctionControlMask a, FunctionControlMask b) { return FunctionControlMask(unsigned(a) & unsigned(b)); }
+inline FunctionControlMask operator^(FunctionControlMask a, FunctionControlMask b) { return FunctionControlMask(unsigned(a) ^ unsigned(b)); }
+inline FunctionControlMask operator~(FunctionControlMask a) { return FunctionControlMask(~unsigned(a)); }
 inline MemorySemanticsMask operator|(MemorySemanticsMask a, MemorySemanticsMask b) { return MemorySemanticsMask(unsigned(a) | unsigned(b)); }
+inline MemorySemanticsMask operator&(MemorySemanticsMask a, MemorySemanticsMask b) { return MemorySemanticsMask(unsigned(a) & unsigned(b)); }
+inline MemorySemanticsMask operator^(MemorySemanticsMask a, MemorySemanticsMask b) { return MemorySemanticsMask(unsigned(a) ^ unsigned(b)); }
+inline MemorySemanticsMask operator~(MemorySemanticsMask a) { return MemorySemanticsMask(~unsigned(a)); }
 inline MemoryAccessMask operator|(MemoryAccessMask a, MemoryAccessMask b) { return MemoryAccessMask(unsigned(a) | unsigned(b)); }
+inline MemoryAccessMask operator&(MemoryAccessMask a, MemoryAccessMask b) { return MemoryAccessMask(unsigned(a) & unsigned(b)); }
+inline MemoryAccessMask operator^(MemoryAccessMask a, MemoryAccessMask b) { return MemoryAccessMask(unsigned(a) ^ unsigned(b)); }
+inline MemoryAccessMask operator~(MemoryAccessMask a) { return MemoryAccessMask(~unsigned(a)); }
 inline KernelProfilingInfoMask operator|(KernelProfilingInfoMask a, KernelProfilingInfoMask b) { return KernelProfilingInfoMask(unsigned(a) | unsigned(b)); }
+inline KernelProfilingInfoMask operator&(KernelProfilingInfoMask a, KernelProfilingInfoMask b) { return KernelProfilingInfoMask(unsigned(a) & unsigned(b)); }
+inline KernelProfilingInfoMask operator^(KernelProfilingInfoMask a, KernelProfilingInfoMask b) { return KernelProfilingInfoMask(unsigned(a) ^ unsigned(b)); }
+inline KernelProfilingInfoMask operator~(KernelProfilingInfoMask a) { return KernelProfilingInfoMask(~unsigned(a)); }
 inline RayFlagsMask operator|(RayFlagsMask a, RayFlagsMask b) { return RayFlagsMask(unsigned(a) | unsigned(b)); }
+inline RayFlagsMask operator&(RayFlagsMask a, RayFlagsMask b) { return RayFlagsMask(unsigned(a) & unsigned(b)); }
+inline RayFlagsMask operator^(RayFlagsMask a, RayFlagsMask b) { return RayFlagsMask(unsigned(a) ^ unsigned(b)); }
+inline RayFlagsMask operator~(RayFlagsMask a) { return RayFlagsMask(~unsigned(a)); }
 inline FragmentShadingRateMask operator|(FragmentShadingRateMask a, FragmentShadingRateMask b) { return FragmentShadingRateMask(unsigned(a) | unsigned(b)); }
+inline FragmentShadingRateMask operator&(FragmentShadingRateMask a, FragmentShadingRateMask b) { return FragmentShadingRateMask(unsigned(a) & unsigned(b)); }
+inline FragmentShadingRateMask operator^(FragmentShadingRateMask a, FragmentShadingRateMask b) { return FragmentShadingRateMask(unsigned(a) ^ unsigned(b)); }
+inline FragmentShadingRateMask operator~(FragmentShadingRateMask a) { return FragmentShadingRateMask(~unsigned(a)); }
+inline CooperativeMatrixOperandsMask operator|(CooperativeMatrixOperandsMask a, CooperativeMatrixOperandsMask b) { return CooperativeMatrixOperandsMask(unsigned(a) | unsigned(b)); }
+inline CooperativeMatrixOperandsMask operator&(CooperativeMatrixOperandsMask a, CooperativeMatrixOperandsMask b) { return CooperativeMatrixOperandsMask(unsigned(a) & unsigned(b)); }
+inline CooperativeMatrixOperandsMask operator^(CooperativeMatrixOperandsMask a, CooperativeMatrixOperandsMask b) { return CooperativeMatrixOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline CooperativeMatrixOperandsMask operator~(CooperativeMatrixOperandsMask a) { return CooperativeMatrixOperandsMask(~unsigned(a)); }
+inline CooperativeMatrixReduceMask operator|(CooperativeMatrixReduceMask a, CooperativeMatrixReduceMask b) { return CooperativeMatrixReduceMask(unsigned(a) | unsigned(b)); }
+inline CooperativeMatrixReduceMask operator&(CooperativeMatrixReduceMask a, CooperativeMatrixReduceMask b) { return CooperativeMatrixReduceMask(unsigned(a) & unsigned(b)); }
+inline CooperativeMatrixReduceMask operator^(CooperativeMatrixReduceMask a, CooperativeMatrixReduceMask b) { return CooperativeMatrixReduceMask(unsigned(a) ^ unsigned(b)); }
+inline CooperativeMatrixReduceMask operator~(CooperativeMatrixReduceMask a) { return CooperativeMatrixReduceMask(~unsigned(a)); }
+inline TensorAddressingOperandsMask operator|(TensorAddressingOperandsMask a, TensorAddressingOperandsMask b) { return TensorAddressingOperandsMask(unsigned(a) | unsigned(b)); }
+inline TensorAddressingOperandsMask operator&(TensorAddressingOperandsMask a, TensorAddressingOperandsMask b) { return TensorAddressingOperandsMask(unsigned(a) & unsigned(b)); }
+inline TensorAddressingOperandsMask operator^(TensorAddressingOperandsMask a, TensorAddressingOperandsMask b) { return TensorAddressingOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline TensorAddressingOperandsMask operator~(TensorAddressingOperandsMask a) { return TensorAddressingOperandsMask(~unsigned(a)); }
+inline TensorOperandsMask operator|(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) | unsigned(b)); }
+inline TensorOperandsMask operator&(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) & unsigned(b)); }
+inline TensorOperandsMask operator^(TensorOperandsMask a, TensorOperandsMask b) { return TensorOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline TensorOperandsMask operator~(TensorOperandsMask a) { return TensorOperandsMask(~unsigned(a)); }
+inline MatrixMultiplyAccumulateOperandsMask operator|(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) | unsigned(b)); }
+inline MatrixMultiplyAccumulateOperandsMask operator&(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) & unsigned(b)); }
+inline MatrixMultiplyAccumulateOperandsMask operator^(MatrixMultiplyAccumulateOperandsMask a, MatrixMultiplyAccumulateOperandsMask b) { return MatrixMultiplyAccumulateOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline MatrixMultiplyAccumulateOperandsMask operator~(MatrixMultiplyAccumulateOperandsMask a) { return MatrixMultiplyAccumulateOperandsMask(~unsigned(a)); }
+inline RawAccessChainOperandsMask operator|(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) | unsigned(b)); }
+inline RawAccessChainOperandsMask operator&(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) & unsigned(b)); }
+inline RawAccessChainOperandsMask operator^(RawAccessChainOperandsMask a, RawAccessChainOperandsMask b) { return RawAccessChainOperandsMask(unsigned(a) ^ unsigned(b)); }
+inline RawAccessChainOperandsMask operator~(RawAccessChainOperandsMask a) { return RawAccessChainOperandsMask(~unsigned(a)); }
 
 }  // end namespace spv
 
diff --git a/thirdparty/spirv-cross/spirv_common.hpp b/thirdparty/spirv-cross/spirv_common.hpp
index b70536d9ec..4780d2750f 100644
--- a/thirdparty/spirv-cross/spirv_common.hpp
+++ b/thirdparty/spirv-cross/spirv_common.hpp
@@ -580,7 +580,10 @@ struct SPIRType : IVariant
 		Interpolant,
 		Char,
 		// MSL specific type, that is used by 'object'(analog of 'task' from glsl) shader.
-		MeshGridProperties
+		MeshGridProperties,
+		BFloat16,
+		FloatE4M3,
+		FloatE5M2
 	};
 
 	// Scalar/vector/matrix support.
@@ -605,6 +608,14 @@ struct SPIRType : IVariant
 	bool pointer = false;
 	bool forward_pointer = false;
 
+	struct
+	{
+		uint32_t use_id = 0;
+		uint32_t rows_id = 0;
+		uint32_t columns_id = 0;
+		uint32_t scope_id = 0;
+	} cooperative;
+
 	spv::StorageClass storage = spv::StorageClassGeneric;
 
 	SmallVector<TypeID> member_types;
@@ -686,6 +697,7 @@ struct SPIREntryPoint
 	FunctionID self = 0;
 	std::string name;
 	std::string orig_name;
+	std::unordered_map<uint32_t, uint32_t> fp_fast_math_defaults;
 	SmallVector<VariableID> interface_variables;
 
 	Bitset flags;
@@ -1026,6 +1038,9 @@ struct SPIRFunction : IVariant
 	// consider arrays value types.
 	SmallVector<ID> constant_arrays_needed_on_stack;
 
+	// Does this function (or any function called by it), emit geometry?
+	bool emits_geometry = false;
+
 	bool active = false;
 	bool flush_undeclared = true;
 	bool do_combined_parameters = true;
@@ -1226,6 +1241,26 @@ struct SPIRConstant : IVariant
 		return u.f32;
 	}
 
+	static inline float fe4m3_to_f32(uint8_t v)
+	{
+		if ((v & 0x7f) == 0x7f)
+		{
+			union
+			{
+				float f32;
+				uint32_t u32;
+			} u;
+
+			u.u32 = (v & 0x80) ? 0xffffffffu : 0x7fffffffu;
+			return u.f32;
+		}
+		else
+		{
+			// Reuse the FP16 to FP32 code. Cute bit-hackery.
+			return f16_to_f32((int16_t(int8_t(v)) << 7) & (0xffff ^ 0x4000)) * 256.0f;
+		}
+	}
+
 	inline uint32_t specialization_constant_id(uint32_t col, uint32_t row) const
 	{
 		return m.c[col].id[row];
@@ -1266,6 +1301,24 @@ struct SPIRConstant : IVariant
 		return f16_to_f32(scalar_u16(col, row));
 	}
 
+	inline float scalar_bf16(uint32_t col = 0, uint32_t row = 0) const
+	{
+		uint32_t v = scalar_u16(col, row) << 16;
+		float fp32;
+		memcpy(&fp32, &v, sizeof(float));
+		return fp32;
+	}
+
+	inline float scalar_floate4m3(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return fe4m3_to_f32(scalar_u8(col, row));
+	}
+
+	inline float scalar_bf8(uint32_t col = 0, uint32_t row = 0) const
+	{
+		return f16_to_f32(scalar_u8(col, row) << 8);
+	}
+
 	inline float scalar_f32(uint32_t col = 0, uint32_t row = 0) const
 	{
 		return m.c[col].r[row].f32;
@@ -1336,9 +1389,10 @@ struct SPIRConstant : IVariant
 
 	SPIRConstant() = default;
 
-	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized)
+	SPIRConstant(TypeID constant_type_, const uint32_t *elements, uint32_t num_elements, bool specialized, bool replicated_ = false)
 	    : constant_type(constant_type_)
 	    , specialization(specialized)
+	    , replicated(replicated_)
 	{
 		subconstants.reserve(num_elements);
 		for (uint32_t i = 0; i < num_elements; i++)
@@ -1410,9 +1464,16 @@ struct SPIRConstant : IVariant
 	// If true, this is a LUT, and should always be declared in the outer scope.
 	bool is_used_as_lut = false;
 
+	// If this is a null constant of array type with specialized length.
+	// May require special handling in initializer
+	bool is_null_array_specialized_length = false;
+
 	// For composites which are constant arrays, etc.
 	SmallVector<ConstantID> subconstants;
 
+	// Whether the subconstants are intended to be replicated (e.g. OpConstantCompositeReplicateEXT)
+	bool replicated = false;
+
 	// Non-Vulkan GLSL, HLSL and sometimes MSL emits defines for each specialization constant,
 	// and uses them to initialize the constant. This allows the user
 	// to still be able to specialize the value by supplying corresponding
@@ -1708,6 +1769,7 @@ struct Meta
 		uint32_t spec_id = 0;
 		uint32_t index = 0;
 		spv::FPRoundingMode fp_rounding_mode = spv::FPRoundingModeMax;
+		spv::FPFastMathModeMask fp_fast_math_mode = spv::FPFastMathModeMaskNone;
 		bool builtin = false;
 		bool qualified_alias_explicit_override = false;
 
diff --git a/thirdparty/spirv-cross/spirv_cross.cpp b/thirdparty/spirv-cross/spirv_cross.cpp
index 3492f0b3ed..350eff3429 100644
--- a/thirdparty/spirv-cross/spirv_cross.cpp
+++ b/thirdparty/spirv-cross/spirv_cross.cpp
@@ -82,7 +82,7 @@ bool Compiler::variable_storage_is_aliased(const SPIRVariable &v)
 	            ir.meta[type.self].decoration.decoration_flags.get(DecorationBufferBlock);
 	bool image = type.basetype == SPIRType::Image;
 	bool counter = type.basetype == SPIRType::AtomicCounter;
-	bool buffer_reference = type.storage == StorageClassPhysicalStorageBufferEXT;
+	bool buffer_reference = type.storage == StorageClassPhysicalStorageBuffer;
 
 	bool is_restrict;
 	if (ssbo)
@@ -171,6 +171,7 @@ bool Compiler::block_is_control_dependent(const SPIRBlock &block)
 		case OpGroupNonUniformLogicalXor:
 		case OpGroupNonUniformQuadBroadcast:
 		case OpGroupNonUniformQuadSwap:
+		case OpGroupNonUniformRotateKHR:
 
 		// Control barriers
 		case OpControlBarrier:
@@ -210,6 +211,7 @@ bool Compiler::block_is_pure(const SPIRBlock &block)
 
 		case OpCopyMemory:
 		case OpStore:
+		case OpCooperativeMatrixStoreKHR:
 		{
 			auto &type = expression_type(ops[0]);
 			if (type.storage != StorageClassFunction)
@@ -370,6 +372,7 @@ void Compiler::register_global_read_dependencies(const SPIRBlock &block, uint32_
 		}
 
 		case OpLoad:
+		case OpCooperativeMatrixLoadKHR:
 		case OpImageRead:
 		{
 			// If we're in a storage class which does not get invalidated, adding dependencies here is no big deal.
@@ -481,7 +484,7 @@ void Compiler::register_write(uint32_t chain)
 			}
 		}
 
-		if (type.storage == StorageClassPhysicalStorageBufferEXT || variable_storage_is_aliased(*var))
+		if (type.storage == StorageClassPhysicalStorageBuffer || variable_storage_is_aliased(*var))
 			flush_all_aliased_variables();
 		else if (var)
 			flush_dependees(*var);
@@ -587,6 +590,7 @@ const SPIRType &Compiler::expression_type(uint32_t id) const
 bool Compiler::expression_is_lvalue(uint32_t id) const
 {
 	auto &type = expression_type(id);
+
 	switch (type.basetype)
 	{
 	case SPIRType::SampledImage:
@@ -818,6 +822,7 @@ bool Compiler::InterfaceVariableAccessHandler::handle(Op opcode, const uint32_t
 
 	case OpAtomicStore:
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		// Invalid SPIR-V.
 		if (length < 1)
 			return false;
@@ -910,6 +915,7 @@ bool Compiler::InterfaceVariableAccessHandler::handle(Op opcode, const uint32_t
 	case OpInBoundsAccessChain:
 	case OpPtrAccessChain:
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	case OpCopyObject:
 	case OpImageTexelPointer:
 	case OpAtomicLoad:
@@ -2364,6 +2370,10 @@ void Compiler::set_execution_mode(ExecutionMode mode, uint32_t arg0, uint32_t ar
 		execution.output_primitives = arg0;
 		break;
 
+	case ExecutionModeFPFastMathDefault:
+		execution.fp_fast_math_defaults[arg0] = arg1;
+		break;
+
 	default:
 		break;
 	}
@@ -3461,6 +3471,7 @@ bool Compiler::AnalyzeVariableScopeAccessHandler::handle(spv::Op op, const uint3
 	switch (op)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 	{
 		if (length < 2)
 			return false;
@@ -3581,6 +3592,7 @@ bool Compiler::AnalyzeVariableScopeAccessHandler::handle(spv::Op op, const uint3
 	}
 
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	{
 		if (length < 3)
 			return false;
@@ -3800,6 +3812,7 @@ bool Compiler::StaticExpressionAccessHandler::handle(spv::Op op, const uint32_t
 	switch (op)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		if (length < 2)
 			return false;
 		if (args[0] == variable_id)
@@ -3810,6 +3823,7 @@ bool Compiler::StaticExpressionAccessHandler::handle(spv::Op op, const uint32_t
 		break;
 
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 		if (length < 3)
 			return false;
 		if (args[2] == variable_id && static_expression == 0) // Tried to read from variable before it was initialized.
@@ -4285,6 +4299,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 		switch (op.op)
 		{
 		case OpStore:
+		case OpCooperativeMatrixStoreKHR:
 		case OpCopyMemory:
 			if (ops[0] == var)
 				return false;
@@ -4323,6 +4338,7 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 
 		case OpCopyObject:
 		case OpLoad:
+		case OpCooperativeMatrixLoadKHR:
 			if (ops[2] == var)
 				return true;
 			break;
@@ -4350,6 +4366,39 @@ bool Compiler::may_read_undefined_variable_in_block(const SPIRBlock &block, uint
 	return true;
 }
 
+bool Compiler::GeometryEmitDisocveryHandler::handle(spv::Op opcode, const uint32_t *, uint32_t)
+{
+	if (opcode == OpEmitVertex || opcode == OpEndPrimitive)
+	{
+		for (auto *func : function_stack)
+			func->emits_geometry = true;
+	}
+
+	return true;
+}
+
+bool Compiler::GeometryEmitDisocveryHandler::begin_function_scope(const uint32_t *stream, uint32_t)
+{
+	auto &callee = compiler.get<SPIRFunction>(stream[2]);
+	function_stack.push_back(&callee);
+	return true;
+}
+
+bool Compiler::GeometryEmitDisocveryHandler::end_function_scope([[maybe_unused]] const uint32_t *stream, uint32_t)
+{
+	assert(function_stack.back() == &compiler.get<SPIRFunction>(stream[2]));
+	function_stack.pop_back();
+
+	return true;
+}
+
+void Compiler::discover_geometry_emitters()
+{
+	GeometryEmitDisocveryHandler handler(*this);
+
+	traverse_all_reachable_opcodes(get<SPIRFunction>(ir.default_entry_point), handler);
+}
+
 Bitset Compiler::get_buffer_block_flags(VariableID id) const
 {
 	return ir.get_buffer_block_flags(get<SPIRVariable>(id));
@@ -4462,6 +4511,7 @@ bool Compiler::ActiveBuiltinHandler::handle(spv::Op opcode, const uint32_t *args
 	switch (opcode)
 	{
 	case OpStore:
+	case OpCooperativeMatrixStoreKHR:
 		if (length < 1)
 			return false;
 
@@ -4478,6 +4528,7 @@ bool Compiler::ActiveBuiltinHandler::handle(spv::Op opcode, const uint32_t *args
 
 	case OpCopyObject:
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 		if (length < 3)
 			return false;
 
@@ -4910,13 +4961,16 @@ void Compiler::make_constant_null(uint32_t id, uint32_t type)
 		uint32_t parent_id = ir.increase_bound_by(1);
 		make_constant_null(parent_id, constant_type.parent_type);
 
-		if (!constant_type.array_size_literal.back())
-			SPIRV_CROSS_THROW("Array size of OpConstantNull must be a literal.");
-
-		SmallVector<uint32_t> elements(constant_type.array.back());
-		for (uint32_t i = 0; i < constant_type.array.back(); i++)
+		// The array size of OpConstantNull can be either literal or specialization constant.
+		// In the latter case, we cannot take the value as-is, as it can be changed to anything.
+		// Rather, we assume it to be *one* for the sake of initializer.
+		bool is_literal_array_size = constant_type.array_size_literal.back();
+		uint32_t count = is_literal_array_size ? constant_type.array.back() : 1;
+		SmallVector<uint32_t> elements(count);
+		for (uint32_t i = 0; i < count; i++)
 			elements[i] = parent_id;
-		set<SPIRConstant>(id, type, elements.data(), uint32_t(elements.size()), false);
+		auto &constant = set<SPIRConstant>(id, type, elements.data(), uint32_t(elements.size()), false);
+		constant.is_null_array_specialized_length = !is_literal_array_size;
 	}
 	else if (!constant_type.member_types.empty())
 	{
@@ -5177,7 +5231,7 @@ bool Compiler::PhysicalStorageBufferPointerHandler::type_is_bda_block_entry(uint
 
 uint32_t Compiler::PhysicalStorageBufferPointerHandler::get_minimum_scalar_alignment(const SPIRType &type) const
 {
-	if (type.storage == spv::StorageClassPhysicalStorageBufferEXT)
+	if (type.storage == spv::StorageClassPhysicalStorageBuffer)
 		return 8;
 	else if (type.basetype == SPIRType::Struct)
 	{
@@ -5252,6 +5306,13 @@ bool Compiler::PhysicalStorageBufferPointerHandler::handle(Op op, const uint32_t
 		break;
 	}
 
+	case OpCooperativeMatrixLoadKHR:
+	case OpCooperativeMatrixStoreKHR:
+	{
+		// TODO: Can we meaningfully deal with this?
+		break;
+	}
+
 	default:
 		break;
 	}
@@ -5274,6 +5335,10 @@ uint32_t Compiler::PhysicalStorageBufferPointerHandler::get_base_non_block_type_
 
 void Compiler::PhysicalStorageBufferPointerHandler::analyze_non_block_types_from_block(const SPIRType &type)
 {
+	if (analyzed_type_ids.count(type.self))
+		return;
+	analyzed_type_ids.insert(type.self);
+
 	for (auto &member : type.member_types)
 	{
 		auto &subtype = compiler.get<SPIRType>(member);
@@ -5407,6 +5472,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	switch (opcode)
 	{
 	case OpLoad:
+	case OpCooperativeMatrixLoadKHR:
 	{
 		if (length < 3)
 			return false;
@@ -5484,6 +5550,7 @@ bool Compiler::InterlockedResourceAccessHandler::handle(Op opcode, const uint32_
 	case OpStore:
 	case OpImageWrite:
 	case OpAtomicStore:
+	case OpCooperativeMatrixStoreKHR:
 	{
 		if (length < 1)
 			return false;
diff --git a/thirdparty/spirv-cross/spirv_cross.hpp b/thirdparty/spirv-cross/spirv_cross.hpp
index e9062b485c..b65b5ac77a 100644
--- a/thirdparty/spirv-cross/spirv_cross.hpp
+++ b/thirdparty/spirv-cross/spirv_cross.hpp
@@ -1054,6 +1054,7 @@ protected:
 		std::unordered_set<uint32_t> non_block_types;
 		std::unordered_map<uint32_t, PhysicalBlockMeta> physical_block_type_meta;
 		std::unordered_map<uint32_t, PhysicalBlockMeta *> access_chain_to_physical_block;
+		std::unordered_set<uint32_t> analyzed_type_ids;
 
 		void mark_aligned_access(uint32_t id, const uint32_t *args, uint32_t length);
 		PhysicalBlockMeta *find_block_meta(uint32_t id) const;
@@ -1072,6 +1073,22 @@ protected:
 	                              bool single_function);
 	bool may_read_undefined_variable_in_block(const SPIRBlock &block, uint32_t var);
 
+	struct GeometryEmitDisocveryHandler : OpcodeHandler
+	{
+		explicit GeometryEmitDisocveryHandler(Compiler &compiler_)
+		    : compiler(compiler_)
+		{
+		}
+		Compiler &compiler;
+
+		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
+		bool begin_function_scope(const uint32_t *, uint32_t) override;
+		bool end_function_scope(const uint32_t *, uint32_t) override;
+		SmallVector<SPIRFunction *> function_stack;
+	};
+
+	void discover_geometry_emitters();
+
 	// Finds all resources that are written to from inside the critical section, if present.
 	// The critical section is delimited by OpBeginInvocationInterlockEXT and
 	// OpEndInvocationInterlockEXT instructions. In MSL and HLSL, any resources written
diff --git a/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp b/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
index b05afeb3f5..760b8037d4 100644
--- a/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
+++ b/thirdparty/spirv-cross/spirv_cross_parsed_ir.cpp
@@ -452,6 +452,10 @@ void ParsedIR::set_decoration(ID id, Decoration decoration, uint32_t argument)
 		dec.fp_rounding_mode = static_cast<FPRoundingMode>(argument);
 		break;
 
+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = static_cast<FPFastMathModeMask>(argument);
+		break;
+
 	default:
 		break;
 	}
@@ -643,6 +647,8 @@ uint32_t ParsedIR::get_decoration(ID id, Decoration decoration) const
 		return dec.index;
 	case DecorationFPRoundingMode:
 		return dec.fp_rounding_mode;
+	case DecorationFPFastMathMode:
+		return dec.fp_fast_math_mode;
 	default:
 		return 1;
 	}
@@ -730,6 +736,10 @@ void ParsedIR::unset_decoration(ID id, Decoration decoration)
 		dec.fp_rounding_mode = FPRoundingModeMax;
 		break;
 
+	case DecorationFPFastMathMode:
+		dec.fp_fast_math_mode = FPFastMathModeMaskNone;
+		break;
+
 	case DecorationHlslCounterBufferGOOGLE:
 	{
 		auto &counter = meta[id].hlsl_magic_counter_buffer;
@@ -1050,16 +1060,21 @@ void ParsedIR::make_constant_null(uint32_t id, uint32_t type, bool add_to_typed_
 		uint32_t parent_id = increase_bound_by(1);
 		make_constant_null(parent_id, constant_type.parent_type, add_to_typed_id_set);
 
-		if (!constant_type.array_size_literal.back())
-			SPIRV_CROSS_THROW("Array size of OpConstantNull must be a literal.");
+		// The array size of OpConstantNull can be either literal or specialization constant.
+		// In the latter case, we cannot take the value as-is, as it can be changed to anything.
+		// Rather, we assume it to be *one* for the sake of initializer.
+		bool is_literal_array_size = constant_type.array_size_literal.back();
+		uint32_t count = is_literal_array_size ? constant_type.array.back() : 1;
 
-		SmallVector<uint32_t> elements(constant_type.array.back());
-		for (uint32_t i = 0; i < constant_type.array.back(); i++)
+		SmallVector<uint32_t> elements(count);
+		for (uint32_t i = 0; i < count; i++)
 			elements[i] = parent_id;
 
 		if (add_to_typed_id_set)
 			add_typed_id(TypeConstant, id);
-		variant_set<SPIRConstant>(ids[id], type, elements.data(), uint32_t(elements.size()), false).self = id;
+		auto& constant = variant_set<SPIRConstant>(ids[id], type, elements.data(), uint32_t(elements.size()), false);
+		constant.self = id;
+		constant.is_null_array_specialized_length = !is_literal_array_size;
 	}
 	else if (!constant_type.member_types.empty())
 	{
diff --git a/thirdparty/spirv-cross/spirv_cross_util.cpp b/thirdparty/spirv-cross/spirv_cross_util.cpp
deleted file mode 100644
index 7cff010d1c..0000000000
--- a/thirdparty/spirv-cross/spirv_cross_util.cpp
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright 2015-2021 Arm Limited
- * SPDX-License-Identifier: Apache-2.0 OR MIT
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * At your option, you may choose to accept this material under either:
- *  1. The Apache License, Version 2.0, found at <http://www.apache.org/licenses/LICENSE-2.0>, or
- *  2. The MIT License, found at <http://opensource.org/licenses/MIT>.
- */
-
-#include "spirv_cross_util.hpp"
-#include "spirv_common.hpp"
-
-using namespace spv;
-using namespace SPIRV_CROSS_NAMESPACE;
-
-namespace spirv_cross_util
-{
-void rename_interface_variable(Compiler &compiler, const SmallVector<Resource> &resources, uint32_t location,
-                               const std::string &name)
-{
-	for (auto &v : resources)
-	{
-		if (!compiler.has_decoration(v.id, spv::DecorationLocation))
-			continue;
-
-		auto loc = compiler.get_decoration(v.id, spv::DecorationLocation);
-		if (loc != location)
-			continue;
-
-		auto &type = compiler.get_type(v.base_type_id);
-
-		// This is more of a friendly variant. If we need to rename interface variables, we might have to rename
-		// structs as well and make sure all the names match up.
-		if (type.basetype == SPIRType::Struct)
-		{
-			compiler.set_name(v.base_type_id, join("SPIRV_Cross_Interface_Location", location));
-			for (uint32_t i = 0; i < uint32_t(type.member_types.size()); i++)
-				compiler.set_member_name(v.base_type_id, i, join("InterfaceMember", i));
-		}
-
-		compiler.set_name(v.id, name);
-	}
-}
-
-void inherit_combined_sampler_bindings(Compiler &compiler)
-{
-	auto &samplers = compiler.get_combined_image_samplers();
-	for (auto &s : samplers)
-	{
-		if (compiler.has_decoration(s.image_id, spv::DecorationDescriptorSet))
-		{
-			uint32_t set = compiler.get_decoration(s.image_id, spv::DecorationDescriptorSet);
-			compiler.set_decoration(s.combined_id, spv::DecorationDescriptorSet, set);
-		}
-
-		if (compiler.has_decoration(s.image_id, spv::DecorationBinding))
-		{
-			uint32_t binding = compiler.get_decoration(s.image_id, spv::DecorationBinding);
-			compiler.set_decoration(s.combined_id, spv::DecorationBinding, binding);
-		}
-	}
-}
-} // namespace spirv_cross_util
diff --git a/thirdparty/spirv-cross/spirv_cross_util.hpp b/thirdparty/spirv-cross/spirv_cross_util.hpp
deleted file mode 100644
index e6e3fcdb63..0000000000
--- a/thirdparty/spirv-cross/spirv_cross_util.hpp
+++ /dev/null
@@ -1,37 +0,0 @@
-/*
- * Copyright 2015-2021 Arm Limited
- * SPDX-License-Identifier: Apache-2.0 OR MIT
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
- * At your option, you may choose to accept this material under either:
- *  1. The Apache License, Version 2.0, found at <http://www.apache.org/licenses/LICENSE-2.0>, or
- *  2. The MIT License, found at <http://opensource.org/licenses/MIT>.
- */
-
-#ifndef SPIRV_CROSS_UTIL_HPP
-#define SPIRV_CROSS_UTIL_HPP
-
-#include "spirv_cross.hpp"
-
-namespace spirv_cross_util
-{
-void rename_interface_variable(SPIRV_CROSS_NAMESPACE::Compiler &compiler,
-                               const SPIRV_CROSS_NAMESPACE::SmallVector<SPIRV_CROSS_NAMESPACE::Resource> &resources,
-                               uint32_t location, const std::string &name);
-void inherit_combined_sampler_bindings(SPIRV_CROSS_NAMESPACE::Compiler &compiler);
-} // namespace spirv_cross_util
-
-#endif
diff --git a/thirdparty/spirv-cross/spirv_glsl.cpp b/thirdparty/spirv-cross/spirv_glsl.cpp
index 6c1d5208b9..a01cef4449 100644
--- a/thirdparty/spirv-cross/spirv_glsl.cpp
+++ b/thirdparty/spirv-cross/spirv_glsl.cpp
@@ -545,7 +545,7 @@ void CompilerGLSL::find_static_extensions()
 	if (options.separate_shader_objects && !options.es && options.version < 410)
 		require_extension_internal("GL_ARB_separate_shader_objects");
 
-	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64EXT)
+	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 	{
 		if (!options.vulkan_semantics)
 			SPIRV_CROSS_THROW("GL_EXT_buffer_reference is only supported in Vulkan GLSL.");
@@ -557,7 +557,7 @@ void CompilerGLSL::find_static_extensions()
 	}
 	else if (ir.addressing_model != AddressingModelLogical)
 	{
-		SPIRV_CROSS_THROW("Only Logical and PhysicalStorageBuffer64EXT addressing models are supported.");
+		SPIRV_CROSS_THROW("Only Logical and PhysicalStorageBuffer64 addressing models are supported.");
 	}
 
 	// Check for nonuniform qualifier and passthrough.
@@ -631,6 +631,12 @@ void CompilerGLSL::find_static_extensions()
 		require_extension_internal("GL_OVR_multiview2");
 	}
 
+	if (execution.flags.get(ExecutionModeQuadDerivativesKHR) ||
+	    (execution.flags.get(ExecutionModeRequireFullQuadsKHR) && get_execution_model() == ExecutionModelFragment))
+	{
+		require_extension_internal("GL_EXT_shader_quad_control");
+	}
+
 	// KHR one is likely to get promoted at some point, so if we don't see an explicit SPIR-V extension, assume KHR.
 	for (auto &ext : ir.declared_extensions)
 		if (ext == "SPV_NV_fragment_shader_barycentric")
@@ -681,6 +687,8 @@ string CompilerGLSL::compile()
 	backend.requires_relaxed_precision_analysis = options.es || options.vulkan_semantics;
 	backend.support_precise_qualifier =
 			(!options.es && options.version >= 400) || (options.es && options.version >= 320);
+	backend.constant_null_initializer = "{ }";
+	backend.requires_matching_array_initializer = true;
 
 	if (is_legacy_es())
 		backend.support_case_fallthrough = false;
@@ -700,7 +708,7 @@ string CompilerGLSL::compile()
 
 	// Shaders might cast unrelated data to pointers of non-block types.
 	// Find all such instances and make sure we can cast the pointers to a synthesized block type.
-	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64EXT)
+	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 		analyze_non_block_pointer_types();
 
 	uint32_t pass_count = 0;
@@ -1191,6 +1199,9 @@ void CompilerGLSL::emit_header()
 		else if (!options.es && execution.flags.get(ExecutionModeDepthLess))
 			statement("layout(depth_less) out float gl_FragDepth;");
 
+		if (execution.flags.get(ExecutionModeRequireFullQuadsKHR))
+			statement("layout(full_quads) in;");
+
 		break;
 
 	default:
@@ -1201,6 +1212,9 @@ void CompilerGLSL::emit_header()
 		if (cap == CapabilityRayTraversalPrimitiveCullingKHR)
 			statement("layout(primitive_culling);");
 
+	if (execution.flags.get(ExecutionModeQuadDerivativesKHR))
+		statement("layout(quad_derivatives) in;");
+
 	if (!inputs.empty())
 		statement("layout(", merge(inputs), ") in;");
 	if (!outputs.empty())
@@ -1515,9 +1529,12 @@ uint32_t CompilerGLSL::type_to_packed_base_size(const SPIRType &type, BufferPack
 	case SPIRType::Half:
 	case SPIRType::Short:
 	case SPIRType::UShort:
+	case SPIRType::BFloat16:
 		return 2;
 	case SPIRType::SByte:
 	case SPIRType::UByte:
+	case SPIRType::FloatE4M3:
+	case SPIRType::FloatE5M2:
 		return 1;
 
 	default:
@@ -1528,14 +1545,14 @@ uint32_t CompilerGLSL::type_to_packed_base_size(const SPIRType &type, BufferPack
 uint32_t CompilerGLSL::type_to_packed_alignment(const SPIRType &type, const Bitset &flags,
                                                 BufferPackingStandard packing)
 {
-	// If using PhysicalStorageBufferEXT storage class, this is a pointer,
+	// If using PhysicalStorageBuffer storage class, this is a pointer,
 	// and is 64-bit.
 	if (is_physical_pointer(type))
 	{
 		if (!type.pointer)
-			SPIRV_CROSS_THROW("Types in PhysicalStorageBufferEXT must be pointers.");
+			SPIRV_CROSS_THROW("Types in PhysicalStorageBuffer must be pointers.");
 
-		if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64EXT)
+		if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 		{
 			if (packing_is_vec4_padded(packing) && type_is_array_of_pointers(type))
 				return 16;
@@ -1543,7 +1560,7 @@ uint32_t CompilerGLSL::type_to_packed_alignment(const SPIRType &type, const Bits
 				return 8;
 		}
 		else
-			SPIRV_CROSS_THROW("AddressingModelPhysicalStorageBuffer64EXT must be used for PhysicalStorageBufferEXT.");
+			SPIRV_CROSS_THROW("AddressingModelPhysicalStorageBuffer64 must be used for PhysicalStorageBuffer.");
 	}
 	else if (is_array(type))
 	{
@@ -1651,17 +1668,17 @@ uint32_t CompilerGLSL::type_to_packed_array_stride(const SPIRType &type, const B
 
 uint32_t CompilerGLSL::type_to_packed_size(const SPIRType &type, const Bitset &flags, BufferPackingStandard packing)
 {
-	// If using PhysicalStorageBufferEXT storage class, this is a pointer,
+	// If using PhysicalStorageBuffer storage class, this is a pointer,
 	// and is 64-bit.
 	if (is_physical_pointer(type))
 	{
 		if (!type.pointer)
-			SPIRV_CROSS_THROW("Types in PhysicalStorageBufferEXT must be pointers.");
+			SPIRV_CROSS_THROW("Types in PhysicalStorageBuffer must be pointers.");
 
-		if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64EXT)
+		if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 			return 8;
 		else
-			SPIRV_CROSS_THROW("AddressingModelPhysicalStorageBuffer64EXT must be used for PhysicalStorageBufferEXT.");
+			SPIRV_CROSS_THROW("AddressingModelPhysicalStorageBuffer64 must be used for PhysicalStorageBuffer.");
 	}
 	else if (is_array(type))
 	{
@@ -2841,7 +2858,7 @@ void CompilerGLSL::emit_uniform(const SPIRVariable &var)
 	statement(layout_for_variable(var), variable_decl(var), ";");
 }
 
-string CompilerGLSL::constant_value_macro_name(uint32_t id)
+string CompilerGLSL::constant_value_macro_name(uint32_t id) const
 {
 	return join("SPIRV_CROSS_CONSTANT_ID_", id);
 }
@@ -3624,6 +3641,36 @@ void CompilerGLSL::emit_resources()
 
 	bool emitted = false;
 
+	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
+	{
+		// Output buffer reference block forward declarations.
+		ir.for_each_typed_id<SPIRType>([&](uint32_t id, SPIRType &type)
+		{
+			if (is_physical_pointer(type))
+			{
+				bool emit_type = true;
+				if (!is_physical_pointer_to_buffer_block(type))
+				{
+					// Only forward-declare if we intend to emit it in the non_block_pointer types.
+					// Otherwise, these are just "benign" pointer types that exist as a result of access chains.
+					emit_type = std::find(physical_storage_non_block_pointer_types.begin(),
+					                      physical_storage_non_block_pointer_types.end(),
+					                      id) != physical_storage_non_block_pointer_types.end();
+				}
+
+				if (emit_type)
+				{
+					emit_buffer_reference_block(id, true);
+					emitted = true;
+				}
+			}
+		});
+	}
+
+	if (emitted)
+		statement("");
+	emitted = false;
+
 	// If emitted Vulkan GLSL,
 	// emit specialization constants as actual floats,
 	// spec op expressions will redirect to the constant name.
@@ -3733,30 +3780,10 @@ void CompilerGLSL::emit_resources()
 
 	emitted = false;
 
-	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64EXT)
+	if (ir.addressing_model == AddressingModelPhysicalStorageBuffer64)
 	{
 		// Output buffer reference blocks.
-		// Do this in two stages, one with forward declaration,
-		// and one without. Buffer reference blocks can reference themselves
-		// to support things like linked lists.
-		ir.for_each_typed_id<SPIRType>([&](uint32_t id, SPIRType &type) {
-			if (is_physical_pointer(type))
-			{
-				bool emit_type = true;
-				if (!is_physical_pointer_to_buffer_block(type))
-				{
-					// Only forward-declare if we intend to emit it in the non_block_pointer types.
-					// Otherwise, these are just "benign" pointer types that exist as a result of access chains.
-					emit_type = std::find(physical_storage_non_block_pointer_types.begin(),
-					                      physical_storage_non_block_pointer_types.end(),
-					                      id) != physical_storage_non_block_pointer_types.end();
-				}
-
-				if (emit_type)
-					emit_buffer_reference_block(id, true);
-			}
-		});
-
+		// Buffer reference blocks can reference themselves to support things like linked lists.
 		for (auto type : physical_storage_non_block_pointer_types)
 			emit_buffer_reference_block(type, false);
 
@@ -4955,12 +4982,16 @@ void CompilerGLSL::emit_polyfills(uint32_t polyfills, bool relaxed)
 // Subclasses may override to modify the return value.
 string CompilerGLSL::to_func_call_arg(const SPIRFunction::Parameter &, uint32_t id)
 {
+	// BDA expects pointers through function interface.
+	if (is_physical_pointer(expression_type(id)))
+		return to_pointer_expression(id);
+
 	// Make sure that we use the name of the original variable, and not the parameter alias.
 	uint32_t name_id = id;
 	auto *var = maybe_get<SPIRVariable>(id);
 	if (var && var->basevariable)
 		name_id = var->basevariable;
-	return to_expression(name_id);
+	return to_unpacked_expression(name_id);
 }
 
 void CompilerGLSL::force_temporary_and_recompile(uint32_t id)
@@ -5391,6 +5422,15 @@ string CompilerGLSL::to_non_uniform_aware_expression(uint32_t id)
 	return expr;
 }
 
+string CompilerGLSL::to_atomic_ptr_expression(uint32_t id)
+{
+	string expr = to_non_uniform_aware_expression(id);
+	// If we have naked pointer to POD, we need to dereference to get the proper ".value" resolve.
+	if (should_dereference(id))
+		expr = dereference_expression(expression_type(id), expr);
+	return expr;
+}
+
 string CompilerGLSL::to_expression(uint32_t id, bool register_expression_read)
 {
 	auto itr = invalid_expressions.find(id);
@@ -5898,6 +5938,35 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 	{
 		return backend.null_pointer_literal;
 	}
+	else if (c.is_null_array_specialized_length && backend.requires_matching_array_initializer)
+	{
+		require_extension_internal("GL_EXT_null_initializer");
+		return backend.constant_null_initializer;
+	}
+	else if (c.replicated && type.op != spv::OpTypeArray)
+	{
+		if (type.op == spv::OpTypeMatrix)
+		{
+			uint32_t num_elements = type.columns;
+			// GLSL does not allow the replication constructor for matrices
+			// mat4(vec4(0.0)) needs to be manually expanded to mat4(vec4(0.0), vec4(0.0), vec4(0.0), vec4(0.0));
+			std::string res;
+			res += type_to_glsl(type);
+			res += "(";
+			for (uint32_t i = 0; i < num_elements; i++)
+			{
+				res += to_expression(c.subconstants[0]);
+				if (i < num_elements - 1)
+					res += ", ";
+			}
+			res += ")";
+			return res;
+		}
+		else
+		{
+			return join(type_to_glsl(type), "(", to_expression(c.subconstants[0]), ")");
+		}
+	}
 	else if (!c.subconstants.empty())
 	{
 		// Handles Arrays and structures.
@@ -5947,8 +6016,16 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		}
 
 		uint32_t subconstant_index = 0;
-		for (auto &elem : c.subconstants)
+		size_t num_elements = c.subconstants.size();
+		if (c.replicated)
 		{
+			if (type.array.size() != 1)
+				SPIRV_CROSS_THROW("Multidimensional arrays not yet supported as replicated constans");
+			num_elements = type.array[0];
+		}
+		for (size_t i = 0; i < num_elements; i++)
+		{
+			auto &elem = c.subconstants[c.replicated ? 0 : i];
 			if (auto *op = maybe_get<SPIRConstantOp>(elem))
 			{
 				res += constant_op_expression(*op);
@@ -5979,7 +6056,7 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 				}
 			}
 
-			if (&elem != &c.subconstants.back())
+			if (i != num_elements - 1)
 				res += ", ";
 
 			subconstant_index++;
@@ -6003,7 +6080,7 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 		else
 			return join(type_to_glsl(type), "(0)");
 	}
-	else if (c.columns() == 1)
+	else if (c.columns() == 1 && type.op != spv::OpTypeCooperativeMatrixKHR)
 	{
 		auto res = constant_expression_vector(c, 0);
 
@@ -6053,17 +6130,44 @@ string CompilerGLSL::constant_expression(const SPIRConstant &c,
 #pragma warning(disable : 4996)
 #endif
 
+string CompilerGLSL::convert_floate4m3_to_string(const SPIRConstant &c, uint32_t col, uint32_t row)
+{
+	string res;
+	float float_value = c.scalar_floate4m3(col, row);
+
+	// There is no infinity in e4m3.
+	if (std::isnan(float_value))
+	{
+		SPIRType type { OpTypeFloat };
+		type.basetype = SPIRType::Half;
+		type.vecsize = 1;
+		type.columns = 1;
+		res = join(type_to_glsl(type), "(0.0 / 0.0)");
+	}
+	else
+	{
+		SPIRType type { OpTypeFloat };
+		type.basetype = SPIRType::FloatE4M3;
+		type.vecsize = 1;
+		type.columns = 1;
+		res = join(type_to_glsl(type), "(", format_float(float_value), ")");
+	}
+
+	return res;
+}
+
 string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col, uint32_t row)
 {
 	string res;
-	float float_value = c.scalar_f16(col, row);
+	bool is_bfloat8 = get<SPIRType>(c.constant_type).basetype == SPIRType::FloatE5M2;
+	float float_value = is_bfloat8 ? c.scalar_bf8(col, row) : c.scalar_f16(col, row);
 
 	// There is no literal "hf" in GL_NV_gpu_shader5, so to avoid lots
 	// of complicated workarounds, just value-cast to the half type always.
 	if (std::isnan(float_value) || std::isinf(float_value))
 	{
 		SPIRType type { OpTypeFloat };
-		type.basetype = SPIRType::Half;
+		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
 
@@ -6079,7 +6183,7 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 	else
 	{
 		SPIRType type { OpTypeFloat };
-		type.basetype = SPIRType::Half;
+		type.basetype = is_bfloat8 ? SPIRType::FloatE5M2 : SPIRType::Half;
 		type.vecsize = 1;
 		type.columns = 1;
 		res = join(type_to_glsl(type), "(", format_float(float_value), ")");
@@ -6091,7 +6195,9 @@ string CompilerGLSL::convert_half_to_string(const SPIRConstant &c, uint32_t col,
 string CompilerGLSL::convert_float_to_string(const SPIRConstant &c, uint32_t col, uint32_t row)
 {
 	string res;
-	float float_value = c.scalar_f32(col, row);
+
+	bool is_bfloat16 = get<SPIRType>(c.constant_type).basetype == SPIRType::BFloat16;
+	float float_value = is_bfloat16 ? c.scalar_bf16(col, row) : c.scalar_f32(col, row);
 
 	if (std::isnan(float_value) || std::isinf(float_value))
 	{
@@ -6155,6 +6261,9 @@ string CompilerGLSL::convert_float_to_string(const SPIRConstant &c, uint32_t col
 			res += "f";
 	}
 
+	if (is_bfloat16)
+		res = join("bfloat16_t(", res, ")");
+
 	return res;
 }
 
@@ -6311,6 +6420,29 @@ string CompilerGLSL::constant_expression_vector(const SPIRConstant &c, uint32_t
 
 	switch (type.basetype)
 	{
+	case SPIRType::FloatE4M3:
+		if (splat || swizzle_splat)
+		{
+			res += convert_floate4m3_to_string(c, vector, 0);
+			if (swizzle_splat)
+				res = remap_swizzle(get<SPIRType>(c.constant_type), 1, res);
+		}
+		else
+		{
+			for (uint32_t i = 0; i < c.vector_size(); i++)
+			{
+				if (c.vector_size() > 1 && c.specialization_constant_id(vector, i) != 0)
+					res += to_expression(c.specialization_constant_id(vector, i));
+				else
+					res += convert_floate4m3_to_string(c, vector, i);
+
+				if (i + 1 < c.vector_size())
+					res += ", ";
+			}
+		}
+		break;
+
+	case SPIRType::FloatE5M2:
 	case SPIRType::Half:
 		if (splat || swizzle_splat)
 		{
@@ -6333,6 +6465,7 @@ string CompilerGLSL::constant_expression_vector(const SPIRConstant &c, uint32_t
 		}
 		break;
 
+	case SPIRType::BFloat16:
 	case SPIRType::Float:
 		if (splat || swizzle_splat)
 		{
@@ -6988,9 +7121,12 @@ void CompilerGLSL::emit_atomic_func_op(uint32_t result_type, uint32_t result_id,
 		require_extension_internal("GL_EXT_shader_atomic_float");
 	}
 
+	if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
+		require_extension_internal("GL_EXT_shader_atomic_int64");
+
 	forced_temporaries.insert(result_id);
 	emit_op(result_type, result_id,
-	        join(op, "(", to_non_uniform_aware_expression(op0), ", ",
+	        join(op, "(", to_atomic_ptr_expression(op0), ", ",
 	             to_unpacked_expression(op1), ")"), false);
 	flush_all_atomic_capable_variables();
 }
@@ -9358,6 +9494,10 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
 		require_extension_internal("GL_KHR_shader_subgroup_shuffle_relative");
 		break;
 
+	case OpGroupNonUniformRotateKHR:
+		require_extension_internal("GL_KHR_shader_subgroup_rotate");
+		break;
+
 	case OpGroupNonUniformAll:
 	case OpGroupNonUniformAny:
 	case OpGroupNonUniformAllEqual:
@@ -9429,6 +9569,13 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
 		require_extension_internal("GL_KHR_shader_subgroup_quad");
 		break;
 
+	case OpGroupNonUniformQuadAllKHR:
+	case OpGroupNonUniformQuadAnyKHR:
+		// Require both extensions to be enabled.
+		require_extension_internal("GL_KHR_shader_subgroup_vote");
+		require_extension_internal("GL_EXT_shader_quad_control");
+		break;
+
 	default:
 		SPIRV_CROSS_THROW("Invalid opcode for subgroup.");
 	}
@@ -9436,9 +9583,13 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
 	uint32_t result_type = ops[0];
 	uint32_t id = ops[1];
 
-	auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
-	if (scope != ScopeSubgroup)
-		SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+	// These quad ops do not have a scope parameter.
+	if (op != OpGroupNonUniformQuadAllKHR && op != OpGroupNonUniformQuadAnyKHR)
+	{
+		auto scope = static_cast<Scope>(evaluate_constant_u32(ops[2]));
+		if (scope != ScopeSubgroup)
+			SPIRV_CROSS_THROW("Only subgroup scope is supported.");
+	}
 
 	switch (op)
 	{
@@ -9504,6 +9655,13 @@ void CompilerGLSL::emit_subgroup_op(const Instruction &i)
 		emit_binary_func_op(result_type, id, ops[3], ops[4], "subgroupShuffleDown");
 		break;
 
+	case OpGroupNonUniformRotateKHR:
+		if (i.length > 5)
+			emit_trinary_func_op(result_type, id, ops[3], ops[4], ops[5], "subgroupClusteredRotate");
+		else
+			emit_binary_func_op(result_type, id, ops[3], ops[4], "subgroupRotate");
+		break;
+
 	case OpGroupNonUniformAll:
 		emit_unary_func_op(result_type, id, ops[3], "subgroupAll");
 		break;
@@ -9591,6 +9749,14 @@ case OpGroupNonUniform##op: \
 		break;
 	}
 
+	case OpGroupNonUniformQuadAllKHR:
+		emit_unary_func_op(result_type, id, ops[2], "subgroupQuadAll");
+		break;
+
+	case OpGroupNonUniformQuadAnyKHR:
+		emit_unary_func_op(result_type, id, ops[2], "subgroupQuadAny");
+		break;
+
 	default:
 		SPIRV_CROSS_THROW("Invalid opcode for subgroup.");
 	}
@@ -9706,6 +9872,30 @@ string CompilerGLSL::bitcast_glsl_op(const SPIRType &out_type, const SPIRType &i
 		return "packUint4x16";
 	else if (out_type.basetype == SPIRType::UShort && in_type.basetype == SPIRType::UInt64 && in_type.vecsize == 1)
 		return "unpackUint4x16";
+	else if (out_type.basetype == SPIRType::BFloat16 && in_type.basetype == SPIRType::UShort)
+		return "uintBitsToBFloat16EXT";
+	else if (out_type.basetype == SPIRType::BFloat16 && in_type.basetype == SPIRType::Short)
+		return "intBitsToBFloat16EXT";
+	else if (out_type.basetype == SPIRType::UShort && in_type.basetype == SPIRType::BFloat16)
+		return "bfloat16BitsToUintEXT";
+	else if (out_type.basetype == SPIRType::Short && in_type.basetype == SPIRType::BFloat16)
+		return "bfloat16BitsToIntEXT";
+	else if (out_type.basetype == SPIRType::FloatE4M3 && in_type.basetype == SPIRType::UByte)
+		return "uintBitsToFloate4m3EXT";
+	else if (out_type.basetype == SPIRType::FloatE4M3 && in_type.basetype == SPIRType::SByte)
+		return "intBitsToFloate4m3EXT";
+	else if (out_type.basetype == SPIRType::UByte && in_type.basetype == SPIRType::FloatE4M3)
+		return "floate4m3BitsToUintEXT";
+	else if (out_type.basetype == SPIRType::SByte && in_type.basetype == SPIRType::FloatE4M3)
+		return "floate4m3BitsToIntEXT";
+	else if (out_type.basetype == SPIRType::FloatE5M2 && in_type.basetype == SPIRType::UByte)
+		return "uintBitsToFloate5m2EXT";
+	else if (out_type.basetype == SPIRType::FloatE5M2 && in_type.basetype == SPIRType::SByte)
+		return "intBitsToFloate5m2EXT";
+	else if (out_type.basetype == SPIRType::UByte && in_type.basetype == SPIRType::FloatE5M2)
+		return "floate5m2BitsToUintEXT";
+	else if (out_type.basetype == SPIRType::SByte && in_type.basetype == SPIRType::FloatE5M2)
+		return "floate5m2BitsToIntEXT";
 
 	return "";
 }
@@ -9824,7 +10014,17 @@ string CompilerGLSL::builtin_to_glsl(BuiltIn builtin, StorageClass storage)
 	case BuiltInInvocationId:
 		return "gl_InvocationID";
 	case BuiltInLayer:
+	{
+		auto model = get_execution_model();
+		if (model == ExecutionModelVertex || model == ExecutionModelTessellationEvaluation)
+		{
+			if (options.es)
+				require_extension_internal("GL_NV_viewport_array2");
+			else
+				require_extension_internal("GL_ARB_shader_viewport_layer_array");
+		}
 		return "gl_Layer";
+	}
 	case BuiltInViewportIndex:
 		return "gl_ViewportIndex";
 	case BuiltInTessLevelOuter:
@@ -10228,7 +10428,8 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 		if (!is_ptr_chain)
 			mod_flags &= ~ACCESS_CHAIN_PTR_CHAIN_BIT;
 		access_chain_internal_append_index(expr, base, type, mod_flags, access_chain_is_arrayed, index);
-		check_physical_type_cast(expr, type, physical_type);
+		if (check_physical_type_cast(expr, type, physical_type))
+			physical_type = 0;
 	};
 
 	for (uint32_t i = 0; i < count; i++)
@@ -10572,7 +10773,7 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 			type = &get<SPIRType>(type_id);
 		}
 		// Vector -> Scalar
-		else if (type->vecsize > 1)
+		else if (type->op == OpTypeCooperativeMatrixKHR || type->vecsize > 1)
 		{
 			string deferred_index;
 			if (row_major_matrix_needs_conversion)
@@ -10634,9 +10835,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 
 			if (is_literal)
 			{
-				bool out_of_bounds = (index >= type->vecsize);
+				bool out_of_bounds = index >= type->vecsize && type->op != OpTypeCooperativeMatrixKHR;
 
-				if (!is_packed && !row_major_matrix_needs_conversion)
+				if (!is_packed && !row_major_matrix_needs_conversion && type->op != OpTypeCooperativeMatrixKHR)
 				{
 					expr += ".";
 					expr += index_to_swizzle(out_of_bounds ? 0 : index);
@@ -10736,8 +10937,9 @@ string CompilerGLSL::access_chain_internal(uint32_t base, const uint32_t *indice
 	return expr;
 }
 
-void CompilerGLSL::check_physical_type_cast(std::string &, const SPIRType *, uint32_t)
+bool CompilerGLSL::check_physical_type_cast(std::string &, const SPIRType *, uint32_t)
 {
+	return false;
 }
 
 bool CompilerGLSL::prepare_access_chain_for_scalar_access(std::string &, const SPIRType &, spv::StorageClass, bool &)
@@ -11239,7 +11441,7 @@ bool CompilerGLSL::should_dereference(uint32_t id)
 {
 	const auto &type = expression_type(id);
 	// Non-pointer expressions don't need to be dereferenced.
-	if (!type.pointer)
+	if (!is_pointer(type))
 		return false;
 
 	// Handles shouldn't be dereferenced either.
@@ -11247,8 +11449,9 @@ bool CompilerGLSL::should_dereference(uint32_t id)
 		return false;
 
 	// If id is a variable but not a phi variable, we should not dereference it.
+	// BDA passed around as parameters are always pointers.
 	if (auto *var = maybe_get<SPIRVariable>(id))
-		return var->phi_variable;
+		return (var->parameter && is_physical_pointer(type)) || var->phi_variable;
 
 	if (auto *expr = maybe_get<SPIRExpression>(id))
 	{
@@ -11281,6 +11484,16 @@ bool CompilerGLSL::should_dereference(uint32_t id)
 	return true;
 }
 
+bool CompilerGLSL::should_dereference_caller_param(uint32_t id)
+{
+	const auto &type = expression_type(id);
+	// BDA is always passed around as pointers.
+	if (is_physical_pointer(type))
+		return false;
+
+	return should_dereference(id);
+}
+
 bool CompilerGLSL::should_forward(uint32_t id) const
 {
 	// If id is a variable we will try to forward it regardless of force_temporary check below
@@ -11575,7 +11788,8 @@ string CompilerGLSL::build_composite_combiner(uint32_t return_type, const uint32
 
 	// Can only merge swizzles for vectors.
 	auto &type = get<SPIRType>(return_type);
-	bool can_apply_swizzle_opt = type.basetype != SPIRType::Struct && type.array.empty() && type.columns == 1;
+	bool can_apply_swizzle_opt = type.basetype != SPIRType::Struct && type.array.empty() && type.columns == 1 &&
+	                             type.op != spv::OpTypeCooperativeMatrixKHR;
 	bool swizzle_optimization = false;
 
 	for (uint32_t i = 0; i < length; i++)
@@ -12132,6 +12346,33 @@ CompilerGLSL::TemporaryCopy CompilerGLSL::handle_instruction_precision(const Ins
 	return {};
 }
 
+static pair<string, string> split_coopmat_pointer(const string &expr)
+{
+	auto ptr_expr = expr;
+	string index_expr;
+
+	if (ptr_expr.back() != ']')
+		SPIRV_CROSS_THROW("Access chain for coopmat must be indexed into an array.");
+
+	// Strip the access chain.
+	ptr_expr.pop_back();
+	uint32_t counter = 1;
+	while (counter && !ptr_expr.empty())
+	{
+		if (ptr_expr.back() == ']')
+			counter++;
+		else if (ptr_expr.back() == '[')
+			counter--;
+		ptr_expr.pop_back();
+	}
+
+	if (ptr_expr.empty())
+		SPIRV_CROSS_THROW("Invalid pointer expression for coopmat.");
+
+	index_expr = expr.substr(ptr_expr.size() + 1, expr.size() - (ptr_expr.size() + 1) - 1);
+	return { std::move(ptr_expr), std::move(index_expr) };
+}
+
 void CompilerGLSL::emit_instruction(const Instruction &instruction)
 {
 	auto ops = stream(instruction);
@@ -12675,6 +12916,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		if (composite_type_is_complex)
 			allow_base_expression = false;
 
+		if (composite_type.op == spv::OpTypeCooperativeMatrixKHR)
+			allow_base_expression = false;
+
 		// Packed expressions or physical ID mapped expressions cannot be split up.
 		if (has_extended_decoration(ops[2], SPIRVCrossDecorationPhysicalTypePacked) ||
 		    has_extended_decoration(ops[2], SPIRVCrossDecorationPhysicalTypeID))
@@ -13618,13 +13862,42 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCooperativeMatrixConvertNV:
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("CooperativeMatrixConvertNV requires vulkan semantics.");
+		require_extension_internal("GL_NV_cooperative_matrix2");
+		// fallthrough
 	case OpFConvert:
 	{
 		uint32_t result_type = ops[0];
 		uint32_t id = ops[1];
 
-		auto func = type_to_glsl_constructor(get<SPIRType>(result_type));
-		emit_unary_func_op(result_type, id, ops[2], func.c_str());
+		auto &type = get<SPIRType>(result_type);
+
+		if (type.op == OpTypeCooperativeMatrixKHR && opcode == OpFConvert)
+		{
+			auto &expr_type = expression_type(ops[2]);
+			if (get<SPIRConstant>(type.cooperative.use_id).scalar() !=
+			    get<SPIRConstant>(expr_type.cooperative.use_id).scalar())
+			{
+				// Somewhat questionable with spec constant uses.
+				if (!options.vulkan_semantics)
+					SPIRV_CROSS_THROW("NV_cooperative_matrix2 requires vulkan semantics.");
+				require_extension_internal("GL_NV_cooperative_matrix2");
+			}
+		}
+
+		if ((type.basetype == SPIRType::FloatE4M3 || type.basetype == SPIRType::FloatE5M2) &&
+		    has_decoration(id, spv::DecorationSaturatedToLargestFloat8NormalConversionEXT))
+		{
+			emit_uninitialized_temporary_expression(result_type, id);
+			statement("saturatedConvertEXT(", to_expression(id), ", ", to_unpacked_expression(ops[2]), ");");
+		}
+		else
+		{
+			auto func = type_to_glsl_constructor(type);
+			emit_unary_func_op(result_type, id, ops[2], func.c_str());
+		}
 		break;
 	}
 
@@ -13843,8 +14116,11 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		const char *increment = unsigned_type ? "0u" : "0";
 		emit_op(ops[0], ops[1],
 		        join(op, "(",
-		             to_non_uniform_aware_expression(ops[2]), ", ", increment, ")"), false);
+		             to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
 		flush_all_atomic_capable_variables();
+
+		if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
+			require_extension_internal("GL_EXT_shader_atomic_int64");
 		break;
 	}
 
@@ -13856,8 +14132,12 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		// Ignore semantics for now, probably only relevant to CL.
 		uint32_t val = ops[3];
 		const char *op = check_atomic_image(ptr) ? "imageAtomicExchange" : "atomicExchange";
-		statement(op, "(", to_non_uniform_aware_expression(ptr), ", ", to_expression(val), ");");
+		statement(op, "(", to_atomic_ptr_expression(ptr), ", ", to_expression(val), ");");
 		flush_all_atomic_capable_variables();
+
+		auto &type = expression_type(ptr);
+		if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
+			require_extension_internal("GL_EXT_shader_atomic_int64");
 		break;
 	}
 
@@ -13892,7 +14172,10 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 				increment = "-1";
 
 			emit_op(ops[0], ops[1],
-			        join(op, "(", to_non_uniform_aware_expression(ops[2]), ", ", increment, ")"), false);
+			        join(op, "(", to_atomic_ptr_expression(ops[2]), ", ", increment, ")"), false);
+
+			if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
+				require_extension_internal("GL_EXT_shader_atomic_int64");
 		}
 
 		flush_all_atomic_capable_variables();
@@ -13911,9 +14194,13 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		const char *op = check_atomic_image(ops[2]) ? "imageAtomicAdd" : "atomicAdd";
 		forced_temporaries.insert(ops[1]);
-		auto expr = join(op, "(", to_non_uniform_aware_expression(ops[2]), ", -", to_enclosed_expression(ops[5]), ")");
+		auto expr = join(op, "(", to_atomic_ptr_expression(ops[2]), ", -", to_enclosed_expression(ops[5]), ")");
 		emit_op(ops[0], ops[1], expr, should_forward(ops[2]) && should_forward(ops[5]));
 		flush_all_atomic_capable_variables();
+
+		auto &type = get<SPIRType>(ops[0]);
+		if (type.basetype == SPIRType::UInt64 || type.basetype == SPIRType::Int64)
+			require_extension_internal("GL_EXT_shader_atomic_int64");
 		break;
 	}
 
@@ -14717,6 +15004,20 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpExtInstWithForwardRefsKHR:
+	{
+		uint32_t extension_set = ops[2];
+		auto ext = get<SPIRExtension>(extension_set).ext;
+		if (ext != SPIRExtension::SPV_debug_info &&
+		    ext != SPIRExtension::NonSemanticShaderDebugInfo &&
+		    ext != SPIRExtension::NonSemanticGeneric)
+		{
+			SPIRV_CROSS_THROW("Unexpected use of ExtInstWithForwardRefsKHR.");
+		}
+
+		break;
+	}
+
 	case OpExtInst:
 	{
 		uint32_t extension_set = ops[2];
@@ -14757,7 +15058,7 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 					SPIRV_CROSS_THROW("Debug printf is only supported in Vulkan GLSL.\n");
 				require_extension_internal("GL_EXT_debug_printf");
 				auto &format_string = get<SPIRString>(ops[4]).str;
-				string expr = join("debugPrintfEXT(\"", format_string, "\"");
+				string expr = join(backend.printf_function, "(\"", format_string, "\"");
 				for (uint32_t i = 5; i < length; i++)
 				{
 					expr += ", ";
@@ -14956,6 +15257,9 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpGroupNonUniformLogicalXor:
 	case OpGroupNonUniformQuadSwap:
 	case OpGroupNonUniformQuadBroadcast:
+	case OpGroupNonUniformQuadAllKHR:
+	case OpGroupNonUniformQuadAnyKHR:
+	case OpGroupNonUniformRotateKHR:
 		emit_subgroup_op(instruction);
 		break;
 
@@ -15175,8 +15479,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	case OpConvertUToPtr:
 	{
 		auto &type = get<SPIRType>(ops[0]);
-		if (type.storage != StorageClassPhysicalStorageBufferEXT)
-			SPIRV_CROSS_THROW("Only StorageClassPhysicalStorageBufferEXT is supported by OpConvertUToPtr.");
+		if (type.storage != StorageClassPhysicalStorageBuffer)
+			SPIRV_CROSS_THROW("Only StorageClassPhysicalStorageBuffer is supported by OpConvertUToPtr.");
 
 		auto &in_type = expression_type(ops[2]);
 		if (in_type.vecsize == 2)
@@ -15191,8 +15495,8 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 	{
 		auto &type = get<SPIRType>(ops[0]);
 		auto &ptr_type = expression_type(ops[2]);
-		if (ptr_type.storage != StorageClassPhysicalStorageBufferEXT)
-			SPIRV_CROSS_THROW("Only StorageClassPhysicalStorageBufferEXT is supported by OpConvertPtrToU.");
+		if (ptr_type.storage != StorageClassPhysicalStorageBuffer)
+			SPIRV_CROSS_THROW("Only StorageClassPhysicalStorageBuffer is supported by OpConvertPtrToU.");
 
 		if (type.vecsize == 2)
 			require_extension_internal("GL_EXT_buffer_reference_uvec2");
@@ -15291,6 +15595,169 @@ void CompilerGLSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpCooperativeMatrixLengthKHR:
+	{
+		// Need to synthesize a dummy temporary, since the SPIR-V opcode is based on the type.
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		set<SPIRExpression>(
+				id, join(type_to_glsl(get<SPIRType>(result_type)),
+				         "(", type_to_glsl(get<SPIRType>(ops[2])), "(0).length())"),
+				result_type, true);
+		break;
+	}
+
+	case OpCooperativeMatrixLoadKHR:
+	{
+		// Spec contradicts itself if stride is optional or not.
+		if (length < 5)
+			SPIRV_CROSS_THROW("Stride is not provided.");
+
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		emit_uninitialized_temporary_expression(result_type, id);
+
+		auto expr = to_expression(ops[2]);
+		pair<string, string> split_expr;
+		if (!is_forcing_recompilation())
+			split_expr = split_coopmat_pointer(expr);
+
+		string layout_expr;
+		if (const auto *layout = maybe_get<SPIRConstant>(ops[3]))
+		{
+			if (!layout->specialization)
+			{
+				if (layout->scalar() == spv::CooperativeMatrixLayoutColumnMajorKHR)
+					layout_expr = "gl_CooperativeMatrixLayoutColumnMajor";
+				else
+					layout_expr = "gl_CooperativeMatrixLayoutRowMajor";
+			}
+		}
+
+		if (layout_expr.empty())
+			layout_expr = join("int(", to_expression(ops[3]), ")");
+
+		statement("coopMatLoad(",
+		          to_expression(id), ", ",
+		          split_expr.first, ", ",
+		          split_expr.second, ", ",
+		          to_expression(ops[4]), ", ",
+		          layout_expr, ");");
+
+		register_read(id, ops[2], false);
+		break;
+	}
+
+	case OpCooperativeMatrixStoreKHR:
+	{
+		// Spec contradicts itself if stride is optional or not.
+		if (length < 4)
+			SPIRV_CROSS_THROW("Stride is not provided.");
+
+		// SPIR-V and GLSL don't agree how to pass the expression.
+		// In SPIR-V it's a pointer, but in GLSL it's reference to array + index.
+
+		auto expr = to_expression(ops[0]);
+		pair<string, string> split_expr;
+		if (!is_forcing_recompilation())
+			split_expr = split_coopmat_pointer(expr);
+
+		string layout_expr;
+		if (const auto *layout = maybe_get<SPIRConstant>(ops[2]))
+		{
+			if (!layout->specialization)
+			{
+				if (layout->scalar() == spv::CooperativeMatrixLayoutColumnMajorKHR)
+					layout_expr = "gl_CooperativeMatrixLayoutColumnMajor";
+				else
+					layout_expr = "gl_CooperativeMatrixLayoutRowMajor";
+			}
+		}
+
+		if (layout_expr.empty())
+			layout_expr = join("int(", to_expression(ops[2]), ")");
+
+		statement("coopMatStore(",
+		          to_expression(ops[1]), ", ",
+		          split_expr.first, ", ",
+		          split_expr.second, ", ",
+		          to_expression(ops[3]), ", ",
+		          layout_expr, ");");
+
+		// TODO: Do we care about memory operands?
+
+		register_write(ops[0]);
+		break;
+	}
+
+	case OpCooperativeMatrixMulAddKHR:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+		uint32_t A = ops[2];
+		uint32_t B = ops[3];
+		uint32_t C = ops[4];
+		bool forward = should_forward(A) && should_forward(B) && should_forward(C);
+		emit_op(result_type, id,
+		        join("coopMatMulAdd(",
+		             to_unpacked_expression(A), ", ",
+		             to_unpacked_expression(B), ", ",
+		             to_unpacked_expression(C), ", ",
+		             (length >= 6 ? ops[5] : 0),
+		             ")"),
+		        forward);
+
+		inherit_expression_dependencies(id, A);
+		inherit_expression_dependencies(id, B);
+		inherit_expression_dependencies(id, C);
+		break;
+	}
+
+	case OpCompositeConstructReplicateEXT:
+	{
+		uint32_t result_type = ops[0];
+		uint32_t id = ops[1];
+
+		auto &type = get<SPIRType>(result_type);
+		auto value_to_replicate = to_expression(ops[2]);
+		std::string rhs;
+		// Matrices don't have a replicating constructor for vectors. Need to manually replicate
+		if (type.op == spv::OpTypeMatrix || type.op == spv::OpTypeArray)
+		{
+			if (type.op == spv::OpTypeArray && type.array.size() != 1)
+			{
+				SPIRV_CROSS_THROW(
+				    "Multi-dimensional arrays currently not supported for OpCompositeConstructReplicateEXT");
+			}
+			uint32_t num_elements = type.op == spv::OpTypeMatrix ? type.columns : type.array[0];
+			if (backend.use_initializer_list && type.op == spv::OpTypeArray)
+			{
+				rhs += "{";
+			}
+			else
+			{
+				rhs += type_to_glsl_constructor(type);
+				rhs += "(";
+			}
+			for (uint32_t i = 0; i < num_elements; i++)
+			{
+				rhs += value_to_replicate;
+				if (i < num_elements - 1)
+					rhs += ", ";
+			}
+			if (backend.use_initializer_list && type.op == spv::OpTypeArray)
+				rhs += "}";
+			else
+				rhs += ")";
+		}
+		else
+		{
+			rhs = join(type_to_glsl(type), "(", to_expression(ops[2]), ")");
+		}
+		emit_op(result_type, id, rhs, true);
+		break;
+	}
+
 	default:
 		statement("// unimplemented op ", instruction.op);
 		break;
@@ -15689,7 +16156,10 @@ string CompilerGLSL::argument_decl(const SPIRFunction::Parameter &arg)
 	auto &type = expression_type(arg.id);
 	const char *direction = "";
 
-	if (type.pointer)
+	if (is_pointer(type) &&
+	    (type.storage == StorageClassFunction ||
+	     type.storage == StorageClassPrivate ||
+	     type.storage == StorageClassOutput))
 	{
 		// If we're passing around block types to function, we really mean reference in a pointer sense,
 		// but DXC does not like inout for mesh blocks, so workaround that. out is technically not correct,
@@ -15763,13 +16233,24 @@ string CompilerGLSL::variable_decl(const SPIRVariable &variable)
 		else if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
 			res += join(" = ", to_zero_initialized_expression(get_variable_data_type_id(variable)));
 	}
-	else if (variable.initializer && !variable_decl_is_remapped_storage(variable, StorageClassWorkgroup))
+	else if (variable.initializer)
 	{
-		uint32_t expr = variable.initializer;
-		if (ir.ids[expr].get_type() != TypeUndef)
-			res += join(" = ", to_initializer_expression(variable));
-		else if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
-			res += join(" = ", to_zero_initialized_expression(get_variable_data_type_id(variable)));
+		if (!variable_decl_is_remapped_storage(variable, StorageClassWorkgroup))
+		{
+			uint32_t expr = variable.initializer;
+			if (ir.ids[expr].get_type() != TypeUndef)
+				res += join(" = ", to_initializer_expression(variable));
+			else if (options.force_zero_initialized_variables && type_can_zero_initialize(type))
+				res += join(" = ", to_zero_initialized_expression(get_variable_data_type_id(variable)));
+		}
+		else
+		{
+			// Workgroup memory requires special handling. First, it can only be Null-Initialized.
+			// GLSL will handle this with null initializer, while others require more work after the decl
+			require_extension_internal("GL_EXT_null_initializer");
+			if (!backend.constant_null_initializer.empty())
+				res += join(" = ", backend.constant_null_initializer);
+		}
 	}
 
 	return res;
@@ -15849,7 +16330,7 @@ string CompilerGLSL::to_array_size(const SPIRType &type, uint32_t index)
 
 string CompilerGLSL::type_to_array_glsl(const SPIRType &type, uint32_t)
 {
-	if (type.pointer && type.storage == StorageClassPhysicalStorageBufferEXT && type.basetype != SPIRType::Struct)
+	if (type.pointer && type.storage == StorageClassPhysicalStorageBuffer && type.basetype != SPIRType::Struct)
 	{
 		// We are using a wrapped pointer type, and we should not emit any array declarations here.
 		return "";
@@ -16124,6 +16605,61 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 			require_extension_internal("GL_ARB_shader_atomic_counters");
 	}
 
+	const SPIRType *coop_type = &type;
+	while (is_pointer(*coop_type) || is_array(*coop_type))
+		coop_type = &get<SPIRType>(coop_type->parent_type);
+
+	if (coop_type->op == spv::OpTypeCooperativeMatrixKHR)
+	{
+		require_extension_internal("GL_KHR_cooperative_matrix");
+		if (!options.vulkan_semantics)
+			SPIRV_CROSS_THROW("Cooperative matrix only available in Vulkan.");
+		// GLSL doesn't support this as spec constant, which makes sense ...
+		uint32_t use_type = get<SPIRConstant>(coop_type->cooperative.use_id).scalar();
+
+		const char *use = nullptr;
+		switch (use_type)
+		{
+		case CooperativeMatrixUseMatrixAKHR:
+			use = "gl_MatrixUseA";
+			break;
+
+		case CooperativeMatrixUseMatrixBKHR:
+			use = "gl_MatrixUseB";
+			break;
+
+		case CooperativeMatrixUseMatrixAccumulatorKHR:
+			use = "gl_MatrixUseAccumulator";
+			break;
+
+		default:
+			SPIRV_CROSS_THROW("Invalid matrix use.");
+		}
+
+		string scope_expr;
+		if (const auto *scope = maybe_get<SPIRConstant>(coop_type->cooperative.scope_id))
+		{
+			if (!scope->specialization)
+			{
+				require_extension_internal("GL_KHR_memory_scope_semantics");
+				if (scope->scalar() == spv::ScopeSubgroup)
+					scope_expr = "gl_ScopeSubgroup";
+				else if (scope->scalar() == spv::ScopeWorkgroup)
+					scope_expr = "gl_ScopeWorkgroup";
+				else
+					SPIRV_CROSS_THROW("Invalid scope for cooperative matrix.");
+			}
+		}
+
+		if (scope_expr.empty())
+			scope_expr = to_expression(coop_type->cooperative.scope_id);
+
+		return join("coopmat<", type_to_glsl(get<SPIRType>(coop_type->parent_type)), ", ",
+		            scope_expr, ", ",
+		            to_expression(coop_type->cooperative.rows_id), ", ",
+		            to_expression(coop_type->cooperative.columns_id), ", ", use, ">");
+	}
+
 	if (type.vecsize == 1 && type.columns == 1) // Scalar builtin
 	{
 		switch (type.basetype)
@@ -16146,6 +16682,21 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 			return "atomic_uint";
 		case SPIRType::Half:
 			return "float16_t";
+		case SPIRType::BFloat16:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("bfloat16 requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_bfloat16");
+			return "bfloat16_t";
+		case SPIRType::FloatE4M3:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate4m3_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e4m3");
+			return "floate4m3_t";
+		case SPIRType::FloatE5M2:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate5m2_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e5m2");
+			return "floate5m2_t";
 		case SPIRType::Float:
 			return "float";
 		case SPIRType::Double:
@@ -16178,6 +16729,21 @@ string CompilerGLSL::type_to_glsl(const SPIRType &type, uint32_t id)
 			return join("uvec", type.vecsize);
 		case SPIRType::Half:
 			return join("f16vec", type.vecsize);
+		case SPIRType::BFloat16:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("bfloat16 requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_bfloat16");
+			return join("bf16vec", type.vecsize);
+		case SPIRType::FloatE4M3:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate4m3_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e4m3");
+			return join("fe4m3vec", type.vecsize);
+		case SPIRType::FloatE5M2:
+			if (!options.vulkan_semantics)
+				SPIRV_CROSS_THROW("floate5m2_t requires Vulkan semantics.");
+			require_extension_internal("GL_EXT_float_e5m2");
+			return join("fe5m2vec", type.vecsize);
 		case SPIRType::Float:
 			return join("vec", type.vecsize);
 		case SPIRType::Double:
@@ -16353,6 +16919,11 @@ void CompilerGLSL::add_function_overload(const SPIRFunction &func)
 		// but that will not change the signature in GLSL/HLSL,
 		// so strip the pointer type before hashing.
 		uint32_t type_id = get_pointee_type_id(arg.type);
+
+		// Workaround glslang bug. It seems to only consider the base type when resolving overloads.
+		if (get<SPIRType>(type_id).op == spv::OpTypeCooperativeMatrixKHR)
+			type_id = get<SPIRType>(type_id).parent_type;
+
 		auto &type = get<SPIRType>(type_id);
 
 		if (!combined_image_samplers.empty())
@@ -16492,6 +17063,7 @@ void CompilerGLSL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 			{
 				// Recursively emit functions which are called.
 				uint32_t id = ops[2];
+
 				emit_function(get<SPIRFunction>(id), ir.meta[ops[1]].decoration.decoration_flags);
 			}
 		}
@@ -16530,6 +17102,12 @@ void CompilerGLSL::emit_function(SPIRFunction &func, const Bitset &return_flags)
 			// Comes from MSL which can push global variables as local variables in main function.
 			add_local_variable_name(var.self);
 			statement(variable_decl(var), ";");
+
+			// "Real" workgroup variables in compute shaders needs extra caretaking.
+			// They need to be initialized with an extra routine as they come in arbitrary form.
+			if (var.storage == StorageClassWorkgroup && var.initializer)
+				emit_workgroup_initialization(var);
+
 			var.deferred_declaration = false;
 		}
 		else if (var.storage == StorageClassPrivate)
@@ -16636,6 +17214,10 @@ void CompilerGLSL::emit_fixup()
 	}
 }
 
+void CompilerGLSL::emit_workgroup_initialization(const SPIRVariable &)
+{
+}
+
 void CompilerGLSL::flush_phi(BlockID from, BlockID to)
 {
 	auto &child = get<SPIRBlock>(to);
@@ -17869,6 +18451,14 @@ void CompilerGLSL::emit_block_chain(SPIRBlock &block)
 
 	case SPIRBlock::Unreachable:
 	{
+		// If the entry point ends with unreachable and has a return value, insert a return
+		// statement to avoid potential compiler errors from non-void functions without a return value.
+		if (block.return_value)
+		{
+			statement("return ", to_unpacked_expression(block.return_value), ";");
+			break;
+		}
+
 		// Avoid emitting false fallthrough, which can happen for
 		// if (cond) break; else discard; inside a case label.
 		// Discard is not always implementable as a terminator.
diff --git a/thirdparty/spirv-cross/spirv_glsl.hpp b/thirdparty/spirv-cross/spirv_glsl.hpp
index 8a00263234..03ff330ccf 100644
--- a/thirdparty/spirv-cross/spirv_glsl.hpp
+++ b/thirdparty/spirv-cross/spirv_glsl.hpp
@@ -297,6 +297,9 @@ public:
 		float_formatter = formatter;
 	}
 
+	// Returns the macro name corresponding to constant id
+	std::string constant_value_macro_name(uint32_t id) const;
+
 protected:
 	struct ShaderSubgroupSupportHelper
 	{
@@ -450,6 +453,7 @@ protected:
 	virtual std::string variable_decl(const SPIRType &type, const std::string &name, uint32_t id = 0);
 	virtual bool variable_decl_is_remapped_storage(const SPIRVariable &var, spv::StorageClass storage) const;
 	virtual std::string to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_t id);
+	virtual void emit_workgroup_initialization(const SPIRVariable &var);
 
 	struct TextureFunctionBaseArguments
 	{
@@ -622,6 +626,8 @@ protected:
 		const char *uint16_t_literal_suffix = "us";
 		const char *nonuniform_qualifier = "nonuniformEXT";
 		const char *boolean_mix_function = "mix";
+		const char *printf_function = "debugPrintfEXT";
+		std::string constant_null_initializer = "";
 		SPIRType::BaseType boolean_in_struct_remapped_type = SPIRType::Boolean;
 		bool swizzle_is_function = false;
 		bool shared_is_implied = false;
@@ -629,6 +635,7 @@ protected:
 		bool explicit_struct_type = false;
 		bool use_initializer_list = false;
 		bool use_typed_initializer_list = false;
+		bool requires_matching_array_initializer = false;
 		bool can_declare_struct_inline = true;
 		bool can_declare_arrays_inline = true;
 		bool native_row_major_matrix = true;
@@ -679,7 +686,6 @@ protected:
 	                                    const SmallVector<uint32_t> &indices);
 	void emit_block_chain(SPIRBlock &block);
 	void emit_hoisted_temporaries(SmallVector<std::pair<TypeID, ID>> &temporaries);
-	std::string constant_value_macro_name(uint32_t id);
 	int get_constant_mapping_to_workgroup_component(const SPIRConstant &constant) const;
 	void emit_constant(const SPIRConstant &constant);
 	void emit_specialization_constant_op(const SPIRConstantOp &constant);
@@ -695,6 +701,7 @@ protected:
 	void emit_variable_temporary_copies(const SPIRVariable &var);
 
 	bool should_dereference(uint32_t id);
+	bool should_dereference_caller_param(uint32_t id);
 	bool should_forward(uint32_t id) const;
 	bool should_suppress_usage_tracking(uint32_t id) const;
 	void emit_mix_op(uint32_t result_type, uint32_t id, uint32_t left, uint32_t right, uint32_t lerp);
@@ -762,7 +769,7 @@ protected:
 	spv::StorageClass get_expression_effective_storage_class(uint32_t ptr);
 	virtual bool access_chain_needs_stage_io_builtin_translation(uint32_t base);
 
-	virtual void check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type);
+	virtual bool check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type);
 	virtual bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type,
 	                                                    spv::StorageClass storage, bool &is_packed);
 
@@ -792,8 +799,9 @@ protected:
 	std::string declare_temporary(uint32_t type, uint32_t id);
 	void emit_uninitialized_temporary(uint32_t type, uint32_t id);
 	SPIRExpression &emit_uninitialized_temporary_expression(uint32_t type, uint32_t id);
-	void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
+	virtual void append_global_func_args(const SPIRFunction &func, uint32_t index, SmallVector<std::string> &arglist);
 	std::string to_non_uniform_aware_expression(uint32_t id);
+	std::string to_atomic_ptr_expression(uint32_t id);
 	std::string to_expression(uint32_t id, bool register_expression_read = true);
 	std::string to_composite_constructor_expression(const SPIRType &parent_type, uint32_t id, bool block_like_type);
 	std::string to_rerolled_array_expression(const SPIRType &parent_type, const std::string &expr, const SPIRType &type);
@@ -1009,6 +1017,8 @@ protected:
 	const Instruction *get_next_instruction_in_block(const Instruction &instr);
 	static uint32_t mask_relevant_memory_semantics(uint32_t semantics);
 
+	std::string convert_floate4m3_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
+	std::string convert_floate5m2_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_half_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_float_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
 	std::string convert_double_to_string(const SPIRConstant &value, uint32_t col, uint32_t row);
diff --git a/thirdparty/spirv-cross/spirv_msl.cpp b/thirdparty/spirv-cross/spirv_msl.cpp
index 642fcfa59a..1c4f1ed13a 100644
--- a/thirdparty/spirv-cross/spirv_msl.cpp
+++ b/thirdparty/spirv-cross/spirv_msl.cpp
@@ -272,16 +272,22 @@ void CompilerMSL::build_implicit_builtins()
 	    (active_input_builtins.get(BuiltInVertexId) || active_input_builtins.get(BuiltInVertexIndex) ||
 	     active_input_builtins.get(BuiltInBaseVertex) || active_input_builtins.get(BuiltInInstanceId) ||
 	     active_input_builtins.get(BuiltInInstanceIndex) || active_input_builtins.get(BuiltInBaseInstance));
-	bool need_local_invocation_index = (msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId)) || is_mesh_shader();
+	bool need_local_invocation_index =
+		(msl_options.emulate_subgroups && active_input_builtins.get(BuiltInSubgroupId)) || is_mesh_shader() ||
+		needs_workgroup_zero_init || needs_local_invocation_index;
 	bool need_workgroup_size = msl_options.emulate_subgroups && active_input_builtins.get(BuiltInNumSubgroups);
 	bool force_frag_depth_passthrough =
 	    get_execution_model() == ExecutionModelFragment && !uses_explicit_early_fragment_test() && need_subpass_input &&
 	    msl_options.enable_frag_depth_builtin && msl_options.input_attachment_is_ds_attachment;
+	bool need_point_size =
+	    msl_options.enable_point_size_builtin && msl_options.enable_point_size_default &&
+	    get_execution_model() == ExecutionModelVertex;
 
 	if (need_subpass_input || need_sample_pos || need_subgroup_mask || need_vertex_params || need_tesc_params ||
 	    need_tese_params || need_multiview || need_dispatch_base || need_vertex_base_params || need_grid_params ||
 	    needs_sample_id || needs_subgroup_invocation_id || needs_subgroup_size || needs_helper_invocation ||
-	    has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size || force_frag_depth_passthrough || is_mesh_shader())
+	    has_additional_fixed_sample_mask() || need_local_invocation_index || need_workgroup_size ||
+	    force_frag_depth_passthrough || need_point_size || is_mesh_shader())
 	{
 		bool has_frag_coord = false;
 		bool has_sample_id = false;
@@ -299,6 +305,7 @@ void CompilerMSL::build_implicit_builtins()
 		bool has_local_invocation_index = false;
 		bool has_workgroup_size = false;
 		bool has_frag_depth = false;
+		bool has_point_size = false;
 		uint32_t workgroup_id_type = 0;
 
 		ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
@@ -306,6 +313,22 @@ void CompilerMSL::build_implicit_builtins()
 				return;
 			if (!interface_variable_exists_in_entry_point(var.self))
 				return;
+
+			auto &type = this->get<SPIRType>(var.basetype);
+			if (need_point_size && has_decoration(type.self, DecorationBlock))
+			{
+				const auto member_count = static_cast<uint32_t>(type.member_types.size());
+				for (uint32_t i = 0; i < member_count; i++)
+				{
+					if (get_member_decoration(type.self, i, DecorationBuiltIn) == BuiltInPointSize)
+					{
+						has_point_size = true;
+						active_output_builtins.set(BuiltInPointSize);
+						break;
+					}
+				}
+			}
+
 			if (!has_decoration(var.self, DecorationBuiltIn))
 				return;
 
@@ -328,6 +351,12 @@ void CompilerMSL::build_implicit_builtins()
 				}
 			}
 
+			if (builtin == BuiltInPointSize)
+			{
+				has_point_size = true;
+				active_output_builtins.set(BuiltInPointSize);
+			}
+
 			if (builtin == BuiltInPrimitivePointIndicesEXT ||
 			    builtin == BuiltInPrimitiveLineIndicesEXT ||
 			    builtin == BuiltInPrimitiveTriangleIndicesEXT)
@@ -481,7 +510,7 @@ void CompilerMSL::build_implicit_builtins()
 				has_local_invocation_index = true;
 			}
 
-			if (need_workgroup_size && builtin == BuiltInLocalInvocationId)
+			if (need_workgroup_size && builtin == BuiltInWorkgroupSize)
 			{
 				builtin_workgroup_size_id = var.self;
 				mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var.self);
@@ -903,25 +932,55 @@ void CompilerMSL::build_implicit_builtins()
 
 		if (need_workgroup_size && !has_workgroup_size)
 		{
-			uint32_t offset = ir.increase_bound_by(2);
-			uint32_t type_ptr_id = offset;
-			uint32_t var_id = offset + 1;
+			auto &execution = get_entry_point();
+			// First, check if the workgroup size _constant_ were defined.
+			// If it were, we don't need to do--in fact, shouldn't do--anything.
+			builtin_workgroup_size_id = execution.workgroup_size.constant;
+			if (builtin_workgroup_size_id == 0)
+			{
+				uint32_t var_id = ir.increase_bound_by(1);
 
-			// Create gl_WorkgroupSize.
-			uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 3);
-			SPIRType uint_type_ptr = get<SPIRType>(type_id);
-			uint_type_ptr.op = OpTypePointer;
-			uint_type_ptr.pointer = true;
-			uint_type_ptr.pointer_depth++;
-			uint_type_ptr.parent_type = type_id;
-			uint_type_ptr.storage = StorageClassInput;
+				// Create gl_WorkgroupSize.
+				uint32_t type_id = build_extended_vector_type(get_uint_type_id(), 3);
+				// If we have LocalSize or LocalSizeId, use those to define the workgroup size.
+				if (execution.flags.get(ExecutionModeLocalSizeId))
+				{
+					const SPIRConstant *init[] = { &get<SPIRConstant>(execution.workgroup_size.id_x),
+						                           &get<SPIRConstant>(execution.workgroup_size.id_y),
+						                           &get<SPIRConstant>(execution.workgroup_size.id_z) };
+					bool specialized = init[0]->specialization || init[1]->specialization || init[2]->specialization;
+					set<SPIRConstant>(var_id, type_id, init, 3, specialized);
+					execution.workgroup_size.constant = var_id;
+				}
+				else if (execution.flags.get(ExecutionModeLocalSize))
+				{
+					uint32_t offset = ir.increase_bound_by(3);
+					const SPIRConstant *init[] = {
+						&set<SPIRConstant>(offset, get_uint_type_id(), execution.workgroup_size.x, false),
+						&set<SPIRConstant>(offset + 1, get_uint_type_id(), execution.workgroup_size.y, false),
+						&set<SPIRConstant>(offset + 2, get_uint_type_id(), execution.workgroup_size.z, false)
+					};
+					set<SPIRConstant>(var_id, type_id, init, 3, false);
+					execution.workgroup_size.constant = var_id;
+				}
+				else
+				{
+					uint32_t type_ptr_id = ir.increase_bound_by(1);
+					SPIRType uint_type_ptr = get<SPIRType>(type_id);
+					uint_type_ptr.op = OpTypePointer;
+					uint_type_ptr.pointer = true;
+					uint_type_ptr.pointer_depth++;
+					uint_type_ptr.parent_type = type_id;
+					uint_type_ptr.storage = StorageClassInput;
 
-			auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
-			ptr_type.self = type_id;
-			set<SPIRVariable>(var_id, type_ptr_id, StorageClassInput);
-			set_decoration(var_id, DecorationBuiltIn, BuiltInWorkgroupSize);
-			builtin_workgroup_size_id = var_id;
-			mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id);
+					auto &ptr_type = set<SPIRType>(type_ptr_id, uint_type_ptr);
+					ptr_type.self = type_id;
+					set<SPIRVariable>(var_id, type_ptr_id, StorageClassInput);
+					mark_implicit_builtin(StorageClassInput, BuiltInWorkgroupSize, var_id);
+				}
+				set_decoration(var_id, DecorationBuiltIn, BuiltInWorkgroupSize);
+				builtin_workgroup_size_id = var_id;
+			}
 		}
 
 		if (!has_frag_depth && force_frag_depth_passthrough)
@@ -953,6 +1012,34 @@ void CompilerMSL::build_implicit_builtins()
 			mark_implicit_builtin(StorageClassOutput, BuiltInFragDepth, var_id);
 			active_output_builtins.set(BuiltInFragDepth);
 		}
+
+		if (!has_point_size && need_point_size)
+		{
+			uint32_t offset = ir.increase_bound_by(3);
+			uint32_t type_id = offset;
+			uint32_t type_ptr_id = offset + 1;
+			uint32_t var_id = offset + 2;
+
+			// Create gl_PointSize
+			SPIRType float_type { OpTypeFloat };
+			float_type.basetype = SPIRType::Float;
+			float_type.width = 32;
+			float_type.vecsize = 1;
+			set<SPIRType>(type_id, float_type);
+
+			SPIRType float_type_ptr_in = float_type;
+			float_type_ptr_in.op = spv::OpTypePointer;
+			float_type_ptr_in.pointer = true;
+			float_type_ptr_in.pointer_depth++;
+			float_type_ptr_in.parent_type = type_id;
+			float_type_ptr_in.storage = StorageClassOutput;
+
+			auto &ptr_in_type = set<SPIRType>(type_ptr_id, float_type_ptr_in);
+			ptr_in_type.self = type_id;
+			set<SPIRVariable>(var_id, type_ptr_id, StorageClassOutput);
+			set_decoration(var_id, DecorationBuiltIn, BuiltInPointSize);
+			mark_implicit_builtin(StorageClassOutput, BuiltInPointSize, var_id);
+		}
 	}
 
 	if (needs_swizzle_buffer_def)
@@ -1003,6 +1090,7 @@ void CompilerMSL::build_implicit_builtins()
 	// If we're returning a struct from a vertex-like entry point, we must return a position attribute.
 	bool need_position = (get_execution_model() == ExecutionModelVertex || is_tese_shader()) &&
 	                     !capture_output_to_buffer && !get_is_rasterization_disabled() &&
+	                     !msl_options.auto_disable_rasterization &&
 	                     !active_output_builtins.get(BuiltInPosition);
 
 	if (need_position)
@@ -1039,6 +1127,10 @@ void CompilerMSL::build_implicit_builtins()
 		});
 		need_position = has_output && !active_output_builtins.get(BuiltInPosition);
 	}
+	else if (!active_output_builtins.get(BuiltInPosition) && msl_options.auto_disable_rasterization)
+	{
+		is_rasterization_disabled = true;
+	}
 
 	if (need_position)
 	{
@@ -1593,6 +1685,7 @@ string CompilerMSL::compile()
 	backend.basic_int16_type = "short";
 	backend.basic_uint16_type = "ushort";
 	backend.boolean_mix_function = "select";
+	backend.printf_function = "os_log_default.log";
 	backend.swizzle_is_function = false;
 	backend.shared_is_implied = false;
 	backend.use_initializer_list = true;
@@ -1606,7 +1699,7 @@ string CompilerMSL::compile()
 	backend.nonuniform_qualifier = "";
 	backend.support_small_type_sampling_result = true;
 	backend.force_merged_mesh_block = false;
-	backend.force_gl_in_out_block = get_execution_model() == ExecutionModelMeshEXT;
+	backend.force_gl_in_out_block = false;
 	backend.supports_empty_struct = true;
 	backend.support_64bit_switch = true;
 	backend.boolean_in_struct_remapped_type = SPIRType::Short;
@@ -1644,6 +1737,7 @@ string CompilerMSL::compile()
 	analyze_image_and_sampler_usage();
 	analyze_sampled_image_usage();
 	analyze_interlocked_resource_usage();
+	analyze_workgroup_variables();
 	preprocess_op_codes();
 	build_implicit_builtins();
 
@@ -1777,7 +1871,7 @@ void CompilerMSL::preprocess_op_codes()
 	if (preproc.uses_atomics)
 	{
 		add_header_line("#include <metal_atomic>");
-		add_pragma_line("#pragma clang diagnostic ignored \"-Wunused-variable\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wunused-variable\"", false);
 	}
 
 	// Before MSL 2.1 (2.2 for textures), Metal vertex functions that write to
@@ -1794,6 +1888,8 @@ void CompilerMSL::preprocess_op_codes()
 		capture_output_to_buffer = true;
 	}
 
+	if (preproc.needs_local_invocation_index)
+		needs_local_invocation_index = true;
 	if (preproc.needs_subgroup_invocation_id)
 		needs_subgroup_invocation_id = true;
 	if (preproc.needs_subgroup_size)
@@ -2147,6 +2243,36 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				break;
 			}
 
+			case OpGroupNonUniformRotateKHR:
+			{
+				// Add the correct invocation ID for calculating clustered rotate case.
+				if (i.length > 5)
+					added_arg_ids.insert(static_cast<Scope>(evaluate_constant_u32(ops[2])) == ScopeSubgroup
+						? builtin_subgroup_invocation_id_id : builtin_local_invocation_index_id);
+				break;
+			}
+
+			case OpGroupNonUniformFAdd:
+			case OpGroupNonUniformFMul:
+			case OpGroupNonUniformFMin:
+			case OpGroupNonUniformFMax:
+			case OpGroupNonUniformIAdd:
+			case OpGroupNonUniformIMul:
+			case OpGroupNonUniformSMin:
+			case OpGroupNonUniformSMax:
+			case OpGroupNonUniformUMin:
+			case OpGroupNonUniformUMax:
+			case OpGroupNonUniformBitwiseAnd:
+			case OpGroupNonUniformBitwiseOr:
+			case OpGroupNonUniformBitwiseXor:
+			case OpGroupNonUniformLogicalAnd:
+			case OpGroupNonUniformLogicalOr:
+			case OpGroupNonUniformLogicalXor:
+				if ((get_execution_model() != ExecutionModelFragment || msl_options.supports_msl_version(2, 2)) &&
+				    ops[3] == GroupOperationClusteredReduce)
+					added_arg_ids.insert(builtin_subgroup_invocation_id_id);
+				break;
+
 			case OpDemoteToHelperInvocation:
 				if (needs_manual_helper_invocation_updates() && needs_helper_invocation)
 					added_arg_ids.insert(builtin_helper_invocation_id);
@@ -2317,7 +2443,14 @@ void CompilerMSL::extract_global_variables_from_function(uint32_t func_id, std::
 				v.storage = StorageClassWorkgroup;
 
 				// Ensure the existing variable has a valid name and the new variable has all the same meta info
-				set_name(arg_id, ensure_valid_name(to_name(arg_id), "v"));
+				if (ir.meta[arg_id].decoration.builtin)
+				{
+					set_name(arg_id, builtin_to_glsl(bi_type, var.storage));
+				}
+				else
+				{
+					set_name(arg_id, ensure_valid_name(to_name(arg_id), "v"));
+				}
 				ir.meta[next_id] = ir.meta[arg_id];
 			}
 			else if (is_builtin && has_decoration(p_type->self, DecorationBlock))
@@ -3182,41 +3315,62 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 		string mbr_name = ensure_valid_name(append_member_name(mbr_name_qual, var_type, mbr_idx) + (mbr_is_indexable ? join("_", i) : ""), "m");
 		set_member_name(ib_type.self, ib_mbr_idx, mbr_name);
 
+		// The SPIRV location of interface variable, used to obtain the initial
+		// MSL location (the location variable) and interface matching
+		uint32_t ir_location = UINT32_MAX;
+		bool has_member_loc_decor = has_member_decoration(var_type.self, mbr_idx, DecorationLocation);
+		bool has_var_loc_decor = has_decoration(var.self, DecorationLocation);
+		uint32_t orig_vecsize = UINT32_MAX;
+
+		// If we haven't established a location base yet, do so here.
+		if (location == UINT32_MAX)
+		{
+			if (has_member_loc_decor)
+				ir_location = get_member_decoration(var_type.self, mbr_idx, DecorationLocation);
+			else if (has_var_loc_decor)
+				ir_location = get_accumulated_member_location(var, mbr_idx, meta.strip_array);
+			else if (is_builtin)
+			{
+				if (is_tessellation_shader() && storage == StorageClassInput && inputs_by_builtin.count(builtin))
+					ir_location = inputs_by_builtin[builtin].location;
+				else if (capture_output_to_buffer && storage == StorageClassOutput && outputs_by_builtin.count(builtin))
+					ir_location = outputs_by_builtin[builtin].location;
+			}
+		}
+
 		// Once we determine the location of the first member within nested structures,
 		// from a var of the topmost structure, the remaining flattened members of
 		// the nested structures will have consecutive location values. At this point,
 		// we've recursively tunnelled into structs, arrays, and matrices, and are
 		// down to a single location for each member now.
-		if (!is_builtin && location != UINT32_MAX)
+		if (location == UINT32_MAX && ir_location != UINT32_MAX)
+			location = ir_location + i;
+
+		if (storage == StorageClassInput && (has_member_loc_decor || has_var_loc_decor))
 		{
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-			mark_location_as_used_by_shader(location, *usable_type, storage);
-			location++;
+			uint32_t component = 0;
+			uint32_t orig_mbr_type_id = usable_type->self;
+
+			if (has_member_loc_decor)
+				component = get_member_decoration(var_type.self, mbr_idx, DecorationComponent);
+
+			var.basetype = ensure_correct_input_type(var.basetype, location, component, 0, meta.strip_array);
+			mbr_type_id = ensure_correct_input_type(usable_type->self, location, component, 0, meta.strip_array);
+
+			// For members of the composite interface block, we only change the interface block type
+			// when interface matching happens. In the meantime, we store the original vector size
+			// and insert a swizzle when loading from metal interface block (see fixup below)
+			if (mbr_type_id != orig_mbr_type_id)
+				orig_vecsize = get<SPIRType>(orig_mbr_type_id).vecsize;
+
+			if (storage == StorageClassInput && pull_model_inputs.count(var.self))
+				ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
+			else
+				ib_type.member_types[ib_mbr_idx] = mbr_type_id;
 		}
-		else if (has_member_decoration(var_type.self, mbr_idx, DecorationLocation))
+
+		if ((!is_builtin && location != UINT32_MAX) || (is_builtin && ir_location != UINT32_MAX))
 		{
-			location = get_member_decoration(var_type.self, mbr_idx, DecorationLocation) + i;
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-			mark_location_as_used_by_shader(location, *usable_type, storage);
-			location++;
-		}
-		else if (has_decoration(var.self, DecorationLocation))
-		{
-			location = get_accumulated_member_location(var, mbr_idx, meta.strip_array) + i;
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-			mark_location_as_used_by_shader(location, *usable_type, storage);
-			location++;
-		}
-		else if (is_builtin && is_tessellation_shader() && storage == StorageClassInput && inputs_by_builtin.count(builtin))
-		{
-			location = inputs_by_builtin[builtin].location + i;
-			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-			mark_location_as_used_by_shader(location, *usable_type, storage);
-			location++;
-		}
-		else if (is_builtin && capture_output_to_buffer && storage == StorageClassOutput && outputs_by_builtin.count(builtin))
-		{
-			location = outputs_by_builtin[builtin].location + i;
 			set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
 			mark_location_as_used_by_shader(location, *usable_type, storage);
 			location++;
@@ -3256,6 +3410,7 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 			case StorageClassInput:
 				entry_func.fixup_hooks_in.push_back([=, &var]() {
 					string lerp_call;
+					string swizzle;
 					if (pull_model_inputs.count(var.self))
 					{
 						if (is_centroid)
@@ -3265,7 +3420,9 @@ void CompilerMSL::add_composite_member_variable_to_interface_block(StorageClass
 						else
 							lerp_call = ".interpolate_at_center()";
 					}
-					statement(var_chain, " = ", ib_var_ref, ".", mbr_name, lerp_call, ";");
+					if (orig_vecsize != UINT32_MAX)
+						swizzle = vector_swizzle(orig_vecsize, 0);
+					statement(var_chain, " = ", ib_var_ref, ".", mbr_name, lerp_call, swizzle, ";");
 				});
 				break;
 
@@ -3333,6 +3490,55 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 			qual_var_name += ".interpolate_at_center()";
 	}
 
+	// The SPIRV location of interface variable, used to obtain the initial
+	// MSL location (the location variable) and interface matching
+	uint32_t ir_location = UINT32_MAX;
+	bool has_member_loc_decor = has_member_decoration(var_type.self, mbr_idx, DecorationLocation);
+	bool has_var_loc_decor = has_decoration(var.self, DecorationLocation);
+	uint32_t orig_vecsize = UINT32_MAX;
+
+	if (has_member_loc_decor)
+		ir_location = get_member_decoration(var_type.self, mbr_idx, DecorationLocation);
+	else if (has_var_loc_decor)
+		ir_location = get_accumulated_member_location(var, mbr_idx, meta.strip_array);
+	else if (is_builtin)
+	{
+		if (is_tessellation_shader() && storage == StorageClassInput && inputs_by_builtin.count(builtin))
+			ir_location = inputs_by_builtin[builtin].location;
+		else if (capture_output_to_buffer && storage == StorageClassOutput && outputs_by_builtin.count(builtin))
+			ir_location = outputs_by_builtin[builtin].location;
+	}
+
+	// Once we determine the location of the first member within nested structures,
+	// from a var of the topmost structure, the remaining flattened members of
+	// the nested structures will have consecutive location values. At this point,
+	// we've recursively tunnelled into structs, arrays, and matrices, and are
+	// down to a single location for each member now.
+	if (location == UINT32_MAX && ir_location != UINT32_MAX)
+		location = ir_location;
+
+	if (storage == StorageClassInput && (has_member_loc_decor || has_var_loc_decor))
+	{
+		uint32_t component = 0;
+		uint32_t orig_mbr_type_id = mbr_type_id;
+
+		if (has_member_loc_decor)
+			component = get_member_decoration(var_type.self, mbr_idx, DecorationComponent);
+
+		mbr_type_id = ensure_correct_input_type(mbr_type_id, location, component, 0, meta.strip_array);
+
+		// For members of the composite interface block, we only change the interface block type
+		// when interface matching happens. In the meantime, we store the original vector size
+		// and insert a swizzle when loading from metal interface block (see fixup below)
+		if (mbr_type_id != orig_mbr_type_id)
+			orig_vecsize = get<SPIRType>(orig_mbr_type_id).vecsize;
+
+		if (storage == StorageClassInput && pull_model_inputs.count(var.self))
+			ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
+		else
+			ib_type.member_types[ib_mbr_idx] = mbr_type_id;
+	}
+
 	bool flatten_stage_out = false;
 	string var_chain = var_chain_qual + "." + to_member_name(var_type, mbr_idx);
 	if (is_builtin && !meta.strip_array)
@@ -3348,7 +3554,11 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		{
 		case StorageClassInput:
 			entry_func.fixup_hooks_in.push_back([=]() {
-				statement(var_chain, " = ", qual_var_name, ";");
+				string swizzle;
+				// Insert swizzle for widened interface block vector from interface matching
+				if (orig_vecsize != UINT32_MAX)
+					swizzle = vector_swizzle(orig_vecsize, 0);
+				statement(var_chain, " = ", qual_var_name, swizzle, ";");
 			});
 			break;
 
@@ -3364,64 +3574,12 @@ void CompilerMSL::add_plain_member_variable_to_interface_block(StorageClass stor
 		}
 	}
 
-	// Once we determine the location of the first member within nested structures,
-	// from a var of the topmost structure, the remaining flattened members of
-	// the nested structures will have consecutive location values. At this point,
-	// we've recursively tunnelled into structs, arrays, and matrices, and are
-	// down to a single location for each member now.
-	if (!is_builtin && location != UINT32_MAX)
+	if ((!is_builtin && location != UINT32_MAX) || (is_builtin && ir_location != UINT32_MAX))
 	{
 		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
 		mark_location_as_used_by_shader(location, get<SPIRType>(mbr_type_id), storage);
 		location += type_to_location_count(get<SPIRType>(mbr_type_id));
 	}
-	else if (has_member_decoration(var_type.self, mbr_idx, DecorationLocation))
-	{
-		location = get_member_decoration(var_type.self, mbr_idx, DecorationLocation);
-		uint32_t comp = get_member_decoration(var_type.self, mbr_idx, DecorationComponent);
-		if (storage == StorageClassInput)
-		{
-			mbr_type_id = ensure_correct_input_type(mbr_type_id, location, comp, 0, meta.strip_array);
-			var_type.member_types[mbr_idx] = mbr_type_id;
-			if (storage == StorageClassInput && pull_model_inputs.count(var.self))
-				ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
-			else
-				ib_type.member_types[ib_mbr_idx] = mbr_type_id;
-		}
-		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-		mark_location_as_used_by_shader(location, get<SPIRType>(mbr_type_id), storage);
-		location += type_to_location_count(get<SPIRType>(mbr_type_id));
-	}
-	else if (has_decoration(var.self, DecorationLocation))
-	{
-		location = get_accumulated_member_location(var, mbr_idx, meta.strip_array);
-		if (storage == StorageClassInput)
-		{
-			mbr_type_id = ensure_correct_input_type(mbr_type_id, location, 0, 0, meta.strip_array);
-			var_type.member_types[mbr_idx] = mbr_type_id;
-			if (storage == StorageClassInput && pull_model_inputs.count(var.self))
-				ib_type.member_types[ib_mbr_idx] = build_msl_interpolant_type(mbr_type_id, is_noperspective);
-			else
-				ib_type.member_types[ib_mbr_idx] = mbr_type_id;
-		}
-		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-		mark_location_as_used_by_shader(location, get<SPIRType>(mbr_type_id), storage);
-		location += type_to_location_count(get<SPIRType>(mbr_type_id));
-	}
-	else if (is_builtin && is_tessellation_shader() && storage == StorageClassInput && inputs_by_builtin.count(builtin))
-	{
-		location = inputs_by_builtin[builtin].location;
-		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-		mark_location_as_used_by_shader(location, get<SPIRType>(mbr_type_id), storage);
-		location += type_to_location_count(get<SPIRType>(mbr_type_id));
-	}
-	else if (is_builtin && capture_output_to_buffer && storage == StorageClassOutput && outputs_by_builtin.count(builtin))
-	{
-		location = outputs_by_builtin[builtin].location;
-		set_member_decoration(ib_type.self, ib_mbr_idx, DecorationLocation, location);
-		mark_location_as_used_by_shader(location, get<SPIRType>(mbr_type_id), storage);
-		location += type_to_location_count(get<SPIRType>(mbr_type_id));
-	}
 
 	// Copy the component location, if present.
 	if (has_member_decoration(var_type.self, mbr_idx, DecorationComponent))
@@ -3720,6 +3878,20 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 		return;
 	}
 
+	// Tesselation stages pass I/O via buffer content which may contain nested structs.
+	// Ensure the vector sizes of any nested struct members within these input variables match
+	// the vector sizes of the corresponding output variables from the previous pipeline stage.
+	// This adjustment is handled here instead of ensure_correct_input_type() in order to
+	// perform the necessary recursive processing.
+	if (storage == StorageClassInput && var_type.basetype == SPIRType::Struct &&
+		((is_tesc_shader() && msl_options.multi_patch_workgroup) ||
+		 (is_tese_shader() && msl_options.raw_buffer_tese_input)) &&
+		has_decoration(var.self, DecorationLocation))
+	{
+		uint32_t locn = get_decoration(var.self, DecorationLocation);
+		ensure_struct_members_valid_vecsizes(get_variable_data_type(var), locn);
+	}
+
 	if (storage == StorageClassInput && has_decoration(var.self, DecorationPerVertexKHR))
 		SPIRV_CROSS_THROW("PerVertexKHR decoration is not supported in MSL.");
 
@@ -3915,6 +4087,43 @@ void CompilerMSL::add_variable_to_interface_block(StorageClass storage, const st
 	}
 }
 
+// Recursively iterate into the input struct type, and adjust the vecsize
+// of any nested members, based on location info provided through the API.
+// The location parameter is modified recursively.
+void CompilerMSL::ensure_struct_members_valid_vecsizes(SPIRType &struct_type, uint32_t &location)
+{
+	assert(struct_type.basetype == SPIRType::Struct);
+
+	auto mbr_cnt = struct_type.member_types.size();
+	for (size_t mbr_idx = 0; mbr_idx < mbr_cnt; mbr_idx++)
+	{
+		auto mbr_type_id = struct_type.member_types[mbr_idx];
+		auto &mbr_type = get<SPIRType>(mbr_type_id);
+
+		if (mbr_type.basetype == SPIRType::Struct)
+			ensure_struct_members_valid_vecsizes(mbr_type, location);
+		else
+		{
+			auto p_va = inputs_by_location.find({location, 0});
+			if (p_va != end(inputs_by_location) && p_va->second.vecsize > mbr_type.vecsize)
+			{
+				// Set a new member type into the struct type, and all its parent types.
+				auto new_mbr_type_id = build_extended_vector_type(mbr_type_id, p_va->second.vecsize);
+				for (auto *p_type = &struct_type; p_type; p_type = maybe_get<SPIRType>(p_type->parent_type))
+					p_type->member_types[mbr_idx] = new_mbr_type_id;
+			}
+
+			// Calc location of next member
+			uint32_t loc_cnt = mbr_type.columns;
+			auto dim_cnt = mbr_type.array.size();
+			for (uint32_t i = 0; i < dim_cnt; i++)
+				loc_cnt *= to_array_size_literal(mbr_type, i);
+
+			location += loc_cnt;
+		}
+	}
+}
+
 // Fix up the mapping of variables to interface member indices, which is used to compile access chains
 // for per-vertex variables in a tessellation control shader.
 void CompilerMSL::fix_up_interface_member_indices(StorageClass storage, uint32_t ib_type_id)
@@ -4219,8 +4428,9 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 		// If the entry point should return the output struct, set the entry function
 		// to return the output interface struct, otherwise to return nothing.
 		// Watch out for the rare case where the terminator of the last entry point block is a
-		// Kill, instead of a Return. Based on SPIR-V's block-domination rules, we assume that
-		// any block that has a Kill will also have a terminating Return, except the last block.
+		// Kill or Unreachable, instead of a Return. Based on SPIR-V's block-domination rules,
+		// we assume that any block that has a Kill will also have a terminating Return, except
+		// the last block.
 		// Indicate the output var requires early initialization.
 		bool ep_should_return_output = !get_is_rasterization_disabled();
 		uint32_t rtn_id = ep_should_return_output ? ib_var_id : 0;
@@ -4230,7 +4440,8 @@ uint32_t CompilerMSL::add_interface_block(StorageClass storage, bool patch)
 			for (auto &blk_id : entry_func.blocks)
 			{
 				auto &blk = get<SPIRBlock>(blk_id);
-				if (blk.terminator == SPIRBlock::Return || (blk.terminator == SPIRBlock::Kill && blk_id == entry_func.blocks.back()))
+				auto last_blk_return = blk.terminator == SPIRBlock::Kill || blk.terminator == SPIRBlock::Unreachable;
+				if (blk.terminator == SPIRBlock::Return || (last_blk_return && blk_id == entry_func.blocks.back()))
 					blk.return_value = rtn_id;
 			}
 			vars_needing_early_declaration.push_back(ib_var_id);
@@ -5539,18 +5750,44 @@ void CompilerMSL::emit_header()
 {
 	// This particular line can be overridden during compilation, so make it a flag and not a pragma line.
 	if (suppress_missing_prototypes)
-		statement("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-prototypes\"", false);
 	if (suppress_incompatible_pointer_types_discard_qualifiers)
-		statement("#pragma clang diagnostic ignored \"-Wincompatible-pointer-types-discards-qualifiers\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wincompatible-pointer-types-discards-qualifiers\"", false);
+
+	// Disable warning about "sometimes unitialized" when zero-initializing simple threadgroup variables
+	if (suppress_sometimes_unitialized)
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wsometimes-uninitialized\"", false);
 
 	// Disable warning about missing braces for array<T> template to make arrays a value type
 	if (spv_function_implementations.count(SPVFuncImplUnsafeArray) != 0)
-		statement("#pragma clang diagnostic ignored \"-Wmissing-braces\"");
+		add_pragma_line("#pragma clang diagnostic ignored \"-Wmissing-braces\"", false);
+
+	// Floating point fast math compile declarations
+	if (msl_options.use_fast_math_pragmas && msl_options.supports_msl_version(3, 2))
+	{
+		uint32_t contract_mask = FPFastMathModeAllowContractMask;
+		uint32_t relax_mask = (FPFastMathModeNSZMask | FPFastMathModeAllowRecipMask | FPFastMathModeAllowReassocMask);
+		uint32_t fast_mask = (relax_mask | FPFastMathModeNotNaNMask | FPFastMathModeNotInfMask);
+
+		// FP math mode
+		uint32_t fp_flags = get_fp_fast_math_flags(true);
+		const char *math_mode = "safe";
+		if ((fp_flags & fast_mask) == fast_mask)		// Must have all flags
+			math_mode = "fast";
+		else if ((fp_flags & relax_mask) == relax_mask)	// Must have all flags
+			math_mode = "relaxed";
+
+		add_pragma_line(join("#pragma metal fp math_mode(", math_mode, ")"), false);
+
+		// FP contraction
+		const char *contract_mode = ((fp_flags & contract_mask) == contract_mask) ?  "fast" : "off";
+		add_pragma_line(join("#pragma metal fp contract(", contract_mode, ")"), false);
+	}
 
 	for (auto &pragma : pragma_lines)
 		statement(pragma);
 
-	if (!pragma_lines.empty() || suppress_missing_prototypes)
+	if (!pragma_lines.empty())
 		statement("");
 
 	statement("#include <metal_stdlib>");
@@ -5570,18 +5807,23 @@ void CompilerMSL::emit_header()
 		statement("");
 }
 
-void CompilerMSL::add_pragma_line(const string &line)
+void CompilerMSL::add_pragma_line(const string &line, bool recompile_on_unique)
 {
-	auto rslt = pragma_lines.insert(line);
-	if (rslt.second)
-		force_recompile();
+	if (std::find(pragma_lines.begin(), pragma_lines.end(), line) == pragma_lines.end())
+	{
+		pragma_lines.push_back(line);
+		if (recompile_on_unique)
+			force_recompile();
+	}
 }
 
 void CompilerMSL::add_typedef_line(const string &line)
 {
-	auto rslt = typedef_lines.insert(line);
-	if (rslt.second)
+	if (std::find(typedef_lines.begin(), typedef_lines.end(), line) == typedef_lines.end())
+	{
+		typedef_lines.push_back(line);
 		force_recompile();
+	}
 }
 
 // Template struct like spvUnsafeArray<> need to be declared *before* any resources are declared
@@ -5789,7 +6031,6 @@ void CompilerMSL::emit_custom_functions()
 		if (!msl_options.supports_msl_version(2))
 			SPIRV_CROSS_THROW(
 			    "spvDynamicImageSampler requires default-constructible texture objects, which require MSL 2.0.");
-		spv_function_implementations.insert(SPVFuncImplForwardArgs);
 		spv_function_implementations.insert(SPVFuncImplTextureSwizzle);
 		if (msl_options.swizzle_texture_samples)
 			spv_function_implementations.insert(SPVFuncImplGatherSwizzle);
@@ -5803,16 +6044,22 @@ void CompilerMSL::emit_custom_functions()
 		spv_function_implementations.insert(SPVFuncImplConvertYCbCrBT2020);
 	}
 
-	for (uint32_t i = SPVFuncImplChromaReconstructNearest2Plane;
-	     i <= SPVFuncImplChromaReconstructLinear420XMidpointYMidpoint3Plane; i++)
-		if (spv_function_implementations.count(static_cast<SPVFuncImpl>(i)))
-			spv_function_implementations.insert(SPVFuncImplForwardArgs);
+	if (spv_function_implementations.count(SPVFuncImplGatherSwizzle) ||
+	    spv_function_implementations.count(SPVFuncImplGatherConstOffsets))
+	{
+		spv_function_implementations.insert(SPVFuncImplGatherReturn);
+	}
+
+	if (spv_function_implementations.count(SPVFuncImplGatherCompareSwizzle) ||
+	    spv_function_implementations.count(SPVFuncImplGatherCompareConstOffsets))
+	{
+		spv_function_implementations.insert(SPVFuncImplGatherCompareReturn);
+	}
 
 	if (spv_function_implementations.count(SPVFuncImplTextureSwizzle) ||
 	    spv_function_implementations.count(SPVFuncImplGatherSwizzle) ||
 	    spv_function_implementations.count(SPVFuncImplGatherCompareSwizzle))
 	{
-		spv_function_implementations.insert(SPVFuncImplForwardArgs);
 		spv_function_implementations.insert(SPVFuncImplGetSwizzle);
 	}
 
@@ -5820,6 +6067,17 @@ void CompilerMSL::emit_custom_functions()
 	{
 		switch (spv_func)
 		{
+		case SPVFuncImplSMod:
+			statement("// Implementation of signed integer mod accurate to SPIR-V specification");
+			statement("template<typename Tx, typename Ty>");
+			statement("inline Tx spvSMod(Tx x, Ty y)");
+			begin_scope();
+			statement("Tx remainder = x - y * (x / y);");
+			statement("return select(Tx(remainder + y), remainder, remainder == 0 || (x >= 0) == (y >= 0));");
+			end_scope();
+			statement("");
+			break;
+
 		case SPVFuncImplMod:
 			statement("// Implementation of the GLSL mod() function, which is slightly different than Metal fmod()");
 			statement("template<typename Tx, typename Ty>");
@@ -6282,23 +6540,6 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
-		case SPVFuncImplForwardArgs:
-			statement("template<typename T> struct spvRemoveReference { typedef T type; };");
-			statement("template<typename T> struct spvRemoveReference<thread T&> { typedef T type; };");
-			statement("template<typename T> struct spvRemoveReference<thread T&&> { typedef T type; };");
-			statement("template<typename T> inline constexpr thread T&& spvForward(thread typename "
-			          "spvRemoveReference<T>::type& x)");
-			begin_scope();
-			statement("return static_cast<thread T&&>(x);");
-			end_scope();
-			statement("template<typename T> inline constexpr thread T&& spvForward(thread typename "
-			          "spvRemoveReference<T>::type&& x)");
-			begin_scope();
-			statement("return static_cast<thread T&&>(x);");
-			end_scope();
-			statement("");
-			break;
-
 		case SPVFuncImplGetSwizzle:
 			statement("enum class spvSwizzle : uint");
 			begin_scope();
@@ -6356,11 +6597,22 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		case SPVFuncImplGatherReturn:
+			statement("template<typename Tex, typename... Tp>");
+			statement("using spvGatherReturn = decltype(declval<Tex>().gather(declval<sampler>(), declval<Tp>()...));");
+			statement("");
+			break;
+
+		case SPVFuncImplGatherCompareReturn:
+			statement("template<typename Tex, typename... Tp>");
+			statement("using spvGatherCompareReturn = decltype(declval<Tex>().gather_compare(declval<sampler>(), declval<Tp>()...));");
+			statement("");
+			break;
+
 		case SPVFuncImplGatherSwizzle:
 			statement("// Wrapper function that swizzles texture gathers.");
-			statement("template<typename T, template<typename, access = access::sample, typename = void> class Tex, "
-			          "typename... Ts>");
-			statement("inline vec<T, 4> spvGatherSwizzle(const thread Tex<T>& t, sampler s, "
+			statement("template<typename Tex, typename... Ts>");
+			statement("inline spvGatherReturn<Tex, Ts...> spvGatherSwizzle(const thread Tex& t, sampler s, "
 			          "uint sw, component c, Ts... params) METAL_CONST_ARG(c)");
 			begin_scope();
 			statement("if (sw)");
@@ -6370,17 +6622,17 @@ void CompilerMSL::emit_custom_functions()
 			statement("case spvSwizzle::none:");
 			statement("    break;");
 			statement("case spvSwizzle::zero:");
-			statement("    return vec<T, 4>(0, 0, 0, 0);");
+			statement("    return spvGatherReturn<Tex, Ts...>(0, 0, 0, 0);");
 			statement("case spvSwizzle::one:");
-			statement("    return vec<T, 4>(1, 1, 1, 1);");
+			statement("    return spvGatherReturn<Tex, Ts...>(1, 1, 1, 1);");
 			statement("case spvSwizzle::red:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::x);");
+			statement("    return t.gather(s, params..., component::x);");
 			statement("case spvSwizzle::green:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::y);");
+			statement("    return t.gather(s, params..., component::y);");
 			statement("case spvSwizzle::blue:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::z);");
+			statement("    return t.gather(s, params..., component::z);");
 			statement("case spvSwizzle::alpha:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::w);");
+			statement("    return t.gather(s, params..., component::w);");
 			end_scope();
 			end_scope();
 			// texture::gather insists on its component parameter being a constant
@@ -6388,13 +6640,13 @@ void CompilerMSL::emit_custom_functions()
 			statement("switch (c)");
 			begin_scope();
 			statement("case component::x:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::x);");
+			statement("    return t.gather(s, params..., component::x);");
 			statement("case component::y:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::y);");
+			statement("    return t.gather(s, params..., component::y);");
 			statement("case component::z:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::z);");
+			statement("    return t.gather(s, params..., component::z);");
 			statement("case component::w:");
-			statement("    return t.gather(s, spvForward<Ts>(params)..., component::w);");
+			statement("    return t.gather(s, params..., component::w);");
 			end_scope();
 			end_scope();
 			statement("");
@@ -6402,10 +6654,8 @@ void CompilerMSL::emit_custom_functions()
 
 		case SPVFuncImplGatherCompareSwizzle:
 			statement("// Wrapper function that swizzles depth texture gathers.");
-			statement("template<typename T, template<typename, access = access::sample, typename = void> class Tex, "
-			          "typename... Ts>");
-			statement("inline vec<T, 4> spvGatherCompareSwizzle(const thread Tex<T>& t, sampler "
-			          "s, uint sw, Ts... params) ");
+			statement("template<typename Tex, typename... Ts>");
+			statement("inline spvGatherCompareReturn<Tex, Ts...> spvGatherCompareSwizzle(const thread Tex& t, sampler s, uint sw, Ts... params)");
 			begin_scope();
 			statement("if (sw)");
 			begin_scope();
@@ -6418,12 +6668,12 @@ void CompilerMSL::emit_custom_functions()
 			statement("case spvSwizzle::green:");
 			statement("case spvSwizzle::blue:");
 			statement("case spvSwizzle::alpha:");
-			statement("    return vec<T, 4>(0, 0, 0, 0);");
+			statement("    return spvGatherCompareReturn<Tex, Ts...>(0, 0, 0, 0);");
 			statement("case spvSwizzle::one:");
-			statement("    return vec<T, 4>(1, 1, 1, 1);");
+			statement("    return spvGatherCompareReturn<Tex, Ts...>(1, 1, 1, 1);");
 			end_scope();
 			end_scope();
-			statement("return t.gather_compare(s, spvForward<Ts>(params)...);");
+			statement("return t.gather_compare(s, params...);");
 			end_scope();
 			statement("");
 			break;
@@ -6433,33 +6683,32 @@ void CompilerMSL::emit_custom_functions()
 			for (uint32_t i = 0; i < texture_addr_space_count; i++)
 			{
 				statement("// Wrapper function that processes a ", texture_addr_spaces[i], " texture gather with a constant offset array.");
-				statement("template<typename T, template<typename, access = access::sample, typename = void> class Tex, "
-						  "typename Toff, typename... Tp>");
-				statement("inline vec<T, 4> spvGatherConstOffsets(const ", texture_addr_spaces[i], " Tex<T>& t, sampler s, "
+				statement("template<typename Tex, typename Toff, typename... Tp>");
+				statement("inline spvGatherReturn<Tex, Tp...> spvGatherConstOffsets(const ", texture_addr_spaces[i], " Tex& t, sampler s, "
 						  "Toff coffsets, component c, Tp... params) METAL_CONST_ARG(c)");
 				begin_scope();
-				statement("vec<T, 4> rslts[4];");
+				statement("spvGatherReturn<Tex, Tp...> rslts[4];");
 				statement("for (uint i = 0; i < 4; i++)");
 				begin_scope();
 				statement("switch (c)");
 				begin_scope();
 				// Work around texture::gather() requiring its component parameter to be a constant expression
 				statement("case component::x:");
-				statement("    rslts[i] = t.gather(s, spvForward<Tp>(params)..., coffsets[i], component::x);");
+				statement("    rslts[i] = t.gather(s, params..., coffsets[i], component::x);");
 				statement("    break;");
 				statement("case component::y:");
-				statement("    rslts[i] = t.gather(s, spvForward<Tp>(params)..., coffsets[i], component::y);");
+				statement("    rslts[i] = t.gather(s, params..., coffsets[i], component::y);");
 				statement("    break;");
 				statement("case component::z:");
-				statement("    rslts[i] = t.gather(s, spvForward<Tp>(params)..., coffsets[i], component::z);");
+				statement("    rslts[i] = t.gather(s, params..., coffsets[i], component::z);");
 				statement("    break;");
 				statement("case component::w:");
-				statement("    rslts[i] = t.gather(s, spvForward<Tp>(params)..., coffsets[i], component::w);");
+				statement("    rslts[i] = t.gather(s, params..., coffsets[i], component::w);");
 				statement("    break;");
 				end_scope();
 				end_scope();
 				// Pull all values from the i0j0 component of each gather footprint
-				statement("return vec<T, 4>(rslts[0].w, rslts[1].w, rslts[2].w, rslts[3].w);");
+				statement("return spvGatherReturn<Tex, Tp...>(rslts[0].w, rslts[1].w, rslts[2].w, rslts[3].w);");
 				end_scope();
 				statement("");
 			}
@@ -6470,18 +6719,17 @@ void CompilerMSL::emit_custom_functions()
 			for (uint32_t i = 0; i < texture_addr_space_count; i++)
 			{
 				statement("// Wrapper function that processes a ", texture_addr_spaces[i], " texture gather with a constant offset array.");
-				statement("template<typename T, template<typename, access = access::sample, typename = void> class Tex, "
-						  "typename Toff, typename... Tp>");
-				statement("inline vec<T, 4> spvGatherCompareConstOffsets(const ", texture_addr_spaces[i], " Tex<T>& t, sampler s, "
+				statement("template<typename Tex, typename Toff, typename... Tp>");
+				statement("inline spvGatherCompareReturn<Tex, Tp...> spvGatherCompareConstOffsets(const ", texture_addr_spaces[i], " Tex& t, sampler s, "
 						  "Toff coffsets, Tp... params)");
 				begin_scope();
-				statement("vec<T, 4> rslts[4];");
+				statement("spvGatherCompareReturn<Tex, Tp...> rslts[4];");
 				statement("for (uint i = 0; i < 4; i++)");
 				begin_scope();
-				statement("    rslts[i] = t.gather_compare(s, spvForward<Tp>(params)..., coffsets[i]);");
+				statement("    rslts[i] = t.gather_compare(s, params..., coffsets[i]);");
 				end_scope();
 				// Pull all values from the i0j0 component of each gather footprint
-				statement("return vec<T, 4>(rslts[0].w, rslts[1].w, rslts[2].w, rslts[3].w);");
+				statement("return spvGatherCompareReturn<Tex, Tp...>(rslts[0].w, rslts[1].w, rslts[2].w, rslts[3].w);");
 				end_scope();
 				statement("");
 			}
@@ -6826,6 +7074,135 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		case SPVFuncImplSubgroupRotate:
+			statement("template<typename T>");
+			statement("inline T spvSubgroupRotate(T value, ushort delta)");
+			begin_scope();
+			if (msl_options.use_quadgroup_operation())
+				statement("return quad_shuffle_rotate_down(value, delta);");
+			else
+				statement("return simd_shuffle_rotate_down(value, delta);");
+			end_scope();
+			statement("");
+			statement("template<>");
+			statement("inline bool spvSubgroupRotate(bool value, ushort delta)");
+			begin_scope();
+			if (msl_options.use_quadgroup_operation())
+				statement("return !!quad_shuffle_rotate_down((ushort)value, delta);");
+			else
+				statement("return !!simd_shuffle_rotate_down((ushort)value, delta);");
+			end_scope();
+			statement("");
+			statement("template<uint N>");
+			statement("inline vec<bool, N> spvSubgroupRotate(vec<bool, N> value, ushort delta)");
+			begin_scope();
+			if (msl_options.use_quadgroup_operation())
+				statement("return (vec<bool, N>)quad_shuffle_rotate_down((vec<ushort, N>)value, delta);");
+			else
+				statement("return (vec<bool, N>)simd_shuffle_rotate_down((vec<ushort, N>)value, delta);");
+			end_scope();
+			statement("");
+			break;
+
+			// C++ disallows partial specializations of function templates,
+			// hence the use of a struct.
+			// clang-format off
+#define FUNC_SUBGROUP_CLUSTERED(spv, msl, combine, op, ident) \
+		case SPVFuncImplSubgroupClustered##spv: \
+			statement("template<uint N, uint offset>"); \
+			statement("struct spvClustered" #spv "Detail;"); \
+			statement(""); \
+			statement("// Base cases"); \
+			statement("template<>"); \
+			statement("struct spvClustered" #spv "Detail<1, 0>"); \
+			begin_scope(); \
+			statement("template<typename T>"); \
+			statement("static T op(T value, uint)"); \
+			begin_scope(); \
+			statement("return value;"); \
+			end_scope(); \
+			end_scope_decl(); \
+			statement(""); \
+			statement("template<uint offset>"); \
+			statement("struct spvClustered" #spv "Detail<1, offset>"); \
+			begin_scope(); \
+			statement("template<typename T>"); \
+			statement("static T op(T value, uint lid)"); \
+			begin_scope(); \
+			statement("// If the target lane is inactive, then return identity."); \
+			if (msl_options.use_quadgroup_operation()) \
+				statement("if (!extract_bits((quad_vote::vote_t)quad_active_threads_mask(), (lid ^ offset), 1))"); \
+			else \
+				statement("if (!extract_bits(as_type<uint2>((simd_vote::vote_t)simd_active_threads_mask())[(lid ^ offset) / 32], (lid ^ offset) % 32, 1))"); \
+			statement("    return " #ident ";"); \
+			if (msl_options.use_quadgroup_operation()) \
+				statement("return quad_shuffle_xor(value, offset);"); \
+			else \
+				statement("return simd_shuffle_xor(value, offset);"); \
+			end_scope(); \
+			end_scope_decl(); \
+			statement(""); \
+			statement("template<>"); \
+			statement("struct spvClustered" #spv "Detail<4, 0>"); \
+			begin_scope(); \
+			statement("template<typename T>"); \
+			statement("static T op(T value, uint)"); \
+			begin_scope(); \
+			statement("return quad_" #msl "(value);"); \
+			end_scope(); \
+			end_scope_decl(); \
+			statement(""); \
+			statement("template<uint offset>"); \
+			statement("struct spvClustered" #spv "Detail<4, offset>"); \
+			begin_scope(); \
+			statement("template<typename T>"); \
+			statement("static T op(T value, uint lid)"); \
+			begin_scope(); \
+			statement("// Here, we care if any of the lanes in the quad are active."); \
+			statement("uint quad_mask = extract_bits(as_type<uint2>((simd_vote::vote_t)simd_active_threads_mask())[(lid ^ offset) / 32], ((lid ^ offset) % 32) & ~3, 4);"); \
+			statement("if (!quad_mask)"); \
+			statement("    return " #ident ";"); \
+			statement("// But we need to make sure we shuffle from an active lane."); \
+			if (msl_options.use_quadgroup_operation()) \
+				SPIRV_CROSS_THROW("Subgroup size with quadgroup operation cannot exceed 4."); \
+			else \
+				statement("return simd_shuffle(quad_" #msl "(value), ((lid ^ offset) & ~3) | ctz(quad_mask));"); \
+			end_scope(); \
+			end_scope_decl(); \
+			statement(""); \
+			statement("// General case"); \
+			statement("template<uint N, uint offset>"); \
+			statement("struct spvClustered" #spv "Detail"); \
+			begin_scope(); \
+			statement("template<typename T>"); \
+			statement("static T op(T value, uint lid)"); \
+			begin_scope(); \
+			statement("return " combine(msl, op, "spvClustered" #spv "Detail<N/2, offset>::op(value, lid)", "spvClustered" #spv "Detail<N/2, offset + N/2>::op(value, lid)") ";"); \
+			end_scope(); \
+			end_scope_decl(); \
+			statement(""); \
+			statement("template<uint N, typename T>"); \
+			statement("T spvClustered_" #msl "(T value, uint lid)"); \
+			begin_scope(); \
+			statement("return spvClustered" #spv "Detail<N, 0>::op(value, lid);"); \
+			end_scope(); \
+			statement(""); \
+			break
+#define BINOP(msl, op, l, r) l " " #op " " r
+#define BINFUNC(msl, op, l, r) #msl "(" l ", " r ")"
+
+		FUNC_SUBGROUP_CLUSTERED(Add, sum, BINOP, +, 0);
+		FUNC_SUBGROUP_CLUSTERED(Mul, product, BINOP, *, 1);
+		FUNC_SUBGROUP_CLUSTERED(Min, min, BINFUNC, , numeric_limits<T>::max());
+		FUNC_SUBGROUP_CLUSTERED(Max, max, BINFUNC, , numeric_limits<T>::min());
+		FUNC_SUBGROUP_CLUSTERED(And, and, BINOP, &, ~T(0));
+		FUNC_SUBGROUP_CLUSTERED(Or, or, BINOP, |, 0);
+		FUNC_SUBGROUP_CLUSTERED(Xor, xor, BINOP, ^, 0);
+			// clang-format on
+#undef FUNC_SUBGROUP_CLUSTERED
+#undef BINOP
+#undef BINFUNC
+
 		case SPVFuncImplQuadBroadcast:
 			statement("template<typename T>");
 			statement("inline T spvQuadBroadcast(T value, uint lane)");
@@ -6925,8 +7302,8 @@ void CompilerMSL::emit_custom_functions()
 			          "samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
-			statement("ycbcr.br = plane1.sample(samp, coord, spvForward<LodOptions>(options)...).rg;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
+			statement("ycbcr.br = plane1.sample(samp, coord, options...).rg;");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -6938,9 +7315,9 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
-			statement("ycbcr.b = plane1.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
-			statement("ycbcr.r = plane2.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
+			statement("ycbcr.b = plane1.sample(samp, coord, options...).r;");
+			statement("ycbcr.r = plane2.sample(samp, coord, options...).r;");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -6952,15 +7329,15 @@ void CompilerMSL::emit_custom_functions()
 			          "plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("if (fract(coord.x * plane1.get_width()) != 0.0)");
 			begin_scope();
-			statement("ycbcr.br = vec<T, 2>(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), 0.5).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), 0.5).rg);");
 			end_scope();
 			statement("else");
 			begin_scope();
-			statement("ycbcr.br = plane1.sample(samp, coord, spvForward<LodOptions>(options)...).rg;");
+			statement("ycbcr.br = plane1.sample(samp, coord, options...).rg;");
 			end_scope();
 			statement("return ycbcr;");
 			end_scope();
@@ -6973,18 +7350,18 @@ void CompilerMSL::emit_custom_functions()
 			          "plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("if (fract(coord.x * plane1.get_width()) != 0.0)");
 			begin_scope();
-			statement("ycbcr.b = T(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), 0.5).r);");
-			statement("ycbcr.r = T(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), 0.5).r);");
+			statement("ycbcr.b = T(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), 0.5).r);");
+			statement("ycbcr.r = T(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., int2(1, 0)), 0.5).r);");
 			end_scope();
 			statement("else");
 			begin_scope();
-			statement("ycbcr.b = plane1.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
-			statement("ycbcr.r = plane2.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.b = plane1.sample(samp, coord, options...).r;");
+			statement("ycbcr.r = plane2.sample(samp, coord, options...).r;");
 			end_scope();
 			statement("return ycbcr;");
 			end_scope();
@@ -6997,10 +7374,10 @@ void CompilerMSL::emit_custom_functions()
 			          "plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("int2 offs = int2(fract(coord.x * plane1.get_width()) != 0.0 ? 1 : -1, 0);");
-			statement("ycbcr.br = vec<T, 2>(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., offs), 0.25).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., offs), 0.25).rg);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7012,12 +7389,12 @@ void CompilerMSL::emit_custom_functions()
 			          "plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("int2 offs = int2(fract(coord.x * plane1.get_width()) != 0.0 ? 1 : -1, 0);");
-			statement("ycbcr.b = T(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., offs), 0.25).r);");
-			statement("ycbcr.r = T(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., offs), 0.25).r);");
+			statement("ycbcr.b = T(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., offs), 0.25).r);");
+			statement("ycbcr.r = T(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., offs), 0.25).r);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7029,12 +7406,12 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract(round(coord * float2(plane0.get_width(), plane0.get_height())) * 0.5);");
-			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).rg);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7046,16 +7423,16 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract(round(coord * float2(plane0.get_width(), plane0.get_height())) * 0.5);");
-			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
-			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane2.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane2.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7067,13 +7444,13 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0.5, "
 			          "0)) * 0.5);");
-			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).rg);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7085,17 +7462,17 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0.5, "
 			          "0)) * 0.5);");
-			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
-			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane2.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane2.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7107,13 +7484,13 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0, "
 			          "0.5)) * 0.5);");
-			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).rg);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7125,17 +7502,17 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0, "
 			          "0.5)) * 0.5);");
-			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
-			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane2.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane2.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7147,13 +7524,13 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0.5, "
 			          "0.5)) * 0.5);");
-			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).rg);");
+			statement("ycbcr.br = vec<T, 2>(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).rg);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7165,17 +7542,17 @@ void CompilerMSL::emit_custom_functions()
 			          "texture2d<T> plane1, texture2d<T> plane2, sampler samp, float2 coord, LodOptions... options)");
 			begin_scope();
 			statement("vec<T, 4> ycbcr = vec<T, 4>(0, 0, 0, 1);");
-			statement("ycbcr.g = plane0.sample(samp, coord, spvForward<LodOptions>(options)...).r;");
+			statement("ycbcr.g = plane0.sample(samp, coord, options...).r;");
 			statement("float2 ab = fract((round(coord * float2(plane0.get_width(), plane0.get_height())) - float2(0.5, "
 			          "0.5)) * 0.5);");
-			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane1.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
-			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)...), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 0)), ab.x), "
-			          "mix(plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(0, 1)), "
-			          "plane2.sample(samp, coord, spvForward<LodOptions>(options)..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.b = T(mix(mix(plane1.sample(samp, coord, options...), "
+			          "plane1.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane1.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane1.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
+			statement("ycbcr.r = T(mix(mix(plane2.sample(samp, coord, options...), "
+			          "plane2.sample(samp, coord, options..., int2(1, 0)), ab.x), "
+			          "mix(plane2.sample(samp, coord, options..., int2(0, 1)), "
+			          "plane2.sample(samp, coord, options..., int2(1, 1)), ab.x), ab.y).r);");
 			statement("return ycbcr;");
 			end_scope();
 			statement("");
@@ -7457,10 +7834,8 @@ void CompilerMSL::emit_custom_functions()
 			statement("    ycbcr_samp.get_chroma_filter() == spvChromaFilter::nearest)");
 			begin_scope();
 			statement("if (!is_null_texture(plane2))");
-			statement("    return spvChromaReconstructNearest(plane0, plane1, plane2, samp, coord,");
-			statement("                                       spvForward<LodOptions>(options)...);");
-			statement(
-			    "return spvChromaReconstructNearest(plane0, plane1, samp, coord, spvForward<LodOptions>(options)...);");
+			statement("    return spvChromaReconstructNearest(plane0, plane1, plane2, samp, coord, options...);");
+			statement("return spvChromaReconstructNearest(plane0, plane1, samp, coord, options...);");
 			end_scope(); // if (resolution == 422 || chroma_filter == nearest)
 			statement("switch (ycbcr_samp.get_resolution())");
 			begin_scope();
@@ -7473,18 +7848,18 @@ void CompilerMSL::emit_custom_functions()
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear422CositedEven(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear422CositedEven(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			statement("case spvXChromaLocation::midpoint:");
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear422Midpoint(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear422Midpoint(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			end_scope(); // switch (x_chroma_offset)
 			end_scope(); // case 422:
 			statement("case spvFormatResolution::_420:");
@@ -7499,18 +7874,18 @@ void CompilerMSL::emit_custom_functions()
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear420XCositedEvenYCositedEven(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear420XCositedEvenYCositedEven(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			statement("case spvYChromaLocation::midpoint:");
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear420XCositedEvenYMidpoint(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear420XCositedEvenYMidpoint(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			end_scope(); // switch (y_chroma_offset)
 			end_scope(); // case x::cosited_even:
 			statement("case spvXChromaLocation::midpoint:");
@@ -7521,31 +7896,30 @@ void CompilerMSL::emit_custom_functions()
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear420XMidpointYCositedEven(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear420XMidpointYCositedEven(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			statement("case spvYChromaLocation::midpoint:");
 			statement("    if (!is_null_texture(plane2))");
 			statement("        return spvChromaReconstructLinear420XMidpointYMidpoint(");
 			statement("            plane0, plane1, plane2, samp,");
-			statement("            coord, spvForward<LodOptions>(options)...);");
+			statement("            coord, options...);");
 			statement("    return spvChromaReconstructLinear420XMidpointYMidpoint(");
 			statement("        plane0, plane1, samp, coord,");
-			statement("        spvForward<LodOptions>(options)...);");
+			statement("        options...);");
 			end_scope(); // switch (y_chroma_offset)
 			end_scope(); // case x::midpoint
 			end_scope(); // switch (x_chroma_offset)
 			end_scope(); // case 420:
 			end_scope(); // switch (resolution)
 			end_scope(); // if (multiplanar)
-			statement("return plane0.sample(samp, coord, spvForward<LodOptions>(options)...);");
+			statement("return plane0.sample(samp, coord, options...);");
 			end_scope(); // do_sample()
 			statement("template <typename... LodOptions>");
 			statement("vec<T, 4> sample(float2 coord, LodOptions... options) const thread");
 			begin_scope();
-			statement(
-			    "vec<T, 4> s = spvTextureSwizzle(do_sample(coord, spvForward<LodOptions>(options)...), swizzle);");
+			statement("vec<T, 4> s = spvTextureSwizzle(do_sample(coord, options...), swizzle);");
 			statement("if (ycbcr_samp.get_ycbcr_model() == spvYCbCrModelConversion::rgb_identity)");
 			statement("    return s;");
 			statement("");
@@ -7784,6 +8158,26 @@ void CompilerMSL::emit_custom_functions()
 			statement("");
 			break;
 
+		case SPVFuncImplAssume:
+			statement_no_indent("#if defined(__has_builtin)");
+			statement_no_indent("#if !defined(SPV_ASSUME) && __has_builtin(__builtin_assume)");
+			statement_no_indent("#define SPV_ASSUME(x) __builtin_assume(x);");
+			statement_no_indent("#endif");
+			statement_no_indent("#if !defined(SPV_EXPECT) && __has_builtin(__builtin_expect)");
+			statement_no_indent("#define SPV_EXPECT(x, y) __builtin_expect(x, y);");
+			statement_no_indent("#endif");
+			statement_no_indent("#endif");
+
+			statement_no_indent("#ifndef SPV_ASSUME");
+			statement_no_indent("#define SPV_ASSUME(x)");
+			statement_no_indent("#endif");
+
+			statement_no_indent("#ifndef SPV_EXPECT");
+			statement_no_indent("#define SPV_EXPECT(x, y) x");
+			statement_no_indent("#endif");
+
+			break;
+
 		default:
 			break;
 		}
@@ -7909,8 +8303,16 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 {
 	SpecializationConstant wg_x, wg_y, wg_z;
 	ID workgroup_size_id = get_work_group_size_specialization_constants(wg_x, wg_y, wg_z);
-	bool emitted = false;
+	if (workgroup_size_id == 0 && is_mesh_shader())
+	{
+		auto &execution = get_entry_point();
+		statement("constant uint3 ", builtin_to_glsl(BuiltInWorkgroupSize, StorageClassWorkgroup),
+		          " [[maybe_unused]] = ", "uint3(", execution.workgroup_size.x, ", ", execution.workgroup_size.y, ", ",
+		          execution.workgroup_size.z, ");");
+		statement("");
+	}
 
+	bool emitted = false;
 	unordered_set<uint32_t> declared_structs;
 	unordered_set<uint32_t> aligned_structs;
 
@@ -8015,14 +8417,18 @@ void CompilerMSL::emit_specialization_constants_and_structs()
 				else if (has_decoration(c.self, DecorationSpecId))
 				{
 					// Fallback to macro overrides.
+					uint32_t constant_id = get_decoration(c.self, DecorationSpecId);
 					c.specialization_constant_macro_name =
-					    constant_value_macro_name(get_decoration(c.self, DecorationSpecId));
+					    constant_value_macro_name(constant_id);
 
 					statement("#ifndef ", c.specialization_constant_macro_name);
 					statement("#define ", c.specialization_constant_macro_name, " ", constant_expression(c));
 					statement("#endif");
 					statement("constant ", sc_type_name, " ", sc_name, " = ", c.specialization_constant_macro_name,
 					          ";");
+
+					// Record the usage of macro
+					constant_macro_ids.insert(constant_id);
 				}
 				else
 				{
@@ -8897,7 +9303,7 @@ void CompilerMSL::fix_up_interpolant_access_chain(const uint32_t *ops, uint32_t
 
 // If the physical type of a physical buffer pointer has been changed
 // to a ulong or ulongn vector, add a cast back to the pointer type.
-void CompilerMSL::check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type)
+bool CompilerMSL::check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type)
 {
 	auto *p_physical_type = maybe_get<SPIRType>(physical_type);
 	if (p_physical_type &&
@@ -8908,7 +9314,10 @@ void CompilerMSL::check_physical_type_cast(std::string &expr, const SPIRType *ty
 			expr += ".x";
 
 		expr = join("((", type_to_glsl(*type), ")", expr, ")");
+		return true;
 	}
+
+	return false;
 }
 
 // Override for MSL-specific syntax instructions
@@ -9124,6 +9533,10 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpSMod:
+		MSL_BFOP(spvSMod);
+		break;
+
 	case OpFRem:
 		MSL_BFOP(fmod);
 		break;
@@ -9594,6 +10007,9 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		if (needs_frag_discard_checks() &&
 		    (type.storage == StorageClassStorageBuffer || type.storage == StorageClassUniform))
 			end_scope();
+		if (has_decoration(ops[0], DecorationBuiltIn) && get_decoration(ops[0], DecorationBuiltIn) == BuiltInPointSize)
+			writes_to_point_size = true;
+
 		break;
 	}
 
@@ -9604,9 +10020,9 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 
 	case OpControlBarrier:
 		// In GLSL a memory barrier is often followed by a control barrier.
-		// But in MSL, memory barriers are also control barriers, so don't
+		// But in MSL, memory barriers are also control barriers (before MSL 3.2), so don't
 		// emit a simple control barrier if a memory barrier has just been emitted.
-		if (previous_instruction_opcode != OpMemoryBarrier)
+		if (previous_instruction_opcode != OpMemoryBarrier || msl_options.supports_msl_version(3, 2))
 			emit_barrier(ops[0], ops[1], ops[2]);
 		break;
 
@@ -9971,11 +10387,11 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		auto &type = get<SPIRType>(ops[0]);
 		auto &input_type = expression_type(ops[2]);
 
-		if (opcode != OpBitcast || type.pointer || input_type.pointer)
+		if (opcode != OpBitcast || is_pointer(type) || is_pointer(input_type))
 		{
 			string op;
 
-			if (type.vecsize == 1 && input_type.vecsize == 1)
+			if ((type.vecsize == 1 || is_pointer(type)) && (input_type.vecsize == 1 || is_pointer(input_type)))
 				op = join("reinterpret_cast<", type_to_glsl(type), ">(", to_unpacked_expression(ops[2]), ")");
 			else if (input_type.vecsize == 2)
 				op = join("reinterpret_cast<", type_to_glsl(type), ">(as_type<ulong>(", to_unpacked_expression(ops[2]), "))");
@@ -10125,6 +10541,27 @@ void CompilerMSL::emit_instruction(const Instruction &instruction)
 		break;
 	}
 
+	case OpAssumeTrueKHR:
+	{
+		auto condition = ops[0];
+		statement(join("SPV_ASSUME(", to_unpacked_expression(condition), ")"));
+		break;
+	}
+
+	case OpExpectKHR:
+	{
+		auto result_type = ops[0];
+		auto ret = ops[1];
+		auto value = ops[2];
+		auto exp_value = ops[3];
+
+		auto exp = join("SPV_EXPECT(", to_unpacked_expression(value), ", ", to_unpacked_expression(exp_value), ")");
+		emit_op(result_type, ret, exp, should_forward(value), should_forward(exp_value));
+		inherit_expression_dependencies(ret, value);
+		inherit_expression_dependencies(ret, exp_value);
+		break;
+	}
+
 	default:
 		CompilerGLSL::emit_instruction(instruction);
 		break;
@@ -10184,10 +10621,20 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 		return;
 
 	string bar_stmt;
-	if ((msl_options.is_ios() && msl_options.supports_msl_version(1, 2)) || msl_options.supports_msl_version(2))
-		bar_stmt = exe_scope < ScopeSubgroup ? "threadgroup_barrier" : "simdgroup_barrier";
+
+	if (!id_exe_scope && msl_options.supports_msl_version(3, 2))
+	{
+		// Just took 10 years to get a proper barrier, but hey!
+		bar_stmt = "atomic_thread_fence";
+	}
 	else
-		bar_stmt = "threadgroup_barrier";
+	{
+		if ((msl_options.is_ios() && msl_options.supports_msl_version(1, 2)) || msl_options.supports_msl_version(2))
+			bar_stmt = exe_scope < ScopeSubgroup ? "threadgroup_barrier" : "simdgroup_barrier";
+		else
+			bar_stmt = "threadgroup_barrier";
+	}
+
 	bar_stmt += "(";
 
 	uint32_t mem_sem = id_mem_sem ? evaluate_constant_u32(id_mem_sem) : uint32_t(MemorySemanticsMaskNone);
@@ -10195,7 +10642,8 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 	// Use the | operator to combine flags if we can.
 	if (msl_options.supports_msl_version(1, 2))
 	{
-		string mem_flags = "";
+		string mem_flags;
+
 		// For tesc shaders, this also affects objects in the Output storage class.
 		// Since in Metal, these are placed in a device buffer, we have to sync device memory here.
 		if (is_tesc_shader() ||
@@ -10236,6 +10684,55 @@ void CompilerMSL::emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uin
 			bar_stmt += "mem_flags::mem_none";
 	}
 
+	if (!id_exe_scope && msl_options.supports_msl_version(3, 2))
+	{
+		// If there's no device-related memory in the barrier, demote to workgroup scope.
+		// glslang seems to emit device scope even for memoryBarrierShared().
+		if (mem_scope == ScopeDevice &&
+		    (mem_sem & (MemorySemanticsUniformMemoryMask |
+		                MemorySemanticsImageMemoryMask |
+		                MemorySemanticsCrossWorkgroupMemoryMask)) == 0)
+		{
+			mem_scope = ScopeWorkgroup;
+		}
+
+		// MSL 3.2 only supports seq_cst or relaxed.
+		if (mem_sem & (MemorySemanticsAcquireReleaseMask |
+		               MemorySemanticsAcquireMask |
+		               MemorySemanticsReleaseMask |
+		               MemorySemanticsSequentiallyConsistentMask))
+		{
+			bar_stmt += ", memory_order_seq_cst";
+		}
+		else
+		{
+			bar_stmt += ", memory_order_relaxed";
+		}
+
+		switch (mem_scope)
+		{
+		case ScopeDevice:
+			bar_stmt += ", thread_scope_device";
+			break;
+
+		case ScopeWorkgroup:
+			bar_stmt += ", thread_scope_threadgroup";
+			break;
+
+		case ScopeSubgroup:
+			bar_stmt += ", thread_scope_subgroup";
+			break;
+
+		case ScopeInvocation:
+			bar_stmt += ", thread_scope_thread";
+			break;
+
+		default:
+			// The default argument is device, which is conservative.
+			break;
+		}
+	}
+
 	bar_stmt += ");";
 
 	statement(bar_stmt);
@@ -11337,7 +11834,6 @@ string CompilerMSL::to_function_name(const TextureFunctionNameArguments &args)
 	{
 		bool is_compare = comparison_ids.count(img);
 		add_spv_func_and_recompile(is_compare ? SPVFuncImplGatherCompareConstOffsets : SPVFuncImplGatherConstOffsets);
-		add_spv_func_and_recompile(SPVFuncImplForwardArgs);
 		return is_compare ? "spvGatherCompareConstOffsets" : "spvGatherConstOffsets";
 	}
 
@@ -12310,7 +12806,7 @@ string CompilerMSL::to_func_call_arg(const SPIRFunction::Parameter &arg, uint32_
 	}
 	// Dereference pointer variables where needed.
 	// FIXME: This dereference is actually backwards. We should really just support passing pointer variables between functions.
-	else if (should_dereference(id))
+	else if (should_dereference_caller_param(id))
 		arg_str += dereference_expression(type, CompilerGLSL::to_func_call_arg(arg, id));
 	else
 		arg_str += CompilerGLSL::to_func_call_arg(arg, id);
@@ -12564,6 +13060,9 @@ void CompilerMSL::emit_fixup()
 {
 	if (is_vertex_like_shader() && stage_out_var_id && !qual_pos_var_name.empty() && !capture_output_to_buffer)
 	{
+		if (msl_options.enable_point_size_default && !writes_to_point_size)
+			statement(builtin_to_glsl(BuiltInPointSize, StorageClassOutput), " = ", format_float(msl_options.default_point_size), ";");
+
 		if (options.vertex.fixup_clipspace)
 			statement(qual_pos_var_name, ".z = (", qual_pos_var_name, ".z + ", qual_pos_var_name,
 			          ".w) * 0.5;       // Adjust clip-space for Metal");
@@ -13398,12 +13897,23 @@ bool CompilerMSL::uses_explicit_early_fragment_test()
 string CompilerMSL::get_argument_address_space(const SPIRVariable &argument)
 {
 	const auto &type = get<SPIRType>(argument.basetype);
+	// BDA is always passed around by value. There is no storage class for the argument itself.
+	if (is_physical_pointer(type))
+		return "";
 	return get_type_address_space(type, argument.self, true);
 }
 
-bool CompilerMSL::decoration_flags_signal_volatile(const Bitset &flags)
+bool CompilerMSL::decoration_flags_signal_volatile(const Bitset &flags) const
 {
-	return flags.get(DecorationVolatile) || flags.get(DecorationCoherent);
+	// Using volatile for coherent pre-3.2 is definitely not correct, but it's something.
+	// MSL 3.2 adds actual coherent qualifiers.
+	return flags.get(DecorationVolatile) ||
+	       (flags.get(DecorationCoherent) && !msl_options.supports_msl_version(3, 2));
+}
+
+bool CompilerMSL::decoration_flags_signal_coherent(const Bitset &flags) const
+{
+	return flags.get(DecorationCoherent) && msl_options.supports_msl_version(3, 2);
 }
 
 string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bool argument)
@@ -13415,8 +13925,17 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 	    (has_decoration(type.self, DecorationBlock) || has_decoration(type.self, DecorationBufferBlock)))
 		flags = get_buffer_block_flags(id);
 	else
+	{
 		flags = get_decoration_bitset(id);
 
+		if (type.basetype == SPIRType::Struct &&
+		    (has_decoration(type.self, DecorationBlock) ||
+		     has_decoration(type.self, DecorationBufferBlock)))
+		{
+			flags.merge_or(ir.get_buffer_block_type_flags(type));
+		}
+	}
+
 	const char *addr_space = nullptr;
 	switch (type.storage)
 	{
@@ -13425,7 +13944,6 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 		break;
 
 	case StorageClassStorageBuffer:
-	case StorageClassPhysicalStorageBuffer:
 	{
 		// For arguments from variable pointers, we use the write count deduction, so
 		// we should not assume any constness here. Only for global SSBOs.
@@ -13433,10 +13951,19 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 		if (!var || has_decoration(type.self, DecorationBlock))
 			readonly = flags.get(DecorationNonWritable);
 
+		if (decoration_flags_signal_coherent(flags))
+			readonly = false;
+
 		addr_space = readonly ? "const device" : "device";
 		break;
 	}
 
+	case StorageClassPhysicalStorageBuffer:
+		// We cannot fully trust NonWritable coming from glslang due to a bug in buffer_reference handling.
+		// There isn't much gain in emitting const in C++ languages anyway.
+		addr_space = "device";
+		break;
+
 	case StorageClassUniform:
 	case StorageClassUniformConstant:
 	case StorageClassPushConstant:
@@ -13525,7 +14052,9 @@ string CompilerMSL::get_type_address_space(const SPIRType &type, uint32_t id, bo
 		addr_space = type.pointer || (argument && type.basetype == SPIRType::ControlPointArray) ? "thread" : "";
 	}
 
-	if (decoration_flags_signal_volatile(flags) && 0 != strcmp(addr_space, "thread"))
+	if (decoration_flags_signal_coherent(flags) && strcmp(addr_space, "device") == 0)
+		return join("coherent device");
+	else if (decoration_flags_signal_volatile(flags) && strcmp(addr_space, "thread") != 0)
 		return join("volatile ", addr_space);
 	else
 		return addr_space;
@@ -15133,19 +15662,24 @@ const char *CompilerMSL::descriptor_address_space(uint32_t id, StorageClass stor
 string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 {
 	auto &var = get<SPIRVariable>(arg.id);
-	auto &type = get_variable_data_type(var);
 	auto &var_type = get<SPIRType>(arg.type);
 	StorageClass type_storage = var_type.storage;
 
+	// Physical pointer types are passed by pointer, not reference.
+	auto &data_type = get_variable_data_type(var);
+	bool passed_by_value = is_physical_pointer(var_type);
+	auto &type = passed_by_value ? var_type : data_type;
+
 	// If we need to modify the name of the variable, make sure we use the original variable.
 	// Our alias is just a shadow variable.
 	uint32_t name_id = var.self;
 	if (arg.alias_global_variable && var.basevariable)
 		name_id = var.basevariable;
 
-	bool constref = !arg.alias_global_variable && is_pointer(var_type) && arg.write_count == 0;
+	bool constref = !arg.alias_global_variable && !passed_by_value && is_pointer(var_type) && arg.write_count == 0;
 	// Framebuffer fetch is plain value, const looks out of place, but it is not wrong.
-	if (type_is_msl_framebuffer_fetch(type))
+	// readonly coming from glslang is not reliable in all cases.
+	if (type_is_msl_framebuffer_fetch(type) || type_storage == StorageClassPhysicalStorageBuffer)
 		constref = false;
 	else if (type_storage == StorageClassUniformConstant)
 		constref = true;
@@ -15174,8 +15708,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 
 	if (var.basevariable && (var.basevariable == stage_in_ptr_var_id || var.basevariable == stage_out_ptr_var_id))
 		decl = join(cv_qualifier, type_to_glsl(type, arg.id));
-	else if (builtin && builtin_type != spv::BuiltInPrimitiveTriangleIndicesEXT &&
-	         builtin_type != spv::BuiltInPrimitiveLineIndicesEXT && builtin_type != spv::BuiltInPrimitivePointIndicesEXT)
+	else if (builtin && !is_mesh_shader())
 	{
 		// Only use templated array for Clip/Cull distance when feasible.
 		// In other scenarios, we need need to override array length for tess levels (if used as outputs),
@@ -15227,7 +15760,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 	else
 	{
 		// The type is a pointer type we need to emit cv_qualifier late.
-		if (is_pointer(type))
+		if (is_pointer(data_type))
 		{
 			decl = type_to_glsl(type, arg.id);
 			if (*cv_qualifier != '\0')
@@ -15239,8 +15772,8 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 		}
 	}
 
-	if (!builtin && !is_pointer(var_type) &&
-	    (type_storage == StorageClassFunction || type_storage == StorageClassGeneric))
+	if (passed_by_value || (!builtin && !is_pointer(var_type) &&
+	                        (type_storage == StorageClassFunction || type_storage == StorageClassGeneric)))
 	{
 		// If the argument is a pure value and not an opaque type, we will pass by value.
 		if (msl_options.force_native_arrays && is_array(type))
@@ -15341,7 +15874,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 		// for the reference has to go before the '&', but after the '*'.
 		if (!address_space.empty())
 		{
-			if (is_pointer(type))
+			if (is_pointer(data_type))
 			{
 				if (*cv_qualifier == '\0')
 					decl += ' ';
@@ -15350,6 +15883,7 @@ string CompilerMSL::argument_decl(const SPIRFunction::Parameter &arg)
 			else
 				decl = join(address_space, " ", decl);
 		}
+
 		decl += "&";
 		decl += " ";
 		decl += to_restrict(name_id, true);
@@ -15584,6 +16118,9 @@ const std::unordered_set<std::string> &CompilerMSL::get_illegal_func_names()
 {
 	static const unordered_set<string> illegal_func_names = {
 		"main",
+		"fragment",
+		"vertex",
+		"kernel",
 		"saturate",
 		"assert",
 		"fmin3",
@@ -16091,6 +16628,9 @@ string CompilerMSL::constant_op_expression(const SPIRConstantOp &cop)
 {
 	switch (cop.opcode)
 	{
+	case OpSMod:
+		add_spv_func_and_recompile(SPVFuncImplSMod);
+		return join("spvSMod(", to_expression(cop.arguments[0]), ", ", to_expression(cop.arguments[1]), ")");
 	case OpQuantizeToF16:
 		add_spv_func_and_recompile(SPVFuncImplQuantizeToF16);
 		return join("spvQuantizeToF16(", to_expression(cop.arguments[0]), ")");
@@ -16367,6 +16907,12 @@ string CompilerMSL::image_type_glsl(const SPIRType &type, uint32_t id, bool memb
 	// Otherwise it may be set based on whether the image is read from or written to within the shader.
 	if (type.basetype == SPIRType::Image && type.image.sampled == 2 && type.image.dim != DimSubpassData)
 	{
+		auto *p_var = maybe_get_backing_variable(id);
+		if (p_var && p_var->basevariable)
+			p_var = maybe_get<SPIRVariable>(p_var->basevariable);
+
+		bool has_access_qualifier = true;
+
 		switch (img_type.access)
 		{
 		case AccessQualifierReadOnly:
@@ -16383,9 +16929,6 @@ string CompilerMSL::image_type_glsl(const SPIRType &type, uint32_t id, bool memb
 
 		default:
 		{
-			auto *p_var = maybe_get_backing_variable(id);
-			if (p_var && p_var->basevariable)
-				p_var = maybe_get<SPIRVariable>(p_var->basevariable);
 			if (p_var && !has_decoration(p_var->self, DecorationNonWritable))
 			{
 				img_type_name += ", access::";
@@ -16395,9 +16938,21 @@ string CompilerMSL::image_type_glsl(const SPIRType &type, uint32_t id, bool memb
 
 				img_type_name += "write";
 			}
+			else
+			{
+				has_access_qualifier = false;
+			}
 			break;
 		}
 		}
+
+		if (p_var && has_decoration(p_var->self, DecorationCoherent) && msl_options.supports_msl_version(3, 2))
+		{
+			// Cannot declare memory_coherence_device without access qualifier.
+			if (!has_access_qualifier)
+				img_type_name += ", access::read";
+			img_type_name += ", memory_coherence_device";
+		}
 	}
 
 	img_type_name += ">";
@@ -16465,6 +17020,10 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i)
 			if (!msl_options.supports_msl_version(2, 2))
 				SPIRV_CROSS_THROW("Ballot ops on iOS requires Metal 2.2 and up.");
 			break;
+		case OpGroupNonUniformRotateKHR:
+			if (!msl_options.supports_msl_version(2, 2))
+				SPIRV_CROSS_THROW("Rotate on iOS requires Metal 2.2 and up.");
+			break;
 		case OpGroupNonUniformBroadcast:
 		case OpGroupNonUniformShuffle:
 		case OpGroupNonUniformShuffleXor:
@@ -16500,13 +17059,16 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i)
 	Scope scope;
 	switch (op)
 	{
+	// These earlier instructions don't have the scope operand.
 	case OpSubgroupBallotKHR:
 	case OpSubgroupFirstInvocationKHR:
 	case OpSubgroupReadInvocationKHR:
 	case OpSubgroupAllKHR:
 	case OpSubgroupAnyKHR:
 	case OpSubgroupAllEqualKHR:
-		// These earlier instructions don't have the scope operand.
+	// These instructions are always quad-scoped and thus do not have a scope operand.
+	case OpGroupNonUniformQuadAllKHR:
+	case OpGroupNonUniformQuadAnyKHR:
 		scope = ScopeSubgroup;
 		break;
 	default:
@@ -16594,6 +17156,23 @@ void CompilerMSL::emit_subgroup_op(const Instruction &i)
 		emit_binary_func_op(result_type, id, ops[op_idx], ops[op_idx + 1], "spvSubgroupShuffleDown");
 		break;
 
+	case OpGroupNonUniformRotateKHR:
+	{
+		if (i.length > 5)
+		{
+			// MSL does not have a cluster size parameter, so calculate the invocation ID manually and using a shuffle.
+			auto delta_expr = enclose_expression(to_unpacked_expression(ops[op_idx + 1]));
+			auto cluster_size_minus_one = evaluate_constant_u32(ops[op_idx + 2]) - 1;
+			auto local_id_expr = to_unpacked_expression(scope == ScopeSubgroup
+				? builtin_subgroup_invocation_id_id : builtin_local_invocation_index_id);
+			auto shuffle_idx = join("((", local_id_expr, " + ", delta_expr, ")", " & ", std::to_string(cluster_size_minus_one),
+				") + (", local_id_expr, " & ", std::to_string(~cluster_size_minus_one), ")");
+			emit_op(result_type, id, join("spvSubgroupShuffle(", to_unpacked_expression(ops[op_idx]), ", ", shuffle_idx, ")"), false);
+		} else
+			emit_binary_func_op(result_type, id, ops[op_idx], ops[op_idx + 1], "spvSubgroupRotate");
+		break;
+	}
+
 	case OpGroupNonUniformAll:
 	case OpSubgroupAllKHR:
 		if (msl_options.use_quadgroup_operation())
@@ -16628,11 +17207,10 @@ case OpGroupNonUniform##op: \
 			emit_unary_func_op(result_type, id, ops[op_idx], "simd_prefix_exclusive_" #msl_op); \
 		else if (operation == GroupOperationClusteredReduce) \
 		{ \
-			/* Only cluster sizes of 4 are supported. */ \
 			uint32_t cluster_size = evaluate_constant_u32(ops[op_idx + 1]); \
-			if (cluster_size != 4) \
-				SPIRV_CROSS_THROW("Metal only supports quad ClusteredReduce."); \
-			emit_unary_func_op(result_type, id, ops[op_idx], "quad_" #msl_op); \
+			if (get_execution_model() != ExecutionModelFragment || msl_options.supports_msl_version(2, 2)) \
+				add_spv_func_and_recompile(SPVFuncImplSubgroupClustered##op); \
+			emit_subgroup_cluster_op(result_type, id, cluster_size, ops[op_idx], #msl_op); \
 		} \
 		else \
 			SPIRV_CROSS_THROW("Invalid group operation."); \
@@ -16657,11 +17235,10 @@ case OpGroupNonUniform##op: \
 			SPIRV_CROSS_THROW("Metal doesn't support ExclusiveScan for OpGroupNonUniform" #op "."); \
 		else if (operation == GroupOperationClusteredReduce) \
 		{ \
-			/* Only cluster sizes of 4 are supported. */ \
 			uint32_t cluster_size = evaluate_constant_u32(ops[op_idx + 1]); \
-			if (cluster_size != 4) \
-				SPIRV_CROSS_THROW("Metal only supports quad ClusteredReduce."); \
-			emit_unary_func_op(result_type, id, ops[op_idx], "quad_" #msl_op); \
+			if (get_execution_model() != ExecutionModelFragment || msl_options.supports_msl_version(2, 2)) \
+				add_spv_func_and_recompile(SPVFuncImplSubgroupClustered##op); \
+			emit_subgroup_cluster_op(result_type, id, cluster_size, ops[op_idx], #msl_op); \
 		} \
 		else \
 			SPIRV_CROSS_THROW("Invalid group operation."); \
@@ -16680,11 +17257,10 @@ case OpGroupNonUniform##op: \
 			SPIRV_CROSS_THROW("Metal doesn't support ExclusiveScan for OpGroupNonUniform" #op "."); \
 		else if (operation == GroupOperationClusteredReduce) \
 		{ \
-			/* Only cluster sizes of 4 are supported. */ \
 			uint32_t cluster_size = evaluate_constant_u32(ops[op_idx + 1]); \
-			if (cluster_size != 4) \
-				SPIRV_CROSS_THROW("Metal only supports quad ClusteredReduce."); \
-			emit_unary_func_op_cast(result_type, id, ops[op_idx], "quad_" #msl_op, type, type); \
+			if (get_execution_model() != ExecutionModelFragment || msl_options.supports_msl_version(2, 2)) \
+				add_spv_func_and_recompile(SPVFuncImplSubgroupClustered##op); \
+			emit_subgroup_cluster_op_cast(result_type, id, cluster_size, ops[op_idx], #msl_op, type, type); \
 		} \
 		else \
 			SPIRV_CROSS_THROW("Invalid group operation."); \
@@ -16700,9 +17276,11 @@ case OpGroupNonUniform##op: \
 	MSL_GROUP_OP(BitwiseAnd, and)
 	MSL_GROUP_OP(BitwiseOr, or)
 	MSL_GROUP_OP(BitwiseXor, xor)
-	MSL_GROUP_OP(LogicalAnd, and)
-	MSL_GROUP_OP(LogicalOr, or)
-	MSL_GROUP_OP(LogicalXor, xor)
+	// Metal doesn't support boolean types in SIMD-group operations, so we
+	// have to emit some casts.
+	MSL_GROUP_OP_CAST(LogicalAnd, and, SPIRType::UShort)
+	MSL_GROUP_OP_CAST(LogicalOr, or, SPIRType::UShort)
+	MSL_GROUP_OP_CAST(LogicalXor, xor, SPIRType::UShort)
 		// clang-format on
 #undef MSL_GROUP_OP
 #undef MSL_GROUP_OP_CAST
@@ -16715,6 +17293,14 @@ case OpGroupNonUniform##op: \
 		emit_binary_func_op(result_type, id, ops[op_idx], ops[op_idx + 1], "spvQuadBroadcast");
 		break;
 
+	case OpGroupNonUniformQuadAllKHR:
+		emit_unary_func_op(result_type, id, ops[op_idx], "quad_all");
+		break;
+
+	case OpGroupNonUniformQuadAnyKHR:
+		emit_unary_func_op(result_type, id, ops[op_idx], "quad_any");
+		break;
+
 	default:
 		SPIRV_CROSS_THROW("Invalid opcode for subgroup.");
 	}
@@ -16722,6 +17308,83 @@ case OpGroupNonUniform##op: \
 	register_control_dependent_expression(id);
 }
 
+void CompilerMSL::emit_subgroup_cluster_op(uint32_t result_type, uint32_t result_id, uint32_t cluster_size,
+                                           uint32_t op0, const char *op)
+{
+	if (get_execution_model() == ExecutionModelFragment && !msl_options.supports_msl_version(2, 2))
+	{
+		if (cluster_size == 4)
+		{
+			emit_unary_func_op(result_type, result_id, op0, join("quad_", op).c_str());
+			return;
+		}
+		SPIRV_CROSS_THROW("Cluster sizes other than 4 in fragment shaders require MSL 2.2.");
+	}
+	bool forward = should_forward(op0);
+	emit_op(result_type, result_id,
+	        join("spvClustered_", op, "<", cluster_size, ">(", to_unpacked_expression(op0), ", ",
+	             to_expression(builtin_subgroup_invocation_id_id), ")"),
+	        forward);
+	inherit_expression_dependencies(result_id, op0);
+}
+
+void CompilerMSL::emit_subgroup_cluster_op_cast(uint32_t result_type, uint32_t result_id, uint32_t cluster_size,
+                                                uint32_t op0, const char *op, SPIRType::BaseType input_type,
+                                                SPIRType::BaseType expected_result_type)
+{
+	if (get_execution_model() == ExecutionModelFragment && !msl_options.supports_msl_version(2, 2))
+	{
+		if (cluster_size == 4)
+		{
+			emit_unary_func_op_cast(result_type, result_id, op0, join("quad_", op).c_str(), input_type,
+			                        expected_result_type);
+			return;
+		}
+		SPIRV_CROSS_THROW("Cluster sizes other than 4 in fragment shaders require MSL 2.2.");
+	}
+
+	auto &out_type = get<SPIRType>(result_type);
+	auto &expr_type = expression_type(op0);
+	auto expected_type = out_type;
+
+	// Bit-widths might be different in unary cases because we use it for SConvert/UConvert and friends.
+	expected_type.basetype = input_type;
+	expected_type.width = expr_type.width;
+
+	string cast_op;
+	if (expr_type.basetype != input_type)
+	{
+		if (expr_type.basetype == SPIRType::Boolean)
+			cast_op = join(type_to_glsl(expected_type), "(", to_unpacked_expression(op0), ")");
+		else
+			cast_op = bitcast_glsl(expected_type, op0);
+	}
+	else
+		cast_op = to_unpacked_expression(op0);
+
+	string sg_op = join("spvClustered_", op, "<", cluster_size, ">");
+	string expr;
+	if (out_type.basetype != expected_result_type)
+	{
+		expected_type.basetype = expected_result_type;
+		expected_type.width = out_type.width;
+		if (out_type.basetype == SPIRType::Boolean)
+			expr = type_to_glsl(out_type);
+		else
+			expr = bitcast_glsl_op(out_type, expected_type);
+		expr += '(';
+		expr += join(sg_op, "(", cast_op, ", ", to_expression(builtin_subgroup_invocation_id_id), ")");
+		expr += ')';
+	}
+	else
+	{
+		expr += join(sg_op, "(", cast_op, ", ", to_expression(builtin_subgroup_invocation_id_id), ")");
+	}
+
+	emit_op(result_type, result_id, expr, should_forward(op0));
+	inherit_expression_dependencies(result_id, op0);
+}
+
 string CompilerMSL::bitcast_glsl_op(const SPIRType &out_type, const SPIRType &in_type)
 {
 	if (out_type.basetype == in_type.basetype)
@@ -17084,6 +17747,9 @@ string CompilerMSL::builtin_qualifier(BuiltIn builtin)
 	case BuiltInGlobalInvocationId:
 		return "thread_position_in_grid";
 
+	case BuiltInWorkgroupSize:
+		return "threads_per_threadgroup";
+
 	case BuiltInWorkgroupId:
 		return "threadgroup_position_in_grid";
 
@@ -17269,6 +17935,7 @@ string CompilerMSL::builtin_type_decl(BuiltIn builtin, uint32_t id)
 	case BuiltInLocalInvocationId:
 	case BuiltInNumWorkgroups:
 	case BuiltInWorkgroupId:
+	case BuiltInWorkgroupSize:
 		return "uint3";
 	case BuiltInLocalInvocationIndex:
 	case BuiltInNumSubgroups:
@@ -17598,6 +18265,23 @@ void CompilerMSL::analyze_sampled_image_usage()
 	}
 }
 
+void CompilerMSL::analyze_workgroup_variables()
+{
+	ir.for_each_typed_id<SPIRVariable>([&](uint32_t, SPIRVariable &var) {
+		// If workgroup variables have initializer, it can only be ConstantNull (zero init)
+		if (var.storage == StorageClassWorkgroup && var.initializer)
+		{
+			needs_workgroup_zero_init = true;
+
+			// MSL compiler does not like the routine to initialize simple threadgroup variables,
+			// falsely claiming it is "sometimes uninitialized". Suppress it.
+			auto &type = get_variable_data_type(var);
+			if (type.array.empty() && type.member_types.empty())
+				suppress_sometimes_unitialized = true;
+		}
+	});
+}
+
 bool CompilerMSL::SampledImageScanner::handle(spv::Op opcode, const uint32_t *args, uint32_t length)
 {
 	switch (opcode)
@@ -17658,7 +18342,7 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 	// suppress_missing_prototypes to suppress compiler warnings of missing function prototypes.
 
 	// Mark if the input requires the implementation of an SPIR-V function that does not exist in Metal.
-	SPVFuncImpl spv_func = get_spv_func_impl(opcode, args);
+	SPVFuncImpl spv_func = get_spv_func_impl(opcode, args, length);
 	if (spv_func != SPVFuncImplNone)
 	{
 		compiler.spv_function_implementations.insert(spv_func);
@@ -17765,6 +18449,39 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 			needs_subgroup_invocation_id = true;
 		break;
 
+	case OpGroupNonUniformRotateKHR:
+		// Add the correct invocation ID for calculating clustered rotate case.
+		if (length > 5)
+		{
+			if (static_cast<Scope>(compiler.evaluate_constant_u32(args[2])) == ScopeSubgroup)
+				needs_subgroup_invocation_id = true;
+			else
+				needs_local_invocation_index = true;
+		}
+		break;
+
+	case OpGroupNonUniformFAdd:
+	case OpGroupNonUniformFMul:
+	case OpGroupNonUniformFMin:
+	case OpGroupNonUniformFMax:
+	case OpGroupNonUniformIAdd:
+	case OpGroupNonUniformIMul:
+	case OpGroupNonUniformSMin:
+	case OpGroupNonUniformSMax:
+	case OpGroupNonUniformUMin:
+	case OpGroupNonUniformUMax:
+	case OpGroupNonUniformBitwiseAnd:
+	case OpGroupNonUniformBitwiseOr:
+	case OpGroupNonUniformBitwiseXor:
+	case OpGroupNonUniformLogicalAnd:
+	case OpGroupNonUniformLogicalOr:
+	case OpGroupNonUniformLogicalXor:
+		if ((compiler.get_execution_model() != ExecutionModelFragment ||
+		     compiler.msl_options.supports_msl_version(2, 2)) &&
+		    args[3] == GroupOperationClusteredReduce)
+			needs_subgroup_invocation_id = true;
+		break;
+
 	case OpArrayLength:
 	{
 		auto *var = compiler.maybe_get_backing_variable(args[2]);
@@ -17794,7 +18511,8 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 	case OpExtInst:
 	{
 		uint32_t extension_set = args[2];
-		if (compiler.get<SPIRExtension>(extension_set).ext == SPIRExtension::GLSL)
+		SPIRExtension::Extension ext = compiler.get<SPIRExtension>(extension_set).ext;
+		if (ext == SPIRExtension::GLSL)
 		{
 			auto op_450 = static_cast<GLSLstd450>(args[3]);
 			switch (op_450)
@@ -17837,6 +18555,12 @@ bool CompilerMSL::OpCodePreprocessor::handle(Op opcode, const uint32_t *args, ui
 				break;
 			}
 		}
+		else if (ext == SPIRExtension::NonSemanticDebugPrintf)
+		{
+			// Operation 1 is printf.
+			if (args[3] == 1 && !compiler.msl_options.supports_msl_version(3, 2))
+				SPIRV_CROSS_THROW("Debug printf requires MSL 3.2.");
+		}
 		break;
 	}
 
@@ -17867,10 +18591,13 @@ void CompilerMSL::OpCodePreprocessor::check_resource_write(uint32_t var_id)
 }
 
 // Returns an enumeration of a SPIR-V function that needs to be output for certain Op codes.
-CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op opcode, const uint32_t *args)
+CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op opcode, const uint32_t *args, uint32_t length)
 {
 	switch (opcode)
 	{
+	case OpSMod:
+		return SPVFuncImplSMod;
+
 	case OpFMod:
 		return SPVFuncImplMod;
 
@@ -18049,6 +18776,12 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpGroupNonUniformShuffleDown:
 		return SPVFuncImplSubgroupShuffleDown;
 
+	case OpGroupNonUniformRotateKHR:
+		// Clustered rotate is performed using shuffle.
+		if (length > 5)
+			return SPVFuncImplSubgroupShuffle;
+		return SPVFuncImplSubgroupRotate;
+
 	case OpGroupNonUniformQuadBroadcast:
 		return SPVFuncImplQuadBroadcast;
 
@@ -18067,6 +18800,10 @@ CompilerMSL::SPVFuncImpl CompilerMSL::OpCodePreprocessor::get_spv_func_impl(Op o
 	case OpUMulExtended:
 		return SPVFuncImplMulExtended;
 
+	case OpAssumeTrueKHR:
+	case OpExpectKHR:
+		return SPVFuncImplAssume;
+
 	default:
 		break;
 	}
@@ -18907,7 +19644,7 @@ void CompilerMSL::analyze_argument_buffers()
 				SetBindingPair pair = { desc_set, binding };
 
 				if (resource.basetype == SPIRType::Image || resource.basetype == SPIRType::Sampler ||
-				    resource.basetype == SPIRType::SampledImage)
+				    resource.basetype == SPIRType::SampledImage || resource.basetype == SPIRType::AccelerationStructure)
 				{
 					// Drop pointer information when we emit the resources into a struct.
 					buffer_type.member_types.push_back(get_variable_data_type_id(var));
@@ -18997,7 +19734,7 @@ void CompilerMSL::analyze_argument_buffers()
 				set_extended_member_decoration(buffer_type.self, member_index, SPIRVCrossDecorationOverlappingBinding);
 			member_index++;
 		}
-		
+
 		if (msl_options.replace_recursive_inputs && type_contains_recursion(buffer_type))
 		{
 			recursive_inputs.insert(type_id);
@@ -19157,6 +19894,35 @@ const char *CompilerMSL::get_combined_sampler_suffix() const
 	return sampler_name_suffix.c_str();
 }
 
+bool CompilerMSL::specialization_constant_is_macro(uint32_t const_id) const
+{
+	return constant_macro_ids.find(const_id) != constant_macro_ids.end();
+}
+
+// Start with all fast math flags enabled, and selectively disable based execution modes and float controls
+uint32_t CompilerMSL::get_fp_fast_math_flags(bool incl_ops)
+{
+	uint32_t fp_flags = ~0;
+	auto &ep = get_entry_point();
+
+	if (ep.flags.get(ExecutionModeSignedZeroInfNanPreserve))
+		fp_flags &= ~(FPFastMathModeNSZMask | FPFastMathModeNotInfMask | FPFastMathModeNotNaNMask);
+
+	if (ep.flags.get(ExecutionModeContractionOff))
+		fp_flags &= ~(FPFastMathModeAllowContractMask);
+
+	for (auto &fp_pair : ep.fp_fast_math_defaults)
+		if (fp_pair.second)
+			fp_flags &= get<SPIRConstant>(fp_pair.second).scalar();
+
+	if (incl_ops)
+		for (auto &p_m : ir.meta)
+			if (p_m.second.decoration.decoration_flags.get(DecorationFPFastMathMode))
+				fp_flags &= p_m.second.decoration.fp_fast_math_mode;
+
+	return fp_flags;
+}
+
 void CompilerMSL::emit_block_hints(const SPIRBlock &)
 {
 }
@@ -19379,6 +20145,70 @@ void CompilerMSL::emit_mesh_tasks(SPIRBlock &block)
 	statement("return;");
 }
 
+void CompilerMSL::emit_workgroup_initialization(const SPIRVariable &var)
+{
+	auto &type = get_variable_data_type(var);
+
+	begin_scope();
+
+	if (type.array.empty() && type.member_types.empty())
+	{
+		// For simple shared variables, we just initialize it in thread 0 of the block
+		// We use short to represent bool for threadgroup variable to workaround compiler bug,
+		// so we do a temporary fixup here. Alas. (see the type_to_glsl method)
+		bool is_boolean = type.basetype == SPIRType::Boolean;
+		if (is_boolean)
+			type.basetype = SPIRType::Short;
+
+		statement("if (gl_LocalInvocationIndex == 0)");
+		begin_scope();
+		statement(to_name(var.self), " = ", to_initializer_expression(var), ";");
+		end_scope();
+
+		if (is_boolean)
+			type.basetype = SPIRType::Boolean;
+	}
+	else
+	{
+		// Otherwise, we use a loop to cooperatively initialize the memory within the group
+
+		// First, we define a few variable names;
+		string var_name = to_name(var.self);
+		string var_ptr_name = join(var_name, "_ptr");
+		string var_size_name = join(var_name, "_sz");
+		string var_pos_name = join(var_name, "_pos");
+		string var_stride_name = join(var_name, "_stride");
+		string var_ptr2_name = join(var_name, "_ptr2");
+
+		statement("threadgroup uint *", var_ptr_name, " = (threadgroup uint *)&", var_name, ";");
+		statement("uint ", var_size_name, " = ", "sizeof(", var_name, ");");
+		statement("uint ", var_pos_name, " = gl_LocalInvocationIndex;");
+		statement("uint ", var_stride_name, " = gl_WorkGroupSize.x * gl_WorkGroupSize.y * gl_WorkGroupSize.z;");
+
+		statement("while (sizeof(uint) * ", var_pos_name, " < ", var_size_name, ")");
+		begin_scope();
+		statement(var_ptr_name, "[", var_pos_name, "] = 0u;");
+		statement(var_pos_name, " += ", var_stride_name, ";");
+		end_scope();
+
+		statement("if (gl_LocalInvocationIndex == 0)");
+		begin_scope();
+		statement(var_pos_name, " = (", var_size_name, " / sizeof(uint)) * sizeof(uint);");
+		statement("threadgroup uchar *", var_ptr2_name, " = (threadgroup uchar *)&", var_name, ";");
+
+		statement("while (", var_pos_name, " < ", var_size_name, ")");
+		begin_scope();
+		statement(var_ptr2_name, "[", var_pos_name, "] = '\\0';");
+		statement(var_pos_name, "++;");
+		end_scope();
+		end_scope();
+	}
+
+	statement("threadgroup_barrier(mem_flags::mem_threadgroup);");
+
+	end_scope();
+}
+
 string CompilerMSL::additional_fixed_sample_mask_str() const
 {
 	char print_buffer[32];
diff --git a/thirdparty/spirv-cross/spirv_msl.hpp b/thirdparty/spirv-cross/spirv_msl.hpp
index 4aaad01a89..99cbb5e138 100644
--- a/thirdparty/spirv-cross/spirv_msl.hpp
+++ b/thirdparty/spirv-cross/spirv_msl.hpp
@@ -324,6 +324,8 @@ public:
 		// of the shader with the additional fixed sample mask.
 		uint32_t additional_fixed_sample_mask = 0xffffffff;
 		bool enable_point_size_builtin = true;
+		bool enable_point_size_default = false;
+		float default_point_size = 1.0f;
 		bool enable_frag_depth_builtin = true;
 		bool enable_frag_stencil_ref_builtin = true;
 		bool disable_rasterization = false;
@@ -536,6 +538,14 @@ public:
 		// if the fragment does not modify the depth value.
 		bool input_attachment_is_ds_attachment = false;
 
+		// If BuiltInPosition is not written, automatically disable rasterization.
+		// The result can be queried with get_is_rasterization_disabled.
+		bool auto_disable_rasterization = false;
+
+		// Use Fast Math pragmas in MSL code, based on SPIR-V float controls and FP ExecutionModes.
+		// Requires MSL 3.2 or above, and has no effect with earlier MSL versions.
+		bool use_fast_math_pragmas = false;
+
 		bool is_ios() const
 		{
 			return platform == iOS;
@@ -756,6 +766,19 @@ public:
 	void set_combined_sampler_suffix(const char *suffix);
 	const char *get_combined_sampler_suffix() const;
 
+	// Information about specialization constants that are translated into MSL macros
+	// instead of using function constant
+	// These must only be called after a successful call to CompilerMSL::compile().
+	bool specialization_constant_is_macro(uint32_t constant_id) const;
+
+	// Returns a mask of SPIR-V FP Fast Math Mode flags, that represents the set of flags that can be applied
+	// across all floating-point types. Each FPFastMathDefault execution mode operation identifies the flags
+	// for one floating-point type, and the value returned here is a bitwise-AND combination across all types.
+	// If incl_ops is enabled, the FPFastMathMode of any SPIR-V operations are also included in the bitwise-AND
+	// to determine the minimal fast-math that applies to all default execution modes and all operations.
+	// The returned value is also affected by execution modes SignedZeroInfNanPreserve and ContractionOff.
+	uint32_t get_fp_fast_math_flags(bool incl_ops);
+
 protected:
 	// An enum of SPIR-V functions that are implemented in additional
 	// source code that is added to the shader if necessary.
@@ -763,6 +786,7 @@ protected:
 	{
 		SPVFuncImplNone,
 		SPVFuncImplMod,
+		SPVFuncImplSMod,
 		SPVFuncImplRadians,
 		SPVFuncImplDegrees,
 		SPVFuncImplFindILsb,
@@ -784,12 +808,11 @@ protected:
 		SPVFuncImplInverse4x4,
 		SPVFuncImplInverse3x3,
 		SPVFuncImplInverse2x2,
-		// It is very important that this come before *Swizzle and ChromaReconstruct*, to ensure it's
-		// emitted before them.
-		SPVFuncImplForwardArgs,
-		// Likewise, this must come before *Swizzle.
+		// It is very important that this come before *Swizzle, to ensure it's emitted before them.
 		SPVFuncImplGetSwizzle,
 		SPVFuncImplTextureSwizzle,
+		SPVFuncImplGatherReturn,
+		SPVFuncImplGatherCompareReturn,
 		SPVFuncImplGatherSwizzle,
 		SPVFuncImplGatherCompareSwizzle,
 		SPVFuncImplGatherConstOffsets,
@@ -806,6 +829,30 @@ protected:
 		SPVFuncImplSubgroupShuffleXor,
 		SPVFuncImplSubgroupShuffleUp,
 		SPVFuncImplSubgroupShuffleDown,
+		SPVFuncImplSubgroupRotate,
+		SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredFAdd = SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredIAdd = SPVFuncImplSubgroupClusteredAdd,
+		SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredFMul = SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredIMul = SPVFuncImplSubgroupClusteredMul,
+		SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredFMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredSMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredUMin = SPVFuncImplSubgroupClusteredMin,
+		SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredFMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredSMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredUMax = SPVFuncImplSubgroupClusteredMax,
+		SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredBitwiseAnd = SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredLogicalAnd = SPVFuncImplSubgroupClusteredAnd,
+		SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredBitwiseOr = SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredLogicalOr = SPVFuncImplSubgroupClusteredOr,
+		SPVFuncImplSubgroupClusteredXor,
+		SPVFuncImplSubgroupClusteredBitwiseXor = SPVFuncImplSubgroupClusteredXor,
+		SPVFuncImplSubgroupClusteredLogicalXor = SPVFuncImplSubgroupClusteredXor,
 		SPVFuncImplQuadBroadcast,
 		SPVFuncImplQuadSwap,
 		SPVFuncImplReflectScalar,
@@ -841,6 +888,7 @@ protected:
 		SPVFuncImplTextureCast,
 		SPVFuncImplMulExtended,
 		SPVFuncImplSetMeshOutputsEXT,
+		SPVFuncImplAssume,
 	};
 
 	// If the underlying resource has been used for comparison then duplicate loads of that resource must be too
@@ -858,6 +906,11 @@ protected:
 	void emit_function_prototype(SPIRFunction &func, const Bitset &return_flags) override;
 	void emit_sampled_image_op(uint32_t result_type, uint32_t result_id, uint32_t image_id, uint32_t samp_id) override;
 	void emit_subgroup_op(const Instruction &i) override;
+	void emit_subgroup_cluster_op(uint32_t result_type, uint32_t result_id, uint32_t cluster_size, uint32_t op0,
+	                              const char *op);
+	void emit_subgroup_cluster_op_cast(uint32_t result_type, uint32_t result_id, uint32_t cluster_size, uint32_t op0,
+	                                   const char *op, SPIRType::BaseType input_type,
+	                                   SPIRType::BaseType expected_result_type);
 	std::string to_texture_op(const Instruction &i, bool sparse, bool *forward,
 	                          SmallVector<uint32_t> &inherited_expressions) override;
 	void emit_fixup() override;
@@ -872,6 +925,7 @@ protected:
 	void emit_mesh_entry_point();
 	void emit_mesh_outputs();
 	void emit_mesh_tasks(SPIRBlock &block) override;
+	void emit_workgroup_initialization(const SPIRVariable &var) override;
 
 	// Allow Metal to use the array<T> template to make arrays a value type
 	std::string type_to_array_glsl(const SPIRType &type, uint32_t variable_id) override;
@@ -985,6 +1039,7 @@ protected:
 	void add_tess_level_input_to_interface_block(const std::string &ib_var_ref, SPIRType &ib_type, SPIRVariable &var);
 	void add_tess_level_input(const std::string &base_ref, const std::string &mbr_name, SPIRVariable &var);
 
+	void ensure_struct_members_valid_vecsizes(SPIRType &struct_type, uint32_t &location);
 	void fix_up_interface_member_indices(spv::StorageClass storage, uint32_t ib_type_id);
 
 	void mark_location_as_used_by_shader(uint32_t location, const SPIRType &type,
@@ -1069,7 +1124,8 @@ protected:
 	bool validate_member_packing_rules_msl(const SPIRType &type, uint32_t index) const;
 	std::string get_argument_address_space(const SPIRVariable &argument);
 	std::string get_type_address_space(const SPIRType &type, uint32_t id, bool argument = false);
-	static bool decoration_flags_signal_volatile(const Bitset &flags);
+	bool decoration_flags_signal_volatile(const Bitset &flags) const;
+	bool decoration_flags_signal_coherent(const Bitset &flags) const;
 	const char *to_restrict(uint32_t id, bool space);
 	SPIRType &get_stage_in_struct_type();
 	SPIRType &get_stage_out_struct_type();
@@ -1082,7 +1138,7 @@ protected:
 	                         uint32_t mem_order_1, uint32_t mem_order_2, bool has_mem_order_2, uint32_t op0, uint32_t op1 = 0,
 	                         bool op1_is_pointer = false, bool op1_is_literal = false, uint32_t op2 = 0);
 	const char *get_memory_order(uint32_t spv_mem_sem);
-	void add_pragma_line(const std::string &line);
+	void add_pragma_line(const std::string &line, bool recompile_on_unique);
 	void add_typedef_line(const std::string &line);
 	void emit_barrier(uint32_t id_exe_scope, uint32_t id_mem_scope, uint32_t id_mem_sem);
 	bool emit_array_copy(const char *expr, uint32_t lhs_id, uint32_t rhs_id,
@@ -1133,12 +1189,13 @@ protected:
 	void emit_store_statement(uint32_t lhs_expression, uint32_t rhs_expression) override;
 
 	void analyze_sampled_image_usage();
+	void analyze_workgroup_variables();
 
 	bool access_chain_needs_stage_io_builtin_translation(uint32_t base) override;
 	bool prepare_access_chain_for_scalar_access(std::string &expr, const SPIRType &type, spv::StorageClass storage,
 	                                            bool &is_packed) override;
 	void fix_up_interpolant_access_chain(const uint32_t *ops, uint32_t length);
-	void check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type) override;
+	bool check_physical_type_cast(std::string &expr, const SPIRType *type, uint32_t physical_type) override;
 
 	bool emit_tessellation_access_chain(const uint32_t *ops, uint32_t length);
 	bool emit_tessellation_io_load(uint32_t result_type, uint32_t id, uint32_t ptr);
@@ -1164,9 +1221,10 @@ protected:
 	std::unordered_map<uint32_t, uint32_t> fragment_output_components;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_input_location;
 	std::unordered_map<uint32_t, uint32_t> builtin_to_automatic_output_location;
-	std::set<std::string> pragma_lines;
-	std::set<std::string> typedef_lines;
+	std::vector<std::string> pragma_lines;
+	std::vector<std::string> typedef_lines;
 	SmallVector<uint32_t> vars_needing_early_declaration;
+	std::unordered_set<uint32_t> constant_macro_ids;
 
 	std::unordered_map<StageSetBinding, std::pair<MSLResourceBinding, bool>, InternalHasher> resource_bindings;
 	std::unordered_map<StageSetBinding, uint32_t, InternalHasher> resource_arg_buff_idx_to_binding_number;
@@ -1210,11 +1268,14 @@ protected:
 	bool needs_swizzle_buffer_def = false;
 	bool used_swizzle_buffer = false;
 	bool added_builtin_tess_level = false;
+	bool needs_local_invocation_index = false;
 	bool needs_subgroup_invocation_id = false;
 	bool needs_subgroup_size = false;
 	bool needs_sample_id = false;
 	bool needs_helper_invocation = false;
+	bool needs_workgroup_zero_init = false;
 	bool writes_to_depth = false;
+	bool writes_to_point_size = false;
 	std::string qual_pos_var_name;
 	std::string stage_in_var_name = "in";
 	std::string stage_out_var_name = "out";
@@ -1276,6 +1337,7 @@ protected:
 
 	bool suppress_missing_prototypes = false;
 	bool suppress_incompatible_pointer_types_discard_qualifiers = false;
+	bool suppress_sometimes_unitialized = false;
 
 	void add_spv_func_and_recompile(SPVFuncImpl spv_func);
 
@@ -1308,7 +1370,7 @@ protected:
 		}
 
 		bool handle(spv::Op opcode, const uint32_t *args, uint32_t length) override;
-		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args);
+		CompilerMSL::SPVFuncImpl get_spv_func_impl(spv::Op opcode, const uint32_t *args, uint32_t length);
 		void check_resource_write(uint32_t var_id);
 
 		CompilerMSL &compiler;
@@ -1319,6 +1381,7 @@ protected:
 		bool uses_image_write = false;
 		bool uses_buffer_write = false;
 		bool uses_discard = false;
+		bool needs_local_invocation_index = false;
 		bool needs_subgroup_invocation_id = false;
 		bool needs_subgroup_size = false;
 		bool needs_sample_id = false;
diff --git a/thirdparty/spirv-cross/spirv_parser.cpp b/thirdparty/spirv-cross/spirv_parser.cpp
index 6108dbb653..da1b8a89d1 100644
--- a/thirdparty/spirv-cross/spirv_parser.cpp
+++ b/thirdparty/spirv-cross/spirv_parser.cpp
@@ -305,6 +305,7 @@ void Parser::parse(const Instruction &instruction)
 	}
 
 	case OpExtInst:
+	case OpExtInstWithForwardRefsKHR:
 	{
 		// The SPIR-V debug information extended instructions might come at global scope.
 		if (current_block)
@@ -380,13 +381,21 @@ void Parser::parse(const Instruction &instruction)
 		auto mode = static_cast<ExecutionMode>(ops[1]);
 		execution.flags.set(mode);
 
-		if (mode == ExecutionModeLocalSizeId)
+		switch (mode)
 		{
+		case ExecutionModeLocalSizeId:
 			execution.workgroup_size.id_x = ops[2];
 			execution.workgroup_size.id_y = ops[3];
 			execution.workgroup_size.id_z = ops[4];
-		}
+			break;
 
+		case ExecutionModeFPFastMathDefault:
+			execution.fp_fast_math_defaults[ops[2]] = ops[3];
+			break;
+
+		default:
+			break;
+		}
 		break;
 	}
 
@@ -536,12 +545,37 @@ void Parser::parse(const Instruction &instruction)
 		uint32_t id = ops[0];
 		uint32_t width = ops[1];
 		auto &type = set<SPIRType>(id, op);
+
+		if (width != 16 && width != 8 && length > 2)
+			SPIRV_CROSS_THROW("Unrecognized FP encoding mode for OpTypeFloat.");
+
 		if (width == 64)
 			type.basetype = SPIRType::Double;
 		else if (width == 32)
 			type.basetype = SPIRType::Float;
 		else if (width == 16)
-			type.basetype = SPIRType::Half;
+		{
+			if (length > 2)
+			{
+				if (ops[2] == spv::FPEncodingBFloat16KHR)
+					type.basetype = SPIRType::BFloat16;
+				else
+					SPIRV_CROSS_THROW("Unrecognized encoding for OpTypeFloat 16.");
+			}
+			else
+				type.basetype = SPIRType::Half;
+		}
+		else if (width == 8)
+		{
+			if (length < 2)
+				SPIRV_CROSS_THROW("Missing encoding for OpTypeFloat 8.");
+			else if (ops[2] == spv::FPEncodingFloat8E4M3EXT)
+				type.basetype = SPIRType::FloatE4M3;
+			else if (ops[2] == spv::FPEncodingFloat8E5M2EXT)
+				type.basetype = SPIRType::FloatE5M2;
+			else
+				SPIRV_CROSS_THROW("Invalid encoding for OpTypeFloat 8.");
+		}
 		else
 			SPIRV_CROSS_THROW("Unrecognized bit-width of floating point type.");
 		type.width = width;
@@ -592,6 +626,22 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}
 
+	case OpTypeCooperativeMatrixKHR:
+	{
+		uint32_t id = ops[0];
+		auto &base = get<SPIRType>(ops[1]);
+		auto &matrixbase = set<SPIRType>(id, base);
+
+		matrixbase.op = op;
+		matrixbase.cooperative.scope_id = ops[2];
+		matrixbase.cooperative.rows_id = ops[3];
+		matrixbase.cooperative.columns_id = ops[4];
+		matrixbase.cooperative.use_id = ops[5];
+		matrixbase.self = id;
+		matrixbase.parent_type = ops[1];
+		break;
+	}
+
 	case OpTypeArray:
 	{
 		uint32_t id = ops[0];
@@ -835,17 +885,27 @@ void Parser::parse(const Instruction &instruction)
 		break;
 	}
 
-		// Constants
+	// Constants
 	case OpSpecConstant:
 	case OpConstant:
+	case OpConstantCompositeReplicateEXT:
+	case OpSpecConstantCompositeReplicateEXT:
 	{
 		uint32_t id = ops[1];
 		auto &type = get<SPIRType>(ops[0]);
-
-		if (type.width > 32)
-			set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+		if (op == OpConstantCompositeReplicateEXT || op == OpSpecConstantCompositeReplicateEXT)
+		{
+			auto subconstant = uint32_t(ops[2]);
+			set<SPIRConstant>(id, ops[0], &subconstant, 1, op == OpSpecConstantCompositeReplicateEXT, true);
+		}
 		else
-			set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		{
+
+			if (type.width > 32)
+				set<SPIRConstant>(id, ops[0], ops[2] | (uint64_t(ops[3]) << 32), op == OpSpecConstant);
+			else
+				set<SPIRConstant>(id, ops[0], ops[2], op == OpSpecConstant);
+		}
 		break;
 	}