diff --git a/examples/mesh-shader/Cargo.toml b/examples/mesh-shader/Cargo.toml
new file mode 100644
index 000000000..e460ef238
--- /dev/null
+++ b/examples/mesh-shader/Cargo.toml
@@ -0,0 +1,17 @@
+[package]
+name = "mesh-shader"
+version = "0.0.0"
+edition = "2021"
+publish = false
+
+[[bin]]
+name = "mesh-shader"
+path = "main.rs"
+test = false
+bench = false
+doc = false
+
+[dependencies]
+vulkano = { workspace = true, features = ["macros"] }
+vulkano-shaders = { workspace = true }
+winit = { workspace = true }
diff --git a/examples/mesh-shader/frag.glsl b/examples/mesh-shader/frag.glsl
new file mode 100644
index 000000000..2dd47aaf6
--- /dev/null
+++ b/examples/mesh-shader/frag.glsl
@@ -0,0 +1,9 @@
+#version 450
+
+layout(location = 0) in vec4 in_color;
+
+layout(location = 0) out vec4 f_color;
+
+void main() {
+    f_color = in_color;
+}
diff --git a/examples/mesh-shader/main.rs b/examples/mesh-shader/main.rs
new file mode 100644
index 000000000..43c88091f
--- /dev/null
+++ b/examples/mesh-shader/main.rs
@@ -0,0 +1,503 @@
+// Welcome to the mesh shader example!
+//
+// This is a simple, modified version of the `instancing.rs` example that demonstrates how to use mesh shaders to
+// generate geometry, that looks identical to the instancing example. We expect you to be familiar with both
+// instancing and compute shaders before approaching mesh shaders, due to their high complexity.
+//
+// This example is intentionally kept simple and does not follow the recommended pattern by which one should emit
+// vertices and indices. This pattern should best match what the hardware likes, and thus is unique to each vendor.
+//
+// See these presentation slides for an overview of mesh shaders and best practices:
+// https://vulkan.org/user/pages/09.events/vulkanised-2023/vulkanised_mesh_best_practices_2023.02.09-1.pdf
+// Presentation: https://www.youtube.com/watch?v=g9FoZcEQlbA
+
+use std::{error::Error, sync::Arc};
+use vulkano::{
+    buffer::{Buffer, BufferContents, BufferCreateInfo, BufferUsage},
+    command_buffer::{
+        allocator::StandardCommandBufferAllocator, CommandBufferBeginInfo, CommandBufferLevel,
+        CommandBufferUsage, RecordingCommandBuffer, RenderPassBeginInfo,
+    },
+    descriptor_set::{
+        allocator::StandardDescriptorSetAllocator, DescriptorSet, WriteDescriptorSet,
+    },
+    device::{
+        physical::PhysicalDeviceType, Device, DeviceCreateInfo, DeviceExtensions, Features,
+        QueueCreateInfo, QueueFlags,
+    },
+    image::{view::ImageView, Image, ImageUsage},
+    instance::{Instance, InstanceCreateFlags, InstanceCreateInfo},
+    memory::allocator::{AllocationCreateInfo, MemoryTypeFilter, StandardMemoryAllocator},
+    padded::Padded,
+    pipeline::{
+        graphics::{
+            color_blend::{ColorBlendAttachmentState, ColorBlendState},
+            multisample::MultisampleState,
+            rasterization::RasterizationState,
+            viewport::{Viewport, ViewportState},
+            GraphicsPipelineCreateInfo,
+        },
+        layout::PipelineDescriptorSetLayoutCreateInfo,
+        DynamicState, GraphicsPipeline, Pipeline, PipelineBindPoint, PipelineLayout,
+        PipelineShaderStageCreateInfo,
+    },
+    render_pass::{Framebuffer, FramebufferCreateInfo, RenderPass, Subpass},
+    single_pass_renderpass,
+    swapchain::{
+        acquire_next_image, Surface, Swapchain, SwapchainCreateInfo, SwapchainPresentInfo,
+    },
+    sync::{self, GpuFuture},
+    DeviceSize, Validated, VulkanError, VulkanLibrary,
+};
+use winit::{
+    event::{Event, WindowEvent},
+    event_loop::{ControlFlow, EventLoop},
+    window::WindowBuilder,
+};
+
+/// The vertex type that we will be used to describe the triangle's geometry.
+#[derive(BufferContents)]
+#[repr(C)]
+struct TriangleVertex {
+    position: [f32; 2],
+}
+
+/// The vertex type that describes the unique data per instance.
+type InstanceData = mesh::Instance;
+
+mod mesh {
+    vulkano_shaders::shader! {
+        ty: "mesh",
+        path: "mesh.glsl",
+        vulkan_version: "1.2",
+    }
+}
+
+mod fs {
+    vulkano_shaders::shader! {
+        ty: "fragment",
+        path: "frag.glsl",
+    }
+}
+
+fn main() -> Result<(), impl Error> {
+    let event_loop = EventLoop::new().unwrap();
+
+    let library = VulkanLibrary::new().unwrap();
+    let required_extensions = Surface::required_extensions(&event_loop).unwrap();
+    let instance = Instance::new(
+        library,
+        InstanceCreateInfo {
+            flags: InstanceCreateFlags::ENUMERATE_PORTABILITY,
+            enabled_extensions: required_extensions,
+            ..Default::default()
+        },
+    )
+    .unwrap();
+
+    let window = Arc::new(WindowBuilder::new().build(&event_loop).unwrap());
+    let surface = Surface::from_window(instance.clone(), window.clone()).unwrap();
+
+    let device_extensions = DeviceExtensions {
+        khr_swapchain: true,
+        ext_mesh_shader: true,
+        ..DeviceExtensions::empty()
+    };
+    let (physical_device, queue_family_index) = instance
+        .enumerate_physical_devices()
+        .unwrap()
+        .filter(|p| p.supported_extensions().contains(&device_extensions))
+        .filter_map(|p| {
+            p.queue_family_properties()
+                .iter()
+                .enumerate()
+                .position(|(i, q)| {
+                    q.queue_flags.intersects(QueueFlags::GRAPHICS)
+                        && p.surface_support(i as u32, &surface).unwrap_or(false)
+                })
+                .map(|i| (p, i as u32))
+        })
+        .min_by_key(|(p, _)| match p.properties().device_type {
+            PhysicalDeviceType::DiscreteGpu => 0,
+            PhysicalDeviceType::IntegratedGpu => 1,
+            PhysicalDeviceType::VirtualGpu => 2,
+            PhysicalDeviceType::Cpu => 3,
+            PhysicalDeviceType::Other => 4,
+            _ => 5,
+        })
+        .unwrap();
+
+    println!(
+        "Using device: {} (type: {:?})",
+        physical_device.properties().device_name,
+        physical_device.properties().device_type,
+    );
+
+    let (device, mut queues) = Device::new(
+        physical_device,
+        DeviceCreateInfo {
+            enabled_extensions: device_extensions,
+            enabled_features: Features {
+                mesh_shader: true,
+                ..Features::default()
+            },
+            queue_create_infos: vec![QueueCreateInfo {
+                queue_family_index,
+                ..Default::default()
+            }],
+            ..Default::default()
+        },
+    )
+    .unwrap();
+
+    let queue = queues.next().unwrap();
+
+    let (mut swapchain, images) = {
+        let surface_capabilities = device
+            .physical_device()
+            .surface_capabilities(&surface, Default::default())
+            .unwrap();
+        let image_format = device
+            .physical_device()
+            .surface_formats(&surface, Default::default())
+            .unwrap()[0]
+            .0;
+
+        Swapchain::new(
+            device.clone(),
+            surface,
+            SwapchainCreateInfo {
+                min_image_count: surface_capabilities.min_image_count.max(2),
+                image_format,
+                image_extent: window.inner_size().into(),
+                image_usage: ImageUsage::COLOR_ATTACHMENT,
+                composite_alpha: surface_capabilities
+                    .supported_composite_alpha
+                    .into_iter()
+                    .next()
+                    .unwrap(),
+                ..Default::default()
+            },
+        )
+        .unwrap()
+    };
+
+    let memory_allocator = Arc::new(StandardMemoryAllocator::new_default(device.clone()));
+    let descriptor_set_allocator = Arc::new(StandardDescriptorSetAllocator::new(
+        device.clone(),
+        Default::default(),
+    ));
+
+    // We now create a buffer that will store the shape of our triangle. This triangle is identical
+    // to the one in the `triangle.rs` example.
+    let vertices = [
+        TriangleVertex {
+            position: [-0.5, -0.25],
+        },
+        TriangleVertex {
+            position: [0.0, 0.5],
+        },
+        TriangleVertex {
+            position: [0.25, -0.1],
+        },
+    ];
+    let vertex_buffer = Buffer::from_iter(
+        memory_allocator.clone(),
+        BufferCreateInfo {
+            usage: BufferUsage::STORAGE_BUFFER,
+            ..Default::default()
+        },
+        AllocationCreateInfo {
+            memory_type_filter: MemoryTypeFilter::PREFER_DEVICE
+                | MemoryTypeFilter::HOST_SEQUENTIAL_WRITE,
+            ..Default::default()
+        },
+        vertices,
+    )
+    .unwrap();
+
+    // Now we create another buffer that will store the unique data per instance. For this example,
+    // we'll have the instances form a 10x10 grid that slowly gets larger.
+    let rows = 10;
+    let cols = 10;
+    let instances = {
+        let n_instances = rows * cols;
+        let mut data = Vec::new();
+        for c in 0..cols {
+            for r in 0..rows {
+                let half_cell_w = 0.5 / cols as f32;
+                let half_cell_h = 0.5 / rows as f32;
+                let x = half_cell_w + (c as f32 / cols as f32) * 2.0 - 1.0;
+                let y = half_cell_h + (r as f32 / rows as f32) * 2.0 - 1.0;
+                let position_offset = [x, y];
+                let scale = (2.0 / rows as f32) * (c * rows + r) as f32 / n_instances as f32;
+                data.push(InstanceData {
+                    position_offset,
+                    scale,
+                });
+            }
+        }
+        data
+    };
+    let instance_buffer = Buffer::new_unsized::<mesh::InstanceBuffer>(
+        memory_allocator,
+        BufferCreateInfo {
+            usage: BufferUsage::STORAGE_BUFFER,
+            ..Default::default()
+        },
+        AllocationCreateInfo {
+            memory_type_filter: MemoryTypeFilter::PREFER_DEVICE
+                | MemoryTypeFilter::HOST_SEQUENTIAL_WRITE,
+            ..Default::default()
+        },
+        instances.len() as DeviceSize,
+    )
+    .unwrap();
+    {
+        let mut guard = instance_buffer.write().unwrap();
+        for (i, instance) in instances.iter().enumerate() {
+            guard.instance[i] = Padded(*instance);
+        }
+    }
+
+    let render_pass = single_pass_renderpass!(
+        device.clone(),
+        attachments: {
+            color: {
+                format: swapchain.image_format(),
+                samples: 1,
+                load_op: Clear,
+                store_op: Store,
+            },
+        },
+        pass: {
+            color: [color],
+            depth_stencil: {},
+        },
+    )
+    .unwrap();
+
+    let pipeline = {
+        let mesh = mesh::load(device.clone())
+            .unwrap()
+            .entry_point("main")
+            .unwrap();
+        let fs = fs::load(device.clone())
+            .unwrap()
+            .entry_point("main")
+            .unwrap();
+        let stages = [
+            PipelineShaderStageCreateInfo::new(mesh),
+            PipelineShaderStageCreateInfo::new(fs),
+        ];
+        let layout = PipelineLayout::new(
+            device.clone(),
+            PipelineDescriptorSetLayoutCreateInfo::from_stages(&stages)
+                .into_pipeline_layout_create_info(device.clone())
+                .unwrap(),
+        )
+        .unwrap();
+        let subpass = Subpass::from(render_pass.clone(), 0).unwrap();
+
+        GraphicsPipeline::new(
+            device.clone(),
+            None,
+            GraphicsPipelineCreateInfo {
+                stages: stages.into_iter().collect(),
+                viewport_state: Some(ViewportState::default()),
+                rasterization_state: Some(RasterizationState::default()),
+                multisample_state: Some(MultisampleState::default()),
+                color_blend_state: Some(ColorBlendState::with_attachment_states(
+                    subpass.num_color_attachments(),
+                    ColorBlendAttachmentState::default(),
+                )),
+                dynamic_state: [DynamicState::Viewport].into_iter().collect(),
+                subpass: Some(subpass.into()),
+                ..GraphicsPipelineCreateInfo::layout(layout)
+            },
+        )
+        .unwrap()
+    };
+
+    let descriptor_set = DescriptorSet::new(
+        descriptor_set_allocator,
+        pipeline.layout().set_layouts()[0].clone(),
+        [
+            WriteDescriptorSet::buffer(0, vertex_buffer.clone()),
+            WriteDescriptorSet::buffer(1, instance_buffer.clone()),
+        ],
+        [],
+    )
+    .unwrap();
+
+    let mut viewport = Viewport {
+        offset: [0.0, 0.0],
+        extent: [0.0, 0.0],
+        depth_range: 0.0..=1.0,
+    };
+    let mut framebuffers = window_size_dependent_setup(&images, render_pass.clone(), &mut viewport);
+    let mut recreate_swapchain = false;
+    let mut previous_frame_end = Some(sync::now(device.clone()).boxed());
+
+    let command_buffer_allocator = Arc::new(StandardCommandBufferAllocator::new(
+        device.clone(),
+        Default::default(),
+    ));
+
+    event_loop.run(move |event, elwt| {
+        elwt.set_control_flow(ControlFlow::Poll);
+
+        match event {
+            Event::WindowEvent {
+                event: WindowEvent::CloseRequested,
+                ..
+            } => {
+                elwt.exit();
+            }
+            Event::WindowEvent {
+                event: WindowEvent::Resized(_),
+                ..
+            } => {
+                recreate_swapchain = true;
+            }
+            Event::WindowEvent {
+                event: WindowEvent::RedrawRequested,
+                ..
+            } => {
+                let image_extent: [u32; 2] = window.inner_size().into();
+
+                if image_extent.contains(&0) {
+                    return;
+                }
+
+                previous_frame_end.as_mut().unwrap().cleanup_finished();
+
+                if recreate_swapchain {
+                    let (new_swapchain, new_images) = swapchain
+                        .recreate(SwapchainCreateInfo {
+                            image_extent,
+                            ..swapchain.create_info()
+                        })
+                        .expect("failed to recreate swapchain");
+
+                    swapchain = new_swapchain;
+                    framebuffers = window_size_dependent_setup(
+                        &new_images,
+                        render_pass.clone(),
+                        &mut viewport,
+                    );
+                    recreate_swapchain = false;
+                }
+
+                let (image_index, suboptimal, acquire_future) =
+                    match acquire_next_image(swapchain.clone(), None).map_err(Validated::unwrap) {
+                        Ok(r) => r,
+                        Err(VulkanError::OutOfDate) => {
+                            recreate_swapchain = true;
+                            return;
+                        }
+                        Err(e) => panic!("failed to acquire next image: {e}"),
+                    };
+
+                if suboptimal {
+                    recreate_swapchain = true;
+                }
+
+                let mut builder = RecordingCommandBuffer::new(
+                    command_buffer_allocator.clone(),
+                    queue.queue_family_index(),
+                    CommandBufferLevel::Primary,
+                    CommandBufferBeginInfo {
+                        usage: CommandBufferUsage::OneTimeSubmit,
+                        ..Default::default()
+                    },
+                )
+                .unwrap();
+
+                builder
+                    .begin_render_pass(
+                        RenderPassBeginInfo {
+                            clear_values: vec![Some([0.0, 0.0, 1.0, 1.0].into())],
+                            ..RenderPassBeginInfo::framebuffer(
+                                framebuffers[image_index as usize].clone(),
+                            )
+                        },
+                        Default::default(),
+                    )
+                    .unwrap()
+                    .set_viewport(0, [viewport.clone()].into_iter().collect())
+                    .unwrap()
+                    .bind_pipeline_graphics(pipeline.clone())
+                    .unwrap()
+                    // Instead of binding vertex attributes, bind buffers as descriptor sets
+                    .bind_descriptor_sets(
+                        PipelineBindPoint::Graphics,
+                        pipeline.layout().clone(),
+                        0,
+                        descriptor_set.clone(),
+                    )
+                    .unwrap();
+
+                unsafe {
+                    builder.draw_mesh_tasks([cols, rows, 1]).unwrap();
+                }
+
+                builder.end_render_pass(Default::default()).unwrap();
+
+                let command_buffer = builder.end().unwrap();
+                let future = previous_frame_end
+                    .take()
+                    .unwrap()
+                    .join(acquire_future)
+                    .then_execute(queue.clone(), command_buffer)
+                    .unwrap()
+                    .then_swapchain_present(
+                        queue.clone(),
+                        SwapchainPresentInfo::swapchain_image_index(swapchain.clone(), image_index),
+                    )
+                    .then_signal_fence_and_flush();
+
+                match future.map_err(Validated::unwrap) {
+                    Ok(future) => {
+                        previous_frame_end = Some(future.boxed());
+                    }
+                    Err(VulkanError::OutOfDate) => {
+                        recreate_swapchain = true;
+                        previous_frame_end = Some(sync::now(device.clone()).boxed());
+                    }
+                    Err(e) => {
+                        println!("failed to flush future: {e}");
+                        previous_frame_end = Some(sync::now(device.clone()).boxed());
+                    }
+                }
+            }
+            Event::AboutToWait => window.request_redraw(),
+            _ => (),
+        }
+    })
+}
+
+/// This function is called once during initialization, then again whenever the window is resized.
+fn window_size_dependent_setup(
+    images: &[Arc<Image>],
+    render_pass: Arc<RenderPass>,
+    viewport: &mut Viewport,
+) -> Vec<Arc<Framebuffer>> {
+    let extent = images[0].extent();
+    viewport.extent = [extent[0] as f32, extent[1] as f32];
+
+    images
+        .iter()
+        .map(|image| {
+            let view = ImageView::new_default(image.clone()).unwrap();
+            Framebuffer::new(
+                render_pass.clone(),
+                FramebufferCreateInfo {
+                    attachments: vec![view],
+                    ..Default::default()
+                },
+            )
+            .unwrap()
+        })
+        .collect::<Vec<_>>()
+}
diff --git a/examples/mesh-shader/mesh.glsl b/examples/mesh-shader/mesh.glsl
new file mode 100644
index 000000000..2ef27d08d
--- /dev/null
+++ b/examples/mesh-shader/mesh.glsl
@@ -0,0 +1,97 @@
+#version 450
+#extension GL_EXT_mesh_shader : require
+
+// In mesh shaders you have to load all data manually from storage buffers, which are declared just like uniform
+// buffers, but using the `buffer` keyword. You may not use:
+// * `in`: Unlike vertex shaders, Mesh shaders do not have an input assembly (IA) stage that pulls data from buffers
+//    and forwards them to the vertex shaders as `in` inputs.
+// * `uniform`: Uniform buffers have to be of constant size, but as our buffers may have a varying amount of data,
+//    they have to be storage buffers instead.
+//
+// The triangle vertex positions.
+layout(set = 0, binding = 0) buffer VertexBuffer {
+    vec2 position[];
+} buffer_vertex;
+
+// The per-instance data.
+struct Instance {
+    vec2 position_offset;
+    float scale;
+};
+
+layout(set = 0, binding = 1) buffer InstanceBuffer {
+    Instance instance[];
+} buffer_instance;
+
+// This declaration specifies the workgroup size of the mesh shader, similarly to compute shaders
+layout(local_size_x = 1, local_size_y = 1, local_size_z = 1) in;
+// This declares the type of primitive you want to emit, typically triangles, as well as maximum amount of vertices
+// and primitives you may emit. Primitives may only be in lists, aka. triangle_strip or triangle_fan are not allowed.
+layout(triangles, max_vertices = 3, max_primitives = 1) out;
+
+// As mesh shaders may emit multiple vertices, all outputs have to be an array. See below, when vertices are emitted.
+layout(location = 0) out vec4 out_color[];
+
+const uint rows = 10;
+const uint cols = 10;
+const uint n_instances = rows * cols;
+
+void main() {
+    vec2 position_offset;
+    float scale;
+    vec4 color;
+
+    // There are two main use-cases for mesh shaders, switch in between them here.
+    // They should both draw the same triangles, but with different colors.
+    const bool LOAD_FROM_INSTANCE_BUFFER = false;
+
+    if (LOAD_FROM_INSTANCE_BUFFER) {
+        // Use-case 1: load instance data from buffers, similarly to doing an instanced draw
+        // color triangles red
+        color = vec4(1.0, 0.0, 0.0, 1.0);
+
+        Instance instance = buffer_instance.instance[gl_GlobalInvocationID.y * rows + gl_GlobalInvocationID.x];
+        position_offset = instance.position_offset;
+        scale = instance.scale;
+
+    } else {
+        // Use-case 2: generate the geometry dynamically in the mesh shader
+        // color triangles green
+        color = vec4(0.0, 1.0, 0.0, 1.0);
+
+        uint c = gl_GlobalInvocationID.x;
+        uint r = gl_GlobalInvocationID.y;
+
+        // the same algo for generating the triangle data as in the instanced example
+        float half_cell_w = 0.5 / float(cols);
+        float half_cell_h = 0.5 / float(rows);
+        float x = half_cell_w + (c / float(cols)) * 2.0 - 1.0;
+        float y = half_cell_h + (r / float(rows)) * 2.0 - 1.0;
+        position_offset = vec2(x, y);
+        scale = (2.0 / float(rows)) * (c * float(rows) + r) / n_instances;
+    }
+
+    // Dynamically set the amount of vertices and triangles that you would like to emit, must be lower than what was
+    // declared above. From the `OpSetMeshOutputsEXT` spec:
+    // The arguments are taken from the first invocation in each workgroup. Behavior is undefined if any invocation
+    // executes this instruction more than once or under non-uniform control flow. Behavior is undefined if there is
+    // any control flow path to an output write that is not preceded by this instruction.
+    SetMeshOutputsEXT(
+        3, // vertices
+        1// triangles = indices / 3
+    );
+
+    // emit vertex data
+    for (uint i = 0; i < 3; i++) {
+        // As we may emit multiple vertices, all outputs are arrays. You index into them using a unique vertex index
+        // within your work group. In this example the work group has the size (1, 1, 1), so each invocation can
+        // simply use the indices [0-2]. With larger work groups you will have to use the `gl_LocalInvocationID` to
+        // compute indices and make sure they are unique, so results don't get overridden by other invocations.
+        out_color[i] = color;
+        // just like setting gl_Position in the vertex shader
+        gl_MeshVerticesEXT[i].gl_Position = vec4(buffer_vertex.position[i] * scale + position_offset, 0.0, 1.0);
+    }
+
+    // emit triangle indices
+    gl_PrimitiveTriangleIndicesEXT[0] = uvec3(0, 1, 2);
+}