diff --git a/examples/example-runner/src/main.rs b/examples/example-runner/src/main.rs
index 787afd54db..155e7161b1 100644
--- a/examples/example-runner/src/main.rs
+++ b/examples/example-runner/src/main.rs
@@ -728,7 +728,7 @@ fn main() {
             })
             .collect();
 
-        let index_buffer_data = [0u32, 1, 2];
+        let index_buffer_data = [0u32, 1, 2, 1, 2, 3];
         let index_buffer_info = vk::BufferCreateInfo::builder()
             .size(std::mem::size_of_val(&index_buffer_data) as u64)
             .usage(vk::BufferUsageFlags::INDEX_BUFFER)
@@ -772,8 +772,27 @@ fn main() {
             .bind_buffer_memory(index_buffer, index_buffer_memory, 0)
             .unwrap();
 
+        let vertices = [
+            Vertex {
+                pos: [-1.0, 1.0, 0.0, 1.0],
+                color: [0.0, 1.0, 0.0, 1.0],
+            },
+            Vertex {
+                pos: [1.0, 1.0, 0.0, 1.0],
+                color: [0.0, 0.0, 1.0, 1.0],
+            },
+            Vertex {
+                pos: [-1.0, -1.0, 0.0, 1.0],
+                color: [1.0, 0.0, 0.0, 1.0],
+            },
+            Vertex {
+                pos: [1.0, -1.0, 0.0, 1.0],
+                color: [1.0, 1.0, 1.0, 1.0],
+            },
+        ];
+
         let vertex_input_buffer_info = vk::BufferCreateInfo {
-            size: 3 * std::mem::size_of::<Vertex>() as u64,
+            size: std::mem::size_of_val(&vertices) as u64 as u64,
             usage: vk::BufferUsageFlags::VERTEX_BUFFER,
             sharing_mode: vk::SharingMode::EXCLUSIVE,
             ..Default::default()
@@ -806,21 +825,6 @@ fn main() {
             .allocate_memory(&vertex_buffer_allocate_info, None)
             .unwrap();
 
-        let vertices = [
-            Vertex {
-                pos: [-1.0, 1.0, 0.0, 1.0],
-                color: [0.0, 1.0, 0.0, 1.0],
-            },
-            Vertex {
-                pos: [1.0, 1.0, 0.0, 1.0],
-                color: [0.0, 0.0, 1.0, 1.0],
-            },
-            Vertex {
-                pos: [0.0, -1.0, 0.0, 1.0],
-                color: [1.0, 0.0, 0.0, 1.0],
-            },
-        ];
-
         let vert_ptr = base
             .device
             .map_memory(
@@ -1061,7 +1065,7 @@ fn main() {
                         1,
                         0,
                         0,
-                        1,
+                        0,
                     );
                     // Or draw without the index buffer
                     // device.cmd_draw(draw_command_buffer, 3, 1, 0, 0);
diff --git a/examples/example-shader/src/lib.rs b/examples/example-shader/src/lib.rs
index 5b8fb2e797..ee966b4cf9 100644
--- a/examples/example-shader/src/lib.rs
+++ b/examples/example-shader/src/lib.rs
@@ -1,26 +1,183 @@
+//! Ported to Rust from https://github.com/Tw1ddle/Sky-Shader/blob/master/src/shaders/glsl/sky.fragment
+
 #![no_std]
 #![feature(register_attr)]
 #![register_attr(spirv)]
 
+use core::f32::consts::PI;
 use core::panic::PanicInfo;
-use spirv_std::{f32x4, Input, Output};
+use spirv_std::{f32x4, Input, Mat4, MathExt, Output, Vec3, Vec4};
 
-#[allow(unused_attributes)]
-#[spirv(entry = "fragment")]
-pub fn main_fs(input: Input<f32x4>, mut output: Output<f32x4>) {
-    output.store(input.load());
+const DEPOLARIZATION_FACTOR: f32 = 0.035;
+const LUMINANCE: f32 = 1.0;
+const MIE_COEFFICIENT: f32 = 0.005;
+const MIE_DIRECTIONAL_G: f32 = 0.8;
+const MIE_K_COEFFICIENT: Vec3 = Vec3::new(0.686, 0.678, 0.666);
+const MIE_V: f32 = 4.0;
+const MIE_ZENITH_LENGTH: f32 = 1.25e3;
+const NUM_MOLECULES: f32 = 2.542e25f32;
+const PRIMARIES: Vec3 = Vec3::new(6.8e-7f32, 5.5e-7f32, 4.5e-7f32);
+const RAYLEIGH: f32 = 1.0;
+const RAYLEIGH_ZENITH_LENGTH: f32 = 8.4e3;
+const REFRACTIVE_INDEX: f32 = 1.0003;
+const SUN_ANGULAR_DIAMETER_DEGREES: f32 = 0.0093333;
+const SUN_INTENSITY_FACTOR: f32 = 1000.0;
+const SUN_INTENSITY_FALLOFF_STEEPNESS: f32 = 1.5;
+const TONEMAP_WEIGHTING: Vec3 = Vec3::splat(9.50);
+const TURBIDITY: f32 = 2.0;
+
+/// Based on: https://seblagarde.wordpress.com/2014/12/01/inverse-trigonometric-functions-gpu-optimization-for-amd-gcn-architecture/
+fn acos_approx(v: f32) -> f32 {
+    let x = v.abs();
+    let mut res = -0.155972 * x + 1.56467; // p(x)
+    res *= (1.0f32 - x).sqrt();
+
+    let mask = (v >= 0.0) as u32 as f32;
+
+    // can't use if-statement so do oldskool shader masking instead to avoid conditional
+    (res * mask) + ((1.0f32 - mask) * (PI - res))
+}
+
+fn smoothstep(edge0: f32, edge1: f32, x: f32) -> f32 {
+    // Scale, bias and saturate x to 0..1 range
+    let x = ((x - edge0) / (edge1 - edge0)).clamp(0.0, 1.0);
+    // Evaluate polynomial
+    return x * x * (3.0 - 2.0 * x);
+}
+
+fn total_rayleigh(lambda: Vec3) -> Vec3 {
+    (8.0 * PI.pow(3.0)
+        * (REFRACTIVE_INDEX.pow(2.0) - 1.0).pow(2.0)
+        * (6.0 + 3.0 * DEPOLARIZATION_FACTOR))
+        / (3.0 * NUM_MOLECULES * lambda.pow(4.0) * (6.0 - 7.0 * DEPOLARIZATION_FACTOR))
+}
+
+fn total_mie(lambda: Vec3, k: Vec3, t: f32) -> Vec3 {
+    let c = 0.2 * t * 10e-18;
+    0.434 * c * PI * ((2.0 * PI) / lambda).pow(MIE_V - 2.0) * k
+}
+
+fn rayleigh_phase(cos_theta: f32) -> f32 {
+    (3.0 / (16.0 * PI)) * (1.0 + cos_theta.pow(2.0))
+}
+
+fn henyey_greenstein_phase(cos_theta: f32, g: f32) -> f32 {
+    (1.0 / (4.0 * PI)) * ((1.0 - g.pow(2.0)) / (1.0 - 2.0 * g * cos_theta + g.pow(2.0)).pow(1.5))
+}
+
+fn sun_intensity(zenith_angle_cos: f32) -> f32 {
+    let cutoff_angle = PI / 1.95; // Earth shadow hack
+    SUN_INTENSITY_FACTOR
+        * 0.0f32.max(
+            1.0 - (-((cutoff_angle - acos_approx(zenith_angle_cos))
+                / SUN_INTENSITY_FALLOFF_STEEPNESS))
+                .exp(),
+        )
+}
+
+fn uncharted2_tonemap(w: Vec3) -> Vec3 {
+    let a = Vec3::splat(0.15); // Shoulder strength
+    let b = Vec3::splat(0.50); // Linear strength
+    let c = Vec3::splat(0.10); // Linear angle
+    let d = Vec3::splat(0.20); // Toe strength
+    let e = Vec3::splat(0.02); // Toe numerator
+    let f = Vec3::splat(0.30); // Toe denominator
+
+    ((w * (a * w + c * b) + d * e) / (w * (a * w + b) + d * f)) - e / f
+}
+
+fn sky(dir: Vec3, sun_position: Vec3) -> Vec3 {
+    let up = Vec3::new(0.0, 1.0, 0.0);
+    let sunfade = 1.0 - (1.0 - (sun_position.1 / 450000.0).exp()).clamp(0.0, 1.0);
+    let rayleigh_coefficient = RAYLEIGH - (1.0 * (1.0 - sunfade));
+    let beta_r = total_rayleigh(PRIMARIES) * rayleigh_coefficient;
+
+    // Mie coefficient
+    let beta_m = total_mie(PRIMARIES, MIE_K_COEFFICIENT, TURBIDITY) * MIE_COEFFICIENT;
+
+    // Optical length, cutoff angle at 90 to avoid singularity
+    let zenith_angle = acos_approx(up.dot(dir).max(0.0));
+    let denom = (zenith_angle).cos() + 0.15 * (93.885 - ((zenith_angle * 180.0) / PI)).pow(-1.253);
+
+    let s_r = RAYLEIGH_ZENITH_LENGTH / denom;
+    let s_m = MIE_ZENITH_LENGTH / denom;
+
+    // Combined extinction factor
+    let fex = (-(beta_r * s_r + beta_m * s_m)).exp();
+
+    // In-scattering
+    let sun_direction = sun_position.normalize();
+    let cos_theta = dir.dot(sun_direction);
+    let beta_r_theta = beta_r * rayleigh_phase(cos_theta * 0.5 + 0.5);
+
+    let beta_m_theta = beta_m * henyey_greenstein_phase(cos_theta, MIE_DIRECTIONAL_G);
+    let sun_e = sun_intensity(sun_direction.dot(up));
+    let mut lin =
+        (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * (Vec3::splat(1.0) - fex))
+            .pow(1.5);
+    lin *= Vec3::splat(1.0).lerp(
+        (sun_e * ((beta_r_theta + beta_m_theta) / (beta_r + beta_m)) * fex).pow(0.5),
+        ((1.0 - up.dot(sun_direction)).pow(5.0)).clamp(0.0, 1.0),
+    );
+
+    // Composition + solar disc
+    let sun_angular_diameter_cos = SUN_ANGULAR_DIAMETER_DEGREES.cos();
+    let sundisk = smoothstep(
+        sun_angular_diameter_cos,
+        sun_angular_diameter_cos + 0.00002,
+        cos_theta,
+    );
+    let mut l0 = 0.1 * fex;
+    l0 += sun_e * 19000.0 * fex * sundisk;
+    let mut tex_color = lin + l0;
+    tex_color *= Vec3::splat(0.04);
+    tex_color += Vec3::new(0.0, 0.001, 0.0025) * 0.3;
+
+    // Tonemapping
+    let white_scale = 1.0 / uncharted2_tonemap(TONEMAP_WEIGHTING);
+    let curr = uncharted2_tonemap(((2.0 / LUMINANCE.pow(4.0)).log2()) * tex_color);
+    let color = curr * white_scale;
+
+    color.pow(1.0 / (1.2 + (1.2 * sunfade)))
+}
+
+#[spirv(entry = "fragment")]
+pub fn main_fs(input: Input<Vec4>, mut output: Output<Vec4>) {
+    let color = input.load();
+    let mut dir = Vec3::new(color.0, color.1, 0.0);
+
+    // hard-code information because we can't bind buffers at the moment
+    let eye_pos = Vec3(0.0, 0.0997, 0.2);
+    let sun_pos = Vec3::new(0.0, 75.0, -1000.0);
+    let clip_to_world = Mat4 {
+        x_axis: Vec4(-0.5522849, 0.0, 0.0, 0.0),
+        y_axis: Vec4(0.0, 0.4096309, -0.061444636, 0.0),
+        z_axis: Vec4(0.0, 99.99999, 199.99998, 999.99994),
+        w_axis: Vec4(0.0, -0.14834046, -0.98893654, 0.0),
+    };
+
+    let cs_pos = Vec4(dir.0, -dir.1, 1.0, 1.0);
+    let mut ws_pos = clip_to_world.mul_vec4(cs_pos);
+    let ws_pos = Vec3(
+        ws_pos.0 / ws_pos.3,
+        ws_pos.1 / ws_pos.3,
+        ws_pos.2 / ws_pos.3,
+    );
+    let dir = (ws_pos - eye_pos).normalize();
+    let k = sky(dir, sun_pos);
+
+    output.store(k.extend(0.0))
 }
 
-#[allow(unused_attributes)]
 #[spirv(entry = "vertex")]
 pub fn main_vs(
-    in_pos: Input<f32x4>,
-    in_color: Input<f32x4>,
-    #[spirv(builtin = "position")] mut out_pos: Output<f32x4>,
-    mut out_color: Output<f32x4>,
+    in_pos: Input<Vec4>,
+    in_color: Input<Vec4>,
+    #[spirv(builtin = "position")] mut out_pos: Output<Vec4>,
+    mut out_color: Output<Vec4>,
 ) {
     out_pos.store(in_pos.load());
-    out_color.store(in_color.load());
+    out_color.store(in_pos.load());
 }
 
 #[panic_handler]
diff --git a/spirv-std/src/lib.rs b/spirv-std/src/lib.rs
index c835d32c57..8a644b8d5a 100644
--- a/spirv-std/src/lib.rs
+++ b/spirv-std/src/lib.rs
@@ -1,7 +1,11 @@
 #![no_std]
-#![feature(register_attr, repr_simd)]
+#![feature(register_attr, repr_simd, core_intrinsics)]
 #![register_attr(spirv)]
 
+pub mod math;
+pub use crate::math::MathExt;
+pub use crate::math::*;
+
 macro_rules! pointer_addrspace_write {
     (false) => {};
     (true) => {
diff --git a/spirv-std/src/math/mat2.rs b/spirv-std/src/math/mat2.rs
new file mode 100644
index 0000000000..44670a1b29
--- /dev/null
+++ b/spirv-std/src/math/mat2.rs
@@ -0,0 +1,242 @@
+use super::{Vec2, Vec4};
+use core::ops::{Add, Mul, Sub};
+
+/// Creates a `Mat2` from two column vectors.
+#[inline]
+pub fn mat2(x_axis: Vec2, y_axis: Vec2) -> Mat2 {
+    Mat2::from_cols(x_axis, y_axis)
+}
+
+/// A 2x2 column major matrix.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
+pub struct Mat2(pub Vec4);
+
+impl Default for Mat2 {
+    #[inline]
+    fn default() -> Self {
+        Self::identity()
+    }
+}
+
+impl Mat2 {
+    /// Creates a 2x2 matrix with all elements set to `0.0`.
+    #[inline]
+    pub const fn zero() -> Self {
+        Self(Vec4::zero())
+    }
+
+    /// Creates a 2x2 identity matrix.
+    #[inline]
+    pub const fn identity() -> Self {
+        Self(Vec4::new(1.0, 0.0, 0.0, 1.0))
+    }
+
+    /// Creates a 2x2 matrix from two column vectors.
+    #[inline]
+    pub fn from_cols(x_axis: Vec2, y_axis: Vec2) -> Self {
+        Self(Vec4::new(x_axis.x(), x_axis.y(), y_axis.x(), y_axis.y()))
+    }
+
+    /// Creates a 2x2 matrix from a `[f32; 4]` stored in column major order.  If
+    /// your data is stored in row major you will need to `transpose` the
+    /// returned matrix.
+    #[inline]
+    pub fn from_cols_array(m: &[f32; 4]) -> Self {
+        Mat2(Vec4::new(m[0], m[1], m[2], m[3]))
+    }
+
+    /// Creates a `[f32; 4]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array(&self) -> [f32; 4] {
+        self.0.into()
+    }
+
+    /// Creates a 2x2 matrix from a `[[f32; 2]; 2]` stored in column major
+    /// order.  If your data is in row major order you will need to `transpose`
+    /// the returned matrix.
+    #[inline]
+    pub fn from_cols_array_2d(m: &[[f32; 2]; 2]) -> Self {
+        Mat2(Vec4::new(m[0][0], m[0][1], m[1][0], m[1][1]))
+    }
+
+    /// Creates a `[[f32; 2]; 2]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array_2d(&self) -> [[f32; 2]; 2] {
+        let (x0, y0, x1, y1) = self.0.into();
+        [[x0, y0], [x1, y1]]
+    }
+
+    /// Creates a 2x2 matrix containing the given non-uniform `scale`.
+    #[inline]
+    pub fn from_scale(scale: Vec2) -> Self {
+        let (x, y) = scale.into();
+        Self(Vec4::new(x, 0.0, 0.0, y))
+    }
+
+    /// Sets the first column, the `x` axis.
+    #[inline]
+    pub fn set_x_axis(&mut self, x: Vec2) {
+        (self.0).0 = x.x();
+        (self.0).1 = x.y();
+    }
+
+    /// Sets the second column, the `y` axis.
+    #[inline]
+    pub fn set_y_axis(&mut self, y: Vec2) {
+        (self.0).2 = y.x();
+        (self.0).3 = y.y();
+    }
+
+    /// Returns the first column, the `x` axis.
+    #[inline]
+    pub fn x_axis(&self) -> Vec2 {
+        let (x, y, _, _) = self.0.into();
+        Vec2::new(x, y)
+    }
+
+    /// Returns the second column, the `y` axis.
+    #[inline]
+    pub fn y_axis(&self) -> Vec2 {
+        let (_, _, x, y) = self.0.into();
+        Vec2::new(x, y)
+    }
+
+    // #[inline]
+    // pub(crate) fn col(&self, index: usize) -> Vec2 {
+    //     match index {
+    //         0 => self.x_axis(),
+    //         1 => self.y_axis(),
+    //         _ => panic!(
+    //             "index out of bounds: the len is 2 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    // #[inline]
+    // pub(crate) fn col_mut(&mut self, index: usize) -> &mut Vec2 {
+    //     match index {
+    //         0 => unsafe { &mut *(self.0.as_mut().as_mut_ptr() as *mut Vec2) },
+    //         1 => unsafe { &mut *(self.0.as_mut()[2..].as_mut_ptr() as *mut Vec2) },
+    //         _ => panic!(
+    //             "index out of bounds: the len is 2 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    /// Returns the transpose of `self`.
+    #[inline]
+    pub fn transpose(&self) -> Self {
+        let (m00, m01, m10, m11) = self.0.into();
+        Self(Vec4::new(m00, m10, m01, m11))
+    }
+
+    /// Returns the determinant of `self`.
+    #[inline]
+    pub fn determinant(&self) -> f32 {
+        let (a, b, c, d) = self.0.into();
+        a * d - b * c
+    }
+
+    /// Returns the inverse of `self`.
+    ///
+    /// If the matrix is not invertible the returned matrix will be invalid.
+    #[inline]
+    pub fn inverse(&self) -> Self {
+        let (a, b, c, d) = self.0.into();
+        let det = a * d - b * c;
+        let tmp = Vec4::new(1.0, -1.0, -1.0, 1.0) / det;
+        Self(Vec4::new(d, b, c, a) * tmp)
+    }
+
+    /// Transforms a `Vec2`.
+    #[inline]
+    pub fn mul_vec2(&self, other: Vec2) -> Vec2 {
+        // TODO: SSE2
+        let other = Vec4::new(other.x(), other.x(), other.y(), other.y());
+        let tmp = self.0 * other;
+        let (x0, y0, x1, y1) = tmp.into();
+        Vec2::new(x0 + x1, y0 + y1)
+    }
+
+    /// Multiplies two 2x2 matrices.
+    #[inline]
+    pub fn mul_mat2(&self, other: &Self) -> Self {
+        // TODO: SSE2
+        let (x0, y0, x1, y1) = other.0.into();
+        Mat2::from_cols(
+            self.mul_vec2(Vec2::new(x0, y0)),
+            self.mul_vec2(Vec2::new(x1, y1)),
+        )
+    }
+
+    /// Adds two 2x2 matrices.
+    #[inline]
+    pub fn add_mat2(&self, other: &Self) -> Self {
+        Mat2(self.0 + other.0)
+    }
+
+    /// Subtracts two 2x2 matrices.
+    #[inline]
+    pub fn sub_mat2(&self, other: &Self) -> Self {
+        Mat2(self.0 - other.0)
+    }
+
+    /// Multiplies a 2x2 matrix by a scalar.
+    #[inline]
+    pub fn mul_scalar(&self, other: f32) -> Self {
+        let s = Vec4::splat(other);
+        Mat2(self.0 * s)
+    }
+}
+
+impl Add<Mat2> for Mat2 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        self.add_mat2(&other)
+    }
+}
+
+impl Sub<Mat2> for Mat2 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Self) -> Self {
+        self.sub_mat2(&other)
+    }
+}
+
+impl Mul<Mat2> for Mat2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Self) -> Self {
+        self.mul_mat2(&other)
+    }
+}
+
+impl Mul<Vec2> for Mat2 {
+    type Output = Vec2;
+    #[inline]
+    fn mul(self, other: Vec2) -> Vec2 {
+        self.mul_vec2(other)
+    }
+}
+
+impl Mul<Mat2> for f32 {
+    type Output = Mat2;
+    #[inline]
+    fn mul(self, other: Mat2) -> Mat2 {
+        other.mul_scalar(self)
+    }
+}
+
+impl Mul<f32> for Mat2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        self.mul_scalar(other)
+    }
+}
diff --git a/spirv-std/src/math/mat3.rs b/spirv-std/src/math/mat3.rs
new file mode 100644
index 0000000000..bd822e517a
--- /dev/null
+++ b/spirv-std/src/math/mat3.rs
@@ -0,0 +1,320 @@
+use super::Vec3;
+use core::ops::{Add, Mul, Sub};
+
+/// Creates a `Mat3` from three column vectors.
+#[inline]
+pub fn mat3(x_axis: Vec3, y_axis: Vec3, z_axis: Vec3) -> Mat3 {
+    Mat3 {
+        x_axis,
+        y_axis,
+        z_axis,
+    }
+}
+
+/// A 3x3 column major matrix.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
+pub struct Mat3 {
+    pub x_axis: Vec3,
+    pub y_axis: Vec3,
+    pub z_axis: Vec3,
+}
+
+impl Default for Mat3 {
+    #[inline]
+    fn default() -> Self {
+        Self::identity()
+    }
+}
+
+impl Mat3 {
+    /// Creates a 3x3 matrix with all elements set to `0.0`.
+    #[inline]
+    pub const fn zero() -> Self {
+        Self {
+            x_axis: Vec3::zero(),
+            y_axis: Vec3::zero(),
+            z_axis: Vec3::zero(),
+        }
+    }
+
+    /// Creates a 3x3 identity matrix.
+    #[inline]
+    pub const fn identity() -> Self {
+        Self {
+            x_axis: Vec3::new(1.0, 0.0, 0.0),
+            y_axis: Vec3::new(0.0, 1.0, 0.0),
+            z_axis: Vec3::new(0.0, 0.0, 1.0),
+        }
+    }
+
+    /// Creates a 3x3 matrix from three column vectors.
+    #[inline]
+    pub fn from_cols(x_axis: Vec3, y_axis: Vec3, z_axis: Vec3) -> Self {
+        Self {
+            x_axis,
+            y_axis,
+            z_axis,
+        }
+    }
+
+    /// Creates a 3x3 matrix from a `[f32; 9]` stored in column major order.
+    /// If your data is stored in row major you will need to `transpose` the
+    /// returned matrix.
+    #[inline]
+    pub fn from_cols_array(m: &[f32; 9]) -> Self {
+        Mat3 {
+            x_axis: Vec3::new(m[0], m[1], m[2]),
+            y_axis: Vec3::new(m[3], m[4], m[5]),
+            z_axis: Vec3::new(m[6], m[7], m[8]),
+        }
+    }
+
+    /// Creates a `[f32; 9]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array(&self) -> [f32; 9] {
+        let (m00, m01, m02) = self.x_axis.into();
+        let (m10, m11, m12) = self.y_axis.into();
+        let (m20, m21, m22) = self.z_axis.into();
+        [m00, m01, m02, m10, m11, m12, m20, m21, m22]
+    }
+
+    /// Creates a 3x3 matrix from a `[[f32; 3]; 3]` stored in column major order.
+    /// If your data is in row major order you will need to `transpose` the
+    /// returned matrix.
+    #[inline]
+    pub fn from_cols_array_2d(m: &[[f32; 3]; 3]) -> Self {
+        Mat3 {
+            x_axis: m[0].into(),
+            y_axis: m[1].into(),
+            z_axis: m[2].into(),
+        }
+    }
+
+    /// Creates a `[[f32; 3]; 3]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array_2d(&self) -> [[f32; 3]; 3] {
+        [self.x_axis.into(), self.y_axis.into(), self.z_axis.into()]
+    }
+
+    /// Creates a 3x3 non-uniform scale matrix.
+    #[inline]
+    pub fn from_scale(scale: Vec3) -> Self {
+        // TODO: should have a affine 2D scale and a 3d scale?
+        // Do not panic as long as any component is non-zero
+        let (x, y, z) = scale.into();
+        Self {
+            x_axis: Vec3::new(x, 0.0, 0.0),
+            y_axis: Vec3::new(0.0, y, 0.0),
+            z_axis: Vec3::new(0.0, 0.0, z),
+        }
+    }
+
+    /// Sets the first column, the `x` axis.
+    #[inline]
+    pub fn set_x_axis(&mut self, x: Vec3) {
+        self.x_axis = x;
+    }
+
+    /// Sets the second column, the `y` axis.
+    #[inline]
+    pub fn set_y_axis(&mut self, y: Vec3) {
+        self.y_axis = y;
+    }
+
+    /// Sets the third column, the `z` axis.
+    #[inline]
+    pub fn set_z_axis(&mut self, z: Vec3) {
+        self.z_axis = z;
+    }
+
+    /// Returns the first column, the `x` axis.
+    #[inline]
+    pub fn x_axis(&self) -> Vec3 {
+        self.x_axis
+    }
+
+    /// Returns the second column, the `y` axis.
+    #[inline]
+    pub fn y_axis(&self) -> Vec3 {
+        self.y_axis
+    }
+
+    /// Returns the third column, the `z` axis.
+    #[inline]
+    pub fn z_axis(&self) -> Vec3 {
+        self.z_axis
+    }
+
+    /// Returns a mutable reference to the first column, the `x` axis.
+    #[inline]
+    pub fn x_axis_mut(&mut self) -> &mut Vec3 {
+        &mut self.x_axis
+    }
+
+    /// Returns a mutable reference to the second column, the `y` axis.
+    #[inline]
+    pub fn y_axis_mut(&mut self) -> &mut Vec3 {
+        &mut self.y_axis
+    }
+
+    /// Returns a mutable reference to the third column, the `z` axis.
+    #[inline]
+    pub fn z_axis_mut(&mut self) -> &mut Vec3 {
+        &mut self.z_axis
+    }
+
+    // #[inline]
+    // pub(crate) fn col(&self, index: usize) -> Vec3 {
+    //     match index {
+    //         0 => self.x_axis,
+    //         1 => self.y_axis,
+    //         2 => self.z_axis,
+    //         _ => panic!(
+    //             "index out of bounds: the len is 3 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    // #[inline]
+    // pub(crate) fn col_mut(&mut self, index: usize) -> &mut Vec3 {
+    //     match index {
+    //         0 => &mut self.x_axis,
+    //         1 => &mut self.y_axis,
+    //         2 => &mut self.z_axis,
+    //         _ => panic!(
+    //             "index out of bounds: the len is 3 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    /// Returns the transpose of `self`.
+    #[inline]
+    pub fn transpose(&self) -> Self {
+        Self {
+            x_axis: Vec3::new(self.x_axis.0, self.y_axis.0, self.z_axis.0),
+            y_axis: Vec3::new(self.x_axis.1, self.y_axis.1, self.z_axis.1),
+            z_axis: Vec3::new(self.x_axis.2, self.y_axis.2, self.z_axis.2),
+        }
+    }
+
+    /// Returns the determinant of `self`.
+    #[inline]
+    pub fn determinant(&self) -> f32 {
+        self.z_axis.dot(self.x_axis.cross(self.y_axis))
+    }
+
+    /// Returns the inverse of `self`.
+    ///
+    /// If the matrix is not invertible the returned matrix will be invalid.
+    pub fn inverse(&self) -> Self {
+        let tmp0 = self.y_axis.cross(self.z_axis);
+        let tmp1 = self.z_axis.cross(self.x_axis);
+        let tmp2 = self.x_axis.cross(self.y_axis);
+        let det = self.z_axis.dot_as_vec3(tmp2);
+        let inv_det = det.recip();
+        // TODO: Work out if it's possible to get rid of the transpose
+        Mat3::from_cols(tmp0 * inv_det, tmp1 * inv_det, tmp2 * inv_det).transpose()
+    }
+
+    /// Multiplies two 3x3 matrices.
+    #[inline]
+    pub fn mul_mat3(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.mul_vec3(other.x_axis),
+            y_axis: self.mul_vec3(other.y_axis),
+            z_axis: self.mul_vec3(other.z_axis),
+        }
+    }
+
+    /// Adds two 3x3 matrices.
+    #[inline]
+    pub fn add_mat3(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.x_axis + other.x_axis,
+            y_axis: self.y_axis + other.y_axis,
+            z_axis: self.z_axis + other.z_axis,
+        }
+    }
+
+    /// Subtracts two 3x3 matrices.
+    #[inline]
+    pub fn sub_mat3(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.x_axis - other.x_axis,
+            y_axis: self.y_axis - other.y_axis,
+            z_axis: self.z_axis - other.z_axis,
+        }
+    }
+
+    /// Transforms a `Vec3`.
+    #[inline]
+    pub fn mul_vec3(&self, other: Vec3) -> Vec3 {
+        let mut res = self.x_axis * Vec3::splat(other.x());
+        res = self.y_axis.mul_add(Vec3::splat(other.y()), res);
+        res = self.z_axis.mul_add(Vec3::splat(other.z()), res);
+        res
+    }
+
+    #[inline]
+    /// Multiplies a 3x3 matrix by a scalar.
+    pub fn mul_scalar(&self, other: f32) -> Self {
+        let s = Vec3::splat(other);
+        Self {
+            x_axis: self.x_axis * s,
+            y_axis: self.y_axis * s,
+            z_axis: self.z_axis * s,
+        }
+    }
+}
+
+impl Add<Mat3> for Mat3 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        self.add_mat3(&other)
+    }
+}
+
+impl Sub<Mat3> for Mat3 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Self) -> Self {
+        self.sub_mat3(&other)
+    }
+}
+
+impl Mul<Mat3> for Mat3 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Self) -> Self {
+        self.mul_mat3(&other)
+    }
+}
+
+impl Mul<Vec3> for Mat3 {
+    type Output = Vec3;
+    #[inline]
+    fn mul(self, other: Vec3) -> Vec3 {
+        self.mul_vec3(other)
+    }
+}
+
+impl Mul<Mat3> for f32 {
+    type Output = Mat3;
+    #[inline]
+    fn mul(self, other: Mat3) -> Mat3 {
+        other.mul_scalar(self)
+    }
+}
+
+impl Mul<f32> for Mat3 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        self.mul_scalar(other)
+    }
+}
diff --git a/spirv-std/src/math/mat4.rs b/spirv-std/src/math/mat4.rs
new file mode 100644
index 0000000000..a96b2f287e
--- /dev/null
+++ b/spirv-std/src/math/mat4.rs
@@ -0,0 +1,464 @@
+use super::{Vec3, Vec4};
+use core::ops::{Add, Mul, Sub};
+
+/// Creates a `Mat4` from four column vectors.
+#[inline]
+pub fn mat4(x_axis: Vec4, y_axis: Vec4, z_axis: Vec4, w_axis: Vec4) -> Mat4 {
+    Mat4 {
+        x_axis,
+        y_axis,
+        z_axis,
+        w_axis,
+    }
+}
+
+/// A 4x4 column major matrix.
+///
+/// This type is 16 byte aligned.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug)]
+pub struct Mat4 {
+    pub x_axis: Vec4,
+    pub y_axis: Vec4,
+    pub z_axis: Vec4,
+    pub w_axis: Vec4,
+}
+
+impl Default for Mat4 {
+    #[inline]
+    fn default() -> Self {
+        Self::identity()
+    }
+}
+
+impl Mat4 {
+    /// Creates a 4x4 matrix with all elements set to `0.0`.
+    #[inline]
+    pub const fn zero() -> Self {
+        Mat4 {
+            x_axis: Vec4::zero(),
+            y_axis: Vec4::zero(),
+            z_axis: Vec4::zero(),
+            w_axis: Vec4::zero(),
+        }
+    }
+
+    /// Creates a 4x4 identity matrix.
+    #[inline]
+    pub const fn identity() -> Self {
+        Mat4 {
+            x_axis: Vec4::new(1.0, 0.0, 0.0, 0.0),
+            y_axis: Vec4::new(0.0, 1.0, 0.0, 0.0),
+            z_axis: Vec4::new(0.0, 0.0, 1.0, 0.0),
+            w_axis: Vec4::new(0.0, 0.0, 0.0, 1.0),
+        }
+    }
+
+    /// Creates a 4x4 matrix from four column vectors.
+    #[inline]
+    pub fn from_cols(x_axis: Vec4, y_axis: Vec4, z_axis: Vec4, w_axis: Vec4) -> Self {
+        Self {
+            x_axis,
+            y_axis,
+            z_axis,
+            w_axis,
+        }
+    }
+
+    /// Creates a 4x4 matrix from a `[f32; 16]` stored in column major order.
+    /// If your data is stored in row major you will need to `transpose` the
+    /// returned matrix.
+    #[inline]
+    pub fn from_cols_array(m: &[f32; 16]) -> Self {
+        Mat4 {
+            x_axis: Vec4::new(m[0], m[1], m[2], m[3]),
+            y_axis: Vec4::new(m[4], m[5], m[6], m[7]),
+            z_axis: Vec4::new(m[8], m[9], m[10], m[11]),
+            w_axis: Vec4::new(m[12], m[13], m[14], m[15]),
+        }
+    }
+
+    /// Creates a `[f32; 16]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array(&self) -> [f32; 16] {
+        *self.as_ref()
+    }
+
+    /// Creates a 4x4 matrix from a `[[f32; 4]; 4]` stored in column major
+    /// order.  If your data is in row major order you will need to `transpose`
+    /// the returned matrix.
+    #[inline]
+    pub fn from_cols_array_2d(m: &[[f32; 4]; 4]) -> Self {
+        Mat4 {
+            x_axis: m[0].into(),
+            y_axis: m[1].into(),
+            z_axis: m[2].into(),
+            w_axis: m[3].into(),
+        }
+    }
+
+    /// Creates a `[[f32; 4]; 4]` storing data in column major order.
+    /// If you require data in row major order `transpose` the matrix first.
+    #[inline]
+    pub fn to_cols_array_2d(&self) -> [[f32; 4]; 4] {
+        [
+            self.x_axis.into(),
+            self.y_axis.into(),
+            self.z_axis.into(),
+            self.w_axis.into(),
+        ]
+    }
+
+    /// Creates a 4x4 homogeneous transformation matrix from the given `translation`.
+    #[inline]
+    pub fn from_translation(translation: Vec3) -> Self {
+        Self {
+            x_axis: Vec4::unit_x(),
+            y_axis: Vec4::unit_y(),
+            z_axis: Vec4::unit_z(),
+            w_axis: translation.extend(1.0),
+        }
+    }
+
+    /// Creates a 4x4 homogeneous transformation matrix containing the given
+    /// non-uniform `scale`.
+    #[inline]
+    pub fn from_scale(scale: Vec3) -> Self {
+        // Do not panic as long as any component is non-zero
+        let (x, y, z) = scale.into();
+        Self {
+            x_axis: Vec4::new(x, 0.0, 0.0, 0.0),
+            y_axis: Vec4::new(0.0, y, 0.0, 0.0),
+            z_axis: Vec4::new(0.0, 0.0, z, 0.0),
+            w_axis: Vec4::unit_w(),
+        }
+    }
+
+    /// Sets the first column, the `x` axis.
+    #[inline]
+    pub fn set_x_axis(&mut self, x: Vec4) {
+        self.x_axis = x;
+    }
+
+    /// Sets the second column, the `y` axis.
+    #[inline]
+    pub fn set_y_axis(&mut self, y: Vec4) {
+        self.y_axis = y;
+    }
+
+    /// Sets the third column, the `z` axis.
+    #[inline]
+    pub fn set_z_axis(&mut self, z: Vec4) {
+        self.z_axis = z;
+    }
+
+    /// Sets the fourth column, the `w` axis.
+    #[inline]
+    pub fn set_w_axis(&mut self, w: Vec4) {
+        self.w_axis = w;
+    }
+
+    /// Returns the first column, the `x` axis.
+    #[inline]
+    pub fn x_axis(&self) -> Vec4 {
+        self.x_axis
+    }
+
+    /// Returns the second column, the `y` axis.
+    #[inline]
+    pub fn y_axis(&self) -> Vec4 {
+        self.y_axis
+    }
+
+    /// Returns the third column, the `z` axis.
+    #[inline]
+    pub fn z_axis(&self) -> Vec4 {
+        self.z_axis
+    }
+
+    /// Returns the fourth column, the `w` axis.
+    #[inline]
+    pub fn w_axis(&self) -> Vec4 {
+        self.w_axis
+    }
+
+    /// Returns a mutable reference to the first column, the `x` axis.
+    #[inline]
+    pub fn x_axis_mut(&mut self) -> &mut Vec4 {
+        &mut self.x_axis
+    }
+
+    /// Returns a mutable reference to the second column, the `y` axis.
+    #[inline]
+    pub fn y_axis_mut(&mut self) -> &mut Vec4 {
+        &mut self.y_axis
+    }
+
+    /// Returns a mutable reference to the third column, the `z` axis.
+    #[inline]
+    pub fn z_axis_mut(&mut self) -> &mut Vec4 {
+        &mut self.z_axis
+    }
+
+    /// Returns a mutable reference to the fourth column, the `w` axis.
+    #[inline]
+    pub fn w_axis_mut(&mut self) -> &mut Vec4 {
+        &mut self.w_axis
+    }
+
+    // #[inline]
+    // pub(crate) fn col(&self, index: usize) -> Vec4 {
+    //     match index {
+    //         0 => self.x_axis,
+    //         1 => self.y_axis,
+    //         2 => self.z_axis,
+    //         3 => self.w_axis,
+    //         _ => panic!(
+    //             "index out of bounds: the len is 4 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    // #[inline]
+    // pub(crate) fn col_mut(&mut self, index: usize) -> &mut Vec4 {
+    //     match index {
+    //         0 => &mut self.x_axis,
+    //         1 => &mut self.y_axis,
+    //         2 => &mut self.z_axis,
+    //         3 => &mut self.w_axis,
+    //         _ => panic!(
+    //             "index out of bounds: the len is 4 but the index is {}",
+    //             index
+    //         ),
+    //     }
+    // }
+
+    /// Returns the transpose of `self`.
+    #[inline]
+    pub fn transpose(&self) -> Self {
+        let (m00, m01, m02, m03) = self.x_axis.into();
+        let (m10, m11, m12, m13) = self.y_axis.into();
+        let (m20, m21, m22, m23) = self.z_axis.into();
+        let (m30, m31, m32, m33) = self.w_axis.into();
+
+        Self {
+            x_axis: Vec4::new(m00, m10, m20, m30),
+            y_axis: Vec4::new(m01, m11, m21, m31),
+            z_axis: Vec4::new(m02, m12, m22, m32),
+            w_axis: Vec4::new(m03, m13, m23, m33),
+        }
+    }
+
+    /// Returns the determinant of `self`.
+    #[inline]
+    pub fn determinant(&self) -> f32 {
+        let (m00, m01, m02, m03) = self.x_axis.into();
+        let (m10, m11, m12, m13) = self.y_axis.into();
+        let (m20, m21, m22, m23) = self.z_axis.into();
+        let (m30, m31, m32, m33) = self.w_axis.into();
+
+        let a2323 = m22 * m33 - m23 * m32;
+        let a1323 = m21 * m33 - m23 * m31;
+        let a1223 = m21 * m32 - m22 * m31;
+        let a0323 = m20 * m33 - m23 * m30;
+        let a0223 = m20 * m32 - m22 * m30;
+        let a0123 = m20 * m31 - m21 * m30;
+
+        m00 * (m11 * a2323 - m12 * a1323 + m13 * a1223)
+            - m01 * (m10 * a2323 - m12 * a0323 + m13 * a0223)
+            + m02 * (m10 * a1323 - m11 * a0323 + m13 * a0123)
+            - m03 * (m10 * a1223 - m11 * a0223 + m12 * a0123)
+    }
+
+    /// Returns the inverse of `self`.
+    ///
+    /// If the matrix is not invertible the returned matrix will be invalid.
+    pub fn inverse(&self) -> Self {
+        let (m00, m01, m02, m03) = self.x_axis.into();
+        let (m10, m11, m12, m13) = self.y_axis.into();
+        let (m20, m21, m22, m23) = self.z_axis.into();
+        let (m30, m31, m32, m33) = self.w_axis.into();
+
+        let coef00 = m22 * m33 - m32 * m23;
+        let coef02 = m12 * m33 - m32 * m13;
+        let coef03 = m12 * m23 - m22 * m13;
+
+        let coef04 = m21 * m33 - m31 * m23;
+        let coef06 = m11 * m33 - m31 * m13;
+        let coef07 = m11 * m23 - m21 * m13;
+
+        let coef08 = m21 * m32 - m31 * m22;
+        let coef10 = m11 * m32 - m31 * m12;
+        let coef11 = m11 * m22 - m21 * m12;
+
+        let coef12 = m20 * m33 - m30 * m23;
+        let coef14 = m10 * m33 - m30 * m13;
+        let coef15 = m10 * m23 - m20 * m13;
+
+        let coef16 = m20 * m32 - m30 * m22;
+        let coef18 = m10 * m32 - m30 * m12;
+        let coef19 = m10 * m22 - m20 * m12;
+
+        let coef20 = m20 * m31 - m30 * m21;
+        let coef22 = m10 * m31 - m30 * m11;
+        let coef23 = m10 * m21 - m20 * m11;
+
+        let fac0 = Vec4::new(coef00, coef00, coef02, coef03);
+        let fac1 = Vec4::new(coef04, coef04, coef06, coef07);
+        let fac2 = Vec4::new(coef08, coef08, coef10, coef11);
+        let fac3 = Vec4::new(coef12, coef12, coef14, coef15);
+        let fac4 = Vec4::new(coef16, coef16, coef18, coef19);
+        let fac5 = Vec4::new(coef20, coef20, coef22, coef23);
+
+        let vec0 = Vec4::new(m10, m00, m00, m00);
+        let vec1 = Vec4::new(m11, m01, m01, m01);
+        let vec2 = Vec4::new(m12, m02, m02, m02);
+        let vec3 = Vec4::new(m13, m03, m03, m03);
+
+        let inv0 = vec1 * fac0 - vec2 * fac1 + vec3 * fac2;
+        let inv1 = vec0 * fac0 - vec2 * fac3 + vec3 * fac4;
+        let inv2 = vec0 * fac1 - vec1 * fac3 + vec3 * fac5;
+        let inv3 = vec0 * fac2 - vec1 * fac4 + vec2 * fac5;
+
+        let sign_a = Vec4::new(1.0, -1.0, 1.0, -1.0);
+        let sign_b = Vec4::new(-1.0, 1.0, -1.0, 1.0);
+
+        let inverse = Self {
+            x_axis: inv0 * sign_a,
+            y_axis: inv1 * sign_b,
+            z_axis: inv2 * sign_a,
+            w_axis: inv3 * sign_b,
+        };
+
+        let col0 = Vec4::new(
+            inverse.x_axis.x(),
+            inverse.y_axis.x(),
+            inverse.z_axis.x(),
+            inverse.w_axis.x(),
+        );
+
+        let dot0 = self.x_axis * col0;
+        let dot1 = dot0.x() + dot0.y() + dot0.z() + dot0.w();
+
+        let rcp_det = 1.0 / dot1;
+        inverse * rcp_det
+    }
+
+    /// Transforms a 4D vector.
+    #[inline]
+    pub fn mul_vec4(&self, other: Vec4) -> Vec4 {
+        let mut res = self.x_axis * other.dup_x();
+        res = self.y_axis.mul_add(other.dup_y(), res);
+        res = self.z_axis.mul_add(other.dup_z(), res);
+        res = self.w_axis.mul_add(other.dup_w(), res);
+        res
+    }
+
+    /// Multiplies two 4x4 matrices.
+    #[inline]
+    pub fn mul_mat4(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.mul_vec4(other.x_axis),
+            y_axis: self.mul_vec4(other.y_axis),
+            z_axis: self.mul_vec4(other.z_axis),
+            w_axis: self.mul_vec4(other.w_axis),
+        }
+    }
+
+    /// Adds two 4x4 matrices.
+    #[inline]
+    pub fn add_mat4(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.x_axis + other.x_axis,
+            y_axis: self.y_axis + other.y_axis,
+            z_axis: self.z_axis + other.z_axis,
+            w_axis: self.w_axis + other.w_axis,
+        }
+    }
+
+    /// Subtracts two 4x4 matrices.
+    #[inline]
+    pub fn sub_mat4(&self, other: &Self) -> Self {
+        Self {
+            x_axis: self.x_axis - other.x_axis,
+            y_axis: self.y_axis - other.y_axis,
+            z_axis: self.z_axis - other.z_axis,
+            w_axis: self.w_axis - other.w_axis,
+        }
+    }
+
+    /// Multiplies this matrix by a scalar value.
+    #[inline]
+    pub fn mul_scalar(&self, other: f32) -> Self {
+        let s = Vec4::splat(other);
+        Self {
+            x_axis: self.x_axis * s,
+            y_axis: self.y_axis * s,
+            z_axis: self.z_axis * s,
+            w_axis: self.w_axis * s,
+        }
+    }
+}
+
+impl AsRef<[f32; 16]> for Mat4 {
+    #[inline]
+    fn as_ref(&self) -> &[f32; 16] {
+        unsafe { &*(self as *const Self as *const [f32; 16]) }
+    }
+}
+
+impl AsMut<[f32; 16]> for Mat4 {
+    #[inline]
+    fn as_mut(&mut self) -> &mut [f32; 16] {
+        unsafe { &mut *(self as *mut Self as *mut [f32; 16]) }
+    }
+}
+
+impl Add<Mat4> for Mat4 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        self.add_mat4(&other)
+    }
+}
+
+impl Sub<Mat4> for Mat4 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Self) -> Self {
+        self.sub_mat4(&other)
+    }
+}
+
+impl Mul<Mat4> for Mat4 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Self) -> Self {
+        self.mul_mat4(&other)
+    }
+}
+
+impl Mul<Vec4> for Mat4 {
+    type Output = Vec4;
+    #[inline]
+    fn mul(self, other: Vec4) -> Vec4 {
+        self.mul_vec4(other)
+    }
+}
+
+impl Mul<Mat4> for f32 {
+    type Output = Mat4;
+    #[inline]
+    fn mul(self, other: Mat4) -> Mat4 {
+        other.mul_scalar(self)
+    }
+}
+
+impl Mul<f32> for Mat4 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        self.mul_scalar(other)
+    }
+}
diff --git a/spirv-std/src/math/mod.rs b/spirv-std/src/math/mod.rs
new file mode 100644
index 0000000000..2aa8120a9c
--- /dev/null
+++ b/spirv-std/src/math/mod.rs
@@ -0,0 +1,70 @@
+//! This math library is heavily borrowed from https://github.com/bitshifter/glam-rs
+//! In the future we hope to be able to use it directly!
+
+pub mod mat2;
+pub mod mat3;
+pub mod mat4;
+pub mod vec2;
+pub mod vec3;
+pub mod vec4;
+pub use mat2::*;
+pub use mat3::*;
+pub use mat4::*;
+pub use vec2::*;
+pub use vec3::*;
+pub use vec4::*;
+
+pub trait MathExt {
+    fn pow(self, factor: Self) -> Self;
+    fn sqrt(self) -> Self;
+    fn log2(self) -> Self;
+    fn abs(self) -> Self;
+    fn cos(self) -> Self;
+    fn round(self) -> Self;
+    fn floor(self) -> Self;
+    fn ceil(self) -> Self;
+    fn exp(self) -> Self;
+    fn clamp(self, low: Self, high: Self) -> Self;
+}
+
+impl MathExt for f32 {
+    fn pow(self, factor: f32) -> f32 {
+        unsafe { core::intrinsics::powf32(self, factor) }
+    }
+
+    fn sqrt(self) -> f32 {
+        unsafe { core::intrinsics::sqrtf32(self) }
+    }
+
+    fn log2(self) -> f32 {
+        unsafe { core::intrinsics::log2f32(self) }
+    }
+
+    fn abs(self) -> f32 {
+        unsafe { core::intrinsics::fabsf32(self) }
+    }
+
+    fn cos(self) -> f32 {
+        unsafe { core::intrinsics::cosf32(self) }
+    }
+
+    fn round(self) -> f32 {
+        unsafe { core::intrinsics::roundf32(self) }
+    }
+
+    fn floor(self) -> f32 {
+        unsafe { core::intrinsics::floorf32(self) }
+    }
+
+    fn ceil(self) -> f32 {
+        unsafe { core::intrinsics::ceilf32(self) }
+    }
+
+    fn exp(self) -> f32 {
+        unsafe { core::intrinsics::expf32(self) }
+    }
+
+    fn clamp(self, low: Self, high: Self) -> f32 {
+        self.max(low).min(high)
+    }
+}
diff --git a/spirv-std/src/math/vec2.rs b/spirv-std/src/math/vec2.rs
new file mode 100644
index 0000000000..f26a4567b8
--- /dev/null
+++ b/spirv-std/src/math/vec2.rs
@@ -0,0 +1,409 @@
+use super::Vec3;
+use crate::math::MathExt;
+use core::{f32, ops::*};
+
+/// A 2-dimensional vector.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug, Default)]
+#[repr(simd)]
+pub struct Vec2(pub(crate) f32, pub(crate) f32);
+
+/// Creates a `Vec2`.
+#[inline]
+pub fn vec2(x: f32, y: f32) -> Vec2 {
+    Vec2(x, y)
+}
+
+impl Vec2 {
+    #[deprecated(since = "0.9.5", note = "please use `Vec2::recip` instead")]
+    #[inline(always)]
+    pub fn reciprocal(self) -> Self {
+        self.recip()
+    }
+
+    /// Returns a `Vec2` containing the reciprocal `1.0/n` of each element of `self`.
+    #[inline]
+    pub fn recip(self) -> Self {
+        Self(self.0.recip(), self.1.recip())
+    }
+
+    /// Performs a linear interpolation between `self` and `other` based on
+    /// the value `s`.
+    ///
+    /// When `s` is `0.0`, the result will be equal to `self`.  When `s`
+    /// is `1.0`, the result will be equal to `other`.
+    #[inline]
+    pub fn lerp(self, other: Self, s: f32) -> Self {
+        self + ((other - self) * s)
+    }
+
+    /// Creates a new `Vec2`.
+    #[inline]
+    pub fn new(x: f32, y: f32) -> Vec2 {
+        Vec2(x, y)
+    }
+
+    /// Creates a `Vec2` with all elements set to `0.0`.
+    #[inline]
+    pub fn zero() -> Vec2 {
+        Self::splat(0.0)
+    }
+
+    /// Creates a `Vec2` with all elements set to `1.0`.
+    #[inline]
+    pub fn one() -> Vec2 {
+        Self::splat(1.0)
+    }
+
+    /// Creates a `Vec2` with values `[x: 1.0, y: 0.0]`.
+    #[inline]
+    pub fn unit_x() -> Vec2 {
+        Self::new(1.0, 0.0)
+    }
+
+    /// Creates a `Vec2` with values `[x: 0.0, y: 1.0]`.
+    #[inline]
+    pub fn unit_y() -> Vec2 {
+        Self::new(0.0, 1.0)
+    }
+
+    /// Creates a `Vec2` with all elements set to `v`.
+    #[inline]
+    pub fn splat(v: f32) -> Vec2 {
+        Vec2(v, v)
+    }
+
+    /// Creates a `Vec3` from `self` and the given `z` value.
+    #[inline]
+    pub fn extend(self, z: f32) -> Vec3 {
+        Vec3::new(self.0, self.1, z)
+    }
+
+    /// Returns element `x`.
+    #[inline]
+    pub fn x(self) -> f32 {
+        self.0
+    }
+
+    /// Returns element `y`.
+    #[inline]
+    pub fn y(self) -> f32 {
+        self.1
+    }
+
+    /// Returns a mutable reference to element `x`.
+    #[inline]
+    pub fn x_mut(&mut self) -> &mut f32 {
+        &mut self.0
+    }
+
+    /// Returns a mutable reference to element `y`.
+    #[inline]
+    pub fn y_mut(&mut self) -> &mut f32 {
+        &mut self.1
+    }
+
+    /// Sets element `x`.
+    #[inline]
+    pub fn set_x(&mut self, x: f32) {
+        self.0 = x;
+    }
+
+    /// Sets element `y`.
+    #[inline]
+    pub fn set_y(&mut self, y: f32) {
+        self.1 = y;
+    }
+
+    /// Computes the dot product of `self` and `other`.
+    #[inline]
+    pub fn dot(self, other: Vec2) -> f32 {
+        (self.0 * other.0) + (self.1 * other.1)
+    }
+
+    /// Computes the length of `self`.
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.dot(self).sqrt()
+    }
+
+    /// Computes the squared length of `self`.
+    ///
+    /// This is generally faster than `Vec2::length()` as it avoids a square
+    /// root operation.
+    #[inline]
+    pub fn length_squared(self) -> f32 {
+        self.dot(self)
+    }
+
+    #[deprecated(since = "0.9.5", note = "please use `Vec2::length_recip` instead")]
+    #[inline(always)]
+    pub fn length_reciprocal(self) -> f32 {
+        self.length_recip()
+    }
+
+    /// Computes `1.0 / Vec2::length()`.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn length_recip(self) -> f32 {
+        self.length().recip()
+    }
+
+    /// Returns `self` normalized to length 1.0.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn normalize(self) -> Vec2 {
+        self * self.length_recip()
+    }
+
+    /// Returns the vertical minimum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: min(x1, x2), y: min(y1, y2)]`,
+    /// taking the minimum of each element individually.
+    #[inline]
+    pub fn min(self, other: Vec2) -> Vec2 {
+        Vec2(self.0.min(other.0), self.1.min(other.1))
+    }
+
+    /// Returns the vertical maximum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: max(x1, x2), y: max(y1, y2)]`,
+    /// taking the maximum of each element individually.
+    #[inline]
+    pub fn max(self, other: Vec2) -> Vec2 {
+        Vec2(self.0.max(other.0), self.1.max(other.1))
+    }
+
+    /// Returns the horizontal minimum of `self`'s elements.
+    ///
+    /// In other words, this computes `min(x, y)`.
+    #[inline]
+    pub fn min_element(self) -> f32 {
+        self.0.min(self.1)
+    }
+
+    /// Returns the horizontal maximum of `self`'s elements.
+    ///
+    /// In other words, this computes `max(x, y)`.
+    #[inline]
+    pub fn max_element(self) -> f32 {
+        self.0.max(self.1)
+    }
+
+    /// Creates a `Vec2` from the first two values in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than two elements long.
+    #[inline]
+    pub fn from_slice_unaligned(slice: &[f32]) -> Self {
+        Self(slice[0], slice[1])
+    }
+
+    /// Writes the elements of `self` to the first two elements in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than two elements long.
+    #[inline]
+    pub fn write_to_slice_unaligned(self, slice: &mut [f32]) {
+        slice[0] = self.0;
+        slice[1] = self.1;
+    }
+
+    /// Returns a `Vec2` containing the absolute value of each element of `self`.
+    #[inline]
+    pub fn abs(self) -> Self {
+        Self(self.0.abs(), self.1.abs())
+    }
+
+    /// Returns a `Vec2` containing the nearest integer to a number for each element of `self`.
+    /// Round half-way cases away from 0.0.
+    #[inline]
+    pub fn round(self) -> Self {
+        Self(self.0.round(), self.1.round())
+    }
+
+    /// Returns a `Vec2` containing the largest integer less than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self(self.0.floor(), self.1.floor())
+    }
+
+    /// Returns a `Vec2` containing this vector raised to the power of `power`
+    #[inline]
+    pub fn pow(self, power: f32) -> Self {
+        Self(self.0.pow(power), self.1.pow(power))
+    }
+
+    /// Returns a `Vec2` containing this vector exp'd
+    #[inline]
+    pub fn exp(self) -> Self {
+        Self(self.0.exp(), self.1.exp())
+    }
+
+    /// Returns a `Vec2` containing the smallest integer greater than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self(self.0.ceil(), self.1.ceil())
+    }
+
+    /// The perpendicular dot product of the vector and `other`.
+    #[inline]
+    pub fn perp_dot(self, other: Vec2) -> f32 {
+        (self.0 * other.1) - (self.1 * other.0)
+    }
+}
+
+impl Div<Vec2> for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: Vec2) -> Self {
+        Self(self.0 / other.0, self.1 / other.1)
+    }
+}
+
+impl DivAssign<Vec2> for Vec2 {
+    #[inline]
+    fn div_assign(&mut self, other: Vec2) {
+        self.0 /= other.0;
+        self.1 /= other.1;
+    }
+}
+
+impl Div<f32> for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: f32) -> Self {
+        Self(self.0 / other, self.1 / other)
+    }
+}
+
+impl DivAssign<f32> for Vec2 {
+    #[inline]
+    fn div_assign(&mut self, other: f32) {
+        self.0 /= other;
+        self.1 /= other;
+    }
+}
+
+impl Div<Vec2> for f32 {
+    type Output = Vec2;
+    #[inline]
+    fn div(self, other: Vec2) -> Vec2 {
+        Vec2(self / other.0, self / other.1)
+    }
+}
+
+impl Mul<Vec2> for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Vec2) -> Self {
+        Self(self.0 * other.0, self.1 * other.1)
+    }
+}
+
+impl MulAssign<Vec2> for Vec2 {
+    #[inline]
+    fn mul_assign(&mut self, other: Vec2) {
+        self.0 *= other.0;
+        self.1 *= other.1;
+    }
+}
+
+impl Mul<f32> for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        Self(self.0 * other, self.1 * other)
+    }
+}
+
+impl MulAssign<f32> for Vec2 {
+    #[inline]
+    fn mul_assign(&mut self, other: f32) {
+        self.0 *= other;
+        self.1 *= other;
+    }
+}
+
+impl Mul<Vec2> for f32 {
+    type Output = Vec2;
+    #[inline]
+    fn mul(self, other: Vec2) -> Vec2 {
+        Vec2(self * other.0, self * other.1)
+    }
+}
+
+impl Add for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        Self(self.0 + other.0, self.1 + other.1)
+    }
+}
+
+impl AddAssign for Vec2 {
+    #[inline]
+    fn add_assign(&mut self, other: Self) {
+        self.0 += other.0;
+        self.1 += other.1;
+    }
+}
+
+impl Sub for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Vec2) -> Self {
+        Self(self.0 - other.0, self.1 - other.1)
+    }
+}
+
+impl SubAssign for Vec2 {
+    #[inline]
+    fn sub_assign(&mut self, other: Vec2) {
+        self.0 -= other.0;
+        self.1 -= other.1;
+    }
+}
+
+impl Neg for Vec2 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self(-self.0, -self.1)
+    }
+}
+
+impl From<(f32, f32)> for Vec2 {
+    #[inline]
+    fn from(t: (f32, f32)) -> Self {
+        Self(t.0, t.1)
+    }
+}
+
+impl From<Vec2> for (f32, f32) {
+    #[inline]
+    fn from(v: Vec2) -> Self {
+        (v.0, v.1)
+    }
+}
+
+impl From<[f32; 2]> for Vec2 {
+    #[inline]
+    fn from(a: [f32; 2]) -> Self {
+        Self(a[0], a[1])
+    }
+}
+
+impl From<Vec2> for [f32; 2] {
+    #[inline]
+    fn from(v: Vec2) -> Self {
+        [v.0, v.1]
+    }
+}
diff --git a/spirv-std/src/math/vec3.rs b/spirv-std/src/math/vec3.rs
new file mode 100644
index 0000000000..26ee5a4aa1
--- /dev/null
+++ b/spirv-std/src/math/vec3.rs
@@ -0,0 +1,510 @@
+use super::{Vec2, Vec4};
+use crate::math::MathExt;
+use core::ops::*;
+
+/// A 3-dimensional vector without SIMD support.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug, Default)]
+#[repr(simd)]
+pub struct Vec3(pub f32, pub f32, pub f32);
+
+/// Creates a `Vec3`.
+#[inline]
+pub fn vec3(x: f32, y: f32, z: f32) -> Vec3 {
+    Vec3::new(x, y, z)
+}
+
+impl Vec3 {
+    /// Creates a new `Vec3`.
+    #[inline]
+    pub const fn new(x: f32, y: f32, z: f32) -> Self {
+        Self(x, y, z)
+    }
+
+    /// Creates a `Vec3` with all elements set to `0.0`.
+    #[inline]
+    pub const fn zero() -> Self {
+        Self::splat(0.0)
+    }
+
+    /// Creates a `Vec3` with all elements set to `1.0`.
+    #[inline]
+    pub const fn one() -> Self {
+        Self::splat(1.0)
+    }
+
+    /// Creates a `Vec3` with values `[x: 1.0, y: 0.0, z: 0.0]`.
+    #[inline]
+    pub const fn unit_x() -> Self {
+        Self::new(1.0, 0.0, 0.0)
+    }
+
+    /// Creates a `Vec3` with values `[x: 0.0, y: 1.0, z: 0.0]`.
+    #[inline]
+    pub const fn unit_y() -> Self {
+        Self::new(0.0, 1.0, 0.0)
+    }
+
+    /// Creates a `Vec3` with values `[x: 0.0, y: 0.0, z: 1.0]`.
+    #[inline]
+    pub const fn unit_z() -> Self {
+        Self::new(0.0, 0.0, 1.0)
+    }
+
+    /// Creates a `Vec3` with all elements set to `v`.
+    #[inline]
+    pub const fn splat(v: f32) -> Self {
+        Self(v, v, v)
+    }
+
+    /// Creates a `Vec4` from `self` and the given `w` value.
+    #[inline]
+    pub fn extend(self, w: f32) -> Vec4 {
+        Vec4::new(self.0, self.1, self.2, w)
+    }
+
+    /// Creates a `Vec2` from the first three elements of `self`,
+    /// removing `z`.
+    #[inline]
+    pub fn truncate(self) -> Vec2 {
+        Vec2::new(self.0, self.1)
+    }
+
+    /// Returns element `x`.
+    #[inline]
+    pub fn x(self) -> f32 {
+        self.0
+    }
+
+    /// Returns element `y`.
+    #[inline]
+    pub fn y(self) -> f32 {
+        self.1
+    }
+
+    /// Returns element `z`.
+    #[inline]
+    pub fn z(self) -> f32 {
+        self.2
+    }
+
+    /// Returns a mutable reference to element `x`.
+    #[inline]
+    pub fn x_mut(&mut self) -> &mut f32 {
+        &mut self.0
+    }
+
+    /// Returns a mutable reference to element `y`.
+    #[inline]
+    pub fn y_mut(&mut self) -> &mut f32 {
+        &mut self.1
+    }
+
+    /// Returns a mutable reference to element `z`.
+    #[inline]
+    pub fn z_mut(&mut self) -> &mut f32 {
+        &mut self.2
+    }
+
+    /// Sets element `x`.
+    #[inline]
+    pub fn set_x(&mut self, x: f32) {
+        self.0 = x;
+    }
+
+    /// Sets element `y`.
+    #[inline]
+    pub fn set_y(&mut self, y: f32) {
+        self.1 = y;
+    }
+
+    /// Sets element `z`.
+    #[inline]
+    pub fn set_z(&mut self, z: f32) {
+        self.2 = z;
+    }
+
+    /// Returns a `Vec3` with all elements set to the value of element `x`.
+    #[inline]
+    #[allow(dead_code)]
+    pub(crate) fn dup_x(self) -> Self {
+        Self(self.0, self.0, self.0)
+    }
+
+    /// Returns a `Vec3` with all elements set to the value of element `y`.
+    #[inline]
+    #[allow(dead_code)]
+    pub(crate) fn dup_y(self) -> Self {
+        Self(self.1, self.1, self.1)
+    }
+
+    /// Returns a `Vec3` with all elements set to the value of element `z`.
+    #[inline]
+    #[allow(dead_code)]
+    pub(crate) fn dup_z(self) -> Self {
+        Self(self.2, self.2, self.2)
+    }
+
+    /// Computes the dot product of `self` and `other`.
+    #[inline]
+    pub fn dot(self, other: Self) -> f32 {
+        (self.0 * other.0) + (self.1 * other.1) + (self.2 * other.2)
+    }
+
+    /// Returns Vec3 dot in all lanes of Vec3
+    #[inline]
+    #[allow(dead_code)]
+    pub(crate) fn dot_as_vec3(self, other: Self) -> Self {
+        let dot = self.dot(other);
+        Vec3::new(dot, dot, dot)
+    }
+
+    /// Computes the cross product of `self` and `other`.
+    #[inline]
+    pub fn cross(self, other: Self) -> Self {
+        Self(
+            self.1 * other.2 - other.1 * self.2,
+            self.2 * other.0 - other.2 * self.0,
+            self.0 * other.1 - other.0 * self.1,
+        )
+    }
+
+    /// Computes the length of `self`.
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.dot(self).sqrt()
+    }
+
+    /// Computes the squared length of `self`.
+    ///
+    /// This is generally faster than `Vec3::length()` as it avoids a square
+    /// root operation.
+    #[inline]
+    pub fn length_squared(self) -> f32 {
+        self.dot(self)
+    }
+
+    #[deprecated(since = "0.9.5", note = "please use `Vec3::length_recip` instead")]
+    #[inline(always)]
+    pub fn length_reciprocal(self) -> f32 {
+        self.length_recip()
+    }
+
+    /// Computes `1.0 / Vec3::length()`.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn length_recip(self) -> f32 {
+        self.length().recip()
+    }
+
+    /// Returns `self` normalized to length 1.0.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn normalize(self) -> Self {
+        self * self.length_recip()
+    }
+
+    /// Returns the vertical minimum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: min(x1, x2), y: min(y1, y2), z: min(z1, z2)]`,
+    /// taking the minimum of each element individually.
+    #[inline]
+    pub fn min(self, other: Self) -> Self {
+        Self(
+            self.0.min(other.0),
+            self.1.min(other.1),
+            self.2.min(other.2),
+        )
+    }
+
+    /// Returns the vertical maximum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: max(x1, x2), y: max(y1, y2), z: max(z1, z2)]`,
+    /// taking the maximum of each element individually.
+    #[inline]
+    pub fn max(self, other: Self) -> Self {
+        Self(
+            self.0.max(other.0),
+            self.1.max(other.1),
+            self.2.max(other.2),
+        )
+    }
+
+    /// Returns the horizontal minimum of `self`'s elements.
+    ///
+    /// In other words, this computes `min(x, y, z)`.
+    #[inline]
+    pub fn min_element(self) -> f32 {
+        self.0.min(self.1.min(self.2))
+    }
+
+    /// Returns the horizontal maximum of `self`'s elements.
+    ///
+    /// In other words, this computes `max(x, y, z)`.
+    #[inline]
+    pub fn max_element(self) -> f32 {
+        self.0.max(self.1.max(self.2))
+    }
+
+    /// Creates a `Vec3` from the first three values in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than three elements long.
+    #[inline]
+    pub fn from_slice_unaligned(slice: &[f32]) -> Self {
+        Self::new(slice[0], slice[1], slice[2])
+    }
+
+    /// Writes the elements of `self` to the first three elements in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than three elements long.
+    #[inline]
+    pub fn write_to_slice_unaligned(self, slice: &mut [f32]) {
+        slice[0] = self.0;
+        slice[1] = self.1;
+        slice[2] = self.2;
+    }
+
+    /// Per element multiplication/addition of the three inputs: b + (self * a)
+    #[inline]
+    #[allow(dead_code)]
+    pub(crate) fn mul_add(self, a: Self, b: Self) -> Self {
+        Self(
+            (self.0 * a.0) + b.0,
+            (self.1 * a.1) + b.1,
+            (self.2 * a.2) + b.2,
+        )
+    }
+
+    /// Returns a `Vec3` containing the absolute value of each element of `self`.
+    #[inline]
+    pub fn abs(self) -> Self {
+        Self(self.0.abs(), self.1.abs(), self.2.abs())
+    }
+
+    /// Returns a `Vec3` containing the nearest integer to a number for each element of `self`.
+    /// Round half-way cases away from 0.0.
+    #[inline]
+    pub fn round(self) -> Self {
+        Self(self.0.round(), self.1.round(), self.2.round())
+    }
+
+    /// Returns a `Vec3` containing the largest integer less than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self(self.0.floor(), self.1.floor(), self.2.floor())
+    }
+
+    /// Returns a `Vec3` containing this vector raised to the power of `power`
+    #[inline]
+    pub fn pow(self, power: f32) -> Self {
+        Self(self.0.pow(power), self.1.pow(power), self.2.pow(power))
+    }
+
+    /// Returns a `Vec3` containing this vector exp'd
+    #[inline]
+    pub fn exp(self) -> Self {
+        Self(self.0.exp(), self.1.exp(), self.2.exp())
+    }
+
+    /// Returns a `Vec3` containing the smallest integer greater than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self(self.0.ceil(), self.1.ceil(), self.2.ceil())
+    }
+
+    #[deprecated(since = "0.9.5", note = "please use `Vec3::recip` instead")]
+    #[inline(always)]
+    pub fn reciprocal(self) -> Self {
+        self.recip()
+    }
+
+    /// Returns a `Vec3` containing the reciprocal `1.0/n` of each element of `self`.
+    #[inline]
+    pub fn recip(self) -> Self {
+        Self(self.0.recip(), self.1.recip(), self.2.recip())
+    }
+
+    /// Performs a linear interpolation between `self` and `other` based on
+    /// the value `s`.
+    ///
+    /// When `s` is `0.0`, the result will be equal to `self`.  When `s`
+    /// is `1.0`, the result will be equal to `other`.
+    #[inline]
+    pub fn lerp(self, other: Self, s: f32) -> Self {
+        self + ((other - self) * s)
+    }
+}
+
+impl Div<Vec3> for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: Self) -> Self {
+        Self(self.0 / other.0, self.1 / other.1, self.2 / other.2)
+    }
+}
+
+impl DivAssign<Vec3> for Vec3 {
+    #[inline]
+    fn div_assign(&mut self, other: Self) {
+        self.0 /= other.0;
+        self.1 /= other.1;
+        self.2 /= other.2;
+    }
+}
+
+impl Div<f32> for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: f32) -> Self {
+        Self(self.0 / other, self.1 / other, self.2 / other)
+    }
+}
+
+impl DivAssign<f32> for Vec3 {
+    #[inline]
+    fn div_assign(&mut self, other: f32) {
+        self.0 /= other;
+        self.1 /= other;
+        self.2 /= other;
+    }
+}
+
+impl Div<Vec3> for f32 {
+    type Output = Vec3;
+    #[inline]
+    fn div(self, other: Vec3) -> Vec3 {
+        Vec3(self / other.0, self / other.1, self / other.2)
+    }
+}
+
+impl Mul<Vec3> for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Self) -> Self {
+        Self(self.0 * other.0, self.1 * other.1, self.2 * other.2)
+    }
+}
+
+impl MulAssign<Vec3> for Vec3 {
+    #[inline]
+    fn mul_assign(&mut self, other: Self) {
+        self.0 *= other.0;
+        self.1 *= other.1;
+        self.2 *= other.2;
+    }
+}
+
+impl Mul<f32> for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        Self(self.0 * other, self.1 * other, self.2 * other)
+    }
+}
+
+impl MulAssign<f32> for Vec3 {
+    #[inline]
+    fn mul_assign(&mut self, other: f32) {
+        self.0 *= other;
+        self.1 *= other;
+        self.2 *= other;
+    }
+}
+
+impl Mul<Vec3> for f32 {
+    type Output = Vec3;
+    #[inline]
+    fn mul(self, other: Vec3) -> Vec3 {
+        Vec3(self * other.0, self * other.1, self * other.2)
+    }
+}
+
+impl Add for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        Self(self.0 + other.0, self.1 + other.1, self.2 + other.2)
+    }
+}
+
+impl AddAssign for Vec3 {
+    #[inline]
+    fn add_assign(&mut self, other: Self) {
+        self.0 += other.0;
+        self.1 += other.1;
+        self.2 += other.2;
+    }
+}
+
+impl Sub for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Self) -> Self {
+        Self(self.0 - other.0, self.1 - other.1, self.2 - other.2)
+    }
+}
+
+impl SubAssign for Vec3 {
+    #[inline]
+    fn sub_assign(&mut self, other: Self) {
+        self.0 -= other.0;
+        self.1 -= other.1;
+        self.2 -= other.2;
+    }
+}
+
+impl Neg for Vec3 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        Self(-self.0, -self.1, -self.2)
+    }
+}
+
+impl From<(f32, f32, f32)> for Vec3 {
+    #[inline]
+    fn from(t: (f32, f32, f32)) -> Self {
+        Self::new(t.0, t.1, t.2)
+    }
+}
+
+impl From<Vec3> for (f32, f32, f32) {
+    #[inline]
+    fn from(v: Vec3) -> Self {
+        (v.0, v.1, v.2)
+    }
+}
+
+impl From<[f32; 3]> for Vec3 {
+    #[inline]
+    fn from(a: [f32; 3]) -> Self {
+        Self::new(a[0], a[1], a[2])
+    }
+}
+
+impl From<Vec3> for [f32; 3] {
+    #[inline]
+    fn from(v: Vec3) -> Self {
+        [v.0, v.1, v.2]
+    }
+}
+
+#[test]
+fn test_vec3_private() {
+    assert_eq!(
+        vec3(1.0, 1.0, 1.0).mul_add(vec3(0.5, 2.0, -4.0), vec3(-1.0, -1.0, -1.0)),
+        vec3(-0.5, 1.0, -5.0)
+    );
+    assert_eq!(vec3(1.0, 2.0, 3.0).dup_x(), vec3(1.0, 1.0, 1.0));
+    assert_eq!(vec3(1.0, 2.0, 3.0).dup_y(), vec3(2.0, 2.0, 2.0));
+    assert_eq!(vec3(1.0, 2.0, 3.0).dup_z(), vec3(3.0, 3.0, 3.0));
+}
diff --git a/spirv-std/src/math/vec4.rs b/spirv-std/src/math/vec4.rs
new file mode 100644
index 0000000000..e25d214e0e
--- /dev/null
+++ b/spirv-std/src/math/vec4.rs
@@ -0,0 +1,612 @@
+use crate::math::MathExt;
+use core::{f32, ops::*};
+
+/// A 4-dimensional vector.
+///
+/// A 4-dimensional vector.
+///
+/// This type is 16 byte aligned unless the `scalar-math` feature is enabed.
+#[derive(Clone, Copy, PartialEq, PartialOrd, Debug, Default)]
+// if compiling with simd enabled assume alignment needs to match the simd type
+#[repr(simd)]
+pub struct Vec4(pub f32, pub f32, pub f32, pub f32);
+
+/// Creates a `Vec4`.
+#[inline]
+pub fn vec4(x: f32, y: f32, z: f32, w: f32) -> Vec4 {
+    Vec4::new(x, y, z, w)
+}
+
+impl Vec4 {
+    /// Creates a new `Vec4`.
+    #[inline]
+    pub const fn new(x: f32, y: f32, z: f32, w: f32) -> Self {
+        Self(x, y, z, w)
+    }
+
+    /// Creates a `Vec4` with all elements set to `0.0`.
+    #[inline]
+    pub const fn zero() -> Self {
+        Vec4::splat(0.0)
+    }
+
+    /// Creates a `Vec4` with all elements set to `1.0`.
+    #[inline]
+    pub const fn one() -> Self {
+        Vec4::splat(1.0)
+    }
+
+    /// Creates a `Vec4` with values `[x: 1.0, y: 0.0, z: 0.0, w: 0.0]`.
+    #[inline]
+    pub const fn unit_x() -> Self {
+        Vec4::new(1.0, 0.0, 0.0, 0.0)
+    }
+
+    /// Creates a `Vec4` with values `[x: 0.0, y: 1.0, z: 0.0, w: 0.0]`.
+    #[inline]
+    pub const fn unit_y() -> Self {
+        Vec4::new(0.0, 1.0, 0.0, 0.0)
+    }
+
+    /// Creates a `Vec4` with values `[x: 0.0, y: 0.0, z: 1.0, w: 0.0]`.
+    #[inline]
+    pub const fn unit_z() -> Self {
+        Vec4::new(0.0, 0.0, 1.0, 0.0)
+    }
+
+    /// Creates a `Vec4` with values `[x: 0.0, y: 0.0, z: 0.0, w: 1.0]`.
+    #[inline]
+    pub const fn unit_w() -> Self {
+        Vec4::new(0.0, 0.0, 0.0, 1.0)
+    }
+
+    /// Creates a `Vec4` with all elements set to `v`.
+    #[inline]
+    pub const fn splat(v: f32) -> Self {
+        Self(v, v, v, v)
+    }
+
+    /// Returns element `x`.
+    #[inline]
+    pub fn x(self) -> f32 {
+        self.0
+    }
+
+    /// Returns element `y`.
+    #[inline]
+    pub fn y(self) -> f32 {
+        self.1
+    }
+
+    /// Returns element `z`.
+    #[inline]
+    pub fn z(self) -> f32 {
+        self.2
+    }
+
+    /// Returns element `w`.
+    #[inline]
+    pub fn w(self) -> f32 {
+        self.3
+    }
+
+    /// Returns a mutable reference to element `x`.
+    #[inline]
+    pub fn x_mut(&mut self) -> &mut f32 {
+        &mut self.0
+    }
+
+    /// Returns a mutable reference to element `y`.
+    #[inline]
+    pub fn y_mut(&mut self) -> &mut f32 {
+        &mut self.1
+    }
+
+    /// Returns a mutable reference to element `z`.
+    #[inline]
+    pub fn z_mut(&mut self) -> &mut f32 {
+        &mut self.2
+    }
+
+    /// Returns a mutable reference to element `w`.
+    #[inline]
+    pub fn w_mut(&mut self) -> &mut f32 {
+        &mut self.3
+    }
+
+    /// Sets element `x`.
+    #[inline]
+    pub fn set_x(&mut self, x: f32) {
+        self.0 = x;
+    }
+
+    /// Sets element `y`.
+    #[inline]
+    pub fn set_y(&mut self, y: f32) {
+        self.1 = y;
+    }
+
+    /// Sets element `z`.
+    #[inline]
+    pub fn set_z(&mut self, z: f32) {
+        self.2 = z;
+    }
+
+    /// Sets element `w`.
+    #[inline]
+    pub fn set_w(&mut self, w: f32) {
+        self.3 = w;
+    }
+
+    /// Returns a `Vec4` with all elements set to the value of element `x`.
+    #[inline]
+    pub fn dup_x(self) -> Self {
+        Self(self.0, self.0, self.0, self.0)
+    }
+
+    /// Returns a `Vec4` with all elements set to the value of element `y`.
+    #[inline]
+    pub fn dup_y(self) -> Self {
+        Self(self.1, self.1, self.1, self.1)
+    }
+
+    /// Returns a `Vec4` with all elements set to the value of element `z`.
+    #[inline]
+    pub fn dup_z(self) -> Self {
+        Self(self.2, self.2, self.2, self.2)
+    }
+
+    /// Returns a `Vec4` with all elements set to the value of element `w`.
+    #[inline]
+    pub fn dup_w(self) -> Self {
+        Self(self.3, self.3, self.3, self.3)
+    }
+
+    /// Computes the 4D dot product of `self` and `other`.
+    #[inline]
+    pub fn dot(self, other: Self) -> f32 {
+        (self.0 * other.0) + (self.1 * other.1) + (self.2 * other.2) + (self.3 * other.3)
+    }
+
+    /// Computes the 4D length of `self`.
+    #[inline]
+    pub fn length(self) -> f32 {
+        self.dot(self).sqrt()
+    }
+
+    /// Computes the squared 4D length of `self`.
+    ///
+    /// This is generally faster than `Vec4::length()` as it avoids a square
+    /// root operation.
+    #[inline]
+    pub fn length_squared(self) -> f32 {
+        self.dot(self)
+    }
+
+    #[deprecated(since = "0.9.5", note = "please use `Vec4::length_recip` instead")]
+    #[inline(always)]
+    pub fn length_reciprocal(self) -> f32 {
+        self.length_recip()
+    }
+
+    /// Computes `1.0 / Vec4::length()`.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn length_recip(self) -> f32 {
+        self.length().recip()
+    }
+
+    /// Returns `self` normalized to length 1.0.
+    ///
+    /// For valid results, `self` must _not_ be of length zero.
+    #[inline]
+    pub fn normalize(self) -> Self {
+        self * self.length_recip()
+    }
+
+    /// Returns the vertical minimum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: min(x1, x2), y: min(y1, y2), z: min(z1, z2), w: min(w1, w2)]`,
+    /// taking the minimum of each element individually.
+    #[inline]
+    pub fn min(self, other: Self) -> Self {
+        Self(
+            self.0.min(other.0),
+            self.1.min(other.1),
+            self.2.min(other.2),
+            self.3.min(other.3),
+        )
+    }
+
+    /// Returns the vertical maximum of `self` and `other`.
+    ///
+    /// In other words, this computes
+    /// `[x: max(x1, x2), y: max(y1, y2), z: max(z1, z2), w: max(w1, w2)]`,
+    /// taking the maximum of each element individually.
+    #[inline]
+    pub fn max(self, other: Self) -> Self {
+        Self(
+            self.0.max(other.0),
+            self.1.max(other.1),
+            self.2.max(other.2),
+            self.3.max(other.3),
+        )
+    }
+
+    /// Returns the horizontal minimum of `self`'s elements.
+    ///
+    /// In other words, this computes `min(x, y, z, w)`.
+    #[inline]
+    pub fn min_element(self) -> f32 {
+        self.0.min(self.1.min(self.2.min(self.3)))
+    }
+
+    /// Returns the horizontal maximum of `self`'s elements.
+    ///
+    /// In other words, this computes `max(x, y, z, w)`.
+    #[inline]
+    pub fn max_element(self) -> f32 {
+        self.0.max(self.1.max(self.2.min(self.3)))
+    }
+
+    /// Creates a `Vec4` from the first four values in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than four elements long.
+    #[inline]
+    pub fn from_slice_unaligned(slice: &[f32]) -> Self {
+        Self(slice[0], slice[1], slice[2], slice[3])
+    }
+
+    /// Writes the elements of `self` to the first four elements in `slice`.
+    ///
+    /// # Panics
+    ///
+    /// Panics if `slice` is less than four elements long.
+    #[inline]
+    pub fn write_to_slice_unaligned(self, slice: &mut [f32]) {
+        slice[0] = self.0;
+        slice[1] = self.1;
+        slice[2] = self.2;
+        slice[3] = self.3;
+    }
+
+    /// Per element multiplication/addition of the three inputs: b + (self * a)
+    #[inline]
+    pub fn mul_add(self, a: Self, b: Self) -> Self {
+        Self(
+            (self.0 * a.0) + b.0,
+            (self.1 * a.1) + b.1,
+            (self.2 * a.2) + b.2,
+            (self.3 * a.3) + b.3,
+        )
+    }
+
+    /// Returns a `Vec4` containing the absolute value of each element of `self`.
+    #[inline]
+    pub fn abs(self) -> Self {
+        Self(self.0.abs(), self.1.abs(), self.2.abs(), self.3.abs())
+    }
+
+    /// Returns a `Vec4` containing the nearest integer to a number for each element of `self`.
+    /// Round half-way cases away from 0.0.
+    #[inline]
+    pub fn round(self) -> Self {
+        Self(
+            self.0.round(),
+            self.1.round(),
+            self.2.round(),
+            self.3.round(),
+        )
+    }
+
+    /// Returns a `Vec4` containing the largest integer less than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn floor(self) -> Self {
+        Self(
+            self.0.floor(),
+            self.1.floor(),
+            self.2.floor(),
+            self.3.floor(),
+        )
+    }
+
+    /// Returns a `Vec4` containing this vector raised to the power of `power`
+    #[inline]
+    pub fn pow(self, power: f32) -> Self {
+        Self(
+            self.0.pow(power),
+            self.1.pow(power),
+            self.2.pow(power),
+            self.3.pow(power),
+        )
+    }
+
+    /// Returns a `Vec4` containing this vector exp'd
+    #[inline]
+    pub fn exp(self) -> Self {
+        Self(self.0.exp(), self.1.exp(), self.2.exp(), self.3.exp())
+    }
+
+    /// Returns a `Vec4` containing the smallest integer greater than or equal to a number for each
+    /// element of `self`.
+    #[inline]
+    pub fn ceil(self) -> Self {
+        Self(self.0.ceil(), self.1.ceil(), self.2.ceil(), self.3.ceil())
+    }
+
+    #[deprecated(since = "0.9.5", note = "please use `Vec4::recip` instead")]
+    #[inline(always)]
+    pub fn reciprocal(self) -> Self {
+        self.recip()
+    }
+
+    /// Returns a `Vec4` containing the reciprocal `1.0/n` of each element of `self`.
+    #[inline]
+    pub fn recip(self) -> Self {
+        // TODO: Optimize
+        Self::one() / self
+    }
+
+    /// Performs a linear interpolation between `self` and `other` based on
+    /// the value `s`.
+    ///
+    /// When `s` is `0.0`, the result will be equal to `self`.  When `s`
+    /// is `1.0`, the result will be equal to `other`.
+    #[inline]
+    pub fn lerp(self, other: Self, s: f32) -> Self {
+        self + ((other - self) * s)
+    }
+}
+
+impl Div<Vec4> for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: Self) -> Self {
+        {
+            Self(
+                self.0 / other.0,
+                self.1 / other.1,
+                self.2 / other.2,
+                self.3 / other.3,
+            )
+        }
+    }
+}
+
+impl DivAssign<Vec4> for Vec4 {
+    #[inline]
+    fn div_assign(&mut self, other: Self) {
+        {
+            self.0 /= other.0;
+            self.1 /= other.1;
+            self.2 /= other.2;
+            self.3 /= other.3;
+        }
+    }
+}
+
+impl Div<f32> for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn div(self, other: f32) -> Self {
+        {
+            Self(
+                self.0 / other,
+                self.1 / other,
+                self.2 / other,
+                self.3 / other,
+            )
+        }
+    }
+}
+
+impl DivAssign<f32> for Vec4 {
+    #[inline]
+    fn div_assign(&mut self, other: f32) {
+        {
+            self.0 /= other;
+            self.1 /= other;
+            self.2 /= other;
+            self.3 /= other;
+        }
+    }
+}
+
+impl Div<Vec4> for f32 {
+    type Output = Vec4;
+    #[inline]
+    fn div(self, other: Vec4) -> Vec4 {
+        {
+            Vec4(
+                self / other.0,
+                self / other.1,
+                self / other.2,
+                self / other.3,
+            )
+        }
+    }
+}
+
+impl Mul<Vec4> for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: Self) -> Self {
+        {
+            Self(
+                self.0 * other.0,
+                self.1 * other.1,
+                self.2 * other.2,
+                self.3 * other.3,
+            )
+        }
+    }
+}
+
+impl MulAssign<Vec4> for Vec4 {
+    #[inline]
+    fn mul_assign(&mut self, other: Self) {
+        {
+            self.0 *= other.0;
+            self.1 *= other.1;
+            self.2 *= other.2;
+            self.3 *= other.3;
+        }
+    }
+}
+
+impl Mul<f32> for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn mul(self, other: f32) -> Self {
+        {
+            Self(
+                self.0 * other,
+                self.1 * other,
+                self.2 * other,
+                self.3 * other,
+            )
+        }
+    }
+}
+
+impl MulAssign<f32> for Vec4 {
+    #[inline]
+    fn mul_assign(&mut self, other: f32) {
+        {
+            self.0 *= other;
+            self.1 *= other;
+            self.2 *= other;
+            self.3 *= other;
+        }
+    }
+}
+
+impl Mul<Vec4> for f32 {
+    type Output = Vec4;
+    #[inline]
+    fn mul(self, other: Vec4) -> Vec4 {
+        {
+            Vec4(
+                self * other.0,
+                self * other.1,
+                self * other.2,
+                self * other.3,
+            )
+        }
+    }
+}
+
+impl Add for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn add(self, other: Self) -> Self {
+        {
+            Self(
+                self.0 + other.0,
+                self.1 + other.1,
+                self.2 + other.2,
+                self.3 + other.3,
+            )
+        }
+    }
+}
+
+impl AddAssign for Vec4 {
+    #[inline]
+    fn add_assign(&mut self, other: Self) {
+        {
+            self.0 += other.0;
+            self.1 += other.1;
+            self.2 += other.2;
+            self.3 += other.3;
+        }
+    }
+}
+
+impl Sub for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn sub(self, other: Self) -> Self {
+        {
+            Self(
+                self.0 - other.0,
+                self.1 - other.1,
+                self.2 - other.2,
+                self.3 - other.3,
+            )
+        }
+    }
+}
+
+impl SubAssign for Vec4 {
+    #[inline]
+    fn sub_assign(&mut self, other: Self) {
+        {
+            self.0 -= other.0;
+            self.1 -= other.1;
+            self.2 -= other.2;
+            self.3 -= other.3;
+        }
+    }
+}
+
+impl Neg for Vec4 {
+    type Output = Self;
+    #[inline]
+    fn neg(self) -> Self {
+        {
+            Self(-self.0, -self.1, -self.2, -self.3)
+        }
+    }
+}
+
+impl From<(f32, f32, f32, f32)> for Vec4 {
+    #[inline]
+    fn from(t: (f32, f32, f32, f32)) -> Self {
+        Self::new(t.0, t.1, t.2, t.3)
+    }
+}
+
+impl From<Vec4> for (f32, f32, f32, f32) {
+    #[inline]
+    fn from(v: Vec4) -> Self {
+        {
+            (v.0, v.1, v.2, v.3)
+        }
+    }
+}
+
+impl From<[f32; 4]> for Vec4 {
+    #[inline]
+    fn from(a: [f32; 4]) -> Self {
+        {
+            Self(a[0], a[1], a[2], a[3])
+        }
+    }
+}
+
+impl From<Vec4> for [f32; 4] {
+    #[inline]
+    fn from(v: Vec4) -> Self {
+        {
+            [v.0, v.1, v.2, v.3]
+        }
+    }
+}
+
+#[test]
+fn test_vec4_private() {
+    assert_eq!(
+        vec4(1.0, 1.0, 1.0, 1.0).mul_add(vec4(0.5, 2.0, -4.0, 0.0), vec4(-1.0, -1.0, -1.0, -1.0)),
+        vec4(-0.5, 1.0, -5.0, -1.0)
+    );
+    assert_eq!(vec4(1.0, 2.0, 3.0, 4.0).dup_x(), vec4(1.0, 1.0, 1.0, 1.0));
+    assert_eq!(vec4(1.0, 2.0, 3.0, 4.0).dup_y(), vec4(2.0, 2.0, 2.0, 2.0));
+    assert_eq!(vec4(1.0, 2.0, 3.0, 4.0).dup_z(), vec4(3.0, 3.0, 3.0, 3.0));
+    assert_eq!(vec4(1.0, 2.0, 4.0, 4.0).dup_w(), vec4(4.0, 4.0, 4.0, 4.0));
+}