diff --git a/wgpu-core/src/command/render.rs b/wgpu-core/src/command/render.rs
index 66c0965fa..0acba4a77 100644
--- a/wgpu-core/src/command/render.rs
+++ b/wgpu-core/src/command/render.rs
@@ -1171,6 +1171,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                             .inputs
                             .extend(iter::repeat(VertexBufferState::EMPTY).take(empty_slots));
                         let vertex_state = &mut state.vertex.inputs[slot as usize];
+                        //TODO: where are we checking that the offset is in bound?
                         vertex_state.total_size = match size {
                             Some(s) => s.get(),
                             None => buffer.size - offset,
diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs
index 56ea142f9..4d972ad68 100644
--- a/wgpu-core/src/device/mod.rs
+++ b/wgpu-core/src/device/mod.rs
@@ -2518,7 +2518,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let (device_guard, _) = hub.devices.read(&mut token);
         let device = device_guard.get(device_id).map_err(|_| InvalidDevice)?;
 
-        Ok(device.downlevel)
+        Ok(device.downlevel.clone())
     }
 
     pub fn device_create_buffer<A: HalApi>(
@@ -3640,7 +3640,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
                 encoder,
                 dev_stored,
                 device.limits.clone(),
-                device.downlevel,
+                device.downlevel.clone(),
                 device.features,
                 #[cfg(feature = "trace")]
                 device.trace.is_some(),
diff --git a/wgpu-core/src/instance.rs b/wgpu-core/src/instance.rs
index 6a16a6633..2d6cd1368 100644
--- a/wgpu-core/src/instance.rs
+++ b/wgpu-core/src/instance.rs
@@ -216,6 +216,7 @@ impl<A: HalApi> Adapter<A> {
                 missing_flags,
                 DOWNLEVEL_WARNING_MESSAGE
             );
+            log::info!("{:#?}", caps.downlevel);
         }
 
         // Verify feature preconditions
@@ -257,7 +258,7 @@ impl<A: HalApi> Adapter<A> {
                 ref_count: self.life_guard.add_ref(),
             },
             caps.alignments.clone(),
-            caps.downlevel,
+            caps.downlevel.clone(),
             desc,
             trace_path,
         )
@@ -658,7 +659,7 @@ impl<G: GlobalIdentityHandlerFactory> Global<G> {
         let (adapter_guard, _) = hub.adapters.read(&mut token);
         adapter_guard
             .get(adapter_id)
-            .map(|adapter| adapter.raw.capabilities.downlevel)
+            .map(|adapter| adapter.raw.capabilities.downlevel.clone())
             .map_err(|_| InvalidAdapter)
     }
 
diff --git a/wgpu-hal/src/gles/adapter.rs b/wgpu-hal/src/gles/adapter.rs
index 1a1717083..9230da3d4 100644
--- a/wgpu-hal/src/gles/adapter.rs
+++ b/wgpu-hal/src/gles/adapter.rs
@@ -261,6 +261,12 @@ impl super::Adapter {
             extensions.contains("GL_EXT_texture_shadow_lod"),
         );
         private_caps.set(super::PrivateCapability::MEMORY_BARRIERS, ver >= (3, 1));
+        private_caps.set(
+            super::PrivateCapability::VERTEX_BUFFER_LAYOUT,
+            ver >= (3, 1),
+        );
+
+        let downlevel_limits = wgt::DownlevelLimits {};
 
         Some(crate::ExposedAdapter {
             adapter: super::Adapter {
@@ -276,6 +282,7 @@ impl super::Adapter {
                 limits,
                 downlevel: wgt::DownlevelCapabilities {
                     flags: downlevel_flags,
+                    limits: downlevel_limits,
                     shader_model: wgt::ShaderModel::Sm5,
                 },
                 alignments: crate::Alignments {
diff --git a/wgpu-hal/src/gles/command.rs b/wgpu-hal/src/gles/command.rs
index 8ba5fbebb..859fc7e10 100644
--- a/wgpu-hal/src/gles/command.rs
+++ b/wgpu-hal/src/gles/command.rs
@@ -2,13 +2,6 @@ use super::{conv, Command as C};
 use arrayvec::ArrayVec;
 use std::{mem, ops::Range};
 
-bitflags::bitflags! {
-    #[derive(Default)]
-    struct Dirty: u32 {
-        const VERTEX_BUFFERS = 0x0001;
-    }
-}
-
 #[derive(Clone, Copy, Debug, Default)]
 struct TextureSlotDesc {
     tex_target: super::BindTarget,
@@ -32,7 +25,8 @@ pub(super) struct State {
     resolve_attachments: ArrayVec<[(u32, super::TextureView); crate::MAX_COLOR_TARGETS]>,
     invalidate_attachments: ArrayVec<[u32; crate::MAX_COLOR_TARGETS + 2]>,
     has_pass_label: bool,
-    dirty: Dirty,
+    instance_vbuf_mask: usize,
+    dirty_vbuf_mask: usize,
 }
 
 impl super::CommandBuffer {
@@ -75,21 +69,48 @@ impl super::CommandEncoder {
         }
     }
 
-    fn rebind_vertex_attributes(&mut self, first_instance: u32) {
-        for attribute in self.state.vertex_attributes.iter() {
-            let (buffer_desc, buffer) =
-                self.state.vertex_buffers[attribute.buffer_index as usize].clone();
-
-            let mut attribute_desc = attribute.clone();
-            if buffer_desc.step == wgt::InputStepMode::Instance {
-                attribute_desc.offset += buffer_desc.stride * first_instance;
+    fn rebind_vertex_data(&mut self, first_instance: u32) {
+        if self
+            .private_caps
+            .contains(super::PrivateCapability::VERTEX_BUFFER_LAYOUT)
+        {
+            for (index, &(ref vb_desc, ref vb)) in self.state.vertex_buffers.iter().enumerate() {
+                if self.state.dirty_vbuf_mask & (1 << index) == 0 {
+                    continue;
+                }
+                let instance_offset = match vb_desc.step {
+                    wgt::InputStepMode::Vertex => 0,
+                    wgt::InputStepMode::Instance => first_instance * vb_desc.stride,
+                };
+                self.cmd_buffer.commands.push(C::SetVertexBuffer {
+                    index: index as u32,
+                    buffer: super::BufferBinding {
+                        raw: vb.raw,
+                        offset: vb.offset + instance_offset as wgt::BufferAddress,
+                    },
+                    buffer_desc: vb_desc.clone(),
+                });
             }
+        } else {
+            for attribute in self.state.vertex_attributes.iter() {
+                if self.state.dirty_vbuf_mask & (1 << attribute.buffer_index) == 0 {
+                    continue;
+                }
+                let (buffer_desc, buffer) =
+                    self.state.vertex_buffers[attribute.buffer_index as usize].clone();
 
-            self.cmd_buffer.commands.push(C::SetVertexAttribute {
-                buffer_desc,
-                buffer,
-                attribute_desc,
-            });
+                let mut attribute_desc = attribute.clone();
+                attribute_desc.offset += buffer.offset as u32;
+                if buffer_desc.step == wgt::InputStepMode::Instance {
+                    attribute_desc.offset += buffer_desc.stride * first_instance;
+                }
+
+                self.cmd_buffer.commands.push(C::SetVertexAttribute {
+                    buffer: Some(buffer.raw),
+                    buffer_desc,
+                    attribute_desc,
+                });
+            }
         }
     }
 
@@ -111,11 +132,13 @@ impl super::CommandEncoder {
 
     fn prepare_draw(&mut self, first_instance: u32) {
         if first_instance != 0 {
-            self.rebind_vertex_attributes(first_instance);
-            self.state.dirty.set(Dirty::VERTEX_BUFFERS, true);
-        } else if self.state.dirty.contains(Dirty::VERTEX_BUFFERS) {
-            self.rebind_vertex_attributes(0);
-            self.state.dirty.set(Dirty::VERTEX_BUFFERS, false);
+            self.state.dirty_vbuf_mask = self.state.instance_vbuf_mask;
+        }
+        if self.state.dirty_vbuf_mask != 0 {
+            self.rebind_vertex_data(first_instance);
+            if first_instance == 0 {
+                self.state.dirty_vbuf_mask = 0;
+            }
         }
     }
 
@@ -488,7 +511,8 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
             self.cmd_buffer.commands.push(C::PopDebugGroup);
             self.state.has_pass_label = false;
         }
-        self.state.dirty = Dirty::empty();
+        self.state.instance_vbuf_mask = 0;
+        self.state.dirty_vbuf_mask = 0;
         self.state.color_targets.clear();
         self.state.vertex_attributes.clear();
         self.state.primitive = super::PrimitiveState::default();
@@ -591,25 +615,56 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
 
     unsafe fn set_render_pipeline(&mut self, pipeline: &super::RenderPipeline) {
         self.state.topology = conv::map_primitive_topology(pipeline.primitive.topology);
-        self.state.dirty |= Dirty::VERTEX_BUFFERS;
 
-        self.set_pipeline_inner(&pipeline.inner);
-
-        // set vertex state
-        self.state.vertex_attributes.clear();
-        for vat in pipeline.vertex_attributes.iter() {
-            self.state.vertex_attributes.push(vat.clone());
+        for index in self.state.vertex_attributes.len()..pipeline.vertex_attributes.len() {
+            self.cmd_buffer
+                .commands
+                .push(C::UnsetVertexAttribute(index as u32));
         }
-        for (&mut (ref mut state_desc, _), pipe_desc) in self
+
+        if self
+            .private_caps
+            .contains(super::PrivateCapability::VERTEX_BUFFER_LAYOUT)
+        {
+            for vat in pipeline.vertex_attributes.iter() {
+                let vb = &pipeline.vertex_buffers[vat.buffer_index as usize];
+                // set the layout
+                self.cmd_buffer.commands.push(C::SetVertexAttribute {
+                    buffer: None,
+                    buffer_desc: vb.clone(),
+                    attribute_desc: vat.clone(),
+                });
+            }
+        } else {
+            self.state.dirty_vbuf_mask = 0;
+            // copy vertex attributes
+            for vat in pipeline.vertex_attributes.iter() {
+                //Note: we can invalidate more carefully here.
+                self.state.dirty_vbuf_mask |= 1 << vat.buffer_index;
+                self.state.vertex_attributes.push(vat.clone());
+            }
+        }
+
+        self.state.instance_vbuf_mask = 0;
+        // copy vertex state
+        for (index, (&mut (ref mut state_desc, _), pipe_desc)) in self
             .state
             .vertex_buffers
             .iter_mut()
             .zip(pipeline.vertex_buffers.iter())
+            .enumerate()
         {
-            state_desc.step = pipe_desc.step;
-            state_desc.stride = pipe_desc.stride;
+            if pipe_desc.step == wgt::InputStepMode::Instance {
+                self.state.instance_vbuf_mask |= 1 << index;
+            }
+            if state_desc != pipe_desc {
+                self.state.dirty_vbuf_mask |= 1 << index;
+                *state_desc = pipe_desc.clone();
+            }
         }
 
+        self.set_pipeline_inner(&pipeline.inner);
+
         // set primitive state
         let prim_state = conv::map_primitive_state(&pipeline.primitive);
         if prim_state != self.state.primitive {
@@ -703,8 +758,8 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
         index: u32,
         binding: crate::BufferBinding<'a, super::Api>,
     ) {
-        self.state.dirty |= Dirty::VERTEX_BUFFERS;
-        let vb = &mut self.state.vertex_buffers[index as usize].1;
+        self.state.dirty_vbuf_mask |= 1 << index;
+        let (_, ref mut vb) = self.state.vertex_buffers[index as usize];
         vb.raw = binding.buffer.raw;
         vb.offset = binding.offset;
     }
@@ -854,7 +909,6 @@ impl crate::CommandEncoder<super::Api> for super::CommandEncoder {
             self.cmd_buffer.commands.push(C::PopDebugGroup);
             self.state.has_pass_label = false;
         }
-        self.state.dirty = Dirty::empty();
     }
 
     unsafe fn set_compute_pipeline(&mut self, pipeline: &super::ComputePipeline) {
diff --git a/wgpu-hal/src/gles/mod.rs b/wgpu-hal/src/gles/mod.rs
index 7fc8d9ecc..4258e585b 100644
--- a/wgpu-hal/src/gles/mod.rs
+++ b/wgpu-hal/src/gles/mod.rs
@@ -1,3 +1,61 @@
+/*!
+# OpenGL ES3 API (aka GLES3).
+
+Designed to work on Linux and Android, with context provided by EGL.
+
+## Texture views
+
+GLES3 doesn't really have separate texture view objects. We have to remember the
+original texture and the sub-range into it. Problem is, however, that there is
+no way to expose a subset of array layers or mip levels of a sampled texture.
+
+## Binding model
+
+Binding model is very different from WebGPU, especially with regards to samplers.
+GLES3 has sampler objects, but they aren't separately bindable to the shaders.
+Each sampled texture is exposed to the shader as a combined texture-sampler binding.
+
+When building the pipeline layout, we linearize binding entries based on the groups
+(uniform/storage buffers, uniform/storage textures), and record the mapping into
+`BindGroupLayoutInfo`.
+When a pipeline gets created, and we track all the texture-sampler associations
+from the static use in the shader.
+We only support at most one sampler used with each texture so far. The linear index
+of this sampler is stored per texture slot in `SamplerBindMap` array.
+
+The texture-sampler pairs get potentially invalidated in 2 places:
+  - when a new pipeline is set, we update the linear indices of associated samplers
+  - when a new bind group is set, we update both the textures and the samplers
+
+We expect that the changes to sampler states between any 2 pipelines of the same layout
+will be minimal, if any.
+
+## Vertex data
+
+Generally, vertex buffers are marked as dirty and lazily bound on draw.
+
+GLES3 doesn't support "base instance" semantics. However, it's easy to support,
+since we are forced to do late binding anyway. We just adjust the offsets
+into the vertex data.
+
+### Old path
+
+In GLES-3.0 and WebGL2, vertex buffer layout is provided
+together with the actual buffer binding.
+We invalidate the attributes on the vertex buffer change, and re-bind them.
+
+### New path
+
+In GLES-3.1 and higher, the vertex buffer layout can be declared separately
+from the vertex data itself. This mostly matches WebGPU, however there is a catch:
+`stride` needs to be specified with the data, not as a part of the layout.
+
+To address this, we invalidate the vertex buffers based on:
+  - whether or not `start_instance` is used
+  - stride has changed
+
+*/
+
 #[cfg(not(target_arch = "wasm32"))]
 mod egl;
 
@@ -60,6 +118,8 @@ bitflags::bitflags! {
         const SHADER_TEXTURE_SHADOW_LOD = 0x0002;
         /// Support memory barriers.
         const MEMORY_BARRIERS = 0x0004;
+        /// Vertex buffer layouts separate from the data.
+        const VERTEX_BUFFER_LAYOUT = 0x0008;
     }
 }
 
@@ -254,7 +314,7 @@ struct ImageBinding {
     format: u32,
 }
 
-#[derive(Clone, Debug, Default)]
+#[derive(Clone, Debug, Default, PartialEq)]
 struct VertexBufferDesc {
     step: wgt::InputStepMode,
     stride: u32,
@@ -534,10 +594,16 @@ enum Command {
     SetDepthBias(wgt::DepthBiasState),
     ConfigureDepthStencil(crate::FormatAspect),
     SetVertexAttribute {
-        buffer: BufferBinding,
+        buffer: Option<glow::Buffer>,
         buffer_desc: VertexBufferDesc,
         attribute_desc: AttributeDesc,
     },
+    UnsetVertexAttribute(u32),
+    SetVertexBuffer {
+        index: u32,
+        buffer: BufferBinding,
+        buffer_desc: VertexBufferDesc,
+    },
     SetProgram(glow::Program),
     SetPrimitive(PrimitiveState),
     SetBlendConstant([f32; 4]),
diff --git a/wgpu-hal/src/gles/queue.rs b/wgpu-hal/src/gles/queue.rs
index 04e34d48c..65595a352 100644
--- a/wgpu-hal/src/gles/queue.rs
+++ b/wgpu-hal/src/gles/queue.rs
@@ -590,31 +590,69 @@ impl super::Queue {
                 gl.stencil_op_separate(face, ops.fail, ops.depth_fail, ops.pass);
             }
             C::SetVertexAttribute {
+                buffer,
                 ref buffer_desc,
-                ref buffer,
                 attribute_desc: ref vat,
             } => {
-                gl.bind_buffer(glow::ARRAY_BUFFER, Some(buffer.raw));
-                let offset = vat.offset as i32 + buffer.offset as i32;
-                match vat.format_desc.attrib_kind {
-                    super::VertexAttribKind::Float => gl.vertex_attrib_pointer_f32(
-                        vat.location,
-                        vat.format_desc.element_count,
-                        vat.format_desc.element_format,
-                        true, // always normalized
-                        buffer_desc.stride as i32,
-                        offset,
-                    ),
-                    super::VertexAttribKind::Integer => gl.vertex_attrib_pointer_i32(
-                        vat.location,
-                        vat.format_desc.element_count,
-                        vat.format_desc.element_format,
-                        buffer_desc.stride as i32,
-                        offset,
-                    ),
-                }
-                gl.vertex_attrib_divisor(vat.location, buffer_desc.step as u32);
+                gl.bind_buffer(glow::ARRAY_BUFFER, buffer);
                 gl.enable_vertex_attrib_array(vat.location);
+
+                if buffer.is_none() {
+                    match vat.format_desc.attrib_kind {
+                        super::VertexAttribKind::Float => gl.vertex_attrib_format_f32(
+                            vat.location,
+                            vat.format_desc.element_count,
+                            vat.format_desc.element_format,
+                            true, // always normalized
+                            vat.offset,
+                        ),
+                        super::VertexAttribKind::Integer => gl.vertex_attrib_format_i32(
+                            vat.location,
+                            vat.format_desc.element_count,
+                            vat.format_desc.element_format,
+                            vat.offset,
+                        ),
+                    }
+
+                    //Note: there is apparently a bug on AMD 3500U:
+                    // this call is ignored if the current array is disabled.
+                    gl.vertex_attrib_binding(vat.location, vat.buffer_index);
+                } else {
+                    match vat.format_desc.attrib_kind {
+                        super::VertexAttribKind::Float => gl.vertex_attrib_pointer_f32(
+                            vat.location,
+                            vat.format_desc.element_count,
+                            vat.format_desc.element_format,
+                            true, // always normalized
+                            buffer_desc.stride as i32,
+                            vat.offset as i32,
+                        ),
+                        super::VertexAttribKind::Integer => gl.vertex_attrib_pointer_i32(
+                            vat.location,
+                            vat.format_desc.element_count,
+                            vat.format_desc.element_format,
+                            buffer_desc.stride as i32,
+                            vat.offset as i32,
+                        ),
+                    }
+                    gl.vertex_attrib_divisor(vat.location, buffer_desc.step as u32);
+                }
+            }
+            C::UnsetVertexAttribute(location) => {
+                gl.disable_vertex_attrib_array(location);
+            }
+            C::SetVertexBuffer {
+                index,
+                ref buffer,
+                ref buffer_desc,
+            } => {
+                gl.vertex_binding_divisor(index, buffer_desc.step as u32);
+                gl.bind_vertex_buffer(
+                    index,
+                    Some(buffer.raw),
+                    buffer.offset as i32,
+                    buffer_desc.stride as i32,
+                );
             }
             C::SetDepth(ref depth) => {
                 gl.depth_func(depth.function);
diff --git a/wgpu-info/src/main.rs b/wgpu-info/src/main.rs
index 0f79cfd74..c1cdd4221 100644
--- a/wgpu-info/src/main.rs
+++ b/wgpu-info/src/main.rs
@@ -70,7 +70,8 @@ fn print_info_from_adapter(adapter: &wgpu::Adapter, idx: usize) {
     println!("\tDownlevel Properties:");
     let wgpu::DownlevelCapabilities {
         shader_model,
-        flags
+        limits: _,
+        flags,
     } = downlevel;
     println!("\t\tShader Model:                        {:?}", shader_model);
     for i in 0..(size_of::<wgpu::DownlevelFlags>() * 8) {
diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs
index 94016d881..a732ef7aa 100644
--- a/wgpu-types/src/lib.rs
+++ b/wgpu-types/src/lib.rs
@@ -586,11 +586,24 @@ impl Default for Limits {
     }
 }
 
+/// Represents the sets of additional limits on an adapter,
+/// which take place when running on downlevel backends.
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+pub struct DownlevelLimits {}
+
+impl Default for DownlevelLimits {
+    fn default() -> Self {
+        DownlevelLimits {}
+    }
+}
+
 /// Lists various ways the underlying platform does not conform to the WebGPU standard.
-#[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
+#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)]
 pub struct DownlevelCapabilities {
     /// Combined boolean flags.
     pub flags: DownlevelFlags,
+    /// Additional limits
+    pub limits: DownlevelLimits,
     /// Which collections of features shaders support. Defined in terms of D3D's shader models.
     pub shader_model: ShaderModel,
 }
@@ -599,6 +612,7 @@ impl Default for DownlevelCapabilities {
     fn default() -> Self {
         Self {
             flags: DownlevelFlags::COMPLIANT,
+            limits: DownlevelLimits::default(),
             shader_model: ShaderModel::Sm5,
         }
     }
@@ -609,8 +623,10 @@ impl DownlevelCapabilities {
     ///
     /// If this returns false, some parts of the API will result in validation errors where they would not normally.
     /// These parts can be determined by the values in this structure.
-    pub fn is_webgpu_compliant(self) -> bool {
-        self.flags.contains(DownlevelFlags::COMPLIANT) && self.shader_model >= ShaderModel::Sm5
+    pub fn is_webgpu_compliant(&self) -> bool {
+        self.flags.contains(DownlevelFlags::COMPLIANT)
+            && self.limits == DownlevelLimits::default()
+            && self.shader_model >= ShaderModel::Sm5
     }
 }
 
diff --git a/wgpu/tests/common/mod.rs b/wgpu/tests/common/mod.rs
index 4b4422a5e..330484920 100644
--- a/wgpu/tests/common/mod.rs
+++ b/wgpu/tests/common/mod.rs
@@ -66,6 +66,7 @@ pub fn lowest_reasonable_limits() -> Limits {
 fn lowest_downlevel_properties() -> DownlevelCapabilities {
     DownlevelCapabilities {
         flags: wgt::DownlevelFlags::empty(),
+        limits: wgt::DownlevelLimits {},
         shader_model: wgt::ShaderModel::Sm2,
     }
 }