32 files changed, 442 insertions, 187 deletions
diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index e5a3f00..5607d5b 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -858,6 +858,7 @@ intel_update_image_buffers(struct intel_context *intel, __DRIdrawable *drawable)
    struct __DRIimageList images;
    unsigned int format;
    uint32_t buffer_mask = 0;
+   int ret;
 
    front_rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
    back_rb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
@@ -877,12 +878,14 @@ intel_update_image_buffers(struct intel_context *intel, __DRIdrawable *drawable)
    if (back_rb)
       buffer_mask |= __DRI_IMAGE_BUFFER_BACK;
 
-   (*screen->image.loader->getBuffers) (drawable,
-                                        driGLFormatToImageFormat(format),
-                                        &drawable->dri2.stamp,
-                                        drawable->loaderPrivate,
-                                        buffer_mask,
-                                        &images);
+   ret = screen->image.loader->getBuffers(drawable,
+                                          driGLFormatToImageFormat(format),
+                                          &drawable->dri2.stamp,
+                                          drawable->loaderPrivate,
+                                          buffer_mask,
+                                          &images);
+   if (!ret)
+      return;
 
    if (images.image_mask & __DRI_IMAGE_BUFFER_FRONT) {
       drawable->w = images.front->width;
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index f448551..194b412 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -48,6 +48,7 @@ i965_compiler_FILES = \
 	brw_nir_attribute_workarounds.c \
 	brw_nir_intrinsics.c \
 	brw_nir_opt_peephole_ffma.c \
+	brw_nir_tcs_workarounds.c \
 	brw_packed_float.c \
 	brw_predicated_break.cpp \
 	brw_reg.h \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c
index 9590968..6be82c5 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -167,7 +167,8 @@ nir_uniform_type_size(const struct glsl_type *type)
 }
 
 const unsigned *
-brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
+brw_blorp_compile_nir_shader(struct brw_context *brw, void *mem_ctx,
+                             struct nir_shader *nir,
                              const struct brw_wm_prog_key *wm_key,
                              bool use_repclear,
                              struct brw_blorp_prog_data *prog_data,
@@ -175,13 +176,6 @@ brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
 {
    const struct brw_compiler *compiler = brw->intelScreen->compiler;
 
-   void *mem_ctx = ralloc_context(NULL);
-
-   /* Calling brw_preprocess_nir and friends is destructive and, if cloning is
-    * enabled, may end up completely replacing the nir_shader.  Therefore, we
-    * own it and might as well put it in our context for easy cleanup.
-    */
-   ralloc_steal(mem_ctx, nir);
    nir->options =
       compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions;
 
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index 7ec5875..133a8ac 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -366,7 +366,8 @@ struct brw_blorp_blit_prog_key
 void brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key);
 
 const unsigned *
-brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
+brw_blorp_compile_nir_shader(struct brw_context *brw, void *mem_ctx,
+                             struct nir_shader *nir,
                              const struct brw_wm_prog_key *wm_key,
                              bool use_repclear,
                              struct brw_blorp_prog_data *prog_data,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 782d285..db94f33 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1296,7 +1296,7 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
  * of samples).
  */
 static nir_shader *
-brw_blorp_build_nir_shader(struct brw_context *brw,
+brw_blorp_build_nir_shader(struct brw_context *brw, void *mem_ctx,
                            const brw_blorp_blit_prog_key *key)
 {
    nir_ssa_def *src_pos, *dst_pos, *color;
@@ -1342,7 +1342,7 @@ brw_blorp_build_nir_shader(struct brw_context *brw,
           (key->dst_samples == 0));
 
    nir_builder b;
-   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
 
    struct brw_blorp_blit_vars v;
    brw_blorp_blit_vars_init(&b, &v, key);
@@ -1505,6 +1505,8 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
                         &params->wm_prog_kernel, &params->wm_prog_data))
       return;
 
+   void *mem_ctx = ralloc_context(NULL);
+
    const unsigned *program;
    unsigned program_size;
    struct brw_blorp_prog_data prog_data;
@@ -1512,7 +1514,7 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
    /* Try and compile with NIR first.  If that fails, fall back to the old
     * method of building shaders manually.
     */
-   nir_shader *nir = brw_blorp_build_nir_shader(brw, prog_key);
+   nir_shader *nir = brw_blorp_build_nir_shader(brw, mem_ctx, prog_key);
    struct brw_wm_prog_key wm_key;
    brw_blorp_init_wm_prog_key(&wm_key);
    wm_key.tex.compressed_multisample_layout_mask =
@@ -1520,7 +1522,7 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
    wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
    wm_key.multisample_fbo = prog_key->rt_samples > 1;
 
-   program = brw_blorp_compile_nir_shader(brw, nir, &wm_key, false,
+   program = brw_blorp_compile_nir_shader(brw, mem_ctx, nir, &wm_key, false,
                                           &prog_data, &program_size);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
@@ -1528,6 +1530,8 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
                     program, program_size,
                     &prog_data, sizeof(prog_data),
                     &params->wm_prog_kernel, &params->wm_prog_data);
+
+   ralloc_free(mem_ctx);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp b/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
index 2515a04..6400218 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
@@ -64,7 +64,7 @@ brw_blorp_params_get_clear_kernel(struct brw_context *brw,
    void *mem_ctx = ralloc_context(NULL);
 
    nir_builder b;
-   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
    b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear");
 
    nir_variable *u_color = nir_variable_create(b.shader, nir_var_uniform,
@@ -84,7 +84,8 @@ brw_blorp_params_get_clear_kernel(struct brw_context *brw,
    struct brw_blorp_prog_data prog_data;
    unsigned program_size;
    const unsigned *program =
-      brw_blorp_compile_nir_shader(brw, b.shader, &wm_key, use_replicated_data,
+      brw_blorp_compile_nir_shader(brw, mem_ctx,
+                                   b.shader, &wm_key, use_replicated_data,
                                    &prog_data, &program_size);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 10e9f47..7d15c28 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -220,6 +220,8 @@ struct brw_tcs_prog_key
    /** A bitfield of per-vertex outputs written. */
    uint64_t outputs_written;
 
+   bool quads_workaround;
+
    struct brw_sampler_prog_key_data tex;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 1cb99da..2af42e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5681,7 +5681,7 @@ fs_visitor::setup_gs_payload()
     * have to multiply by VerticesIn to obtain the total storage requirement.
     */
    if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
-       max_push_components) {
+       max_push_components || gs_prog_data->invocations > 1) {
       gs_prog_data->base.include_vue_handles = true;
 
       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 11c078a..91763d3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -2322,23 +2322,23 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          break;
 
       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
+      fs_reg m0_2 = component(m0, 2);
 
-      const fs_builder fwa_bld = bld.exec_all();
+      const fs_builder chanbld = bld.exec_all().group(1, 0);
 
       /* Zero the message header */
-      fwa_bld.MOV(m0, brw_imm_ud(0u));
+      bld.exec_all().MOV(m0, brw_imm_ud(0u));
 
       /* Copy "Barrier ID" from r0.2, bits 16:13 */
-      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
                   brw_imm_ud(INTEL_MASK(16, 13)));
 
       /* Shift it up to bits 27:24. */
-      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
+      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
 
       /* Set the Barrier Count and the enable bit */
-      fwa_bld.OR(m0_2, m0_2,
-                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
 
       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
       break;
@@ -4060,12 +4060,23 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
       dest = get_nir_dest(instr->dest);
 
    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
-   fs_reg offset = get_nir_src(instr->src[0]);
+   fs_reg offset;
    fs_reg data1 = get_nir_src(instr->src[1]);
    fs_reg data2;
    if (op == BRW_AOP_CMPWR)
       data2 = get_nir_src(instr->src[2]);
 
+   /* Get the offset */
+   nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+   if (const_offset) {
+      offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+   } else {
+      offset = vgrf(glsl_type::uint_type);
+      bld.ADD(offset,
+	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+	      brw_imm_ud(instr->const_index[0]));
+   }
+
    /* Emit the actual atomic operation operation */
 
    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 74c354f..6185310 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -117,6 +117,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
 
 bool brw_nir_apply_trig_workarounds(nir_shader *nir);
 
+void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
+
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
                                       const struct brw_device_info *devinfo,
                                       const struct brw_sampler_prog_key_data *key,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c b/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c
new file mode 100644
index 0000000..0626981
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Implements the WaPreventHSTessLevelsInterference workaround (for Gen7-8).
+ *
+ * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
+ * definition of the patch header layouts):
+ *
+ *    "HW Bug: The Tessellation stage will incorrectly add domain points
+ *     along patch edges under the following conditions, which may result
+ *     in conformance failures and/or cracking artifacts:
+ *
+ *       * QUAD domain
+ *       * INTEGER partitioning
+ *       * All three TessFactors in a given U or V direction (e.g., V
+ *         direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
+ *       * All three TessFactors in the other direction are > 1.0 and all
+ *         round up to the same integer value (e.g, U direction:
+ *         VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
+ *
+ *     The suggested workaround (to be implemented as part of the postamble
+ *     to the HS shader in the HS kernel) is:
+ *
+ *     if (
+ *        (TF[UEQ0] > 1.0) ||
+ *        (TF[VEQ0] > 1.0) ||
+ *        (TF[UEQ1] > 1.0) ||
+ *        (TF[VEQ1] > 1.0) ||
+ *        (TF[INSIDE_U] > 1.0) ||
+ *        (TF[INSIDE_V] > 1.0) )
+ *     {
+ *        TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
+ *        TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
+ *     }"
+ *
+ * There's a subtlety here.  Intel internal HSD-ES bug 1208668495 notes
+ * that the above workaround fails to fix certain GL/ES CTS tests which
+ * have inside tessellation factors of -1.0.  This can be explained by
+ * a quote from the ARB_tessellation_shader specification:
+ *
+ *    "If "equal_spacing" is used, the floating-point tessellation level is
+ *     first clamped to the range [1,<max>], where <max> is implementation-
+ *     dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
+ *
+ * In other words, the actual inner tessellation factor used is
+ * clamp(TF[INSIDE_*], 1.0, 64.0).  So we want to compare the clamped
+ * value against 1.0.  To accomplish this, we change the comparison from
+ * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
+ */
+
+static inline nir_ssa_def *
+load_output(nir_builder *b, int num_components, int offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_output);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL);
+   load->num_components = num_components;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_intrinsic_set_base(load, offset);
+
+   nir_builder_instr_insert(b, &load->instr);
+
+   return &load->dest.ssa;
+}
+
+static inline void
+store_output(nir_builder *b, nir_ssa_def *value, int offset, unsigned comps)
+{
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+   store->num_components = comps;
+   nir_intrinsic_set_write_mask(store, (1u << comps) - 1);
+   store->src[0] = nir_src_for_ssa(value);
+   store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_builder_instr_insert(b, &store->instr);
+}
+
+static void
+emit_quads_workaround(nir_builder *b, nir_block *block)
+{
+   /* We're going to insert a new if-statement in a predecessor of the end
+    * block.  This would normally create a new block (after the if) which
+    * would then become the predecessor of the end block, causing our set
+    * walking to get screwed up.  To avoid this, just emit a constant at
+    * the end of our current block, and insert the if before that.
+    */
+   b->cursor = nir_after_block_before_jump(block);
+   b->cursor = nir_before_instr(nir_imm_int(b, 0)->parent_instr);
+
+   nir_ssa_def *inner = load_output(b, 2, 0);
+   nir_ssa_def *outer = load_output(b, 4, 1);
+
+   nir_ssa_def *any_greater_than_1 =
+       nir_ior(b, nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), outer)),
+                  nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), inner)));
+
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(any_greater_than_1);
+   nir_builder_cf_insert(b, &if_stmt->cf_node);
+
+   /* Fill out the new then-block */
+   b->cursor = nir_after_cf_list(&if_stmt->then_list);
+
+   store_output(b, nir_bcsel(b, nir_fge(b, nir_imm_float(b, 1.0f), inner),
+                                nir_imm_float(b, 2.0f), inner), 0, 2);
+}
+
+void
+brw_nir_apply_tcs_quads_workaround(nir_shader *nir)
+{
+   assert(nir->stage == MESA_SHADER_TESS_CTRL);
+
+   nir_foreach_function(func, nir) {
+      if (!func->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      struct set_entry *entry;
+      set_foreach(func->impl->end_block->predecessors, entry) {
+         nir_block *pred = (nir_block *) entry->key;
+         emit_quads_workaround(&b, pred);
+      }
+
+      nir_metadata_preserve(func->impl, 0);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index a91c6e2..a42a322 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -686,12 +686,12 @@ stop_oa_counters(struct brw_context *brw)
  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
  * including the required PIPE_CONTROL flushes.
  *
- * Sandybridge is the worst case scenario: brw_emit_mi_flush
- * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
- * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
- * the 3 DWords for MI_REPORT_PERF_COUNT itself.
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four
+ * PIPE_CONTROLs which are 5 DWords each.  We have to flush before and after
+ * MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add the 3 DWords for
+ * MI_REPORT_PERF_COUNT itself.
  */
-#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
+#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3)
 
 /**
  * Emit an MI_REPORT_PERF_COUNT command packet.
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index 4672efd..d51cf1b 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -96,10 +96,38 @@ gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
 void
 brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 {
+   if (brw->gen >= 6 &&
+       (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
+       (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
+      /* A pipe control command with flush and invalidate bits set
+       * simultaneously is an inherently racy operation on Gen6+ if the
+       * contents of the flushed caches were intended to become visible from
+       * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
+       * first one should stall the pipeline to make sure that the flushed R/W
+       * caches are coherent with memory once the specified R/O caches are
+       * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
+       * invalidation seems to happen at the bottom of the pipeline together
+       * with any write cache flush, so this shouldn't be a concern.
+       */
+      brw_emit_pipe_control_flush(brw, (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) |
+                                       PIPE_CONTROL_CS_STALL);
+      flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
+   }
+
    if (brw->gen >= 8) {
       if (brw->gen == 8)
          gen8_add_cs_stall_workaround_bits(&flags);
 
+      if (brw->gen == 9 &&
+          (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+         /* Hardware workaround: SKL
+          *
+          * Emit Pipe Control with all bits set to zero before emitting
+          * a Pipe Control with VF Cache Invalidate set.
+          */
+         brw_emit_pipe_control_flush(brw, 0);
+      }
+
       BEGIN_BATCH(6);
       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
       OUT_BATCH(flags);
@@ -311,15 +339,6 @@ brw_emit_mi_flush(struct brw_context *brw)
    } else {
       int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
       if (brw->gen >= 6) {
-         if (brw->gen == 9) {
-            /* Hardware workaround: SKL
-             *
-             * Emit Pipe Control with all bits set to zero before emitting
-             * a Pipe Control with VF Cache Invalidate set.
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
          flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 8a5dd7e..6b7fde2 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -153,6 +153,8 @@ brw_tcs_debug_recompile(struct brw_context *brw,
                       key->patch_outputs_written);
    found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
                       key->tes_primitive_mode);
+   found |= key_debug(brw, "quads and equal_spacing workaround",
+                      old_key->quads_workaround, key->quads_workaround);
    found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
 
    if (!found) {
@@ -346,6 +348,9 @@ brw_upload_tcs_prog(struct brw_context *brw,
     * based on the domain the DS is expecting to tessellate.
     */
    key.tes_primitive_mode = tep->program.PrimitiveMode;
+   key.quads_workaround = brw->gen < 9 &&
+                          tep->program.PrimitiveMode == GL_QUADS &&
+                          tep->program.Spacing == GL_EQUAL;
 
    if (tcp) {
       key.program_string_id = tcp->id;
@@ -383,6 +388,8 @@ brw_tcs_precompile(struct gl_context *ctx,
 
    struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
    struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
+   const struct gl_shader *tes =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
 
    memset(&key, 0, sizeof(key));
 
@@ -393,9 +400,14 @@ brw_tcs_precompile(struct gl_context *ctx,
    if (brw->gen < 8)
       key.input_vertices = shader_prog->TessCtrl.VerticesOut;
 
-   key.tes_primitive_mode =
-      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] ?
-      shader_prog->TessEval.PrimitiveMode : GL_TRIANGLES;
+   if (tes) {
+      key.tes_primitive_mode = shader_prog->TessEval.PrimitiveMode;
+      key.quads_workaround = brw->gen < 9 &&
+                             shader_prog->TessEval.PrimitiveMode == GL_QUADS &&
+                             shader_prog->TessEval.Spacing == GL_EQUAL;
+   } else {
+      key.tes_primitive_mode = GL_TRIANGLES;
+   }
 
    key.outputs_written = prog->OutputsWritten;
    key.patch_outputs_written = prog->PatchOutputsWritten;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 162b481..a7398a7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -354,95 +354,97 @@ vec4_visitor::opt_vector_float()
 {
    bool progress = false;
 
-   int last_reg = -1, last_reg_offset = -1;
-   enum brw_reg_file last_reg_file = BAD_FILE;
+   foreach_block(block, cfg) {
+      int last_reg = -1, last_reg_offset = -1;
+      enum brw_reg_file last_reg_file = BAD_FILE;
+
+      uint8_t imm[4] = { 0 };
+      int inst_count = 0;
+      vec4_instruction *imm_inst[4];
+      unsigned writemask = 0;
+      enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+
+      foreach_inst_in_block_safe(vec4_instruction, inst, block) {
+         int vf = -1;
+         enum brw_reg_type need_type;
+
+         /* Look for unconditional MOVs from an immediate with a partial
+          * writemask.  Skip type-conversion MOVs other than integer 0,
+          * where the type doesn't matter.  See if the immediate can be
+          * represented as a VF.
+          */
+         if (inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].file == IMM &&
+             inst->predicate == BRW_PREDICATE_NONE &&
+             inst->dst.writemask != WRITEMASK_XYZW &&
+             (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
+
+            vf = brw_float_to_vf(inst->src[0].d);
+            need_type = BRW_REGISTER_TYPE_D;
+
+            if (vf == -1) {
+               vf = brw_float_to_vf(inst->src[0].f);
+               need_type = BRW_REGISTER_TYPE_F;
+            }
+         } else {
+            last_reg = -1;
+         }
 
-   uint8_t imm[4] = { 0 };
-   int inst_count = 0;
-   vec4_instruction *imm_inst[4];
-   unsigned writemask = 0;
-   enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+         /* If this wasn't a MOV, or the destination register doesn't match,
+          * or we have to switch destination types, then this breaks our
+          * sequence.  Combine anything we've accumulated so far.
+          */
+         if (last_reg != inst->dst.nr ||
+             last_reg_offset != inst->dst.reg_offset ||
+             last_reg_file != inst->dst.file ||
+             (vf > 0 && dest_type != need_type)) {
+
+            if (inst_count > 1) {
+               unsigned vf;
+               memcpy(&vf, imm, sizeof(vf));
+               vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+               mov->dst.type = dest_type;
+               mov->dst.writemask = writemask;
+               inst->insert_before(block, mov);
+
+               for (int i = 0; i < inst_count; i++) {
+                  imm_inst[i]->remove(block);
+               }
 
-   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-      int vf = -1;
-      enum brw_reg_type need_type;
+               progress = true;
+            }
 
-      /* Look for unconditional MOVs from an immediate with a partial
-       * writemask.  Skip type-conversion MOVs other than integer 0,
-       * where the type doesn't matter.  See if the immediate can be
-       * represented as a VF.
-       */
-      if (inst->opcode == BRW_OPCODE_MOV &&
-          inst->src[0].file == IMM &&
-          inst->predicate == BRW_PREDICATE_NONE &&
-          inst->dst.writemask != WRITEMASK_XYZW &&
-          (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
-
-         vf = brw_float_to_vf(inst->src[0].d);
-         need_type = BRW_REGISTER_TYPE_D;
-
-         if (vf == -1) {
-            vf = brw_float_to_vf(inst->src[0].f);
-            need_type = BRW_REGISTER_TYPE_F;
-         }
-      } else {
-         last_reg = -1;
-      }
+            inst_count = 0;
+            last_reg = -1;
+            writemask = 0;
+            dest_type = BRW_REGISTER_TYPE_F;
 
-      /* If this wasn't a MOV, or the destination register doesn't match,
-       * or we have to switch destination types, then this breaks our
-       * sequence.  Combine anything we've accumulated so far.
-       */
-      if (last_reg != inst->dst.nr ||
-          last_reg_offset != inst->dst.reg_offset ||
-          last_reg_file != inst->dst.file ||
-          (vf > 0 && dest_type != need_type)) {
-
-         if (inst_count > 1) {
-            unsigned vf;
-            memcpy(&vf, imm, sizeof(vf));
-            vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
-            mov->dst.type = dest_type;
-            mov->dst.writemask = writemask;
-            inst->insert_before(block, mov);
-
-            for (int i = 0; i < inst_count; i++) {
-               imm_inst[i]->remove(block);
+            for (int i = 0; i < 4; i++) {
+               imm[i] = 0;
             }
-
-            progress = true;
          }
 
-         inst_count = 0;
-         last_reg = -1;
-         writemask = 0;
-         dest_type = BRW_REGISTER_TYPE_F;
-
-         for (int i = 0; i < 4; i++) {
-            imm[i] = 0;
+         /* Record this instruction's value (if it was representable). */
+         if (vf != -1) {
+            if ((inst->dst.writemask & WRITEMASK_X) != 0)
+               imm[0] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Y) != 0)
+               imm[1] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Z) != 0)
+               imm[2] = vf;
+            if ((inst->dst.writemask & WRITEMASK_W) != 0)
+               imm[3] = vf;
+
+            writemask |= inst->dst.writemask;
+            imm_inst[inst_count++] = inst;
+
+            last_reg = inst->dst.nr;
+            last_reg_offset = inst->dst.reg_offset;
+            last_reg_file = inst->dst.file;
+            if (vf > 0)
+               dest_type = need_type;
          }
       }
-
-      /* Record this instruction's value (if it was representable). */
-      if (vf != -1) {
-         if ((inst->dst.writemask & WRITEMASK_X) != 0)
-            imm[0] = vf;
-         if ((inst->dst.writemask & WRITEMASK_Y) != 0)
-            imm[1] = vf;
-         if ((inst->dst.writemask & WRITEMASK_Z) != 0)
-            imm[2] = vf;
-         if ((inst->dst.writemask & WRITEMASK_W) != 0)
-            imm[3] = vf;
-
-         writemask |= inst->dst.writemask;
-         imm_inst[inst_count++] = inst;
-
-         last_reg = inst->dst.nr;
-         last_reg_offset = inst->dst.reg_offset;
-         last_reg_file = inst->dst.file;
-         if (vf > 0)
-            dest_type = need_type;
-      }
    }
 
    if (progress)
@@ -1109,7 +1111,7 @@ vec4_visitor::opt_register_coalesce()
       /* Can't coalesce this GRF if someone else was going to
        * read it later.
        */
-      if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
+      if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
 	 continue;
 
       /* We need to check interference with the final destination between this
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 0c1f0c3..10898a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
              * more -- a sure sign they'll fail operands_match().
              */
             if (src->file == VGRF) {
-               if (var_range_end(var_from_reg(alloc, *src), 4) < ip) {
+               if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 4) < ip) {
                   entry->remove();
                   ralloc_free(entry);
                   break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 927438f..26a910c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -59,7 +59,10 @@ vec4_gs_visitor::make_reg_for_system_value(int location)
    switch (location) {
    case SYSTEM_VALUE_INVOCATION_ID:
       this->current_annotation = "initialize gl_InvocationID";
-      emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      if (gs_prog_data->invocations > 1)
+         emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      else
+         emit(MOV(*reg, brw_imm_ud(0)));
       break;
    default:
       unreachable("not reached");
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index f61c612..5440dba 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -451,6 +451,9 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
+   if (key->quads_workaround)
+      brw_nir_apply_tcs_quads_workaround(nir);
+
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    if (is_scalar)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 609285e..61ada53 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1443,10 +1443,12 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
    /* _NEW_PROGRAM */
    struct gl_shader_program *prog =
       ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+   /* BRW_NEW_CS_PROG_DATA */
+   const struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
 
-   if (prog && brw->cs.prog_data->uses_num_work_groups) {
+   if (prog && cs_prog_data->uses_num_work_groups) {
       const unsigned surf_idx =
-         brw->cs.prog_data->binding_table.work_groups_start;
+         cs_prog_data->binding_table.work_groups_start;
       uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx];
       drm_intel_bo *bo;
       uint32_t bo_offset;
@@ -1475,6 +1477,7 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
 const struct brw_tracked_state brw_cs_work_groups_surface = {
    .dirty = {
       .brw = BRW_NEW_BLORP |
+             BRW_NEW_CS_PROG_DATA |
              BRW_NEW_CS_WORK_GROUPS
    },
    .emit = brw_upload_cs_work_groups_surface,
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 26de633..64ccdb6 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -50,6 +50,7 @@ upload_clip_state(struct brw_context *brw)
       dw2 |= GEN6_CLIP_NON_PERSPECTIVE_BARYCENTRIC_ENABLE;
    }
 
+   /* BRW_NEW_VS_PROG_DATA */
    dw1 |= brw->vs.prog_data->base.cull_distance_mask;
 
    if (brw->gen >= 7)
@@ -224,6 +225,7 @@ const struct brw_tracked_state gen7_clip_state = {
                BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
                BRW_NEW_GEOMETRY_PROGRAM |
+               BRW_NEW_VS_PROG_DATA |
                BRW_NEW_META_IN_PROGRESS |
                BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD |
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 5427fa5..b245226 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -283,7 +283,7 @@ gen7_upload_cs_push_constants(struct brw_context *brw)
       (struct brw_compute_program *) brw->compute_program;
 
    if (cp) {
-      /* CACHE_NEW_CS_PROG */
+      /* BRW_NEW_CS_PROG_DATA */
       struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
 
       brw_upload_cs_push_constants(brw, &cp->program.Base, cs_prog_data,
@@ -297,6 +297,7 @@ const struct brw_tracked_state gen7_cs_push_constants = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_CS_PROG_DATA |
              BRW_NEW_PUSH_CONSTANT_ALLOCATION,
    },
    .emit = gen7_upload_cs_push_constants,
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c
index 6f01abb..3b79b55 100644
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -69,6 +69,7 @@ gen8_upload_ds_state(struct brw_context *brw)
                  GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) |
                 (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
                  GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
+      /* _NEW_TRANSFORM */
       OUT_BATCH(SET_FIELD(ctx->Transform.ClipPlanesEnabled,
                           GEN8_DS_USER_CLIP_DISTANCE) |
                 SET_FIELD(vue_prog_data->cull_distance_mask,
@@ -106,7 +107,7 @@ gen8_upload_ds_state(struct brw_context *brw)
 
 const struct brw_tracked_state gen8_ds_state = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_TRANSFORM,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
                BRW_NEW_TESS_PROGRAMS |
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 8a904fe..f916d99 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -124,6 +124,7 @@ const struct brw_tracked_state gen8_ps_extra = {
       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
       .brw   = BRW_NEW_BLORP |
                BRW_NEW_CONTEXT |
+               BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA,
    },
    .emit = upload_ps_extra,
@@ -283,7 +284,6 @@ const struct brw_tracked_state gen8_ps_state = {
       .mesa  = _NEW_MULTISAMPLE,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
-               BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA,
    },
    .emit = upload_ps_state,
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index aa1dc38..67e8e8f 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -21,13 +21,13 @@ extern "C" {
  *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
  *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
  *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
- *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 5 DWords each ==> 2 * 3 * 5 * 4 = 120 bytes
+ *     - Two sets of PIPE_CONTROLs, which become 4 PIPE_CONTROLs each on SNB,
+ *       which are 5 DWords each ==> 2 * 4 * 5 * 4 = 160 bytes
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
- *   - CC_STATE workaround on HSW (12 * 4 = 48 bytes)
- *     - 5 dwords for initial mi_flush
+ *   - CC_STATE workaround on HSW (17 * 4 = 68 bytes)
+ *     - 10 dwords for initial mi_flush
  *     - 2 dwords for CC state setup
  *     - 5 dwords for the required pipe control at the end
  *   - Restoring L3 configuration: (24 dwords = 96 bytes)
@@ -35,7 +35,7 @@ extern "C" {
  *     - 7 dwords for L3 configuration set-up.
  *     - 5 dwords for L3 atomic set-up (on HSW).
  */
-#define BATCH_RESERVED 248
+#define BATCH_RESERVED 308
 
 struct intel_batchbuffer;
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 939f9a0..8a0d2ad 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -374,6 +374,19 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
    if (!irb->mt)
       return;
 
+   /* Adjust the miptree's upper-left coordinate.
+    *
+    * FIXME: Adjusting the miptree's layout outside of
+    * intel_miptree_create_layout() is fragile. Plumb the adjustment through
+    * intel_miptree_create_layout() and brw_tex_layout().
+    */
+   irb->mt->level[0].level_x = image->tile_x;
+   irb->mt->level[0].level_y = image->tile_y;
+   irb->mt->level[0].slice[0].x_offset = image->tile_x;
+   irb->mt->level[0].slice[0].y_offset = image->tile_y;
+   irb->mt->total_width += image->tile_x;
+   irb->mt->total_height += image->tile_y;
+
    rb->InternalFormat = image->internal_format;
    rb->Width = image->width;
    rb->Height = image->height;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index b6265dc..e74a2dc 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -366,25 +366,8 @@ intel_miptree_create_layout(struct brw_context *brw,
        _mesa_get_format_name(format),
        first_level, last_level, depth0, mt);
 
-   if (target == GL_TEXTURE_1D_ARRAY) {
-      /* For a 1D Array texture the OpenGL API will treat the height0
-       * parameter as the number of array slices. For Intel hardware, we treat
-       * the 1D array as a 2D Array with a height of 1.
-       *
-       * So, when we first come through this path to create a 1D Array
-       * texture, height0 stores the number of slices, and depth0 is 1. In
-       * this case, we want to swap height0 and depth0.
-       *
-       * Since some miptrees will be created based on the base miptree, we may
-       * come through this path and see height0 as 1 and depth0 being the
-       * number of slices. In this case we don't need to do the swap.
-       */
-      assert(height0 == 1 || depth0 == 1);
-      if (height0 > 1) {
-         depth0 = height0;
-         height0 = 1;
-      }
-   }
+   if (target == GL_TEXTURE_1D_ARRAY)
+      assert(height0 == 1);
 
    mt->target = target;
    mt->format = format;
@@ -1050,6 +1033,7 @@ intel_get_image_dims(struct gl_texture_image *image,
        * as a 2D Array with a height of 1. So, here we want to swap image
        * height and depth.
        */
+      assert(image->Depth == 1);
       *width = image->Width;
       *height = 1;
       *depth = image->Height;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index a486d6e..cacd7e2 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -110,22 +110,6 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
    if (ctx->_ImageTransferState)
       return false;
 
-   /* This renderbuffer can come from a texture.  In this case, we impose
-    * some of the same restrictions we have for textures and adjust for
-    * miplevels.
-    */
-   if (rb->TexImage) {
-      if (rb->TexImage->TexObject->Target != GL_TEXTURE_2D &&
-          rb->TexImage->TexObject->Target != GL_TEXTURE_RECTANGLE)
-         return false;
-
-      int level = rb->TexImage->Level + rb->TexImage->TexObject->MinLevel;
-
-      /* Adjust x and y offset based on miplevel */
-      xoffset += irb->mt->level[level].level_x;
-      yoffset += irb->mt->level[level].level_y;
-   }
-
    /* It is possible that the renderbuffer (or underlying texture) is
     * multisampled.  Since ReadPixels from a multisampled buffer requires a
     * multisample resolve, we can't handle this here
@@ -169,6 +153,9 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       return false;
    }
 
+   xoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].x_offset;
+   yoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].y_offset;
+
    dst_pitch = _mesa_image_row_stride(pack, width, format, type);
 
    /* For a window-system renderbuffer, the buffer is actually flipped
@@ -201,7 +188,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
+      bo->virtual + irb->mt->offset,
       dst_pitch, irb->mt->pitch,
       brw->has_swizzling,
       irb->mt->tiling,
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 95365fe..7a82be4 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -134,6 +134,15 @@
 #define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
 #define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
 
+#define PIPE_CONTROL_CACHE_FLUSH_BITS \
+   (PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_DATA_CACHE_FLUSH | \
+    PIPE_CONTROL_RENDER_TARGET_FLUSH)
+
+#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \
+   (PIPE_CONTROL_STATE_CACHE_INVALIDATE | PIPE_CONTROL_CONST_CACHE_INVALIDATE | \
+    PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
+    PIPE_CONTROL_INSTRUCTION_INVALIDATE)
+
 /** @} */
 
 #define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22))
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 3e359a5..39c9636 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -49,6 +49,7 @@ struct brw_fence {
    /** The fence waits for completion of this batch. */
    drm_intel_bo *batch_bo;
 
+   mtx_t mutex;
    bool signalled;
 };
 
@@ -58,10 +59,20 @@ struct intel_gl_sync_object {
 };
 
 static void
+brw_fence_init(struct brw_context *brw, struct brw_fence *fence)
+{
+   fence->brw = brw;
+   fence->batch_bo = NULL;
+   mtx_init(&fence->mutex, mtx_plain);
+}
+
+static void
 brw_fence_finish(struct brw_fence *fence)
 {
    if (fence->batch_bo)
       drm_intel_bo_unreference(fence->batch_bo);
+
+   mtx_destroy(&fence->mutex);
 }
 
 static void
@@ -77,7 +88,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
 }
 
 static bool
-brw_fence_has_completed(struct brw_fence *fence)
+brw_fence_has_completed_locked(struct brw_fence *fence)
 {
    if (fence->signalled)
       return true;
@@ -92,13 +103,21 @@ brw_fence_has_completed(struct brw_fence *fence)
    return false;
 }
 
-/**
- * Return true if the function successfully signals or has already signalled.
- * (This matches the behavior expected from __DRI2fence::client_wait_sync).
- */
 static bool
-brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
-                      uint64_t timeout)
+brw_fence_has_completed(struct brw_fence *fence)
+{
+   bool ret;
+
+   mtx_lock(&fence->mutex);
+   ret = brw_fence_has_completed_locked(fence);
+   mtx_unlock(&fence->mutex);
+
+   return ret;
+}
+
+static bool
+brw_fence_client_wait_locked(struct brw_context *brw, struct brw_fence *fence,
+                             uint64_t timeout)
 {
    if (fence->signalled)
       return true;
@@ -123,6 +142,23 @@ brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
    return true;
 }
 
+/**
+ * Return true if the function successfully signals or has already signalled.
+ * (This matches the behavior expected from __DRI2fence::client_wait_sync).
+ */
+static bool
+brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
+                      uint64_t timeout)
+{
+   bool ret;
+
+   mtx_lock(&fence->mutex);
+   ret = brw_fence_client_wait_locked(brw, fence, timeout);
+   mtx_unlock(&fence->mutex);
+
+   return ret;
+}
+
 static void
 brw_fence_server_wait(struct brw_context *brw, struct brw_fence *fence)
 {
@@ -161,6 +197,7 @@ intel_gl_fence_sync(struct gl_context *ctx, struct gl_sync_object *s,
    struct brw_context *brw = brw_context(ctx);
    struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
 
+   brw_fence_init(brw, &sync->fence);
    brw_fence_insert(brw, &sync->fence);
 }
 
@@ -215,7 +252,7 @@ intel_dri_create_fence(__DRIcontext *ctx)
    if (!fence)
       return NULL;
 
-   fence->brw = brw;
+   brw_fence_init(brw, fence);
    brw_fence_insert(brw, fence);
 
    return fence;
@@ -244,6 +281,12 @@ intel_dri_server_wait_sync(__DRIcontext *ctx, void *driver_fence, unsigned flags
 {
    struct brw_fence *fence = driver_fence;
 
+   /* We might be called here with a NULL fence as a result of WaitSyncKHR
+    * on a EGL_KHR_reusable_sync fence. Nothing to do here in such case.
+    */
+   if (!fence)
+      return;
+
    brw_fence_server_wait(fence->brw, fence);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index cac33ac..a1364b9 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -140,6 +140,8 @@ intel_alloc_texture_storage(struct gl_context *ctx,
        !intel_miptree_match_image(intel_texobj->mt, first_image) ||
        intel_texobj->mt->last_level != levels - 1) {
       intel_miptree_release(&intel_texobj->mt);
+
+      intel_get_image_dims(first_image, &width, &height, &depth);
       intel_texobj->mt = intel_miptree_create(brw, texobj->Target,
                                               first_image->TexFormat,
                                               0, levels - 1,
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_frag.c b/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
index 492ecdc..2c5c2db 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
@@ -67,5 +67,5 @@ nv20_emit_frag(struct gl_context *ctx, int emit)
 	PUSH_DATA (push, in >> 32);
 
 	BEGIN_NV04(push, NV20_3D(RC_ENABLE), 1);
-	PUSH_DATA (push, n);
+	PUSH_DATA (push, MAX2(1, n));
 }
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 2d4bb70..6e006f8 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -484,14 +484,14 @@ swrast_map_renderbuffer(struct gl_context *ctx,
 
       xrb->map_mode = mode;
       xrb->map_x = x;
-      xrb->map_y = y;
+      xrb->map_y = rb->Height - y - h;
       xrb->map_w = w;
       xrb->map_h = h;
 
       stride = w * cpp;
       xrb->Base.Buffer = malloc(h * stride);
 
-      sPriv->swrast_loader->getImage(dPriv, x, rb->Height - y - h, w, h,
+      sPriv->swrast_loader->getImage(dPriv, x, xrb->map_y, w, h,
 				     (char *) xrb->Base.Buffer,
 				     dPriv->loaderPrivate);