Merge remote-tracking branch 'androidx86/marshmallow-x86' into replicant-6.0replicant-6.0-beta-0001 replicant-6.0-alpha-0006 replicant-6.0-old

author: Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> 2016-12-17 03:40:28 +0100
committer: Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de> 2016-12-17 03:40:28 +0100
commit: ef9a82038acd73936830671dbe43205c28a2151d (patch)
tree: 90be2cdd9f48750c18b669ca2ab9553575d9f822
parent: f84f60446aebaeee8a1df741328cbd4a30dd24ea (diff)
parent: 743c2327b167b95046e02af4c7b2f7a282a0943d (diff)
download: external_mesa3d-replicant-6.0-old.zip
external_mesa3d-replicant-6.0-old.tar.gz
external_mesa3d-replicant-6.0-old.tar.bz2
248 files changed, 4655 insertions, 2004 deletions
diff --git a/.travis.yml b/.travis.yml
index da1d81e..5f489a4 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,6 +1,7 @@
 language: c
 
-sudo: false
+sudo: true
+dist: trusty
 
 cache:
   directories:
@@ -15,7 +16,11 @@ addons:
       - libexpat1-dev
       - libxcb-dri2-0-dev
       - libx11-xcb-dev
-      - llvm-3.4-dev
+      - llvm-3.5-dev
+      # llvm-config is not in the dev package?
+      - llvm-3.5
+      # LLVM packaging is broken and misses this dep.
+      - libedit-dev
       - scons
 
 env:
@@ -41,6 +46,16 @@ install:
   - export PATH="/usr/lib/ccache:$PATH"
   - pip install --user mako
 
+  # Since libdrm gets updated in configure.ac regularly, try to pick up the
+  # latest version from there.
+  - for line in `grep "^LIBDRM_.*_REQUIRED=" configure.ac`; do
+      old_ver=`echo $LIBDRM_VERSION | sed 's/libdrm-//'`;
+      new_ver=`echo $line | sed 's/.*REQUIRED=//'`;
+      if `echo "$old_ver,$new_ver" | tr ',' '\n' | sort -Vc 2> /dev/null`; then
+        export LIBDRM_VERSION="libdrm-$new_ver";
+      fi;
+    done
+
   # Install dependencies where we require specific versions (or where
   # disallowed by Travis CI's package whitelisting).
 
@@ -78,22 +93,19 @@ install:
 
   - wget http://dri.freedesktop.org/libdrm/$LIBDRM_VERSION.tar.bz2
   - tar -jxvf $LIBDRM_VERSION.tar.bz2
-  - (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix && make install)
+  - (cd $LIBDRM_VERSION && ./configure --prefix=$HOME/prefix --enable-vc4 && make install)
 
   - wget $XORG_RELEASES/lib/$LIBXSHMFENCE_VERSION.tar.bz2
   - tar -jxvf $LIBXSHMFENCE_VERSION.tar.bz2
   - (cd $LIBXSHMFENCE_VERSION && ./configure --prefix=$HOME/prefix && make install)
 
-# Disabled LLVM (and therefore r300 and r600) because the build fails
-# with "undefined reference to `clock_gettime'" and "undefined
-# reference to `setupterm'" in llvmpipe.
 script:
   - if test "x$BUILD" = xmake; then
       ./autogen.sh --enable-debug
-        --disable-gallium-llvm
         --with-egl-platforms=x11,drm
         --with-dri-drivers=i915,i965,radeon,r200,swrast,nouveau
-        --with-gallium-drivers=svga,swrast,vc4,virgl
+        --with-gallium-drivers=svga,swrast,vc4,virgl,r300,r600
+        --disable-llvm-shared-libs
         ;
       make && make check;
     elif test x$BUILD = xscons; then
diff --git a/Makefile.am b/Makefile.am
index b0fbed6..2027a28 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -62,6 +62,7 @@ noinst_HEADERS = \
 	include/c99_math.h \
 	include/c11 \
 	include/D3D9 \
+	include/GL/wglext.h \
 	include/HaikuGL \
 	include/no_extern_c.h \
 	include/pci_ids
diff --git a/VERSION b/VERSION
index b700dc1..f0ad792 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-12.0.1
+12.0.4
diff --git a/appveyor.yml b/appveyor.yml
index 2e9b9d6..6e69cbf 100644
--- a/appveyor.yml
+++ b/appveyor.yml
@@ -37,6 +37,8 @@ cache:
 - win_flex_bison-2.4.5.zip
 - llvm-3.3.1-msvc2013-mtd.7z
 
+os: Visual Studio 2013
+
 environment:
   WINFLEXBISON_ARCHIVE: win_flex_bison-2.4.5.zip
   LLVM_ARCHIVE: llvm-3.3.1-msvc2013-mtd.7z
@@ -47,11 +49,13 @@ install:
 - python -m pip --version
 # Install Mako
 - python -m pip install --egg Mako
+# Install pywin32 extensions, needed by SCons
+- python -m pip install pypiwin32
 # Install SCons
 - python -m pip install --egg scons==2.4.1
 - scons --version
 # Install flex/bison
-- if not exist "%WINFLEXBISON_ARCHIVE%" appveyor DownloadFile "http://downloads.sourceforge.net/project/winflexbison/%WINFLEXBISON_ARCHIVE%"
+- if not exist "%WINFLEXBISON_ARCHIVE%" appveyor DownloadFile "https://downloads.sourceforge.net/project/winflexbison/old_versions/%WINFLEXBISON_ARCHIVE%"
 - 7z x -y -owinflexbison\ "%WINFLEXBISON_ARCHIVE%" > nul
 - set Path=%CD%\winflexbison;%Path%
 - win_flex --version
diff --git a/bin/.cherry-ignore b/bin/.cherry-ignore
index 5b0b89c..005e1e3 100644
--- a/bin/.cherry-ignore
+++ b/bin/.cherry-ignore
@@ -1,2 +1,25 @@
 # The offending commit that this patch (part) reverts isn't in 12.0
 be32a2132785fbc119f17e62070e007ee7d17af7 i965/compiler: Bring back the INTEL_PRECISE_TRIG environment variable
+
+# The patch depends on the batch_cache work at least.
+89f00f749fda4c1beca38f362c7f86bdc6e32785 a4xx: make sure to actually clamp depth as requested
+
+# The patch depends on the 'generic' interoplation and location
+# implementation introduced with 2d6dd30a9b30
+114874b22beafb2d07006b197c62d717fc7f80cc i965/fs: Use sample interpolation for interpolateAtCentroid in persample mode
+
+# VAAPI encode landed after the branch point.
+a5993022275c20061ac025d9adc26c5f9d02afee st/va Avoid VBR bitrate calculation overflow v2
+
+# EGL_KHR_debug landed after the branch point.
+17084b6f9340f798111e53e08f5d35c7630cee48 egl: Fix missing unlock in eglGetSyncAttribKHR
+
+# Depends on update_renderbuffer_read_surfaces at least
+f2b9b0c730e345bcffa9eadabb25af3ab02642f2 i965: Add missing BRW_NEW_FS_PROG_DATA to render target reads.
+
+# The commit in question hasn't landed in branch
+1ef787339774bc7f1cc9c1615722f944005e070c Revert "egl/android: Set EGL_MAX_PBUFFER_WIDTH and EGL_MAX_PBUFFER_HEIGHT"
+
+# Patches depend on the fence_finish() gallium API change and corresponding driver work
+f240ad98bc05281ea7013d91973cb5f932ae9434 st/mesa: unduplicate st_check_sync code
+b687f766fddb7b39479cd9ee0427984029ea3559 st/mesa: allow multiple concurrent waiters in ClientWaitSync
diff --git a/bin/get-pick-list.sh b/bin/get-pick-list.sh
index 0902fd0..4515837 100755
--- a/bin/get-pick-list.sh
+++ b/bin/get-pick-list.sh
@@ -14,7 +14,7 @@ git log --reverse --grep="cherry picked from commit" origin/master..HEAD |\
 	sed -e 's/^[[:space:]]*(cherry picked from commit[[:space:]]*//' -e 's/)//' > already_picked
 
 # Grep for commits that were marked as a candidate for the stable tree.
-git log --reverse --pretty=%H -i --grep='^\([[:space:]]*NOTE: .*[Cc]andidate\|CC:.*mesa-stable\)' HEAD..origin/master |\
+git log --reverse --pretty=%H -i --grep='^\([[:space:]]*NOTE: .*[Cc]andidate\|CC:.*12\.0.*mesa-stable\)' HEAD..origin/master |\
 while read sha
 do
 	# Check to see whether the patch is on the ignore list.
diff --git a/configure.ac b/configure.ac
index 535a2e3..78cf178 100644
--- a/configure.ac
+++ b/configure.ac
@@ -225,6 +225,7 @@ AX_GCC_FUNC_ATTRIBUTE([packed])
 AX_GCC_FUNC_ATTRIBUTE([pure])
 AX_GCC_FUNC_ATTRIBUTE([returns_nonnull])
 AX_GCC_FUNC_ATTRIBUTE([unused])
+AX_GCC_FUNC_ATTRIBUTE([visibility])
 AX_GCC_FUNC_ATTRIBUTE([warn_unused_result])
 AX_GCC_FUNC_ATTRIBUTE([weak])
 
@@ -783,6 +784,7 @@ if test "x$enable_asm" = xyes; then
     esac
 fi
 
+AC_HEADER_MAJOR
 AC_CHECK_HEADER([xlocale.h], [DEFINES="$DEFINES -DHAVE_XLOCALE_H"])
 AC_CHECK_HEADER([sys/sysctl.h], [DEFINES="$DEFINES -DHAVE_SYS_SYSCTL_H"])
 AC_CHECK_FUNC([strtof], [DEFINES="$DEFINES -DHAVE_STRTOF"])
@@ -1639,7 +1641,7 @@ esac
 
 AC_ARG_WITH([vulkan-icddir],
     [AS_HELP_STRING([--with-vulkan-icddir=DIR],
-        [directory for the Vulkan driver icd files @<:@${sysconfdir}/vulkan/icd.d@:>@])],
+        [directory for the Vulkan driver icd files @<:@${datarootdir}/vulkan/icd.d@:>@])],
     [VULKAN_ICD_INSTALL_DIR="$withval"],
     [VULKAN_ICD_INSTALL_DIR='${datarootdir}/vulkan/icd.d'])
 AC_SUBST([VULKAN_ICD_INSTALL_DIR])
@@ -1997,8 +1999,8 @@ if test "x$with_egl_platforms" != "x" -a "x$enable_egl" != xyes; then
     AC_MSG_ERROR([cannot build egl state tracker without EGL library])
 fi
 
-PKG_CHECK_MODULES([WAYLAND_SCANNER], [wayland_scanner],
-        WAYLAND_SCANNER=`$PKG_CONFIG --variable=wayland_scanner wayland_scanner`,
+PKG_CHECK_MODULES([WAYLAND_SCANNER], [wayland-scanner],
+        WAYLAND_SCANNER=`$PKG_CONFIG --variable=wayland_scanner wayland-scanner`,
         WAYLAND_SCANNER='')
 if test "x$WAYLAND_SCANNER" = x; then
     AC_PATH_PROG([WAYLAND_SCANNER], [wayland-scanner])
@@ -2182,6 +2184,10 @@ if test "x$enable_gallium_llvm" = xyes; then
 
         LLVM_COMPONENTS="engine bitwriter mcjit mcdisassembler"
 
+        if $LLVM_CONFIG --components | grep -q inteljitevents ; then
+            LLVM_COMPONENTS="${LLVM_COMPONENTS} inteljitevents"
+        fi
+
         if test "x$enable_opencl" = xyes; then
             llvm_check_version_for "3" "5" "0" "opencl"
 
@@ -2331,6 +2337,45 @@ swr_llvm_check() {
     fi
 }
 
+swr_require_cxx_feature_flags() {
+    feature_name="$1"
+    preprocessor_test="$2"
+    option_list="$3"
+    output_var="$4"
+
+    AC_MSG_CHECKING([whether $CXX supports $feature_name])
+    AC_LANG_PUSH([C++])
+    save_CXXFLAGS="$CXXFLAGS"
+    save_IFS="$IFS"
+    IFS=","
+    found=0
+    for opts in $option_list
+    do
+        unset IFS
+        CXXFLAGS="$opts $save_CXXFLAGS"
+        AC_COMPILE_IFELSE(
+            [AC_LANG_PROGRAM(
+                [   #if !($preprocessor_test)
+                    #error
+                    #endif
+                ])],
+            [found=1; break],
+            [])
+        IFS=","
+    done
+    IFS="$save_IFS"
+    CXXFLAGS="$save_CXXFLAGS"
+    AC_LANG_POP([C++])
+    if test $found -eq 1; then
+        AC_MSG_RESULT([$opts])
+        eval "$output_var=\$opts"
+        return 0
+    fi
+    AC_MSG_RESULT([no])
+    AC_MSG_ERROR([swr requires $feature_name support])
+    return 1
+}
+
 dnl Duplicates in GALLIUM_DRIVERS_DIRS are removed by sorting it after this block
 if test -n "$with_gallium_drivers"; then
     gallium_drivers=`IFS=', '; echo $with_gallium_drivers`
@@ -2400,29 +2445,20 @@ if test -n "$with_gallium_drivers"; then
         xswr)
             swr_llvm_check "swr"
 
-            AC_MSG_CHECKING([whether $CXX supports c++11/AVX/AVX2])
-            AVX_CXXFLAGS="-march=core-avx-i"
-            AVX2_CXXFLAGS="-march=core-avx2"
-
-            AC_LANG_PUSH([C++])
-            save_CXXFLAGS="$CXXFLAGS"
-            CXXFLAGS="-std=c++11 $CXXFLAGS"
-            AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
-                              [AC_MSG_ERROR([c++11 compiler support not detected])])
-            CXXFLAGS="$save_CXXFLAGS"
-
-            save_CXXFLAGS="$CXXFLAGS"
-            CXXFLAGS="$AVX_CXXFLAGS $CXXFLAGS"
-            AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
-                              [AC_MSG_ERROR([AVX compiler support not detected])])
-            CXXFLAGS="$save_CXXFLAGS"
-
-            save_CFLAGS="$CXXFLAGS"
-            CXXFLAGS="$AVX2_CXXFLAGS $CXXFLAGS"
-            AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
-                              [AC_MSG_ERROR([AVX2 compiler support not detected])])
-            CXXFLAGS="$save_CXXFLAGS"
-            AC_LANG_POP([C++])
+            swr_require_cxx_feature_flags "C++11" "__cplusplus >= 201103L" \
+                ",-std=c++11" \
+                SWR_CXX11_CXXFLAGS
+            AC_SUBST([SWR_CXX11_CXXFLAGS])
+
+            swr_require_cxx_feature_flags "AVX" "defined(__AVX__)" \
+                ",-mavx,-march=core-avx" \
+                SWR_AVX_CXXFLAGS
+            AC_SUBST([SWR_AVX_CXXFLAGS])
+
+            swr_require_cxx_feature_flags "AVX2" "defined(__AVX2__)" \
+                ",-mavx2 -mfma -mbmi2 -mf16c,-march=core-avx2" \
+                SWR_AVX2_CXXFLAGS
+            AC_SUBST([SWR_AVX2_CXXFLAGS])
 
             HAVE_GALLIUM_SWR=yes
             ;;
@@ -2560,6 +2596,8 @@ fi
 AM_CONDITIONAL(HAVE_LIBDRM, test "x$have_libdrm" = xyes)
 AM_CONDITIONAL(HAVE_OSMESA, test "x$enable_osmesa" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_OSMESA, test "x$enable_gallium_osmesa" = xyes)
+AM_CONDITIONAL(HAVE_COMMON_OSMESA, test "x$enable_osmesa" = xyes -o \
+                                        "x$enable_gallium_osmesa" = xyes)
 
 AM_CONDITIONAL(HAVE_X86_ASM, test "x$asm_arch" = xx86 -o "x$asm_arch" = xx86_64)
 AM_CONDITIONAL(HAVE_X86_64_ASM, test "x$asm_arch" = xx86_64)
diff --git a/docs/relnotes/12.0.2.html b/docs/relnotes/12.0.2.html
new file mode 100644
index 0000000..385ef08
--- /dev/null
+++ b/docs/relnotes/12.0.2.html
@@ -0,0 +1,403 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.2 Release Notes / September 2, 2016</h1>
+
+<p>
+Mesa 12.0.2 is a bug fix release which fixes bugs found since the 12.0.1 release.
+</p>
+<p>
+Mesa 12.0.2 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+a08565ab1273751ebe2ffa928cbf785056594c803077c9719d0763da780f2918  mesa-12.0.2.tar.gz
+d957a5cc371dcd7ff2aa0d87492f263aece46f79352f4520039b58b1f32552cb  mesa-12.0.2.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=69622">Bug 69622</a> - eglTerminate then eglMakeCurrent crahes</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=89599">Bug 89599</a> - symbol 'x86_64_entry_start' is already defined when building with LLVM/clang</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=91342">Bug 91342</a> - Very dark textures on some objects in indoors environments in Postal 2</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=92306">Bug 92306</a> - GL Excess demo renders incorrectly on nv43</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=94148">Bug 94148</a> - Framebuffer considered invalid when a draw call is done before glCheckFramebufferStatus</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96274">Bug 96274</a> - [NVC0] Failure when compiling compute shader: Assertion `bb-&gt;getFirst()-&gt;serial &lt;= bb-&gt;getExit()-&gt;serial' failed</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96358">Bug 96358</a> - SSO: wrong interface validation between GS and VS (regresion due to latest gles 3.1)</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96381">Bug 96381</a> - Texture artifacts with immutable texture storage and mipmaps</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96762">Bug 96762</a> - [radeonsi,apitrace] Firewatch: nothing rendered in scrollable (text) areas</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96835">Bug 96835</a> - &quot;gallium: Force blend color to 16-byte alignment&quot; crash with &quot;-march=native -O3&quot; causes some 32bit games to crash</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96850">Bug 96850</a> - Crucible tests fail for 32bit mesa</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96908">Bug 96908</a> - [radeonsi] MSAA causes graphical artifacts</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96911">Bug 96911</a> - webgl2 conformance2/textures/misc/tex-mipmap-levels.html crashes 12.1 Intel driver</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96971">Bug 96971</a> - invariant qualifier is not valid for shader inputs</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97039">Bug 97039</a> - The Talos Principle and Serious Sam 3 GPU faults</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97207">Bug 97207</a> - [IVY BRIDGE] Fragment shader discard writing to depth</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97214">Bug 97214</a> - X not running with error &quot;Failed to make EGL context current&quot;</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97225">Bug 97225</a> - [i965 on HD4600 Haswell] xcom switch to ingame cinematics cause segmentation fault</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97231">Bug 97231</a> - GL_DEPTH_CLAMP doesn't clamp to the far plane</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97307">Bug 97307</a> - glsl/glcpp/tests/glcpp-test regression</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97331">Bug 97331</a> - glDrawElementsBaseVertex doesn't work in display list on i915</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97351">Bug 97351</a> - DrawElementsBaseVertex with VBO ignores base vertex on Intel GMA 9xx in some cases</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97426">Bug 97426</a> - glScissor gives vertically inverted result</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97476">Bug 97476</a> - Shader binaries should not be stored in the PipelineCache</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97567">Bug 97567</a> - [SNB, ILK] ctl, piglit regressions in mesa 12.0.2rc1</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Andreas Boll (1):</p>
+<ul>
+  <li>configure.ac: Use ${datarootdir} for --with-vulkan-icddir help string too</li>
+</ul>
+
+<p>Bernard Kilarski (1):</p>
+<ul>
+  <li>glx: fix error code when there is no context bound</li>
+</ul>
+
+<p>Brian Paul (4):</p>
+<ul>
+  <li>svga: handle mismatched number of samplers, sampler views</li>
+  <li>mesa: use _mesa_clear_texture_image() in clear_texture_fields()</li>
+  <li>swrast: fix incorrectly positioned putImage() in swrast driver</li>
+  <li>mesa: fix format conversion bug in get_tex_rgba_uncompressed()</li>
+</ul>
+
+<p>Chad Versace (2):</p>
+<ul>
+  <li>i965: Fix miptree layout for EGLImage-based renderbuffers</li>
+  <li>i965: Respect miptree offsets in intel_readpixels_tiled_memcpy()</li>
+</ul>
+
+<p>Christian König (1):</p>
+<ul>
+  <li>st/mesa: fix reference counting bug in st_vdpau</li>
+</ul>
+
+<p>Chuck Atkins (1):</p>
+<ul>
+  <li>swr: Refactor checks for compiler feature flags</li>
+</ul>
+
+<p>Daniel Scharrer (1):</p>
+<ul>
+  <li>mesa: Fix fixed function spot lighting on newer hardware (again)</li>
+</ul>
+
+<p>Dave Airlie (2):</p>
+<ul>
+  <li>anv: fix writemask on blit fragment shader.</li>
+  <li>st/glsl_to_tgsi: fix st_src_reg_for_double constant.</li>
+</ul>
+
+<p>Emil Velikov (15):</p>
+<ul>
+  <li>docs: add sha256 checksums for 12.0.1</li>
+  <li>mesa: automake: list builddir before srcdir</li>
+  <li>mesa: scons: list builddir before srcdir</li>
+  <li>i965: store reference to the context within struct brw_fence (v2)</li>
+  <li>anv: remove internal 'validate' layer</li>
+  <li>anv: automake: use VISIBILITY_CFLAGS to restrict symbol visibility</li>
+  <li>anv: automake: build with -Bsymbolic</li>
+  <li>anv: do not export the Vulkan API</li>
+  <li>anv: remove dummy VK_DEBUG_MARKER_EXT entry points</li>
+  <li>isl: automake: use VISIBILITY_CFLAGS to restrict symbol visibility</li>
+  <li>cherry-ignore: temporary(?) drop "a4xx: make sure to actually clamp depth"</li>
+  <li>i915: Check return value of screen-&gt;image.loader-&gt;getBuffers</li>
+  <li>Revert "i965/miptree: Set logical_depth0 == 6 for cube maps"</li>
+  <li>glx/glvnd: list the strcmp arguments in correct order</li>
+  <li>Update version to 12.0.2</li>
+</ul>
+
+<p>Eric Anholt (4):</p>
+<ul>
+  <li>vc4: Close our screen's fd on screen close.</li>
+  <li>vc4: Disable early Z with computed depth.</li>
+  <li>vc4: Fix a leak of the src[] array of VPM reads in optimization.</li>
+  <li>vc4: Fix leak of the bo_handles table.</li>
+</ul>
+
+<p>Francisco Jerez (3):</p>
+<ul>
+  <li>i965: Emit SKL VF cache invalidation W/A from brw_emit_pipe_control_flush.</li>
+  <li>i965: Make room in the batch epilogue for three more pipe controls.</li>
+  <li>i965: Fix remaining flush vs invalidate race conditions in brw_emit_pipe_control_flush.</li>
+</ul>
+
+<p>Haixia Shi (1):</p>
+<ul>
+  <li>platform_android: prevent deadlock in droid_swap_buffers</li>
+</ul>
+
+<p>Ian Romanick (5):</p>
+<ul>
+  <li>mesa: Strip arrayness from interface block names in some IO validation</li>
+  <li>glsl: Pack integer and double varyings as flat even if interpolation mode is none</li>
+  <li>glcpp: Track the actual version instead of just the version_resolved flag</li>
+  <li>glcpp: Only disallow #undef of pre-defined macros on GLSL ES &gt;= 3.00 shaders</li>
+  <li>glsl: Mark cube map array sampler types as reserved in GLSL ES 3.10</li>
+</ul>
+
+<p>Ilia Mirkin (16):</p>
+<ul>
+  <li>mesa: etc2 online compression is unsupported, don't attempt it</li>
+  <li>st/mesa: return appropriate mesa format for ETC texture formats</li>
+  <li>mesa: set _NEW_BUFFERS when updating texture bound to current buffers</li>
+  <li>nv50,nvc0: srgb rendering is only available for rgba/bgra</li>
+  <li>vbo: allow DrawElementsBaseVertex in display lists</li>
+  <li>gallium/util: add helper to compute zmin/zmax for a viewport state</li>
+  <li>nv50,nvc0: fix depth range when halfz is enabled</li>
+  <li>nv50/ir: fix bb positions after exit instructions</li>
+  <li>vbo: add basevertex when looking up elements for vbo splitting</li>
+  <li>a4xx: only disable depth clipping, not all clipping, when requested</li>
+  <li>nv50/ir: make sure cfg iterator always hits all blocks</li>
+  <li>main: add missing EXTRA_END in OES_sample_variables get check</li>
+  <li>nouveau: always enable at least one RC</li>
+  <li>nv30: only bail on color/depth bpp mismatch when surfaces are swizzled</li>
+  <li>a4xx: make sure to actually clamp depth as requested</li>
+  <li>gk110/ir: fix quadop dall emission</li>
+</ul>
+
+<p>Jan Ziak (2):</p>
+<ul>
+  <li>egl/x11: avoid using freed memory if dri2 init fails</li>
+  <li>loader: fix memory leak in loader_dri3_open</li>
+</ul>
+
+<p>Jason Ekstrand (31):</p>
+<ul>
+  <li>nir/spirv: Don't multiply the push constant block size by 4</li>
+  <li>anv: Add a stub for CmdCopyQueryPoolResults on Ivy Bridge</li>
+  <li>glsl/types: Fix function type comparison function</li>
+  <li>glsl/types: Use _mesa_hash_data for hashing function types</li>
+  <li>genxml: Make gen6-7 blending look more like gen8</li>
+  <li>anv/pipeline: Unify blend state setup between gen7 and gen8</li>
+  <li>anv: Enable independentBlend on gen7</li>
+  <li>anv: Add an align_down_npot_u32 helper</li>
+  <li>anv: Handle VK_WHOLE_SIZE properly for buffer views</li>
+  <li>i965/miptree: Enforce that height == 1 for 1-D array textures</li>
+  <li>i965/miptree: Set logical_depth0 == 6 for cube maps</li>
+  <li>nir: Add a nir_deref_foreach_leaf helper</li>
+  <li>nir/inline: Constant-initialize local variables in the callee if needed</li>
+  <li>anv/pipeline: Set up point coord enables</li>
+  <li>i965/miptree: Stop multiplying cube depth by 6 in HiZ calculations</li>
+  <li>i965/vec4: Make opt_vector_float reset at the top of each block</li>
+  <li>anv/blit2d: Add a format parameter to bind_dst and create_iview</li>
+  <li>anv/blit2d: Add support for RGB destinations</li>
+  <li>anv/clear: Make cmd_clear_image take an actual VkClearValue</li>
+  <li>anv/clear: Clear E5B9G9R9 images as R32_UINT</li>
+  <li>anv: Include the pipeline layout in the shader hash</li>
+  <li>isl: Allow multisampled array textures</li>
+  <li>anv/descriptor_set: memset anv_descriptor_set_layout</li>
+  <li>anv/pipeline: Fix bind maps for fragment output arrays</li>
+  <li>anv/allocator: Correctly set the number of buckets</li>
+  <li>anv/pipeline: Properly handle OOM during shader compilation</li>
+  <li>anv: Remove unused fields from anv_pipeline_bind_map</li>
+  <li>anv: Add pipeline_has_stage guards a few places</li>
+  <li>anv: Add a struct for storing a compiled shader</li>
+  <li>anv/pipeline: Add support for caching the push constant map</li>
+  <li>anv: Rework pipeline caching</li>
+</ul>
+
+<p>José Fonseca (2):</p>
+<ul>
+  <li>appveyor: Install pywin32 extensions.</li>
+  <li>appveyor: Force Visual Studio 2013 image.</li>
+</ul>
+
+<p>Kenneth Graunke (21):</p>
+<ul>
+  <li>genxml: Add CLIPMODE_* prefix to 3DSTATE_CLIP's "Clip Mode" enum values.</li>
+  <li>genxml: Add APIMODE_D3D missing enum values and improve consistency.</li>
+  <li>anv: Fix near plane clipping on Gen7/7.5.</li>
+  <li>anv: Enable early culling on Gen7.</li>
+  <li>anv: Unify 3DSTATE_CLIP code across generations.</li>
+  <li>genxml: Rename "API Rendering Disable" to "Rendering Disable".</li>
+  <li>anv: Properly call gen75_emit_state_base_address on Haswell.</li>
+  <li>i965: Include VUE handles for GS with invocations &gt; 1.</li>
+  <li>nir: Add a base const_index to shared atomic intrinsics.</li>
+  <li>i965: Fix shared atomic intrinsics to pay attention to base.</li>
+  <li>mesa: Add GL_BGRA_EXT to the list of GenerateMipmap internal formats.</li>
+  <li>mesa: Don't call GenerateMipmap if Width or Height == 0.</li>
+  <li>glsl: Delete bogus ir_set_program_inouts assert.</li>
+  <li>glsl: Fix the program resource names of gl_TessLevelOuter/Inner[].</li>
+  <li>glsl: Fix location bias for patch variables.</li>
+  <li>glsl: Fix invariant matching in GLSL 4.30 and GLSL ES 1.00.</li>
+  <li>mesa: Fix uf10_to_f32() scale factor in the E == 0 and M != 0 case.</li>
+  <li>nir/builder: Add bany_inequal and bany helpers.</li>
+  <li>i965: Implement the WaPreventHSTessLevelsInterference workaround.</li>
+  <li>i965: Fix execution size of scalar TCS barrier setup code.</li>
+  <li>i965: Fix barrier count shift in scalar TCS backend.</li>
+</ul>
+
+<p>Leo Liu (2):</p>
+<ul>
+  <li>st/omx/enc: check uninitialized list from task release</li>
+  <li>vl/dri3: fix a memory leak from front buffer</li>
+</ul>
+
+<p>Marek Olšák (7):</p>
+<ul>
+  <li>glsl_to_tgsi: don't use the negate modifier in integer ops after bitcast</li>
+  <li>radeonsi: add a workaround for a compute VGPR-usage LLVM bug</li>
+  <li>winsys/amdgpu: disallow DCC with mipmaps</li>
+  <li>gallium/util: fix align64</li>
+  <li>radeonsi: only set dual source blending for MRT0</li>
+  <li>radeonsi: fix VM faults due NULL internal const buffers on CIK</li>
+  <li>radeonsi: disable SDMA texture copying on Carrizo</li>
+</ul>
+
+<p>Matt Turner (4):</p>
+<ul>
+  <li>mapi: Massage code to allow clang to compile.</li>
+  <li>i965/vec4: Ignore swizzle of VGRF for use by var_range_end().</li>
+  <li>mesa: Use AC_HEADER_MAJOR to include correct header for major().</li>
+  <li>nir: Walk blocks in source code order in lower_vars_to_ssa.</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>glx: Don't use current context in __glXSendError</li>
+</ul>
+
+<p>Miklós Máté (1):</p>
+<ul>
+  <li>vbo: set draw_id</li>
+</ul>
+
+<p>Nanley Chery (5):</p>
+<ul>
+  <li>anv/descriptor_set: Fix binding partly undefined descriptor sets</li>
+  <li>isl: Fix assert on raw buffer surface state size</li>
+  <li>anv/device: Fix max buffer range limits</li>
+  <li>isl: Fix isl_tiling_is_any_y()</li>
+  <li>anv/gen7_pipeline: Set PixelShaderKillPixel for discards</li>
+</ul>
+
+<p>Nicolai Hähnle (7):</p>
+<ul>
+  <li>radeonsi: explicitly choose center locations for 1xAA on Polaris</li>
+  <li>radeonsi: fix Polaris MSAA regression</li>
+  <li>radeonsi: ensure sample locations are set for line and polygon smoothing</li>
+  <li>st_glsl_to_tgsi: only skip over slots of an input array that are present</li>
+  <li>glsl: fix optimization of discard nested multiple levels</li>
+  <li>radeonsi: flush TC L2 cache for indirect draw data</li>
+  <li>radeonsi: add si_set_rw_buffer to be used for internal descriptors</li>
+</ul>
+
+<p>Nicolas Boichat (6):</p>
+<ul>
+  <li>egl/dri2: dri2_make_current: Set EGL error if bindContext fails</li>
+  <li>egl/wayland: Set disp-&gt;DriverData to NULL on error</li>
+  <li>egl/surfaceless: Set disp-&gt;DriverData to NULL on error</li>
+  <li>egl/drm: Set disp-&gt;DriverData to NULL on error</li>
+  <li>egl/android: Set dpy-&gt;DriverData to NULL on error</li>
+  <li>egl/dri2: Add reference count for dri2_egl_display</li>
+</ul>
+
+<p>Rob Herring (3):</p>
+<ul>
+  <li>Android: add missing u_math.h include path for libmesa_isl</li>
+  <li>vc4: fix vc4_resource_from_handle() stride calculation</li>
+  <li>vc4: add hash table look-up for exported dmabufs</li>
+</ul>
+
+<p>Samuel Pitoiset (7):</p>
+<ul>
+  <li>nvc0/ir: fix images indirect access on Fermi</li>
+  <li>nvc0: fix the driver cb size when draw parameters are used</li>
+  <li>gm107/ir: add missing NEG modifier for IADD32I</li>
+  <li>gm107/ir: make use of ADD32I for all immediates</li>
+  <li>nvc0: upload sample locations on GM20x</li>
+  <li>nvc0: invalidate textures/samplers on GK104+</li>
+  <li>nv50/ir: always emit the NDV bit for OP_QUADOP</li>
+</ul>
+
+<p>Stefan Dirsch (1):</p>
+<ul>
+  <li>Avoid overflow in 'last' variable of FindGLXFunction(...)</li>
+</ul>
+
+<p>Stencel, Joanna (1):</p>
+<ul>
+  <li>egl/wayland-egl: Fix for segfault in dri2_wl_destroy_surface.</li>
+</ul>
+
+<p>Tim Rowley (2):</p>
+<ul>
+  <li>Revert "gallium: Force blend color to 16-byte alignment"</li>
+  <li>swr: switch from overriding -march to selecting features</li>
+</ul>
+
+<p>Tomasz Figa (8):</p>
+<ul>
+  <li>gallium/dri: Add shared glapi to LIBADD on Android</li>
+  <li>egl/android: Remove unused variables</li>
+  <li>egl/android: Check return value of dri2_get_dri_config()</li>
+  <li>egl/android: Stop leaking DRI images</li>
+  <li>gallium/winsys/kms: Fix double refcount when importing from prime FD (v2)</li>
+  <li>gallium/winsys/kms: Fully initialize kms_sw_dt at prime import time (v2)</li>
+  <li>gallium/winsys/kms: Move display target handle lookup to separate function</li>
+  <li>gallium/winsys/kms: Look up the GEM handle after importing a prime FD</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/12.0.3.html b/docs/relnotes/12.0.3.html
new file mode 100644
index 0000000..70e704b
--- /dev/null
+++ b/docs/relnotes/12.0.3.html
@@ -0,0 +1,71 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.3 Release Notes / September 15, 2016</h1>
+
+<p>
+Mesa 12.0.3 is a bug fix release which fixes bugs found since the 12.0.3 release.
+</p>
+<p>
+Mesa 12.0.3 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+79abcfab3de30dbd416d1582a3cf6b1be308466231488775f1b7bb43be353602 mesa-12.0.3.tar.gz
+1dc86dd9b51272eee1fad3df65e18cda2e556ef1bc0b6e07cd750b9757f493b1 mesa-12.0.3.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97781">Bug 97781</a> - [HSW, BYT, IVB] es2-cts.gtf.gl2extensiontests.depth_texture_cube_map.depth_texture_cube_map</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Emil Velikov (3):</p>
+<ul>
+  <li>docs: add sha256 checksums for 12.0.2</li>
+  <li>Revert "i965/miptree: Stop multiplying cube depth by 6 in HiZ calculations"</li>
+  <li>Update version to 12.0.3</li>
+</ul>
+
+<p>José Fonseca (1):</p>
+<ul>
+  <li>appveyor: Update winflexbison download URL.</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/docs/relnotes/12.0.4.html b/docs/relnotes/12.0.4.html
new file mode 100644
index 0000000..eaa9ba5
--- /dev/null
+++ b/docs/relnotes/12.0.4.html
@@ -0,0 +1,321 @@
+<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
+<html lang="en">
+<head>
+  <meta http-equiv="content-type" content="text/html; charset=utf-8">
+  <title>Mesa Release Notes</title>
+  <link rel="stylesheet" type="text/css" href="../mesa.css">
+</head>
+<body>
+
+<div class="header">
+  <h1>The Mesa 3D Graphics Library</h1>
+</div>
+
+<iframe src="../contents.html"></iframe>
+<div class="content">
+
+<h1>Mesa 12.0.4 Release Notes / November 10, 2016</h1>
+
+<p>
+Mesa 12.0.4 is a bug fix release which fixes bugs found since the 12.0.4 release.
+</p>
+<p>
+Mesa 12.0.4 implements the OpenGL 4.3 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.3.  OpenGL
+4.3 is <strong>only</strong> available if requested at context creation
+because compatibility contexts are not supported.
+</p>
+
+
+<h2>SHA256 checksums</h2>
+<pre>
+22026ce4f1c6a7908b0d10ff057decec0a5633afe7f38a0cef5c08d0689f02a6 mesa-12.0.4.tar.gz
+5d6003da867d3f54e5000b4acdfc37e6cce5b6a4459274fdad73e24bd2f0065e mesa-12.0.4.tar.xz
+</pre>
+
+
+<h2>New features</h2>
+<p>None</p>
+
+
+<h2>Bug fixes</h2>
+
+<p>This list is likely incomplete.</p>
+
+<ul>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=71759">Bug 71759</a> - Intel driver fails with &quot;intel_do_flush_locked failed: No such file or directory&quot; if buffer imported with EGL_NATIVE_PIXMAP_KHR</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=94354">Bug 94354</a> - R9285 Unigine Valley perf regression since radeonsi: use re-Z</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=96770">Bug 96770</a> - include/GL/mesa_glinterop.h:62: error: redefinition of typedef ‘GLXContext’</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97231">Bug 97231</a> - GL_DEPTH_CLAMP doesn't clamp to the far plane</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97233">Bug 97233</a> - vkQuake VkSpecializationMapEntry related bug</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97260">Bug 97260</a> - R9 290 low performance in Linux 4.7</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97549">Bug 97549</a> - [SNB, BXT] up to 40% perf drop from &quot;loader/dri3: Overhaul dri3_update_num_back&quot; commit</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=97887">Bug 97887</a> - llvm segfault in janusvr -render vive</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98025">Bug 98025</a> - [radeonsi] incorrect primitive restart index used</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98134">Bug 98134</a> - dEQP-GLES31.functional.debug.negative_coverage.get_error.buffer.draw_buffers wants a different GL error code</li>
+
+<li><a href="https://bugs.freedesktop.org/show_bug.cgi?id=98326">Bug 98326</a> - [dEQP, EGL] pbuffer depth/stencil tests fail</li>
+
+</ul>
+
+
+<h2>Changes</h2>
+
+<p>Axel Davy (4):</p>
+<ul>
+  <li>gallium/util: Really allow aliasing of dst for u_box_union_*</li>
+  <li>st/nine: Fix the calculation of the number of vs inputs</li>
+  <li>st/nine: Fix mistake in Volume9 UnlockBox</li>
+  <li>st/nine: Fix locking CubeTexture surfaces.</li>
+</ul>
+
+<p>Brendan King (1):</p>
+<ul>
+  <li>configure.ac: fix the name of the Wayland Scanner pc file</li>
+</ul>
+
+<p>Brian Paul (1):</p>
+<ul>
+  <li>st/mesa: fix swizzle issue in st_create_sampler_view_from_stobj()</li>
+</ul>
+
+<p>Chad Versace (3):</p>
+<ul>
+  <li>egl: Fix truncation error in _eglParseSyncAttribList64</li>
+  <li>i965/sync: Fix uninitalized usage and leak of mutex</li>
+  <li>egl: Don't advertise unsupported platform extensions</li>
+</ul>
+
+<p>Chuanbo Weng (1):</p>
+<ul>
+  <li>gbm: fix potential NULL deref of mapImage/unmapImage.</li>
+</ul>
+
+<p>Chuck Atkins (1):</p>
+<ul>
+  <li>autoconf: Make header install distinct for various APIs (v2)</li>
+</ul>
+
+<p>Dave Airlie (3):</p>
+<ul>
+  <li>anv: initialise and increment send_sbc</li>
+  <li>anv/wsi: fix apps that acquire multiple images up front</li>
+  <li>Revert "st/vdpau: use linear layout for output surfaces"</li>
+</ul>
+
+<p>Emil Velikov (12):</p>
+<ul>
+  <li>docs: add sha256 checksums for 12.0.3</li>
+  <li>cherry-ignore: add non-applicable i965 commit</li>
+  <li>cherry-ignore: add vaapi encode fix</li>
+  <li>cherry-ignore: add EGL_KHR_debug fix</li>
+  <li>cherry-ignore: add update_renderbuffer_read_surfaces()</li>
+  <li>isl/gen6: correctly check msaa layout samples count</li>
+  <li>egl/x11: don't crash if dri2_dpy-&gt;conn is NULL</li>
+  <li>get-pick-list.sh: Require explicit "12.0" for nominating stable patches</li>
+  <li>automake: don't forget to pick wglext.h in the tarball</li>
+  <li>cherry-ignore: add N/A EGL revert</li>
+  <li>cherry-ignore: add ClientWaitSync fixes</li>
+  <li>Update version to 12.0.4</li>
+</ul>
+
+<p>Eric Anholt (5):</p>
+<ul>
+  <li>travis: Parse configure.ac to pick an updated LIBDRM_VERSION.</li>
+  <li>travis: Update to the Ubuntu Trusty image.</li>
+  <li>travis: Enable vc4 in libdrm to satisfy vc4 test build dependency.</li>
+  <li>travis: Upgrade LLVM dependency to 3.5 and enable LLVM drivers.</li>
+  <li>gallium: Fix install-gallium-links.mk on non-bash /bin/sh</li>
+</ul>
+
+<p>Hans de Goede (1):</p>
+<ul>
+  <li>pipe_loader_sw: Fix fd leak when instantiated via pipe_loader_sw_probe_kms</li>
+</ul>
+
+<p>Ian Romanick (1):</p>
+<ul>
+  <li>glsl: Fix cut-and-paste bug in hierarchical visitor ir_expression::accept</li>
+</ul>
+
+<p>Ilia Mirkin (16):</p>
+<ul>
+  <li>nv30: set usage to staging so that the buffer is allocated in GART</li>
+  <li>a3xx: make sure to actually clamp depth as requested</li>
+  <li>a3xx: make use of software clipping when hw can't handle it</li>
+  <li>a3xx: use window scissor to simulate viewport xy clip</li>
+  <li>main: GL_RGB10_A2UI does not come with GL 3.0/EXT_texture_integer</li>
+  <li>mesa/formatquery: limit ES target support, fix core context support</li>
+  <li>nir: fix definition of pack_uvec2_to_uint</li>
+  <li>gm107/ir: AL2P writes to a predicate register</li>
+  <li>st/mesa: fix is_scissor_enabled when X/Y are negative</li>
+  <li>nvc0/ir: fix overwriting of value backing non-constant gather offset</li>
+  <li>nv50/ir: copy over value's register id when resolving merge of a phi</li>
+  <li>nvc0/ir: fix textureGather with a single offset</li>
+  <li>gm107/ir: fix texturing with indirect samplers</li>
+  <li>gm107/ir: fix bit offset of tex lod setting for indirect texturing</li>
+  <li>nv50,nvc0: avoid reading out of bounds when getting bogus so info</li>
+  <li>nv50/ir: process texture offset sources as regular sources</li>
+</ul>
+
+<p>James Legg (1):</p>
+<ul>
+  <li>radeonsi: Fix primitive restart when index changes</li>
+</ul>
+
+<p>Jason Ekstrand (9):</p>
+<ul>
+  <li>nir/spirv: Swap the argument order for AtomicCompareExchange</li>
+  <li>nir/spirv: Use the correct sources for CompareExchange on images</li>
+  <li>nir/spirv: Break variable decoration handling into a helper</li>
+  <li>nir/spirv: Refactor variable deocration handling</li>
+  <li>nir/spirv/cfg: Handle switches whose break block is a loop continue</li>
+  <li>nir/spirv/cfg: Detect switch_break after loop_break/continue</li>
+  <li>nir: Add a nop intrinsic</li>
+  <li>nir/spirv/cfg: Use a nop intrinsic for tagging the ends of blocks</li>
+  <li>intel/blorp: Rework our usage of ralloc when compiling shaders</li>
+</ul>
+
+<p>Jonathan Gray (3):</p>
+<ul>
+  <li>genxml: add generated headers to EXTRA_DIST</li>
+  <li>mapi: automake: set VISIBILITY_CFLAGS for shared glapi</li>
+  <li>mesa: automake: include mesa_glinterop.h in distfile</li>
+</ul>
+
+<p>Julien Isorce (1):</p>
+<ul>
+  <li>st/va: also honors interlaced preference when providing a video format</li>
+</ul>
+
+<p>Kenneth Graunke (8):</p>
+<ul>
+  <li>nir: Call nir_metadata_preserve from nir_lower_alu_to_scalar().</li>
+  <li>mesa: Expose RESET_NOTIFICATION_STRATEGY with KHR_robustness.</li>
+  <li>i965: Fix missing _NEW_TRANSFORM in Gen8+ 3DSTATE_DS atom.</li>
+  <li>i965: Add missing BRW_NEW_VS_PROG_DATA to 3DSTATE_CLIP.</li>
+  <li>i965: Move BRW_NEW_FRAGMENT_PROGRAM from 3DSTATE_PS to PS_EXTRA.</li>
+  <li>i965: Add missing BRW_NEW_CS_PROG_DATA to compute constant atom.</li>
+  <li>i965: Add missing BRW_CS_PROG_DATA to CS work group surface atom.</li>
+  <li>i965: Fix gl_InvocationID in dual object GS where invocations == 1.</li>
+</ul>
+
+<p>Marek Olšák (12):</p>
+<ul>
+  <li>radeonsi: fix cubemaps viewed as 2D</li>
+  <li>radeonsi: take compute shader and dispatch indirect memory usage into account</li>
+  <li>radeonsi: fix FP64 UBO loads with indirect uniform block indexing</li>
+  <li>mesa: fix glGetFramebufferAttachmentParameteriv w/ on-demand FRONT_BACK alloc</li>
+  <li>radeonsi: fix interpolateAt opcodes for .zw components</li>
+  <li>radeonsi: fix texture border colors for compute shaders</li>
+  <li>radeonsi: disable ReZ</li>
+  <li>gallium/radeon: make sure the address of separate CMASK is aligned properly</li>
+  <li>winsys/amdgpu: fix radeon_surf::macro_tile_index for imported textures</li>
+  <li>egl: use util/macros.h</li>
+  <li>egl: make interop ABI visible again</li>
+  <li>glx: make interop ABI visible again</li>
+</ul>
+
+<p>Mario Kleiner (1):</p>
+<ul>
+  <li>glx: Perform check for valid fbconfig against proper X-Screen.</li>
+</ul>
+
+<p>Martin Peres (2):</p>
+<ul>
+  <li>loader/dri3: add get_dri_screen() to the vtable</li>
+  <li>loader/dri3: import prime buffers in the currently-bound screen</li>
+</ul>
+
+<p>Matt Whitlock (5):</p>
+<ul>
+  <li>egl/android: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>gallium/auxiliary: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>st/dri: replace calls to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>st/xa: replace call to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+  <li>gallium/winsys: replace calls to dup(2) with fcntl(F_DUPFD_CLOEXEC)</li>
+</ul>
+
+<p>Max Staudt (1):</p>
+<ul>
+  <li>r300g: Set R300_VAP_CNTL on RSxxx to avoid triangle flickering</li>
+</ul>
+
+<p>Michel Dänzer (1):</p>
+<ul>
+  <li>loader/dri3: Overhaul dri3_update_num_back</li>
+</ul>
+
+<p>Nicholas Bishop (2):</p>
+<ul>
+  <li>gbm: return appropriate error when queryImage() fails</li>
+  <li>st/dri: check pipe_screen-&gt;resource_get_handle() return value</li>
+</ul>
+
+<p>Nicolai Hähnle (10):</p>
+<ul>
+  <li>gallium/radeon: cleanup and fix branch emits</li>
+  <li>st/glsl_to_tgsi: disable on-the-fly peephole for 64-bit operations</li>
+  <li>st/glsl_to_tgsi: simplify translate_tex_offset</li>
+  <li>st/glsl_to_tgsi: fix textureGatherOffset with indirectly loaded offsets</li>
+  <li>st/mesa: fix vertex elements setup for doubles</li>
+  <li>radeonsi: fix indirect loads of 64 bit constants</li>
+  <li>st/glsl_to_tgsi: fix atomic counter addressing</li>
+  <li>st/glsl_to_tgsi: fix block copies of arrays of doubles</li>
+  <li>st/mesa: only set primitive_restart when the restart index is in range</li>
+  <li>radeonsi: fix 64-bit loads from LDS</li>
+</ul>
+
+<p>Samuel Pitoiset (4):</p>
+<ul>
+  <li>nvc0/ir: fix subops for IMAD</li>
+  <li>gk110/ir: fix wrong emission of OP_NOT</li>
+  <li>nvc0: use correct bufctx when invalidating CP textures</li>
+  <li>nvc0/ir: fix emission of IMAD with NEG modifiers</li>
+</ul>
+
+<p>Stencel, Joanna (1):</p>
+<ul>
+  <li>egl/wayland: add missing destroy_window callback</li>
+</ul>
+
+<p>Tapani Pälli (5):</p>
+<ul>
+  <li>egl: stop claiming support for pbuffer + msaa</li>
+  <li>egl/dri2: set max values for pbuffer width and height</li>
+  <li>egl: add check that eglCreateContext gets a valid config</li>
+  <li>mesa: fix error handling in DrawBuffers</li>
+  <li>egl: set preserved behavior for surface only if config supports it</li>
+</ul>
+
+<p>Tim Rowley (1):</p>
+<ul>
+  <li>configure.ac: add llvm inteljitevents component if enabled</li>
+</ul>
+
+<p>Vedran Miletić (1):</p>
+<ul>
+  <li>clover: Fix build against clang SVN &gt;= r273191</li>
+</ul>
+
+<p>Vinson Lee (1):</p>
+<ul>
+  <li>Revert "mesa_glinterop: remove inclusion of GLX header"</li>
+</ul>
+
+
+</div>
+</body>
+</html>
diff --git a/include/GL/mesa_glinterop.h b/include/GL/mesa_glinterop.h
index c0c20d6..0b373c1 100644
--- a/include/GL/mesa_glinterop.h
+++ b/include/GL/mesa_glinterop.h
@@ -58,8 +58,8 @@ extern "C" {
 #endif
 
 /* Forward declarations to avoid inclusion of GL/glx.h */
-typedef struct _XDisplay Display;
-typedef struct __GLXcontextRec *GLXContext;
+struct _XDisplay;
+struct __GLXcontextRec;
 
 /* Forward declarations to avoid inclusion of EGL/egl.h */
 typedef void *EGLDisplay;
@@ -246,7 +246,7 @@ struct mesa_glinterop_export_out {
  * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error
  */
 int
-MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
+MesaGLInteropGLXQueryDeviceInfo(struct _XDisplay *dpy, struct __GLXcontextRec *context,
                                 struct mesa_glinterop_device_info *out);
 
 
@@ -271,7 +271,7 @@ MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
  * \return MESA_GLINTEROP_SUCCESS or MESA_GLINTEROP_* != 0 on error
  */
 int
-MesaGLInteropGLXExportObject(Display *dpy, GLXContext context,
+MesaGLInteropGLXExportObject(struct _XDisplay *dpy, struct __GLXcontextRec *context,
                              struct mesa_glinterop_export_in *in,
                              struct mesa_glinterop_export_out *out);
 
@@ -286,11 +286,11 @@ MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context,
                              struct mesa_glinterop_export_out *out);
 
 
-typedef int (PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(Display *dpy, GLXContext context,
+typedef int (PFNMESAGLINTEROPGLXQUERYDEVICEINFOPROC)(struct _XDisplay *dpy, struct __GLXcontextRec *context,
                                                      struct mesa_glinterop_device_info *out);
 typedef int (PFNMESAGLINTEROPEGLQUERYDEVICEINFOPROC)(EGLDisplay dpy, EGLContext context,
                                                      struct mesa_glinterop_device_info *out);
-typedef int (PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(Display *dpy, GLXContext context,
+typedef int (PFNMESAGLINTEROPGLXEXPORTOBJECTPROC)(struct _XDisplay *dpy, struct __GLXcontextRec *context,
                                                   struct mesa_glinterop_export_in *in,
                                                   struct mesa_glinterop_export_out *out);
 typedef int (PFNMESAGLINTEROPEGLEXPORTOBJECTPROC)(EGLDisplay dpy, EGLContext context,
diff --git a/install-gallium-links.mk b/install-gallium-links.mk
index ac5a499..fc2f75d 100644
--- a/install-gallium-links.mk
+++ b/install-gallium-links.mk
@@ -13,8 +13,8 @@ all-local : .install-gallium-links
 	fi;							\
 	$(MKDIR_P) $$link_dir;					\
 	file_list="$(dri_LTLIBRARIES:%.la=.libs/%.so)";		\
-	file_list+="$(egl_LTLIBRARIES:%.la=.libs/%.$(LIB_EXT)*)";	\
-	file_list+="$(lib_LTLIBRARIES:%.la=.libs/%.$(LIB_EXT)*)";	\
+	file_list="$$file_list$(egl_LTLIBRARIES:%.la=.libs/%.$(LIB_EXT)*)";	\
+	file_list="$$file_list$(lib_LTLIBRARIES:%.la=.libs/%.$(LIB_EXT)*)";	\
 	for f in $$file_list; do 				\
 		if test -h .libs/$$f; then			\
 			cp -d $$f $$link_dir;			\
diff --git a/src/Makefile.am b/src/Makefile.am
index b130f5b..c0aa115 100644
--- a/src/Makefile.am
+++ b/src/Makefile.am
@@ -47,6 +47,30 @@ CLEANFILES = $(BUILT_SOURCES)
 
 SUBDIRS = . gtest util mapi/glapi/gen mapi
 
+if HAVE_OPENGL
+gldir = $(includedir)/GL
+gl_HEADERS = \
+  $(top_srcdir)/include/GL/gl.h \
+  $(top_srcdir)/include/GL/glext.h \
+  $(top_srcdir)/include/GL/glcorearb.h \
+  $(top_srcdir)/include/GL/gl_mangle.h
+endif
+
+if HAVE_GLX
+glxdir = $(includedir)/GL
+glx_HEADERS = \
+  $(top_srcdir)/include/GL/glx.h \
+  $(top_srcdir)/include/GL/glxext.h \
+  $(top_srcdir)/include/GL/glx_mangle.h
+pkgconfigdir = $(libdir)/pkgconfig
+pkgconfig_DATA = mesa/gl.pc
+endif
+
+if HAVE_COMMON_OSMESA
+osmesadir = $(includedir)/GL
+osmesa_HEADERS = $(top_srcdir)/include/GL/osmesa.h
+endif
+
 # include only conditionally ?
 SUBDIRS += compiler
 
@@ -93,7 +117,8 @@ SUBDIRS += gallium
 endif
 
 EXTRA_DIST = \
-	getopt hgl SConscript
+	getopt hgl SConscript \
+	$(top_srcdir)/include/GL/mesa_glinterop.h
 
 AM_CFLAGS = $(VISIBILITY_CFLAGS)
 AM_CXXFLAGS = $(VISIBILITY_CXXFLAGS)
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 4022727..68544ae 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -278,10 +278,34 @@ control_line_success:
 	HASH_TOKEN DEFINE_TOKEN define
 |	HASH_TOKEN UNDEF IDENTIFIER NEWLINE {
 		macro_t *macro;
-		if (strcmp("__LINE__", $3) == 0
-		    || strcmp("__FILE__", $3) == 0
-		    || strcmp("__VERSION__", $3) == 0
-		    || strncmp("GL_", $3, 3) == 0)
+
+                /* Section 3.4 (Preprocessor) of the GLSL ES 3.00 spec says:
+                 *
+                 *    It is an error to undefine or to redefine a built-in
+                 *    (pre-defined) macro name.
+                 *
+                 * The GLSL ES 1.00 spec does not contain this text.
+                 *
+                 * Section 3.3 (Preprocessor) of the GLSL 1.30 spec says:
+                 *
+                 *    #define and #undef functionality are defined as is
+                 *    standard for C++ preprocessors for macro definitions
+                 *    both with and without macro parameters.
+                 *
+                 * At least as far as I can tell GCC allow '#undef __FILE__'.
+                 * Furthermore, there are desktop OpenGL conformance tests
+                 * that expect '#undef __VERSION__' and '#undef
+                 * GL_core_profile' to work.
+                 *
+                 * Only disallow #undef of pre-defined macros on GLSL ES >=
+                 * 3.00 shaders.
+                 */
+		if (parser->is_gles &&
+                    parser->version >= 300 &&
+                    (strcmp("__LINE__", $3) == 0
+                     || strcmp("__FILE__", $3) == 0
+                     || strcmp("__VERSION__", $3) == 0
+                     || strncmp("GL_", $3, 3) == 0))
 			glcpp_error(& @1, parser, "Built-in (pre-defined)"
 				    " macro names cannot be undefined.");
 
@@ -396,13 +420,13 @@ control_line_success:
 		_glcpp_parser_skip_stack_pop (parser, & @1);
 	} NEWLINE
 |	HASH_TOKEN VERSION_TOKEN integer_constant NEWLINE {
-		if (parser->version_resolved) {
+		if (parser->version != 0) {
 			glcpp_error(& @1, parser, "#version must appear on the first line");
 		}
 		_glcpp_parser_handle_version_declaration(parser, $3, NULL, true);
 	}
 |	HASH_TOKEN VERSION_TOKEN integer_constant IDENTIFIER NEWLINE {
-		if (parser->version_resolved) {
+		if (parser->version != 0) {
 			glcpp_error(& @1, parser, "#version must appear on the first line");
 		}
 		_glcpp_parser_handle_version_declaration(parser, $3, $4, true);
@@ -1345,7 +1369,7 @@ glcpp_parser_create(const struct gl_extensions *extensions, gl_api api)
 
    parser->extensions = extensions;
    parser->api = api;
-   parser->version_resolved = false;
+   parser->version = 0;
 
    parser->has_new_line_number = 0;
    parser->new_line_number = 1;
@@ -2281,10 +2305,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 {
    const struct gl_extensions *extensions = parser->extensions;
 
-   if (parser->version_resolved)
+   if (parser->version != 0)
       return;
 
-   parser->version_resolved = true;
+   parser->version = version;
 
    add_builtin_define (parser, "__VERSION__", version);
 
diff --git a/src/compiler/glsl/glcpp/glcpp.h b/src/compiler/glsl/glcpp/glcpp.h
index d87e6b7..06f3521 100644
--- a/src/compiler/glsl/glcpp/glcpp.h
+++ b/src/compiler/glsl/glcpp/glcpp.h
@@ -196,7 +196,7 @@ struct glcpp_parser {
 	int error;
 	const struct gl_extensions *extensions;
 	gl_api api;
-	bool version_resolved;
+	unsigned version;
 	bool has_new_line_number;
 	int new_line_number;
 	bool has_new_source_number;
diff --git a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c
index 49e7696..f8ade19 100644
--- a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c
+++ b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c
@@ -1,3 +1,4 @@
+#version 300 es
 #undef __LINE__
 #undef __FILE__
 #undef __VERSION__
diff --git a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
index 3b736df..498dc0f 100644
--- a/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
+++ b/src/compiler/glsl/glcpp/tests/120-undef-builtin.c.expected
@@ -1,6 +1,7 @@
-0:1(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
 0:2(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
 0:3(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
+0:4(1): preprocessor error: Built-in (pre-defined) macro names cannot be undefined.
+#version 300 es
 
 
 
diff --git a/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c
new file mode 100644
index 0000000..e3af10d
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c
@@ -0,0 +1,4 @@
+#version 110
+#undef __LINE__
+#undef __FILE__
+#undef __VERSION__
diff --git a/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected
new file mode 100644
index 0000000..cd0071f
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/147-undef-builtin-allowed.c.expected
@@ -0,0 +1,4 @@
+#version 110
+
+
+
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 11711ee..c31958b 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -348,10 +348,10 @@ isampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 320, yyextra->ARB_texture_mul
 usampler2DMSArray  KEYWORD_WITH_ALT(150, 300, 150, 320, yyextra->ARB_texture_multisample_enable || yyextra->OES_texture_storage_multisample_2d_array_enable, USAMPLER2DMSARRAY);
 
    /* keywords available with ARB_texture_cube_map_array_enable extension on desktop GLSL */
-samplerCubeArray   KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_texture_cube_map_array_enable, SAMPLERCUBEARRAY);
-isamplerCubeArray KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_texture_cube_map_array_enable, ISAMPLERCUBEARRAY);
-usamplerCubeArray KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_texture_cube_map_array_enable, USAMPLERCUBEARRAY);
-samplerCubeArrayShadow   KEYWORD_WITH_ALT(400, 0, 400, 0, yyextra->ARB_texture_cube_map_array_enable, SAMPLERCUBEARRAYSHADOW);
+samplerCubeArray   KEYWORD_WITH_ALT(400, 310, 400, 0, yyextra->ARB_texture_cube_map_array_enable, SAMPLERCUBEARRAY);
+isamplerCubeArray KEYWORD_WITH_ALT(400, 310, 400, 0, yyextra->ARB_texture_cube_map_array_enable, ISAMPLERCUBEARRAY);
+usamplerCubeArray KEYWORD_WITH_ALT(400, 310, 400, 0, yyextra->ARB_texture_cube_map_array_enable, USAMPLERCUBEARRAY);
+samplerCubeArrayShadow   KEYWORD_WITH_ALT(400, 310, 400, 0, yyextra->ARB_texture_cube_map_array_enable, SAMPLERCUBEARRAYSHADOW);
 
 samplerExternalOES		{
 			  if (yyextra->OES_EGL_image_external_enable)
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index 3885688..c72f119 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1784,8 +1784,10 @@ type_qualifier:
        * variables. As only outputs can be declared as invariant, an invariant
        * output from one shader stage will still match an input of a subsequent
        * stage without the input being declared as invariant."
+       *
+       * On the desktop side, this text first appears in GLSL 4.30.
        */
-      if (state->es_shader && state->language_version >= 300 && $$.flags.q.in)
+      if (state->is_version(430, 300) && $$.flags.q.in)
          _mesa_glsl_error(&@1, state, "invariant qualifiers cannot be used with shader inputs");
    }
    | interpolation_qualifier type_qualifier
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index 93716c4..3809270 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -586,6 +586,13 @@ public:
       return this->u.state_slots;
    }
 
+   inline bool is_interpolation_flat() const
+   {
+      return this->data.interpolation == INTERP_QUALIFIER_FLAT ||
+             this->type->contains_integer() ||
+             this->type->contains_double();
+   }
+
    inline bool is_name_ralloced() const
    {
       return this->name != ir_variable::tmp_name;
diff --git a/src/compiler/glsl/ir_hv_accept.cpp b/src/compiler/glsl/ir_hv_accept.cpp
index 213992a..5cc6a34 100644
--- a/src/compiler/glsl/ir_hv_accept.cpp
+++ b/src/compiler/glsl/ir_hv_accept.cpp
@@ -147,7 +147,7 @@ ir_expression::accept(ir_hierarchical_visitor *v)
 	 goto done;
 
       case visit_stop:
-	 return s;
+	 return visit_stop;
       }
    }
 
diff --git a/src/compiler/glsl/ir_set_program_inouts.cpp b/src/compiler/glsl/ir_set_program_inouts.cpp
index 183b13b..bca1e0a 100644
--- a/src/compiler/glsl/ir_set_program_inouts.cpp
+++ b/src/compiler/glsl/ir_set_program_inouts.cpp
@@ -260,15 +260,19 @@ ir_set_program_inouts_visitor::try_mark_partial_variable(ir_variable *var,
     * lowering passes (do_vec_index_to_swizzle() gets rid of indexing into
     * vectors, and lower_packed_varyings() gets rid of structs that occur in
     * varyings).
+    *
+    * However, we don't use varying packing in all cases - tessellation
+    * shaders bypass it.  This means we'll see varying structs and arrays
+    * of structs here.  For now, we just give up so the caller marks the
+    * entire variable as used.
     */
    if (!(type->is_matrix() ||
         (type->is_array() &&
          (type->fields.array->is_numeric() ||
           type->fields.array->is_boolean())))) {
-      assert(!"Unexpected indexing in ir_set_program_inouts");
 
-      /* For safety in release builds, in case we ever encounter unexpected
-       * indexing, give up and let the caller mark the whole variable as used.
+      /* If we don't know how to handle this case, give up and let the
+       * caller mark the whole variable as used.
        */
       return false;
    }
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 5a5adc0..ddf6aa2 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -308,7 +308,25 @@ cross_validate_types_and_qualifiers(struct gl_shader_program *prog,
       return;
    }
 
-   if (!prog->IsES && input->data.invariant != output->data.invariant) {
+   /* The GLSL 4.30 and GLSL ES 3.00 specifications say:
+    *
+    *    "As only outputs need be declared with invariant, an output from
+    *     one shader stage will still match an input of a subsequent stage
+    *     without the input being declared as invariant."
+    *
+    * while GLSL 4.20 says:
+    *
+    *    "For variables leaving one shader and coming into another shader,
+    *     the invariant keyword has to be used in both shaders, or a link
+    *     error will result."
+    *
+    * and GLSL ES 1.00 section 4.6.4 "Invariance and Linking" says:
+    *
+    *    "The invariance of varyings that are declared in both the vertex
+    *     and fragment shaders must match."
+    */
+   if (input->data.invariant != output->data.invariant &&
+       prog->Version < (prog->IsES ? 300 : 430)) {
       linker_error(prog,
                    "%s shader output `%s' %s invariant qualifier, "
                    "but %s shader input %s invariant qualifier\n",
@@ -1610,7 +1628,8 @@ varying_matches::compute_packing_class(const ir_variable *var)
    unsigned packing_class = var->data.centroid | (var->data.sample << 1) |
                             (var->data.patch << 2);
    packing_class *= 4;
-   packing_class += var->data.interpolation;
+   packing_class += var->is_interpolation_flat()
+      ? unsigned(INTERP_QUALIFIER_FLAT) : var->data.interpolation;
    return packing_class;
 }
 
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 6379ed2..02b3e00 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -3687,6 +3687,18 @@ create_shader_variable(struct gl_shader_program *shProg,
    if (in->data.mode == ir_var_system_value &&
        in->data.location == SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) {
       out->name = ralloc_strdup(shProg, "gl_VertexID");
+   } else if ((in->data.mode == ir_var_shader_out &&
+               in->data.location == VARYING_SLOT_TESS_LEVEL_OUTER) ||
+              (in->data.mode == ir_var_system_value &&
+               in->data.location == SYSTEM_VALUE_TESS_LEVEL_OUTER)) {
+      out->name = ralloc_strdup(shProg, "gl_TessLevelOuter");
+      type = glsl_type::get_array_instance(glsl_type::float_type, 4);
+   } else if ((in->data.mode == ir_var_shader_out &&
+               in->data.location == VARYING_SLOT_TESS_LEVEL_INNER) ||
+              (in->data.mode == ir_var_system_value &&
+               in->data.location == SYSTEM_VALUE_TESS_LEVEL_INNER)) {
+      out->name = ralloc_strdup(shProg, "gl_TessLevelInner");
+      type = glsl_type::get_array_instance(glsl_type::float_type, 2);
    } else {
       out->name = ralloc_strdup(shProg, name);
    }
@@ -3839,6 +3851,9 @@ add_interface_variables(struct gl_shader_program *shProg,
          continue;
       };
 
+      if (var->data.patch)
+         loc_bias = int(VARYING_SLOT_PATCH0);
+
       /* Skip packed varyings, packed varyings are handled separately
        * by add_packed_varyings.
        */
diff --git a/src/compiler/glsl/lower_packed_varyings.cpp b/src/compiler/glsl/lower_packed_varyings.cpp
index 41edada..1e7a8c2 100644
--- a/src/compiler/glsl/lower_packed_varyings.cpp
+++ b/src/compiler/glsl/lower_packed_varyings.cpp
@@ -273,11 +273,11 @@ lower_packed_varyings_visitor::run(struct gl_shader *shader)
          continue;
 
       /* This lowering pass is only capable of packing floats and ints
-       * together when their interpolation mode is "flat".  Therefore, to be
-       * safe, caller should ensure that integral varyings always use flat
-       * interpolation, even when this is not required by GLSL.
+       * together when their interpolation mode is "flat".  Treat integers as
+       * being flat when the interpolation mode is none.
        */
       assert(var->data.interpolation == INTERP_QUALIFIER_FLAT ||
+             var->data.interpolation == INTERP_QUALIFIER_NONE ||
              !var->type->contains_integer());
 
       /* Clone the variable for program resource list before
@@ -607,7 +607,7 @@ lower_packed_varyings_visitor::get_packed_varying_deref(
    if (this->packed_varyings[slot] == NULL) {
       char *packed_name = ralloc_asprintf(this->mem_ctx, "packed:%s", name);
       const glsl_type *packed_type;
-      if (unpacked_var->data.interpolation == INTERP_QUALIFIER_FLAT)
+      if (unpacked_var->is_interpolation_flat())
          packed_type = glsl_type::ivec4_type;
       else
          packed_type = glsl_type::vec4_type;
@@ -627,7 +627,8 @@ lower_packed_varyings_visitor::get_packed_varying_deref(
       packed_var->data.centroid = unpacked_var->data.centroid;
       packed_var->data.sample = unpacked_var->data.sample;
       packed_var->data.patch = unpacked_var->data.patch;
-      packed_var->data.interpolation = unpacked_var->data.interpolation;
+      packed_var->data.interpolation = packed_type == glsl_type::ivec4_type
+         ? unsigned(INTERP_QUALIFIER_FLAT) : unpacked_var->data.interpolation;
       packed_var->data.location = location;
       packed_var->data.precision = unpacked_var->data.precision;
       packed_var->data.always_active_io = unpacked_var->data.always_active_io;
diff --git a/src/compiler/glsl/opt_conditional_discard.cpp b/src/compiler/glsl/opt_conditional_discard.cpp
index 1ca8803..a27bead 100644
--- a/src/compiler/glsl/opt_conditional_discard.cpp
+++ b/src/compiler/glsl/opt_conditional_discard.cpp
@@ -72,7 +72,14 @@ opt_conditional_discard_visitor::visit_leave(ir_if *ir)
 
    /* Move the condition and replace the ir_if with the ir_discard. */
    ir_discard *discard = (ir_discard *) ir->then_instructions.head;
-   discard->condition = ir->condition;
+   if (!discard->condition)
+      discard->condition = ir->condition;
+   else {
+      void *ctx = ralloc_parent(ir);
+      discard->condition = new(ctx) ir_expression(ir_binop_logic_and,
+                                                  ir->condition,
+                                                  discard->condition);
+   }
    ir->replace_with(discard);
 
    progress = true;
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 11f1e85..83ce35e 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -1079,7 +1079,7 @@ function_key_compare(const void *a, const void *b)
    const glsl_type *const key2 = (glsl_type *) b;
 
    if (key1->length != key2->length)
-      return 1;
+      return false;
 
    return memcmp(key1->fields.parameters, key2->fields.parameters,
                  (key1->length + 1) * sizeof(*key1->fields.parameters)) == 0;
@@ -1090,20 +1090,8 @@ static uint32_t
 function_key_hash(const void *a)
 {
    const glsl_type *const key = (glsl_type *) a;
-   char hash_key[128];
-   unsigned size = 0;
-
-   size = snprintf(hash_key, sizeof(hash_key), "%08x", key->length);
-
-   for (unsigned i = 0; i < key->length; i++) {
-      if (size >= sizeof(hash_key))
-	 break;
-
-      size += snprintf(& hash_key[size], sizeof(hash_key) - size,
-		       "%p", (void *) key->fields.structure[i].type);
-   }
-
-   return _mesa_hash_string(hash_key);
+   return _mesa_hash_data(key->fields.parameters,
+                          (key->length + 1) * sizeof(*key->fields.parameters));
 }
 
 const glsl_type *
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 3c8b4e0..158ccc1 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -659,6 +659,122 @@ nir_copy_deref(void *mem_ctx, nir_deref *deref)
    return NULL;
 }
 
+/* This is the second step in the recursion.  We've found the tail and made a
+ * copy.  Now we need to iterate over all possible leaves and call the
+ * callback on each one.
+ */
+static bool
+deref_foreach_leaf_build_recur(nir_deref_var *deref, nir_deref *tail,
+                               nir_deref_foreach_leaf_cb cb, void *state)
+{
+   unsigned length;
+   union {
+      nir_deref_array arr;
+      nir_deref_struct str;
+   } tmp;
+
+   assert(tail->child == NULL);
+   switch (glsl_get_base_type(tail->type)) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_BOOL:
+      if (glsl_type_is_vector_or_scalar(tail->type))
+         return cb(deref, state);
+      /* Fall Through */
+
+   case GLSL_TYPE_ARRAY:
+      tmp.arr.deref.deref_type = nir_deref_type_array;
+      tmp.arr.deref.type = glsl_get_array_element(tail->type);
+      tmp.arr.deref_array_type = nir_deref_array_type_direct;
+      tmp.arr.indirect = NIR_SRC_INIT;
+      tail->child = &tmp.arr.deref;
+
+      length = glsl_get_length(tail->type);
+      for (unsigned i = 0; i < length; i++) {
+         tmp.arr.deref.child = NULL;
+         tmp.arr.base_offset = i;
+         if (!deref_foreach_leaf_build_recur(deref, &tmp.arr.deref, cb, state))
+            return false;
+      }
+      return true;
+
+   case GLSL_TYPE_STRUCT:
+      tmp.str.deref.deref_type = nir_deref_type_struct;
+      tail->child = &tmp.str.deref;
+
+      length = glsl_get_length(tail->type);
+      for (unsigned i = 0; i < length; i++) {
+         tmp.arr.deref.child = NULL;
+         tmp.str.deref.type = glsl_get_struct_field(tail->type, i);
+         tmp.str.index = i;
+         if (!deref_foreach_leaf_build_recur(deref, &tmp.arr.deref, cb, state))
+            return false;
+      }
+      return true;
+
+   default:
+      unreachable("Invalid type for dereference");
+   }
+}
+
+/* This is the first step of the foreach_leaf recursion.  In this step we are
+ * walking to the end of the deref chain and making a copy in the stack as we
+ * go.  This is because we don't want to mutate the deref chain that was
+ * passed in by the caller.  The downside is that this deref chain is on the
+ * stack and , if the caller wants to do anything with it, they will have to
+ * make their own copy because this one will go away.
+ */
+static bool
+deref_foreach_leaf_copy_recur(nir_deref_var *deref, nir_deref *tail,
+                              nir_deref_foreach_leaf_cb cb, void *state)
+{
+   union {
+      nir_deref_array arr;
+      nir_deref_struct str;
+   } c;
+
+   if (tail->child) {
+      switch (tail->child->deref_type) {
+      case nir_deref_type_array:
+         c.arr = *nir_deref_as_array(tail->child);
+         tail->child = &c.arr.deref;
+         return deref_foreach_leaf_copy_recur(deref, &c.arr.deref, cb, state);
+
+      case nir_deref_type_struct:
+         c.str = *nir_deref_as_struct(tail->child);
+         tail->child = &c.str.deref;
+         return deref_foreach_leaf_copy_recur(deref, &c.str.deref, cb, state);
+
+      case nir_deref_type_var:
+      default:
+         unreachable("Invalid deref type for a child");
+      }
+   } else {
+      /* We've gotten to the end of the original deref.  Time to start
+       * building our own derefs.
+       */
+      return deref_foreach_leaf_build_recur(deref, tail, cb, state);
+   }
+}
+
+/**
+ * This function iterates over all of the possible derefs that can be created
+ * with the given deref as the head.  It then calls the provided callback with
+ * a full deref for each one.
+ *
+ * The deref passed to the callback will be allocated on the stack.  You will
+ * need to make a copy if you want it to hang around.
+ */
+bool
+nir_deref_foreach_leaf(nir_deref_var *deref,
+                       nir_deref_foreach_leaf_cb cb, void *state)
+{
+   nir_deref_var copy = *deref;
+   return deref_foreach_leaf_copy_recur(&copy, &copy.deref, cb, state);
+}
+
 /* Returns a load_const instruction that represents the constant
  * initializer for the given deref chain.  The caller is responsible for
  * ensuring that there actually is a constant initializer.
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 9816ed6..dc03918 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1923,6 +1923,10 @@ nir_deref_struct *nir_deref_struct_create(void *mem_ctx, unsigned field_index);
 
 nir_deref *nir_copy_deref(void *mem_ctx, nir_deref *deref);
 
+typedef bool (*nir_deref_foreach_leaf_cb)(nir_deref_var *deref, void *state);
+bool nir_deref_foreach_leaf(nir_deref_var *deref,
+                            nir_deref_foreach_leaf_cb cb, void *state);
+
 nir_load_const_instr *
 nir_deref_get_const_initializer_load(nir_shader *shader, nir_deref_var *deref);
 
diff --git a/src/compiler/nir/nir_builder.h b/src/compiler/nir/nir_builder.h
index 09cdf72..7497efc 100644
--- a/src/compiler/nir/nir_builder.h
+++ b/src/compiler/nir/nir_builder.h
@@ -318,6 +318,25 @@ nir_fdot(nir_builder *build, nir_ssa_def *src0, nir_ssa_def *src1)
 }
 
 static inline nir_ssa_def *
+nir_bany_inequal(nir_builder *b, nir_ssa_def *src0, nir_ssa_def *src1)
+{
+   switch (src0->num_components) {
+   case 1: return nir_ine(b, src0, src1);
+   case 2: return nir_bany_inequal2(b, src0, src1);
+   case 3: return nir_bany_inequal3(b, src0, src1);
+   case 4: return nir_bany_inequal4(b, src0, src1);
+   default:
+      unreachable("bad component size");
+   }
+}
+
+static inline nir_ssa_def *
+nir_bany(nir_builder *b, nir_ssa_def *src)
+{
+   return nir_bany_inequal(b, src, nir_imm_int(b, 0));
+}
+
+static inline nir_ssa_def *
 nir_channel(nir_builder *b, nir_ssa_def *def, unsigned c)
 {
    unsigned swizzle[4] = {c, c, c, c};
diff --git a/src/compiler/nir/nir_inline_functions.c b/src/compiler/nir/nir_inline_functions.c
index c36748d..cf31e14 100644
--- a/src/compiler/nir/nir_inline_functions.c
+++ b/src/compiler/nir/nir_inline_functions.c
@@ -25,6 +25,20 @@
 #include "nir_builder.h"
 #include "nir_control_flow.h"
 
+static bool
+deref_apply_constant_initializer(nir_deref_var *deref, void *state)
+{
+   struct nir_builder *b = state;
+
+   nir_load_const_instr *initializer =
+      nir_deref_get_const_initializer_load(b->shader, deref);
+   nir_builder_instr_insert(b, &initializer->instr);
+
+   nir_store_deref_var(b, deref, &initializer->def, 0xf);
+
+   return true;
+}
+
 static bool inline_function_impl(nir_function_impl *impl, struct set *inlined);
 
 static void
@@ -174,11 +188,35 @@ inline_functions_block(nir_block *block, nir_builder *b,
       /* Add copies of all in parameters */
       assert(call->num_params == callee_copy->num_params);
 
+      b->cursor = nir_before_instr(&call->instr);
+
+      /* Before we insert the copy of the function, we need to lower away
+       * constant initializers on local variables.  This is because constant
+       * initializers happen (effectively) at the top of the function and,
+       * since these are about to become locals of the calling function,
+       * initialization will happen at the top of the caller rather than at
+       * the top of the callee.  This isn't usually a problem, but if we are
+       * being inlined inside of a loop, it can result in the variable not
+       * getting re-initialized properly for all loop iterations.
+       */
+      nir_foreach_variable(local, &callee_copy->locals) {
+         if (!local->constant_initializer)
+            continue;
+
+         nir_deref_var deref;
+         deref.deref.deref_type = nir_deref_type_var,
+         deref.deref.child = NULL;
+         deref.deref.type = local->type,
+         deref.var = local;
+
+         nir_deref_foreach_leaf(&deref, deref_apply_constant_initializer, b);
+
+         local->constant_initializer = NULL;
+      }
+
       exec_list_append(&b->impl->locals, &callee_copy->locals);
       exec_list_append(&b->impl->registers, &callee_copy->registers);
 
-      b->cursor = nir_before_instr(&call->instr);
-
       /* We now need to tie the two functions together using the
        * parameters.  There are two ways we do this: One is to turn the
        * parameter into a local variable and do a shadow-copy.  The other
diff --git a/src/compiler/nir/nir_intrinsics.h b/src/compiler/nir/nir_intrinsics.h
index 6f86c9f..9479060 100644
--- a/src/compiler/nir/nir_intrinsics.h
+++ b/src/compiler/nir/nir_intrinsics.h
@@ -41,6 +41,8 @@
 
 #define ARR(...) { __VA_ARGS__ }
 
+INTRINSIC(nop, 0, ARR(0), false, 0, 0, 0, xx, xx, xx,
+          NIR_INTRINSIC_CAN_ELIMINATE)
 
 INTRINSIC(load_var, 0, ARR(0), true, 0, 1, 0, xx, xx, xx, NIR_INTRINSIC_CAN_ELIMINATE)
 INTRINSIC(store_var, 1, ARR(0), false, 0, 1, 1, WRMASK, xx, xx, 0)
@@ -266,16 +268,16 @@ INTRINSIC(ssbo_atomic_comp_swap, 4, ARR(1, 1, 1, 1), true, 1, 0, 0, xx, xx, xx,
  *    in shared_atomic_add, etc).
  * 2: For CompSwap only: the second data parameter.
  */
-INTRINSIC(shared_atomic_add, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_imin, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_umin, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_imax, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_umax, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_and, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_or, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_xor, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_exchange, 2, ARR(1, 1), true, 1, 0, 0, xx, xx, xx, 0)
-INTRINSIC(shared_atomic_comp_swap, 3, ARR(1, 1, 1), true, 1, 0, 0, xx, xx, xx, 0)
+INTRINSIC(shared_atomic_add, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_imin, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_umin, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_imax, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_umax, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_and, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_or, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_xor, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_exchange, 2, ARR(1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
+INTRINSIC(shared_atomic_comp_swap, 3, ARR(1, 1, 1), true, 1, 0, 1, BASE, xx, xx, 0)
 
 #define SYSTEM_VALUE(name, components, num_indices, idx0, idx1, idx2) \
    INTRINSIC(load_##name, 0, ARR(0), true, components, 0, num_indices, \
diff --git a/src/compiler/nir/nir_lower_alu_to_scalar.c b/src/compiler/nir/nir_lower_alu_to_scalar.c
index 4f72cf7..a84fbdf 100644
--- a/src/compiler/nir/nir_lower_alu_to_scalar.c
+++ b/src/compiler/nir/nir_lower_alu_to_scalar.c
@@ -254,6 +254,9 @@ nir_lower_alu_to_scalar_impl(nir_function_impl *impl)
             lower_alu_instr_scalar(nir_instr_as_alu(instr), &builder);
       }
    }
+
+   nir_metadata_preserve(impl, nir_metadata_block_index |
+                               nir_metadata_dominance);
 }
 
 void
diff --git a/src/compiler/nir/nir_lower_vars_to_ssa.c b/src/compiler/nir/nir_lower_vars_to_ssa.c
index d62cec0..937f8b3 100644
--- a/src/compiler/nir/nir_lower_vars_to_ssa.c
+++ b/src/compiler/nir/nir_lower_vars_to_ssa.c
@@ -471,7 +471,7 @@ lower_copies_to_load_store(struct deref_node *node,
    return true;
 }
 
-/* Performs variable renaming by doing a DFS of the dominance tree
+/* Performs variable renaming
  *
  * This algorithm is very similar to the one outlined in "Efficiently
  * Computing Static Single Assignment Form and the Control Dependence
@@ -479,133 +479,132 @@ lower_copies_to_load_store(struct deref_node *node,
  * SSA def on the stack per block.
  */
 static bool
-rename_variables_block(nir_block *block, struct lower_variables_state *state)
+rename_variables(struct lower_variables_state *state)
 {
    nir_builder b;
    nir_builder_init(&b, state->impl);
 
-   nir_foreach_instr_safe(instr, block) {
-      if (instr->type != nir_instr_type_intrinsic)
-         continue;
-
-      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
-
-      switch (intrin->intrinsic) {
-      case nir_intrinsic_load_var: {
-         struct deref_node *node =
-            get_deref_node(intrin->variables[0], state);
-
-         if (node == NULL) {
-            /* If we hit this path then we are referencing an invalid
-             * value.  Most likely, we unrolled something and are
-             * reading past the end of some array.  In any case, this
-             * should result in an undefined value.
-             */
-            nir_ssa_undef_instr *undef =
-               nir_ssa_undef_instr_create(state->shader,
-                                          intrin->num_components,
-                                          intrin->dest.ssa.bit_size);
-
-            nir_instr_insert_before(&intrin->instr, &undef->instr);
-            nir_instr_remove(&intrin->instr);
-
-            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                     nir_src_for_ssa(&undef->def));
+   nir_foreach_block(block, state->impl) {
+      nir_foreach_instr_safe(instr, block) {
+         if (instr->type != nir_instr_type_intrinsic)
             continue;
-         }
 
-         if (!node->lower_to_ssa)
-            continue;
+         nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+         switch (intrin->intrinsic) {
+         case nir_intrinsic_load_var: {
+            struct deref_node *node =
+               get_deref_node(intrin->variables[0], state);
+
+            if (node == NULL) {
+               /* If we hit this path then we are referencing an invalid
+                * value.  Most likely, we unrolled something and are
+                * reading past the end of some array.  In any case, this
+                * should result in an undefined value.
+                */
+               nir_ssa_undef_instr *undef =
+                  nir_ssa_undef_instr_create(state->shader,
+                                             intrin->num_components,
+                                             intrin->dest.ssa.bit_size);
+
+               nir_instr_insert_before(&intrin->instr, &undef->instr);
+               nir_instr_remove(&intrin->instr);
+
+               nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                        nir_src_for_ssa(&undef->def));
+               continue;
+            }
 
-         nir_alu_instr *mov = nir_alu_instr_create(state->shader,
-                                                   nir_op_imov);
-         mov->src[0].src = nir_src_for_ssa(
-            nir_phi_builder_value_get_block_def(node->pb_value, block));
-         for (unsigned i = intrin->num_components; i < 4; i++)
-            mov->src[0].swizzle[i] = 0;
+            if (!node->lower_to_ssa)
+               continue;
 
-         assert(intrin->dest.is_ssa);
+            nir_alu_instr *mov = nir_alu_instr_create(state->shader,
+                                                      nir_op_imov);
+            mov->src[0].src = nir_src_for_ssa(
+               nir_phi_builder_value_get_block_def(node->pb_value, block));
+            for (unsigned i = intrin->num_components; i < 4; i++)
+               mov->src[0].swizzle[i] = 0;
 
-         mov->dest.write_mask = (1 << intrin->num_components) - 1;
-         nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
-                           intrin->num_components,
-                           intrin->dest.ssa.bit_size, NULL);
+            assert(intrin->dest.is_ssa);
 
-         nir_instr_insert_before(&intrin->instr, &mov->instr);
-         nir_instr_remove(&intrin->instr);
+            mov->dest.write_mask = (1 << intrin->num_components) - 1;
+            nir_ssa_dest_init(&mov->instr, &mov->dest.dest,
+                              intrin->num_components,
+                              intrin->dest.ssa.bit_size, NULL);
 
-         nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
-                                  nir_src_for_ssa(&mov->dest.dest.ssa));
-         break;
-      }
-
-      case nir_intrinsic_store_var: {
-         struct deref_node *node =
-            get_deref_node(intrin->variables[0], state);
-
-         if (node == NULL) {
-            /* Probably an out-of-bounds array store.  That should be a
-             * no-op. */
+            nir_instr_insert_before(&intrin->instr, &mov->instr);
             nir_instr_remove(&intrin->instr);
-            continue;
-         }
 
-         if (!node->lower_to_ssa)
-            continue;
-
-         assert(intrin->num_components ==
-                glsl_get_vector_elements(node->type));
-
-         assert(intrin->src[0].is_ssa);
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                     nir_src_for_ssa(&mov->dest.dest.ssa));
+            break;
+         }
 
-         nir_ssa_def *new_def;
-         b.cursor = nir_before_instr(&intrin->instr);
+         case nir_intrinsic_store_var: {
+            struct deref_node *node =
+               get_deref_node(intrin->variables[0], state);
 
-         unsigned wrmask = nir_intrinsic_write_mask(intrin);
-         if (wrmask == (1 << intrin->num_components) - 1) {
-            /* Whole variable store - just copy the source.  Note that
-             * intrin->num_components and intrin->src[0].ssa->num_components
-             * may differ.
-             */
-            unsigned swiz[4];
-            for (unsigned i = 0; i < 4; i++)
-               swiz[i] = i < intrin->num_components ? i : 0;
+            if (node == NULL) {
+               /* Probably an out-of-bounds array store.  That should be a
+                * no-op. */
+               nir_instr_remove(&intrin->instr);
+               continue;
+            }
 
-            new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
-                                  intrin->num_components, false);
-         } else {
-            nir_ssa_def *old_def =
-               nir_phi_builder_value_get_block_def(node->pb_value, block);
-            /* For writemasked store_var intrinsics, we combine the newly
-             * written values with the existing contents of unwritten
-             * channels, creating a new SSA value for the whole vector.
-             */
-            nir_ssa_def *srcs[4];
-            for (unsigned i = 0; i < intrin->num_components; i++) {
-               if (wrmask & (1 << i)) {
-                  srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
-               } else {
-                  srcs[i] = nir_channel(&b, old_def, i);
+            if (!node->lower_to_ssa)
+               continue;
+
+            assert(intrin->num_components ==
+                   glsl_get_vector_elements(node->type));
+
+            assert(intrin->src[0].is_ssa);
+
+            nir_ssa_def *new_def;
+            b.cursor = nir_before_instr(&intrin->instr);
+
+            unsigned wrmask = nir_intrinsic_write_mask(intrin);
+            if (wrmask == (1 << intrin->num_components) - 1) {
+               /* Whole variable store - just copy the source.  Note that
+                * intrin->num_components and intrin->src[0].ssa->num_components
+                * may differ.
+                */
+               unsigned swiz[4];
+               for (unsigned i = 0; i < 4; i++)
+                  swiz[i] = i < intrin->num_components ? i : 0;
+
+               new_def = nir_swizzle(&b, intrin->src[0].ssa, swiz,
+                                     intrin->num_components, false);
+            } else {
+               nir_ssa_def *old_def =
+                  nir_phi_builder_value_get_block_def(node->pb_value, block);
+               /* For writemasked store_var intrinsics, we combine the newly
+                * written values with the existing contents of unwritten
+                * channels, creating a new SSA value for the whole vector.
+                */
+               nir_ssa_def *srcs[4];
+               for (unsigned i = 0; i < intrin->num_components; i++) {
+                  if (wrmask & (1 << i)) {
+                     srcs[i] = nir_channel(&b, intrin->src[0].ssa, i);
+                  } else {
+                     srcs[i] = nir_channel(&b, old_def, i);
+                  }
                }
+               new_def = nir_vec(&b, srcs, intrin->num_components);
             }
-            new_def = nir_vec(&b, srcs, intrin->num_components);
-         }
 
-         assert(new_def->num_components == intrin->num_components);
+            assert(new_def->num_components == intrin->num_components);
 
-         nir_phi_builder_value_set_block_def(node->pb_value, block, new_def);
-         nir_instr_remove(&intrin->instr);
-         break;
-      }
+            nir_phi_builder_value_set_block_def(node->pb_value, block, new_def);
+            nir_instr_remove(&intrin->instr);
+            break;
+         }
 
-      default:
-         break;
+         default:
+            break;
+         }
       }
    }
 
-   for (unsigned i = 0; i < block->num_dom_children; ++i)
-      rename_variables_block(block->dom_children[i], state);
-
    return true;
 }
 
@@ -737,7 +736,7 @@ nir_lower_vars_to_ssa_impl(nir_function_impl *impl)
       }
    }
 
-   rename_variables_block(nir_start_block(impl), &state);
+   rename_variables(&state);
 
    nir_phi_builder_finish(state.phi_builder);
 
diff --git a/src/compiler/nir/nir_opcodes.py b/src/compiler/nir/nir_opcodes.py
index 15066c2..7045c95 100644
--- a/src/compiler/nir/nir_opcodes.py
+++ b/src/compiler/nir/nir_opcodes.py
@@ -257,7 +257,7 @@ unpack_4x8("unorm")
 unpack_2x16("half")
 
 unop_horiz("pack_uvec2_to_uint", 1, tuint32, 2, tuint32, """
-dst.x = (src0.x & 0xffff) | (src0.y >> 16);
+dst.x = (src0.x & 0xffff) | (src0.y << 16);
 """)
 
 unop_horiz("pack_uvec4_to_uint", 1, tuint32, 4, tuint32, """
diff --git a/src/compiler/nir/nir_phi_builder.h b/src/compiler/nir/nir_phi_builder.h
index edc5302..a4dc18a 100644
--- a/src/compiler/nir/nir_phi_builder.h
+++ b/src/compiler/nir/nir_phi_builder.h
@@ -44,7 +44,8 @@
  *         var.pb_val = nir_phi_builder_add_value(pb, var.defs)
  *
  *     // Visit each block.  This needs to visit dominators first;
- *     // nir_for_each_block() will be ok.
+ *     // nir_foreach_block() will be ok.
+ *
  *     foreach block:
  *         foreach instruction:
  *             foreach use of variable var:
diff --git a/src/compiler/spirv/spirv_to_nir.c b/src/compiler/spirv/spirv_to_nir.c
index f1bbfd5..0763cb8 100644
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@@ -1718,8 +1718,8 @@ vtn_handle_image(struct vtn_builder *b, SpvOp opcode,
       break;
 
    case SpvOpAtomicCompareExchange:
-      intrin->src[2] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
-      intrin->src[3] = nir_src_for_ssa(vtn_ssa_value(b, w[6])->def);
+      intrin->src[2] = nir_src_for_ssa(vtn_ssa_value(b, w[8])->def);
+      intrin->src[3] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
       break;
 
    case SpvOpAtomicISub:
@@ -1816,8 +1816,8 @@ fill_common_atomic_sources(struct vtn_builder *b, SpvOp opcode,
       break;
 
    case SpvOpAtomicCompareExchange:
-      src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
-      src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[8])->def);
+      src[0] = nir_src_for_ssa(vtn_ssa_value(b, w[8])->def);
+      src[1] = nir_src_for_ssa(vtn_ssa_value(b, w[7])->def);
       break;
       /* Fall through */
 
diff --git a/src/compiler/spirv/vtn_cfg.c b/src/compiler/spirv/vtn_cfg.c
index d9096f4..62b9056 100644
--- a/src/compiler/spirv/vtn_cfg.c
+++ b/src/compiler/spirv/vtn_cfg.c
@@ -239,12 +239,12 @@ vtn_get_branch_type(struct vtn_block *block,
              swcase->fallthrough == block->switch_case);
       swcase->fallthrough = block->switch_case;
       return vtn_branch_type_switch_fallthrough;
-   } else if (block == switch_break) {
-      return vtn_branch_type_switch_break;
    } else if (block == loop_break) {
       return vtn_branch_type_loop_break;
    } else if (block == loop_cont) {
       return vtn_branch_type_loop_continue;
+   } else if (block == switch_break) {
+      return vtn_branch_type_switch_break;
    } else {
       return vtn_branch_type_none;
    }
@@ -443,6 +443,19 @@ vtn_cfg_walk_blocks(struct vtn_builder *b, struct list_head *cf_list,
             vtn_order_case(swtch, case_block->switch_case);
          }
 
+         enum vtn_branch_type branch_type =
+            vtn_get_branch_type(break_block, switch_case, NULL,
+                                loop_break, loop_cont);
+
+         if (branch_type != vtn_branch_type_none) {
+            /* It is possible that the break is actually the continue block
+             * for the containing loop.  In this case, we need to bail and let
+             * the loop parsing code handle the continue properly.
+             */
+            assert(branch_type == vtn_branch_type_loop_continue);
+            return;
+         }
+
          block = break_block;
          continue;
       }
@@ -518,7 +531,7 @@ vtn_handle_phi_second_pass(struct vtn_builder *b, SpvOp opcode,
       struct vtn_block *pred =
          vtn_value(b, w[i + 1], vtn_value_type_block)->block;
 
-      b->nb.cursor = nir_after_block_before_jump(pred->end_block);
+      b->nb.cursor = nir_after_instr(&pred->end_nop->instr);
 
       vtn_local_store(b, src, nir_deref_var_create(b, phi_var));
    }
@@ -576,7 +589,9 @@ vtn_emit_cf_list(struct vtn_builder *b, struct list_head *cf_list,
 
          vtn_foreach_instruction(b, block_start, block_end, handler);
 
-         block->end_block = nir_cursor_current_block(b->nb.cursor);
+         block->end_nop = nir_intrinsic_instr_create(b->nb.shader,
+                                                     nir_intrinsic_nop);
+         nir_builder_instr_insert(&b->nb, &block->end_nop->instr);
 
          if ((*block->branch & SpvOpCodeMask) == SpvOpReturnValue) {
             struct vtn_ssa_value *src = vtn_ssa_value(b, block->branch[1]);
diff --git a/src/compiler/spirv/vtn_private.h b/src/compiler/spirv/vtn_private.h
index 7f5444e..6f34f09 100644
--- a/src/compiler/spirv/vtn_private.h
+++ b/src/compiler/spirv/vtn_private.h
@@ -149,8 +149,8 @@ struct vtn_block {
    /** Points to the switch case started by this block (if any) */
    struct vtn_case *switch_case;
 
-   /** The last block in this SPIR-V block. */
-   nir_block *end_block;
+   /** Every block ends in a nop intrinsic so that we can find it again */
+   nir_intrinsic_instr *end_nop;
 };
 
 struct vtn_function {
diff --git a/src/compiler/spirv/vtn_variables.c b/src/compiler/spirv/vtn_variables.c
index fe2494b..459e573 100644
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@@ -889,81 +889,9 @@ vtn_get_builtin_location(struct vtn_builder *b,
 }
 
 static void
-var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
-                  const struct vtn_decoration *dec, void *void_var)
+apply_var_decoration(struct vtn_builder *b, nir_variable *nir_var,
+                     const struct vtn_decoration *dec)
 {
-   struct vtn_variable *vtn_var = void_var;
-
-   /* Handle decorations that apply to a vtn_variable as a whole */
-   switch (dec->decoration) {
-   case SpvDecorationBinding:
-      vtn_var->binding = dec->literals[0];
-      return;
-   case SpvDecorationDescriptorSet:
-      vtn_var->descriptor_set = dec->literals[0];
-      return;
-   default:
-      break;
-   }
-
-   /* Now we handle decorations that apply to a particular nir_variable */
-   nir_variable *nir_var = vtn_var->var;
-   if (val->value_type == vtn_value_type_access_chain) {
-      assert(val->access_chain->length == 0);
-      assert(val->access_chain->var == void_var);
-      assert(member == -1);
-   } else {
-      assert(val->value_type == vtn_value_type_type);
-      if (member != -1)
-         nir_var = vtn_var->members[member];
-   }
-
-   /* Location is odd in that it can apply in three different cases: To a
-    * non-split variable, to a whole split variable, or to one structure
-    * member of a split variable.
-    */
-   if (dec->decoration == SpvDecorationLocation) {
-      unsigned location = dec->literals[0];
-      bool is_vertex_input;
-      if (b->shader->stage == MESA_SHADER_FRAGMENT &&
-          vtn_var->mode == vtn_variable_mode_output) {
-         is_vertex_input = false;
-         location += FRAG_RESULT_DATA0;
-      } else if (b->shader->stage == MESA_SHADER_VERTEX &&
-                 vtn_var->mode == vtn_variable_mode_input) {
-         is_vertex_input = true;
-         location += VERT_ATTRIB_GENERIC0;
-      } else if (vtn_var->mode == vtn_variable_mode_input ||
-                 vtn_var->mode == vtn_variable_mode_output) {
-         is_vertex_input = false;
-         location += VARYING_SLOT_VAR0;
-      } else {
-         assert(!"Location must be on input or output variable");
-      }
-
-      if (nir_var) {
-         /* This handles the member and lone variable cases */
-         nir_var->data.location = location;
-         nir_var->data.explicit_location = true;
-      } else {
-         /* This handles the structure member case */
-         assert(vtn_var->members);
-         unsigned length =
-            glsl_get_length(glsl_without_array(vtn_var->type->type));
-         for (unsigned i = 0; i < length; i++) {
-            vtn_var->members[i]->data.location = location;
-            vtn_var->members[i]->data.explicit_location = true;
-            location +=
-               glsl_count_attribute_slots(vtn_var->members[i]->interface_type,
-                                          is_vertex_input);
-         }
-      }
-      return;
-   }
-
-   if (nir_var == NULL)
-      return;
-
    switch (dec->decoration) {
    case SpvDecorationRelaxedPrecision:
       break; /* FIXME: Do nothing with this for now. */
@@ -1080,6 +1008,99 @@ var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
    }
 }
 
+static void
+var_decoration_cb(struct vtn_builder *b, struct vtn_value *val, int member,
+                  const struct vtn_decoration *dec, void *void_var)
+{
+   struct vtn_variable *vtn_var = void_var;
+
+   /* Handle decorations that apply to a vtn_variable as a whole */
+   switch (dec->decoration) {
+   case SpvDecorationBinding:
+      vtn_var->binding = dec->literals[0];
+      return;
+   case SpvDecorationDescriptorSet:
+      vtn_var->descriptor_set = dec->literals[0];
+      return;
+   default:
+      break;
+   }
+
+   if (val->value_type == vtn_value_type_access_chain) {
+      assert(val->access_chain->length == 0);
+      assert(val->access_chain->var == void_var);
+      assert(member == -1);
+   } else {
+      assert(val->value_type == vtn_value_type_type);
+   }
+
+   /* Location is odd.  If applied to a split structure, we have to walk the
+    * whole thing and accumulate the location.  It's easier to handle as a
+    * special case.
+    */
+   if (dec->decoration == SpvDecorationLocation) {
+      unsigned location = dec->literals[0];
+      bool is_vertex_input;
+      if (b->shader->stage == MESA_SHADER_FRAGMENT &&
+          vtn_var->mode == vtn_variable_mode_output) {
+         is_vertex_input = false;
+         location += FRAG_RESULT_DATA0;
+      } else if (b->shader->stage == MESA_SHADER_VERTEX &&
+                 vtn_var->mode == vtn_variable_mode_input) {
+         is_vertex_input = true;
+         location += VERT_ATTRIB_GENERIC0;
+      } else if (vtn_var->mode == vtn_variable_mode_input ||
+                 vtn_var->mode == vtn_variable_mode_output) {
+         is_vertex_input = false;
+         location += VARYING_SLOT_VAR0;
+      } else {
+         assert(!"Location must be on input or output variable");
+      }
+
+      if (vtn_var->var) {
+         /* This handles the member and lone variable cases */
+         vtn_var->var->data.location = location;
+         vtn_var->var->data.explicit_location = true;
+      } else {
+         /* This handles the structure member case */
+         assert(vtn_var->members);
+         unsigned length =
+            glsl_get_length(glsl_without_array(vtn_var->type->type));
+         for (unsigned i = 0; i < length; i++) {
+            vtn_var->members[i]->data.location = location;
+            vtn_var->members[i]->data.explicit_location = true;
+            location +=
+               glsl_count_attribute_slots(vtn_var->members[i]->interface_type,
+                                          is_vertex_input);
+         }
+      }
+      return;
+   } else {
+      if (vtn_var->var) {
+         assert(member <= 0);
+         apply_var_decoration(b, vtn_var->var, dec);
+      } else if (vtn_var->members) {
+         if (member >= 0) {
+            assert(vtn_var->members);
+            apply_var_decoration(b, vtn_var->members[member], dec);
+         } else {
+            unsigned length =
+               glsl_get_length(glsl_without_array(vtn_var->type->type));
+            for (unsigned i = 0; i < length; i++)
+               apply_var_decoration(b, vtn_var->members[i], dec);
+         }
+      } else {
+         /* A few variables, those with external storage, have no actual
+          * nir_variables associated with them.  Fortunately, all decorations
+          * we care about for those variables are on the type only.
+          */
+         assert(vtn_var->mode == vtn_variable_mode_ubo ||
+                vtn_var->mode == vtn_variable_mode_ssbo ||
+                vtn_var->mode == vtn_variable_mode_push_constant);
+      }
+   }
+}
+
 /* Tries to compute the size of an interface block based on the strides and
  * offsets that are provided to us in the SPIR-V source.
  */
@@ -1173,7 +1194,7 @@ vtn_handle_variables(struct vtn_builder *b, SpvOp opcode,
       case SpvStorageClassPushConstant:
          var->mode = vtn_variable_mode_push_constant;
          assert(b->shader->num_uniforms == 0);
-         b->shader->num_uniforms = vtn_type_block_size(var->type) * 4;
+         b->shader->num_uniforms = vtn_type_block_size(var->type);
          break;
       case SpvStorageClassInput:
          var->mode = vtn_variable_mode_input;
diff --git a/src/egl/drivers/dri2/egl_dri2.c b/src/egl/drivers/dri2/egl_dri2.c
index fe33ecd..c6d4fb5 100644
--- a/src/egl/drivers/dri2/egl_dri2.c
+++ b/src/egl/drivers/dri2/egl_dri2.c
@@ -242,6 +242,15 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
             return NULL;
          break;
 
+      case __DRI_ATTRIB_MAX_PBUFFER_WIDTH:
+         _eglSetConfigKey(&base, EGL_MAX_PBUFFER_WIDTH,
+                          _EGL_MAX_PBUFFER_WIDTH);
+         break;
+      case __DRI_ATTRIB_MAX_PBUFFER_HEIGHT:
+         _eglSetConfigKey(&base, EGL_MAX_PBUFFER_HEIGHT,
+                          _EGL_MAX_PBUFFER_HEIGHT);
+         break;
+
       default:
 	 key = dri2_to_egl_attribute_map[attrib];
 	 if (key != 0)
@@ -320,6 +329,15 @@ dri2_add_config(_EGLDisplay *disp, const __DRIconfig *dri_config, int id,
       surface_type &= ~EGL_PIXMAP_BIT;
    }
 
+   /* No support for pbuffer + MSAA for now.
+    *
+    * XXX TODO: pbuffer + MSAA does not work and causes crashes.
+    * See QT bugreport: https://bugreports.qt.io/browse/QTBUG-47509
+    */
+   if (base.Samples) {
+      surface_type &= ~EGL_PBUFFER_BIT;
+   }
+
    conf->base.SurfaceType |= surface_type;
 
    return conf;
@@ -758,64 +776,99 @@ dri2_create_screen(_EGLDisplay *disp)
 
 /**
  * Called via eglInitialize(), GLX_drv->API.Initialize().
+ *
+ * This must be guaranteed to be called exactly once, even if eglInitialize is
+ * called many times (without a eglTerminate in between).
  */
 static EGLBoolean
 dri2_initialize(_EGLDriver *drv, _EGLDisplay *disp)
 {
+   EGLBoolean ret = EGL_FALSE;
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+
+   /* In the case where the application calls eglMakeCurrent(context1),
+    * eglTerminate, then eglInitialize again (without a call to eglReleaseThread
+    * or eglMakeCurrent(NULL) before that), dri2_dpy structure is still
+    * initialized, as we need it to be able to free context1 correctly.
+    *
+    * It would probably be safest to forcibly release the display with
+    * dri2_display_release, to make sure the display is reinitialized correctly.
+    * However, the EGL spec states that we need to keep a reference to the
+    * current context (so we cannot call dri2_make_current(NULL)), and therefore
+    * we would leak context1 as we would be missing the old display connection
+    * to free it up correctly.
+    */
+   if (dri2_dpy) {
+      dri2_dpy->ref_count++;
+      return EGL_TRUE;
+   }
+
    /* not until swrast_dri is supported */
    if (disp->Options.UseFallback)
       return EGL_FALSE;
 
+   /* Nothing to initialize for a test only display */
+   if (disp->Options.TestOnly)
+      return EGL_TRUE;
+
    switch (disp->Platform) {
 #ifdef HAVE_SURFACELESS_PLATFORM
    case _EGL_PLATFORM_SURFACELESS:
-      if (disp->Options.TestOnly)
-         return EGL_TRUE;
-      return dri2_initialize_surfaceless(drv, disp);
+      ret = dri2_initialize_surfaceless(drv, disp);
+      break;
 #endif
-
 #ifdef HAVE_X11_PLATFORM
    case _EGL_PLATFORM_X11:
-      if (disp->Options.TestOnly)
-         return EGL_TRUE;
-      return dri2_initialize_x11(drv, disp);
+      ret = dri2_initialize_x11(drv, disp);
+      break;
 #endif
-
 #ifdef HAVE_DRM_PLATFORM
    case _EGL_PLATFORM_DRM:
-      if (disp->Options.TestOnly)
-         return EGL_TRUE;
-      return dri2_initialize_drm(drv, disp);
+      ret = dri2_initialize_drm(drv, disp);
+      break;
 #endif
 #ifdef HAVE_WAYLAND_PLATFORM
    case _EGL_PLATFORM_WAYLAND:
-      if (disp->Options.TestOnly)
-         return EGL_TRUE;
-      return dri2_initialize_wayland(drv, disp);
+      ret = dri2_initialize_wayland(drv, disp);
+      break;
 #endif
 #ifdef HAVE_ANDROID_PLATFORM
    case _EGL_PLATFORM_ANDROID:
-      if (disp->Options.TestOnly)
-         return EGL_TRUE;
-      return dri2_initialize_android(drv, disp);
+      ret = dri2_initialize_android(drv, disp);
+      break;
 #endif
-
    default:
       _eglLog(_EGL_WARNING, "No EGL platform enabled.");
       return EGL_FALSE;
    }
+
+   if (ret) {
+      dri2_dpy = dri2_egl_display(disp);
+
+      if (!dri2_dpy) {
+         return EGL_FALSE;
+      }
+
+      dri2_dpy->ref_count++;
+   }
+
+   return ret;
 }
 
 /**
- * Called via eglTerminate(), drv->API.Terminate().
+ * Decrement display reference count, and free up display if necessary.
  */
-static EGLBoolean
-dri2_terminate(_EGLDriver *drv, _EGLDisplay *disp)
-{
+static void
+dri2_display_release(_EGLDisplay *disp) {
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    unsigned i;
 
-   _eglReleaseDisplayResources(drv, disp);
+   assert(dri2_dpy->ref_count > 0);
+   dri2_dpy->ref_count--;
+
+   if (dri2_dpy->ref_count > 0)
+      return;
+
    _eglCleanupDisplay(disp);
 
    if (dri2_dpy->own_dri_screen)
@@ -870,6 +923,21 @@ dri2_terminate(_EGLDriver *drv, _EGLDisplay *disp)
    }
    free(dri2_dpy);
    disp->DriverData = NULL;
+}
+
+/**
+ * Called via eglTerminate(), drv->API.Terminate().
+ *
+ * This must be guaranteed to be called exactly once, even if eglTerminate is
+ * called many times (without a eglInitialize in between).
+ */
+static EGLBoolean
+dri2_terminate(_EGLDriver *drv, _EGLDisplay *disp)
+{
+   /* Release all non-current Context/Surfaces. */
+   _eglReleaseDisplayResources(drv, disp);
+
+   dri2_display_release(disp);
 
    return EGL_TRUE;
 }
@@ -1189,10 +1257,16 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
    _EGLSurface *tmp_dsurf, *tmp_rsurf;
    __DRIdrawable *ddraw, *rdraw;
    __DRIcontext *cctx;
+   EGLBoolean unbind;
+
+   if (!dri2_dpy)
+      return _eglError(EGL_NOT_INITIALIZED, "eglMakeCurrent");
 
    /* make new bindings */
-   if (!_eglBindContext(ctx, dsurf, rsurf, &old_ctx, &old_dsurf, &old_rsurf))
+   if (!_eglBindContext(ctx, dsurf, rsurf, &old_ctx, &old_dsurf, &old_rsurf)) {
+      /* _eglBindContext already sets the EGL error (in _eglCheckMakeCurrent) */
       return EGL_FALSE;
+   }
 
    /* flush before context switch */
    if (old_ctx && dri2_drv->glFlush)
@@ -1207,14 +1281,21 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
       dri2_dpy->core->unbindContext(old_cctx);
    }
 
-   if ((cctx == NULL && ddraw == NULL && rdraw == NULL) ||
-       dri2_dpy->core->bindContext(cctx, ddraw, rdraw)) {
+   unbind = (cctx == NULL && ddraw == NULL && rdraw == NULL);
+
+   if (unbind || dri2_dpy->core->bindContext(cctx, ddraw, rdraw)) {
       if (old_dsurf)
          drv->API.DestroySurface(drv, disp, old_dsurf);
       if (old_rsurf)
          drv->API.DestroySurface(drv, disp, old_rsurf);
-      if (old_ctx)
+
+      if (!unbind)
+         dri2_dpy->ref_count++;
+      if (old_ctx) {
+         EGLDisplay old_disp = _eglGetDisplayHandle(old_ctx->Resource.Display);
          drv->API.DestroyContext(drv, disp, old_ctx);
+         dri2_display_release(old_disp);
+      }
 
       return EGL_TRUE;
    } else {
@@ -1232,7 +1313,11 @@ dri2_make_current(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *dsurf,
       _eglPutSurface(old_rsurf);
       _eglPutContext(old_ctx);
 
-      return EGL_FALSE;
+      /* dri2_dpy->core->bindContext failed. We cannot tell for sure why, but
+       * setting the error to EGL_BAD_MATCH is surely better than leaving it
+       * as EGL_SUCCESS.
+       */
+      return _eglError(EGL_BAD_MATCH, "eglMakeCurrent");
    }
 }
 
diff --git a/src/egl/drivers/dri2/egl_dri2.h b/src/egl/drivers/dri2/egl_dri2.h
index 925294b..6099bc2 100644
--- a/src/egl/drivers/dri2/egl_dri2.h
+++ b/src/egl/drivers/dri2/egl_dri2.h
@@ -80,8 +80,6 @@
 #include "eglimage.h"
 #include "eglsync.h"
 
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
-
 struct wl_buffer;
 
 struct dri2_egl_driver
@@ -177,6 +175,10 @@ struct dri2_egl_display
    const __DRI2interopExtension *interop;
    int                       fd;
 
+   /* dri2_initialize/dri2_terminate increment/decrement this count, so does
+    * dri2_make_current (tracks if there are active contexts/surfaces). */
+   int                       ref_count;
+
    int                       own_device;
    int                       swap_available;
    int                       invalidate_available;
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 87bd19b..351fd0f 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -29,7 +29,7 @@
 
 #include <errno.h>
 #include <dlfcn.h>
-
+#include <fcntl.h>
 #if 0
 #include <xf86drm.h>
 #endif
@@ -170,6 +170,8 @@ droid_window_dequeue_buffer(struct dri2_egl_surface *dri2_surf)
 static EGLBoolean
 droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_surf)
 {
+   struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
+
    /* To avoid blocking other EGL calls, release the display mutex before
     * we enter droid_window_enqueue_buffer() and re-acquire the mutex upon
     * return.
@@ -200,6 +202,12 @@ droid_window_enqueue_buffer(_EGLDisplay *disp, struct dri2_egl_surface *dri2_sur
    dri2_surf->buffer = NULL;
 
    mtx_lock(&disp->Mutex);
+
+   if (dri2_surf->dri_image) {
+      dri2_dpy->image->destroyImage(dri2_surf->dri_image);
+      dri2_surf->dri_image = NULL;
+   }
+
    return EGL_TRUE;
 }
 
@@ -291,6 +299,8 @@ droid_create_surface(_EGLDriver *drv, _EGLDisplay *disp, EGLint type,
 
    config = dri2_get_dri_config(dri2_conf, EGL_WINDOW_BIT,
                                 dri2_surf->base.GLColorspace);
+   if (!config)
+      goto cleanup_surface;
 
    if (dri2_dpy->dri2) {
       dri2_surf->dri_drawable =
@@ -384,6 +394,9 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
    int fourcc, pitch;
    int offset = 0, fd;
 
+   if (dri2_surf->dri_image)
+	   return 0;
+
    if (!dri2_surf->buffer)
       return -1;
 
@@ -442,10 +455,8 @@ droid_image_get_buffers(__DRIdrawable *driDrawable,
 static EGLBoolean
 droid_swap_buffers(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *draw)
 {
-   struct dri2_egl_driver *dri2_drv = dri2_egl_driver(drv);
    struct dri2_egl_display *dri2_dpy = dri2_egl_display(disp);
    struct dri2_egl_surface *dri2_surf = dri2_egl_surface(draw);
-   _EGLContext *ctx;
 
    if (dri2_surf->base.Type != EGL_WINDOW_BIT)
       return EGL_TRUE;
@@ -986,7 +997,7 @@ droid_open_device(void)
       fd = -1;
    }
 
-   return (fd >= 0) ? dup(fd) : -1;
+   return (fd >= 0) ? fcntl(fd, F_DUPFD_CLOEXEC, 3) : -1;
 }
 
 /* support versions < JellyBean */
@@ -1134,6 +1145,7 @@ cleanup_device:
    close(dri2_dpy->fd);
 cleanup_display:
    free(dri2_dpy);
+   dpy->DriverData = NULL;
 
    return _eglError(EGL_NOT_INITIALIZED, err);
 }
diff --git a/src/egl/drivers/dri2/platform_drm.c b/src/egl/drivers/dri2/platform_drm.c
index 9373496..1ce282f 100644
--- a/src/egl/drivers/dri2/platform_drm.c
+++ b/src/egl/drivers/dri2/platform_drm.c
@@ -726,5 +726,6 @@ cleanup:
       close(fd);
 
    free(dri2_dpy);
+   disp->DriverData = NULL;
    return EGL_FALSE;
 }
diff --git a/src/egl/drivers/dri2/platform_surfaceless.c b/src/egl/drivers/dri2/platform_surfaceless.c
index e0ddc12..323a8d7 100644
--- a/src/egl/drivers/dri2/platform_surfaceless.c
+++ b/src/egl/drivers/dri2/platform_surfaceless.c
@@ -157,6 +157,7 @@ cleanup_driver:
    close(dri2_dpy->fd);
 cleanup_display:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return _eglError(EGL_NOT_INITIALIZED, err);
 }
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index ff0d5c8..1a295d5 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -118,6 +118,13 @@ resize_callback(struct wl_egl_window *wl_win, void *data)
    (*dri2_dpy->flush->invalidate)(dri2_surf->dri_drawable);
 }
 
+static void
+destroy_window_callback(void *data)
+{
+   struct dri2_egl_surface *dri2_surf = data;
+   dri2_surf->wl_win = NULL;
+}
+
 /**
  * Called via eglCreateWindowSurface(), drv->API.CreateWindowSurface().
  */
@@ -159,6 +166,7 @@ dri2_wl_create_surface(_EGLDriver *drv, _EGLDisplay *disp,
 
    dri2_surf->wl_win->private = dri2_surf;
    dri2_surf->wl_win->resize_callback = resize_callback;
+   dri2_surf->wl_win->destroy_window_callback = destroy_window_callback;
 
    dri2_surf->base.Width =  -1;
    dri2_surf->base.Height = -1;
@@ -257,8 +265,11 @@ dri2_wl_destroy_surface(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf)
    if (dri2_surf->throttle_callback)
       wl_callback_destroy(dri2_surf->throttle_callback);
 
-   dri2_surf->wl_win->private = NULL;
-   dri2_surf->wl_win->resize_callback = NULL;
+   if (dri2_surf->wl_win) {
+      dri2_surf->wl_win->private = NULL;
+      dri2_surf->wl_win->resize_callback = NULL;
+      dri2_surf->wl_win->destroy_window_callback = NULL;
+   }
 
    free(surf);
 
@@ -1238,6 +1249,7 @@ dri2_initialize_wayland_drm(_EGLDriver *drv, _EGLDisplay *disp)
    wl_event_queue_destroy(dri2_dpy->wl_queue);
  cleanup_dpy:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return EGL_FALSE;
 }
@@ -1883,6 +1895,7 @@ dri2_initialize_wayland_swrast(_EGLDriver *drv, _EGLDisplay *disp)
    wl_event_queue_destroy(dri2_dpy->wl_queue);
  cleanup_dpy:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return EGL_FALSE;
 }
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index c0a4005..792cabe 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1231,6 +1231,7 @@ dri2_initialize_x11_swrast(_EGLDriver *drv, _EGLDisplay *disp)
       xcb_disconnect(dri2_dpy->conn);
  cleanup_dpy:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return EGL_FALSE;
 }
@@ -1302,15 +1303,13 @@ dri2_initialize_x11_dri3(_EGLDriver *drv, _EGLDisplay *disp)
       dri2_dpy->screen = DefaultScreen(dpy);
    }
 
-   if (xcb_connection_has_error(dri2_dpy->conn)) {
+   if (!dri2_dpy->conn || xcb_connection_has_error(dri2_dpy->conn)) {
       _eglLog(_EGL_WARNING, "DRI3: xcb_connect failed");
       goto cleanup_dpy;
    }
 
-   if (dri2_dpy->conn) {
-      if (!dri3_x11_connect(dri2_dpy))
-         goto cleanup_conn;
-   }
+   if (!dri3_x11_connect(dri2_dpy))
+      goto cleanup_conn;
 
    if (!dri2_load_driver_dri3(disp))
       goto cleanup_conn;
@@ -1338,10 +1337,8 @@ dri2_initialize_x11_dri3(_EGLDriver *drv, _EGLDisplay *disp)
    disp->Extensions.WL_bind_wayland_display = EGL_TRUE;
 #endif
 
-   if (dri2_dpy->conn) {
-      if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, false))
-         goto cleanup_configs;
-   }
+   if (!dri2_x11_add_configs_for_visuals(dri2_dpy, disp, false))
+      goto cleanup_configs;
 
    dri2_dpy->loader_dri3_ext.core = dri2_dpy->core;
    dri2_dpy->loader_dri3_ext.image_driver = dri2_dpy->image_driver;
@@ -1370,6 +1367,7 @@ dri2_initialize_x11_dri3(_EGLDriver *drv, _EGLDisplay *disp)
       xcb_disconnect(dri2_dpy->conn);
  cleanup_dpy:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return EGL_FALSE;
 }
@@ -1467,6 +1465,7 @@ dri2_initialize_x11_dri2(_EGLDriver *drv, _EGLDisplay *disp)
       xcb_disconnect(dri2_dpy->conn);
  cleanup_dpy:
    free(dri2_dpy);
+   disp->DriverData = NULL;
 
    return EGL_FALSE;
 }
diff --git a/src/egl/drivers/dri2/platform_x11_dri3.c b/src/egl/drivers/dri2/platform_x11_dri3.c
index 9363a8a..69bfcd8 100644
--- a/src/egl/drivers/dri2/platform_x11_dri3.c
+++ b/src/egl/drivers/dri2/platform_x11_dri3.c
@@ -103,6 +103,17 @@ egl_dri3_get_dri_context(struct loader_dri3_drawable *draw)
    return dri2_ctx->dri_context;
 }
 
+static __DRIscreen *
+egl_dri3_get_dri_screen(struct loader_dri3_drawable *draw)
+{
+   _EGLContext *ctx = _eglGetCurrentContext();
+   struct dri2_egl_context *dri2_ctx;
+   if (!ctx)
+      return NULL;
+   dri2_ctx = dri2_egl_context(ctx);
+   return dri2_egl_display(dri2_ctx->base.Resource.Display)->dri_screen;
+}
+
 static void
 egl_dri3_flush_drawable(struct loader_dri3_drawable *draw, unsigned flags)
 {
@@ -119,6 +130,7 @@ static struct loader_dri3_vtable egl_dri3_vtable = {
    .set_drawable_size = egl_dri3_set_drawable_size,
    .in_current_context = egl_dri3_in_current_context,
    .get_dri_context = egl_dri3_get_dri_context,
+   .get_dri_screen = egl_dri3_get_dri_screen,
    .flush_drawable = egl_dri3_flush_drawable,
    .show_fps = NULL,
 };
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 4700dbe..127ca1e 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -627,7 +627,9 @@ eglCreateContext(EGLDisplay dpy, EGLConfig config, EGLContext share_list,
 
    _EGL_CHECK_DISPLAY(disp, EGL_NO_CONTEXT, drv);
 
-   if (!config && !disp->Extensions.MESA_configless_context)
+   if (config)
+      _EGL_CHECK_CONFIG(disp, conf, EGL_NO_CONTEXT, drv);
+   else if (!disp->Extensions.MESA_configless_context)
       RETURN_EGL_ERROR(disp, EGL_BAD_CONFIG, EGL_NO_CONTEXT);
 
    if (!share && share_list != EGL_NO_CONTEXT)
@@ -1937,7 +1939,7 @@ _eglLockDisplayInterop(EGLDisplay dpy, EGLContext context,
    return MESA_GLINTEROP_SUCCESS;
 }
 
-int
+PUBLIC int
 MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
                                 struct mesa_glinterop_device_info *out)
 {
@@ -1959,7 +1961,7 @@ MesaGLInteropEGLQueryDeviceInfo(EGLDisplay dpy, EGLContext context,
    return ret;
 }
 
-int
+PUBLIC int
 MesaGLInteropEGLExportObject(EGLDisplay dpy, EGLContext context,
                              struct mesa_glinterop_export_in *in,
                              struct mesa_glinterop_export_out *out)
diff --git a/src/egl/main/egldefines.h b/src/egl/main/egldefines.h
index 13a7563..6090fc3 100644
--- a/src/egl/main/egldefines.h
+++ b/src/egl/main/egldefines.h
@@ -34,6 +34,8 @@
 #ifndef EGLDEFINES_INCLUDED
 #define EGLDEFINES_INCLUDED
 
+#include "util/macros.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -48,7 +50,6 @@ extern "C" {
 
 #define _EGL_VENDOR_STRING "Mesa Project"
 
-#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
 #define MIN2(A, B)  (((A) < (B)) ? (A) : (B))
 
 #ifdef __cplusplus
diff --git a/src/egl/main/eglglobals.c b/src/egl/main/eglglobals.c
index 938d953..1be6797 100644
--- a/src/egl/main/eglglobals.c
+++ b/src/egl/main/eglglobals.c
@@ -53,10 +53,16 @@ struct _egl_global _eglGlobal =
    /* ClientExtensionsString */
    "EGL_EXT_client_extensions"
    " EGL_EXT_platform_base"
+#ifdef HAVE_WAYLAND_PLATFORM
    " EGL_EXT_platform_wayland"
+#endif
+#ifdef HAVE_X11_PLATFORM
    " EGL_EXT_platform_x11"
-   " EGL_KHR_client_get_all_proc_addresses"
+#endif
+#ifdef HAVE_DRM_PLATFORM
    " EGL_MESA_platform_gbm"
+#endif
+   " EGL_KHR_client_get_all_proc_addresses"
 };
 
 
diff --git a/src/egl/main/eglsurface.c b/src/egl/main/eglsurface.c
index 17d7907..e8ee49c 100644
--- a/src/egl/main/eglsurface.c
+++ b/src/egl/main/eglsurface.c
@@ -262,9 +262,13 @@ _eglInitSurface(_EGLSurface *surf, _EGLDisplay *dpy, EGLint type,
 {
    const char *func;
    EGLint renderBuffer = EGL_BACK_BUFFER;
-   EGLint swapBehavior = EGL_BUFFER_PRESERVED;
+   EGLint swapBehavior = EGL_BUFFER_DESTROYED;
    EGLint err;
 
+   /* Swap behavior can be preserved only if config supports this. */
+   if (conf->SurfaceType & EGL_SWAP_BEHAVIOR_PRESERVED_BIT)
+      swapBehavior = EGL_BUFFER_PRESERVED;
+
    switch (type) {
    case EGL_WINDOW_BIT:
       func = "eglCreateWindowSurface";
diff --git a/src/egl/main/eglsync.c b/src/egl/main/eglsync.c
index 33625e9..f325031 100644
--- a/src/egl/main/eglsync.c
+++ b/src/egl/main/eglsync.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 
+#include <inttypes.h>
 #include <string.h>
 
 #include "eglsync.h"
@@ -75,8 +76,8 @@ _eglParseSyncAttribList64(_EGLSync *sync, const EGLAttrib *attrib_list)
       return EGL_SUCCESS;
 
    for (i = 0; attrib_list[i] != EGL_NONE; i++) {
-      EGLint attr = attrib_list[i++];
-      EGLint val = attrib_list[i];
+      EGLAttrib attr = attrib_list[i++];
+      EGLAttrib val = attrib_list[i];
 
       switch (attr) {
       case EGL_CL_EVENT_HANDLE_KHR:
@@ -92,7 +93,7 @@ _eglParseSyncAttribList64(_EGLSync *sync, const EGLAttrib *attrib_list)
       }
 
       if (err != EGL_SUCCESS) {
-         _eglLog(_EGL_DEBUG, "bad sync attribute 0x%04x", attr);
+         _eglLog(_EGL_DEBUG, "bad sync attribute 0x%" PRIxPTR, attr);
          break;
       }
    }
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-priv.h b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
index f1e3ba2..c91f9cd 100644
--- a/src/egl/wayland/wayland-egl/wayland-egl-priv.h
+++ b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
@@ -27,6 +27,7 @@ struct wl_egl_window {
 
 	void *private;
 	void (*resize_callback)(struct wl_egl_window *, void *);
+	void (*destroy_window_callback)(void *);
 };
 
 #ifdef  __cplusplus
diff --git a/src/egl/wayland/wayland-egl/wayland-egl.c b/src/egl/wayland/wayland-egl/wayland-egl.c
index 80a5be5..4a4701a 100644
--- a/src/egl/wayland/wayland-egl/wayland-egl.c
+++ b/src/egl/wayland/wayland-egl/wayland-egl.c
@@ -66,6 +66,7 @@ wl_egl_window_create(struct wl_surface *surface,
 	egl_window->surface = surface;
 	egl_window->private = NULL;
 	egl_window->resize_callback = NULL;
+	egl_window->destroy_window_callback = NULL;
 	wl_egl_window_resize(egl_window, width, height, 0, 0);
 	egl_window->attached_width  = 0;
 	egl_window->attached_height = 0;
@@ -76,6 +77,8 @@ wl_egl_window_create(struct wl_surface *surface,
 WL_EGL_EXPORT void
 wl_egl_window_destroy(struct wl_egl_window *egl_window)
 {
+	if (egl_window->destroy_window_callback)
+		egl_window->destroy_window_callback(egl_window->private);
 	free(egl_window);
 }
 
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 6077976..013bc88 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -310,7 +310,8 @@ C_SOURCES := \
 	util/u_upload_mgr.h \
 	util/u_vbuf.c \
 	util/u_vbuf.h \
-	util/u_video.h
+	util/u_video.h \
+	util/u_viewport.h
 
 NIR_SOURCES := \
 	nir/tgsi_to_nir.c \
diff --git a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
index c8e1f13..0fbc78e 100644
--- a/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
+++ b/src/gallium/auxiliary/pipe-loader/pipe_loader_sw.c
@@ -45,6 +45,7 @@ struct pipe_loader_sw_device {
    struct util_dl_library *lib;
 #endif
    struct sw_winsys *ws;
+   int fd;
 };
 
 #define pipe_loader_sw_device(dev) ((struct pipe_loader_sw_device *)dev)
@@ -92,6 +93,7 @@ pipe_loader_sw_probe_init_common(struct pipe_loader_sw_device *sdev)
    sdev->base.type = PIPE_LOADER_DEVICE_SOFTWARE;
    sdev->base.driver_name = "swrast";
    sdev->base.ops = &pipe_loader_sw_ops;
+   sdev->fd = -1;
 
 #ifdef GALLIUM_STATIC_TARGETS
    sdev->dd = &driver_descriptors;
@@ -169,6 +171,8 @@ pipe_loader_sw_probe_kms(struct pipe_loader_device **devs, int fd)
    if (!pipe_loader_sw_probe_init_common(sdev))
       goto fail;
 
+   sdev->fd = fd;
+
    for (i = 0; sdev->dd->winsys[i].name; i++) {
       if (strcmp(sdev->dd->winsys[i].name, "kms_dri") == 0) {
          sdev->ws = sdev->dd->winsys[i].create_winsys(fd);
@@ -273,6 +277,11 @@ pipe_loader_sw_release(struct pipe_loader_device **dev)
       util_dl_close(sdev->lib);
 #endif
 
+#ifdef HAVE_PIPE_LOADER_KMS
+   if (sdev->fd != -1)
+      close(sdev->fd);
+#endif
+
    FREE(sdev);
    *dev = NULL;
 }
diff --git a/src/gallium/auxiliary/util/u_box.h b/src/gallium/auxiliary/util/u_box.h
index 00f231d..55da21f 100644
--- a/src/gallium/auxiliary/util/u_box.h
+++ b/src/gallium/auxiliary/util/u_box.h
@@ -140,11 +140,15 @@ static inline void
 u_box_union_2d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
-   dst->x = MIN2(a->x, b->x);
-   dst->y = MIN2(a->y, b->y);
+   int x, y;
 
-   dst->width = MAX2(a->x + a->width, b->x + b->width) - dst->x;
-   dst->height = MAX2(a->y + a->height, b->y + b->height) - dst->y;
+   x = MIN2(a->x, b->x);
+   y = MIN2(a->y, b->y);
+
+   dst->width = MAX2(a->x + a->width, b->x + b->width) - x;
+   dst->height = MAX2(a->y + a->height, b->y + b->height) - y;
+   dst->x = x;
+   dst->y = y;
 }
 
 /* Aliasing of @dst permitted. */
@@ -152,13 +156,18 @@ static inline void
 u_box_union_3d(struct pipe_box *dst,
                const struct pipe_box *a, const struct pipe_box *b)
 {
-   dst->x = MIN2(a->x, b->x);
-   dst->y = MIN2(a->y, b->y);
-   dst->z = MIN2(a->z, b->z);
-
-   dst->width = MAX2(a->x + a->width, b->x + b->width) - dst->x;
-   dst->height = MAX2(a->y + a->height, b->y + b->height) - dst->y;
-   dst->depth = MAX2(a->z + a->depth, b->z + b->depth) - dst->z;
+   int x, y, z;
+
+   x = MIN2(a->x, b->x);
+   y = MIN2(a->y, b->y);
+   z = MIN2(a->z, b->z);
+
+   dst->width = MAX2(a->x + a->width, b->x + b->width) - x;
+   dst->height = MAX2(a->y + a->height, b->y + b->height) - y;
+   dst->depth = MAX2(a->z + a->depth, b->z + b->depth) - z;
+   dst->x = x;
+   dst->y = y;
+   dst->z = z;
 }
 
 static inline boolean
diff --git a/src/gallium/auxiliary/util/u_format_r11g11b10f.h b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
index 218822b..074783a 100644
--- a/src/gallium/auxiliary/util/u_format_r11g11b10f.h
+++ b/src/gallium/auxiliary/util/u_format_r11g11b10f.h
@@ -194,7 +194,7 @@ static inline float uf10_to_f32(uint16_t val)
 
    if (exponent == 0) {
       if (mantissa != 0) {
-         const float scale = 1.0 / (1 << 20);
+         const float scale = 1.0 / (1 << 19);
          f32.f = scale * mantissa;
       }
    }
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 8916a96..55343e8 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -831,7 +831,7 @@ align(int value, int alignment)
 static inline uint64_t
 align64(uint64_t value, unsigned alignment)
 {
-   return (value + alignment - 1) & ~(alignment - 1);
+   return (value + alignment - 1) & ~((uint64_t)alignment - 1);
 }
 
 /**
diff --git a/src/gallium/auxiliary/util/u_viewport.h b/src/gallium/auxiliary/util/u_viewport.h
new file mode 100644
index 0000000..a731b34
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_viewport.h
@@ -0,0 +1,59 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Ilia Mirkin.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef U_VIEWPORT_H
+#define U_VIEWPORT_H
+
+#include "c99_compat.h"
+#include "pipe/p_state.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static inline void
+util_viewport_zmin_zmax(const struct pipe_viewport_state *vp, bool halfz,
+                        float *zmin, float *zmax)
+{
+   float a, b;
+   if (halfz) {
+      a = vp->translate[2];
+      b = vp->translate[2] + vp->scale[2];
+   } else {
+      a = vp->translate[2] - vp->scale[2];
+      b = vp->translate[2] + vp->scale[2];
+   }
+
+   *zmin = a < b ? a : b;
+   *zmax = a < b ? b : a;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri3.c b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
index f7f572e..493e645 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri3.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri3.c
@@ -89,6 +89,7 @@ dri3_free_front_buffer(struct vl_dri3_screen *scrn,
 {
    xcb_sync_destroy_fence(scrn->conn, buffer->sync_fence);
    xshmfence_unmap_shm(buffer->shm_fence);
+   pipe_resource_reference(&buffer->texture, NULL);
    FREE(buffer);
 }
 
diff --git a/src/gallium/auxiliary/vl/vl_winsys_drm.c b/src/gallium/auxiliary/vl/vl_winsys_drm.c
index 6a759ae..df8809c 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_drm.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_drm.c
@@ -26,6 +26,7 @@
  **************************************************************************/
 
 #include <assert.h>
+#include <fcntl.h>
 
 #include "pipe/p_screen.h"
 #include "pipe-loader/pipe_loader.h"
@@ -47,7 +48,7 @@ vl_drm_screen_create(int fd)
    if (!vscreen)
       return NULL;
 
-   if (fd < 0 || (new_fd = dup(fd)) < 0)
+   if (fd < 0 || (new_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3)) < 0)
       goto free_screen;
 
    if (pipe_loader_drm_probe_fd(&vscreen->dev, new_fd))
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index dcb6dfb..bf787d1 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -1472,7 +1472,7 @@ static inline uint32_t A3XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
 {
 	return ((val) << A3XX_RB_DEPTH_CONTROL_ZFUNC__SHIFT) & A3XX_RB_DEPTH_CONTROL_ZFUNC__MASK;
 }
-#define A3XX_RB_DEPTH_CONTROL_BF_ENABLE				0x00000080
+#define A3XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE			0x00000080
 #define A3XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE			0x80000000
 
 #define REG_A3XX_RB_DEPTH_CLEAR					0x00002101
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index 6723941..51fe4bc 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -158,6 +158,9 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 		.sprite_coord_mode = ctx->rasterizer->sprite_coord_mode,
 	};
 
+	if (fd3_needs_manual_clipping(ctx->prog.vp, ctx->rasterizer))
+		emit.key.ucp_enables = ctx->rasterizer->clip_plane_enable;
+
 	fixup_shader_state(ctx, &emit.key);
 
 	unsigned dirty = ctx->dirty;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 4a5242a..45bab58 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_helpers.h"
 #include "util/u_format.h"
+#include "util/u_viewport.h"
 
 #include "freedreno_resource.h"
 #include "freedreno_query_hw.h"
@@ -529,7 +530,7 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A3XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1]));
 	}
 
-	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) {
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
 		uint32_t val = fd3_zsa_stateobj(ctx->zsa)->rb_depth_control;
 		if (fp->writes_pos) {
 			val |= A3XX_RB_DEPTH_CONTROL_FRAG_WRITES_Z;
@@ -538,6 +539,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		if (fp->has_kill) {
 			val |= A3XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE;
 		}
+		if (!ctx->rasterizer->depth_clip) {
+			val |= A3XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE;
+		}
 		OUT_PKT0(ring, REG_A3XX_RB_DEPTH_CONTROL, 1);
 		OUT_RING(ring, val);
 	}
@@ -561,20 +565,24 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
 		uint32_t val = fd3_rasterizer_stateobj(ctx->rasterizer)
 				->gras_cl_clip_cntl;
+		uint8_t planes = ctx->rasterizer->clip_plane_enable;
 		val |= COND(fp->writes_pos, A3XX_GRAS_CL_CLIP_CNTL_ZCLIP_DISABLE);
 		val |= COND(fp->frag_coord, A3XX_GRAS_CL_CLIP_CNTL_ZCOORD |
 				A3XX_GRAS_CL_CLIP_CNTL_WCOORD);
-		/* TODO only use if prog doesn't use clipvertex/clipdist */
-		val |= A3XX_GRAS_CL_CLIP_CNTL_NUM_USER_CLIP_PLANES(
-				MIN2(util_bitcount(ctx->rasterizer->clip_plane_enable), 6));
+		if (!emit->key.ucp_enables)
+			val |= A3XX_GRAS_CL_CLIP_CNTL_NUM_USER_CLIP_PLANES(
+					MIN2(util_bitcount(planes), 6));
 		OUT_PKT0(ring, REG_A3XX_GRAS_CL_CLIP_CNTL, 1);
 		OUT_RING(ring, val);
 	}
 
-	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_UCP)) {
+	if (dirty & (FD_DIRTY_RASTERIZER | FD_DIRTY_PROG | FD_DIRTY_UCP)) {
 		uint32_t planes = ctx->rasterizer->clip_plane_enable;
 		int count = 0;
 
+		if (emit->key.ucp_enables)
+			planes = 0;
+
 		while (planes && count < 6) {
 			int i = ffs(planes) - 1;
 
@@ -615,19 +623,35 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, val);
 	}
 
-	if (dirty & FD_DIRTY_SCISSOR) {
+	if (dirty & (FD_DIRTY_SCISSOR | FD_DIRTY_RASTERIZER | FD_DIRTY_VIEWPORT)) {
 		struct pipe_scissor_state *scissor = fd_context_get_scissor(ctx);
+		int minx = scissor->minx;
+		int miny = scissor->miny;
+		int maxx = scissor->maxx;
+		int maxy = scissor->maxy;
+
+		/* Unfortunately there is no separate depth clip disable, only an all
+		 * or nothing deal. So when we disable clipping, we must handle the
+		 * viewport clip via scissors.
+		 */
+		if (!ctx->rasterizer->depth_clip) {
+			struct pipe_viewport_state *vp = &ctx->viewport;
+			minx = MAX2(minx, (int)floorf(vp->translate[0] - fabsf(vp->scale[0])));
+			miny = MAX2(miny, (int)floorf(vp->translate[1] - fabsf(vp->scale[1])));
+			maxx = MIN2(maxx, (int)ceilf(vp->translate[0] + fabsf(vp->scale[0])));
+			maxy = MIN2(maxy, (int)ceilf(vp->translate[1] + fabsf(vp->scale[1])));
+		}
 
 		OUT_PKT0(ring, REG_A3XX_GRAS_SC_WINDOW_SCISSOR_TL, 2);
-		OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(scissor->minx) |
-				A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(scissor->miny));
-		OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(scissor->maxx - 1) |
-				A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(scissor->maxy - 1));
-
-		ctx->max_scissor.minx = MIN2(ctx->max_scissor.minx, scissor->minx);
-		ctx->max_scissor.miny = MIN2(ctx->max_scissor.miny, scissor->miny);
-		ctx->max_scissor.maxx = MAX2(ctx->max_scissor.maxx, scissor->maxx);
-		ctx->max_scissor.maxy = MAX2(ctx->max_scissor.maxy, scissor->maxy);
+		OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_TL_X(minx) |
+				A3XX_GRAS_SC_WINDOW_SCISSOR_TL_Y(miny));
+		OUT_RING(ring, A3XX_GRAS_SC_WINDOW_SCISSOR_BR_X(maxx - 1) |
+				A3XX_GRAS_SC_WINDOW_SCISSOR_BR_Y(maxy - 1));
+
+		ctx->max_scissor.minx = MIN2(ctx->max_scissor.minx, minx);
+		ctx->max_scissor.miny = MIN2(ctx->max_scissor.miny, miny);
+		ctx->max_scissor.maxx = MAX2(ctx->max_scissor.maxx, maxx);
+		ctx->max_scissor.maxy = MAX2(ctx->max_scissor.maxy, maxy);
 	}
 
 	if (dirty & FD_DIRTY_VIEWPORT) {
@@ -641,6 +665,30 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A3XX_GRAS_CL_VPORT_ZSCALE(ctx->viewport.scale[2]));
 	}
 
+	if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) {
+		float zmin, zmax;
+		int depth = 24;
+		if (ctx->framebuffer.zsbuf) {
+			depth = util_format_get_component_bits(
+					pipe_surface_format(ctx->framebuffer.zsbuf),
+					UTIL_FORMAT_COLORSPACE_ZS, 0);
+		}
+		util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz,
+								&zmin, &zmax);
+
+		OUT_PKT0(ring, REG_A3XX_RB_Z_CLAMP_MIN, 2);
+		if (depth == 32) {
+			OUT_RING(ring, (uint32_t)(zmin * 0xffffffff));
+			OUT_RING(ring, (uint32_t)(zmax * 0xffffffff));
+		} else if (depth == 16) {
+			OUT_RING(ring, (uint32_t)(zmin * 0xffff));
+			OUT_RING(ring, (uint32_t)(zmax * 0xffff));
+		} else {
+			OUT_RING(ring, (uint32_t)(zmin * 0xffffff));
+			OUT_RING(ring, (uint32_t)(zmax * 0xffffff));
+		}
+	}
+
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER | FD_DIRTY_BLEND_DUAL)) {
 		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 		int nr_cbufs = pfb->nr_cbufs;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index 8152f8f..e9059ce 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -28,6 +28,7 @@
 
 #include "pipe/p_state.h"
 #include "util/u_string.h"
+#include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
 #include "util/u_format.h"
@@ -85,6 +86,20 @@ fd3_vp_state_delete(struct pipe_context *pctx, void *hwcso)
 	delete_shader_stateobj(so);
 }
 
+bool
+fd3_needs_manual_clipping(const struct fd3_shader_stateobj *so,
+						  const struct pipe_rasterizer_state *rast)
+{
+	uint64_t outputs = ir3_shader_outputs(so->shader);
+
+	return (!rast->depth_clip ||
+			util_bitcount(rast->clip_plane_enable) > 6 ||
+			outputs & ((1ULL << VARYING_SLOT_CLIP_VERTEX) |
+					   (1ULL << VARYING_SLOT_CLIP_DIST0) |
+					   (1ULL << VARYING_SLOT_CLIP_DIST1)));
+}
+
+
 static void
 emit_shader(struct fd_ringbuffer *ring, const struct ir3_shader_variant *so)
 {
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.h b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
index b3fcc0c..b95df4c 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.h
@@ -44,4 +44,7 @@ void fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 
 void fd3_prog_init(struct pipe_context *pctx);
 
+bool fd3_needs_manual_clipping(const struct fd3_shader_stateobj *,
+							   const struct pipe_rasterizer_state *);
+
 #endif /* FD3_PROGRAM_H_ */
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index d9a7bb5..aeb61e7 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -1376,7 +1376,7 @@ static inline uint32_t A4XX_RB_DEPTH_CONTROL_ZFUNC(enum adreno_compare_func val)
 {
 	return ((val) << A4XX_RB_DEPTH_CONTROL_ZFUNC__SHIFT) & A4XX_RB_DEPTH_CONTROL_ZFUNC__MASK;
 }
-#define A4XX_RB_DEPTH_CONTROL_BF_ENABLE				0x00000080
+#define A4XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE			0x00000080
 #define A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE			0x00010000
 #define A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS			0x00020000
 #define A4XX_RB_DEPTH_CONTROL_Z_TEST_ENABLE			0x80000000
@@ -3145,6 +3145,8 @@ static inline uint32_t A4XX_TPL1_TP_TEX_COUNT_GS(uint32_t val)
 
 #define REG_A4XX_GRAS_CL_CLIP_CNTL				0x00002000
 #define A4XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE			0x00008000
+#define A4XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE		0x00010000
+#define A4XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE		0x00020000
 #define A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z			0x00400000
 
 #define REG_A4XX_GRAS_CLEAR_CNTL				0x00002003
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 00e985d..8b350ae 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_helpers.h"
 #include "util/u_format.h"
+#include "util/u_viewport.h"
 
 #include "freedreno_resource.h"
 #include "freedreno_query_hw.h"
@@ -544,12 +545,14 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A4XX_RB_STENCILREFMASK_BF_STENCILREF(sr->ref_value[1]));
 	}
 
-	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) {
+	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_RASTERIZER | FD_DIRTY_PROG)) {
 		struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa);
 		bool fragz = fp->has_kill | fp->writes_pos;
+		bool clamp = !ctx->rasterizer->depth_clip;
 
 		OUT_PKT0(ring, REG_A4XX_RB_DEPTH_CONTROL, 1);
 		OUT_RING(ring, zsa->rb_depth_control |
+				COND(clamp, A4XX_RB_DEPTH_CONTROL_Z_CLAMP_ENABLE) |
 				COND(fragz, A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE) |
 				COND(fragz && fp->frag_coord, A4XX_RB_DEPTH_CONTROL_FORCE_FRAGZ_TO_FS));
 
@@ -636,6 +639,30 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		OUT_RING(ring, A4XX_GRAS_CL_VPORT_ZSCALE_0(ctx->viewport.scale[2]));
 	}
 
+	if (dirty & (FD_DIRTY_VIEWPORT | FD_DIRTY_RASTERIZER | FD_DIRTY_FRAMEBUFFER)) {
+		float zmin, zmax;
+		int depth = 24;
+		if (ctx->framebuffer.zsbuf) {
+			depth = util_format_get_component_bits(
+					pipe_surface_format(ctx->framebuffer.zsbuf),
+					UTIL_FORMAT_COLORSPACE_ZS, 0);
+		}
+		util_viewport_zmin_zmax(&ctx->viewport, ctx->rasterizer->clip_halfz,
+								&zmin, &zmax);
+
+		OUT_PKT0(ring, REG_A4XX_RB_VPORT_Z_CLAMP(0), 2);
+		if (depth == 32) {
+			OUT_RING(ring, fui(zmin));
+			OUT_RING(ring, fui(zmax));
+		} else if (depth == 16) {
+			OUT_RING(ring, (uint32_t)(zmin * 0xffff));
+			OUT_RING(ring, (uint32_t)(zmax * 0xffff));
+		} else {
+			OUT_RING(ring, (uint32_t)(zmin * 0xffffff));
+			OUT_RING(ring, (uint32_t)(zmax * 0xffffff));
+		}
+	}
+
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
 		unsigned n = pfb->nr_cbufs;
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
index 7456c63..b3a4292 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_rasterizer.c
@@ -98,7 +98,8 @@ fd4_rasterizer_state_create(struct pipe_context *pctx,
 		so->gras_su_mode_control |= A4XX_GRAS_SU_MODE_CONTROL_POLY_OFFSET;
 
 	if (!cso->depth_clip)
-		so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_CLIP_DISABLE;
+		so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZNEAR_CLIP_DISABLE |
+			A4XX_GRAS_CL_CLIP_CNTL_ZFAR_CLIP_DISABLE;
 	if (cso->clip_halfz)
 		so->gras_cl_clip_cntl |= A4XX_GRAS_CL_CLIP_CNTL_ZERO_GB_SCALE_Z;
 
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index ee0018f..fc423ec 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -469,6 +469,12 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 	debug_printf("\n");
 }
 
+uint64_t
+ir3_shader_outputs(const struct ir3_shader *so)
+{
+	return so->nir->info.outputs_written;
+}
+
 /* This has to reach into the fd_context a bit more than the rest of
  * ir3, but it needs to be aligned with the compiler, so both agree
  * on which const regs hold what.  And the logic is identical between
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index c17a76b..f430b6b 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -272,6 +272,7 @@ void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 		struct ir3_shader_key key, struct pipe_debug_callback *debug);
 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
+uint64_t ir3_shader_outputs(const struct ir3_shader *so);
 
 struct fd_ringbuffer;
 struct fd_context;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index 80e0990..3a23a9a 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -726,7 +726,7 @@ void
 CodeEmitterGK110::emitIMAD(const Instruction *i)
 {
    uint8_t addOp =
-      (i->src(2).mod.neg() << 1) | (i->src(0).mod.neg() ^ i->src(1).mod.neg());
+      i->src(2).mod.neg() | ((i->src(0).mod.neg() ^ i->src(1).mod.neg()) << 1);
 
    emitForm_21(i, 0x100, 0xa00);
 
@@ -773,7 +773,7 @@ CodeEmitterGK110::emitNOT(const Instruction *i)
       break;
    case FILE_MEMORY_CONST:
       code[1] |= 0x4 << 28;
-      setCAddress14(i->src(1));
+      setCAddress14(i->src(0));
       break;
    default:
       assert(0);
@@ -1321,15 +1321,12 @@ void
 CodeEmitterGK110::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
 {
    code[0] = 0x00000002 | ((qOp & 1) << 31);
-   code[1] = 0x7fc00000 | (qOp >> 1) | (laneMask << 12);
+   code[1] = 0x7fc00200 | (qOp >> 1) | (laneMask << 12); // dall
 
    defId(i->def(0), 2);
    srcId(i->src(0), 10);
    srcId((i->srcExists(1) && i->predSrc != 1) ? i->src(1) : i->src(0), 23);
 
-   if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
-      code[1] |= 1 << 9; // dall
-
    emitPredicate(i);
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index e62d807..d3e1708 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -1682,7 +1682,7 @@ CodeEmitterGM107::emitNOT()
 void
 CodeEmitterGM107::emitIADD()
 {
-   if (!longIMMD(insn->src(1))) {
+   if (insn->src(1).getFile() != FILE_IMMEDIATE) {
       switch (insn->src(1).getFile()) {
       case FILE_GPR:
          emitInsn(0x5c100000);
@@ -1707,6 +1707,7 @@ CodeEmitterGM107::emitIADD()
       emitX  (0x2b);
    } else {
       emitInsn(0x1c000000);
+      emitNEG (0x38, insn->src(0));
       emitSAT (0x36);
       emitX   (0x35);
       emitCC  (0x34);
@@ -2300,6 +2301,7 @@ CodeEmitterGM107::emitAL2P()
 {
    emitInsn (0xefa00000);
    emitField(0x2f, 2, (insn->getDef(0)->reg.size / 4) - 1);
+   emitPRED (0x2c);
    emitO    (0x20);
    emitField(0x14, 11, insn->src(0).get()->reg.data.offset);
    emitGPR  (0x08, insn->src(0).getIndirect(0));
@@ -2523,7 +2525,7 @@ CodeEmitterGM107::emitTEX()
 
    if (insn->tex.rIndirectSrc >= 0) {
       emitInsn (0xdeb80000);
-      emitField(0x35, 2, lodm);
+      emitField(0x25, 2, lodm);
       emitField(0x24, 1, insn->tex.useOffsets == 1);
    } else {
       emitInsn (0xc0380000);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
index 5d68e99..ca10848 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nv50.cpp
@@ -2112,7 +2112,7 @@ makeInstructionLong(Instruction *insn)
    insn->encSize = 8;
 
    for (int i = fn->bbCount - 1; i >= 0 && fn->bbArray[i] != insn->bb; --i) {
-      fn->bbArray[i]->binPos += 4;
+      fn->bbArray[i]->binPos += adj;
    }
    fn->binSize += adj;
    insn->bb->binSize += adj;
@@ -2164,9 +2164,16 @@ replaceExitWithModifier(Function *func)
             return;
       }
    }
-   epilogue->binSize -= 8;
-   func->binSize -= 8;
+
+   int adj = epilogue->getExit()->encSize;
+   epilogue->binSize -= adj;
+   func->binSize -= adj;
    delete_Instruction(func->getProgram(), epilogue->getExit());
+
+   // There may be BB's that are laid out after the exit block
+   for (int i = func->bbCount - 1; i >= 0 && func->bbArray[i] != epilogue; --i) {
+      func->bbArray[i]->binPos -= adj;
+   }
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index bc94285..be5ee4f 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -736,9 +736,15 @@ CodeEmitterNVC0::emitUADD(const Instruction *i)
 void
 CodeEmitterNVC0::emitIMAD(const Instruction *i)
 {
+   uint8_t addOp =
+      i->src(2).mod.neg() | ((i->src(0).mod.neg() ^ i->src(1).mod.neg()) << 1);
+
    assert(i->encSize == 8);
    emitForm_A(i, HEX64(20000000, 00000003));
 
+   assert(addOp != 3);
+   code[0] |= addOp << 8;
+
    if (isSignedType(i->dType))
       code[0] |= 1 << 7;
    if (isSignedType(i->sType))
@@ -749,10 +755,6 @@ CodeEmitterNVC0::emitIMAD(const Instruction *i)
    if (i->flagsDef >= 0) code[1] |= 1 << 16;
    if (i->flagsSrc >= 0) code[1] |= 1 << 23;
 
-   if (i->src(2).mod.neg()) code[0] |= 0x10;
-   if (i->src(1).mod.neg() ^
-       i->src(0).mod.neg()) code[0] |= 0x20;
-
    if (i->subOp == NV50_IR_SUBOP_MUL_HIGH)
       code[0] |= 1 << 6;
 }
@@ -1356,16 +1358,13 @@ CodeEmitterNVC0::emitTXQ(const TexInstruction *i)
 void
 CodeEmitterNVC0::emitQUADOP(const Instruction *i, uint8_t qOp, uint8_t laneMask)
 {
-   code[0] = 0x00000000 | (laneMask << 6);
+   code[0] = 0x00000200 | (laneMask << 6); // dall
    code[1] = 0x48000000 | qOp;
 
    defId(i->def(0), 14);
    srcId(i->src(0), 20);
    srcId((i->srcExists(1) && i->predSrc != 1) ? i->src(1) : i->src(0), 26);
 
-   if (i->op == OP_QUADOP && progType != Program::TYPE_FRAGMENT)
-      code[0] |= 1 << 9; // dall
-
    emitPredicate(i);
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index beb7b53..899a5cd 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -182,6 +182,7 @@ public:
 
    // mask of used components of source s
    unsigned int srcMask(unsigned int s) const;
+   unsigned int texOffsetMask() const;
 
    SrcRegister getSrc(unsigned int s) const
    {
@@ -234,6 +235,35 @@ private:
    const struct tgsi_full_instruction *insn;
 };
 
+unsigned int Instruction::texOffsetMask() const
+{
+   const struct tgsi_instruction_texture *tex = &insn->Texture;
+   assert(insn->Instruction.Texture);
+
+   switch (tex->Texture) {
+   case TGSI_TEXTURE_BUFFER:
+   case TGSI_TEXTURE_1D:
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return 0x1;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_2D_MSAA:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return 0x3;
+   case TGSI_TEXTURE_3D:
+      return 0x7;
+   default:
+      assert(!"Unexpected texture target");
+      return 0xf;
+   }
+}
+
 unsigned int Instruction::srcMask(unsigned int s) const
 {
    unsigned int mask = insn->Dst[0].Register.WriteMask;
@@ -942,6 +972,9 @@ private:
    int inferSysValDirection(unsigned sn) const;
    bool scanDeclaration(const struct tgsi_full_declaration *);
    bool scanInstruction(const struct tgsi_full_instruction *);
+   void scanInstructionSrc(const Instruction& insn,
+                           const Instruction::SrcRegister& src,
+                           unsigned mask);
    void scanProperty(const struct tgsi_full_property *);
    void scanImmediate(const struct tgsi_full_immediate *);
 
@@ -1351,6 +1384,61 @@ inline bool Source::isEdgeFlagPassthrough(const Instruction& insn) const
       insn.getSrc(0).getFile() == TGSI_FILE_INPUT;
 }
 
+void Source::scanInstructionSrc(const Instruction& insn,
+                                const Instruction::SrcRegister& src,
+                                unsigned mask)
+{
+   if (src.getFile() == TGSI_FILE_TEMPORARY) {
+      if (src.isIndirect(0))
+         indirectTempArrays.insert(src.getArrayId());
+   } else
+   if (src.getFile() == TGSI_FILE_BUFFER ||
+       src.getFile() == TGSI_FILE_IMAGE ||
+       (src.getFile() == TGSI_FILE_MEMORY &&
+        memoryFiles[src.getIndex(0)].mem_type == TGSI_MEMORY_TYPE_GLOBAL)) {
+      info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
+         0x1 : 0x2;
+   } else
+   if (src.getFile() == TGSI_FILE_OUTPUT) {
+      if (src.isIndirect(0)) {
+         // We don't know which one is accessed, just mark everything for
+         // reading. This is an extremely unlikely occurrence.
+         for (unsigned i = 0; i < info->numOutputs; ++i)
+            info->out[i].oread = 1;
+      } else {
+         info->out[src.getIndex(0)].oread = 1;
+      }
+   }
+   if (src.getFile() != TGSI_FILE_INPUT)
+      return;
+
+   if (src.isIndirect(0)) {
+      for (unsigned i = 0; i < info->numInputs; ++i)
+         info->in[i].mask = 0xf;
+   } else {
+      const int i = src.getIndex(0);
+      for (unsigned c = 0; c < 4; ++c) {
+         if (!(mask & (1 << c)))
+            continue;
+         int k = src.getSwizzle(c);
+         if (k <= TGSI_SWIZZLE_W)
+            info->in[i].mask |= 1 << k;
+      }
+      switch (info->in[i].sn) {
+      case TGSI_SEMANTIC_PSIZE:
+      case TGSI_SEMANTIC_PRIMID:
+      case TGSI_SEMANTIC_FOG:
+         info->in[i].mask &= 0x1;
+         break;
+      case TGSI_SEMANTIC_PCOORD:
+         info->in[i].mask &= 0x3;
+         break;
+      default:
+         break;
+      }
+   }
+}
+
 bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
 {
    Instruction insn(inst);
@@ -1383,66 +1471,19 @@ bool Source::scanInstruction(const struct tgsi_full_instruction *inst)
             indirectTempArrays.insert(dst.getArrayId());
       } else
       if (dst.getFile() == TGSI_FILE_BUFFER ||
-          dst.getFile() == TGSI_FILE_IMAGE || 
+          dst.getFile() == TGSI_FILE_IMAGE ||
           (dst.getFile() == TGSI_FILE_MEMORY &&
            memoryFiles[dst.getIndex(0)].mem_type == TGSI_MEMORY_TYPE_GLOBAL)) {
          info->io.globalAccess |= 0x2;
       }
    }
 
-   for (unsigned s = 0; s < insn.srcCount(); ++s) {
-      Instruction::SrcRegister src = insn.getSrc(s);
-      if (src.getFile() == TGSI_FILE_TEMPORARY) {
-         if (src.isIndirect(0))
-            indirectTempArrays.insert(src.getArrayId());
-      } else
-      if (src.getFile() == TGSI_FILE_BUFFER ||
-          src.getFile() == TGSI_FILE_IMAGE ||
-          (src.getFile() == TGSI_FILE_MEMORY &&
-           memoryFiles[src.getIndex(0)].mem_type == TGSI_MEMORY_TYPE_GLOBAL)) {
-         info->io.globalAccess |= (insn.getOpcode() == TGSI_OPCODE_LOAD) ?
-               0x1 : 0x2;
-      } else
-      if (src.getFile() == TGSI_FILE_OUTPUT) {
-         if (src.isIndirect(0)) {
-            // We don't know which one is accessed, just mark everything for
-            // reading. This is an extremely unlikely occurrence.
-            for (unsigned i = 0; i < info->numOutputs; ++i)
-               info->out[i].oread = 1;
-         } else {
-            info->out[src.getIndex(0)].oread = 1;
-         }
-      }
-      if (src.getFile() != TGSI_FILE_INPUT)
-         continue;
-      unsigned mask = insn.srcMask(s);
+   for (unsigned s = 0; s < insn.srcCount(); ++s)
+      scanInstructionSrc(insn, insn.getSrc(s), insn.srcMask(s));
+
+   for (unsigned s = 0; s < insn.getNumTexOffsets(); ++s)
+      scanInstructionSrc(insn, insn.getTexOffset(s), insn.texOffsetMask());
 
-      if (src.isIndirect(0)) {
-         for (unsigned i = 0; i < info->numInputs; ++i)
-            info->in[i].mask = 0xf;
-      } else {
-         const int i = src.getIndex(0);
-         for (unsigned c = 0; c < 4; ++c) {
-            if (!(mask & (1 << c)))
-               continue;
-            int k = src.getSwizzle(c);
-            if (k <= TGSI_SWIZZLE_W)
-               info->in[i].mask |= 1 << k;
-         }
-         switch (info->in[i].sn) {
-         case TGSI_SEMANTIC_PSIZE:
-         case TGSI_SEMANTIC_PRIMID:
-         case TGSI_SEMANTIC_FOG:
-            info->in[i].mask &= 0x1;
-            break;
-         case TGSI_SEMANTIC_PCOORD:
-            info->in[i].mask &= 0x3;
-            break;
-         default:
-            break;
-         }
-      }
-   }
    return true;
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
index 23414d5..b1076cf 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_graph.cpp
@@ -287,7 +287,10 @@ private:
 
       bb.push(node);
 
-      while (bb.getSize()) {
+      while (bb.getSize() || cross.getSize()) {
+         if (bb.getSize() == 0)
+            cross.moveTo(bb);
+
          node = reinterpret_cast<Graph::Node *>(bb.pop().u.p);
          assert(node);
          if (!node->visit(sequence))
@@ -314,9 +317,6 @@ private:
             }
          }
          nodes[count++] = node;
-
-         if (bb.getSize() == 0)
-            cross.moveTo(bb);
       }
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index 6a6b44c..4e60b1c 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -750,6 +750,16 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
          i->tex.rIndirectSrc = 0;
          i->tex.sIndirectSrc = -1;
       }
+      // Move the indirect reference to right after the coords
+      else if (i->tex.rIndirectSrc >= 0 && chipset >= NVISA_GM107_CHIPSET) {
+         Value *hnd = i->getIndirectR();
+
+         i->setIndirectR(NULL);
+         i->moveSources(arg, 1);
+         i->setSrc(arg, hnd);
+         i->tex.rIndirectSrc = 0;
+         i->tex.sIndirectSrc = -1;
+      }
    } else
    // (nvc0) generate and move the tsc/tic/array source to the front
    if (i->tex.target.isArray() || i->tex.rIndirectSrc >= 0 || i->tex.sIndirectSrc >= 0) {
@@ -823,7 +833,7 @@ NVC0LoweringPass::handleTEX(TexInstruction *i)
          for (n = 0; n < i->tex.useOffsets; n++) {
             for (c = 0; c < 2; ++c) {
                if ((n % 2) == 0 && c == 0)
-                  offs[n / 2] = i->offset[n][c].get();
+                  bld.mkMov(offs[n / 2] = bld.getScratch(), i->offset[n][c].get());
                else
                   bld.mkOp3(OP_INSBF, TYPE_U32,
                             offs[n / 2],
@@ -2056,6 +2066,13 @@ NVC0LoweringPass::processSurfaceCoordsNVC0(TexInstruction *su)
       base = 0;
    }
 
+   if (ind) {
+      Value *ptr;
+      ptr = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), ind, bld.mkImm(su->tex.r));
+      ptr = bld.mkOp2v(OP_AND, TYPE_U32, bld.getSSA(), ptr, bld.mkImm(7));
+      su->setIndirectR(ptr);
+   }
+
    // get surface coordinates
    for (c = 0; c < arg; ++c)
       src[c] = su->getSrc(c);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index 6b52d7b..bf260bb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -1902,8 +1902,10 @@ GCRA::resolveSplitsAndMerges()
          // their registers should be identical.
          if (v->getInsn()->op == OP_PHI || v->getInsn()->op == OP_UNION) {
             Instruction *phi = v->getInsn();
-            for (int phis = 0; phi->srcExists(phis); ++phis)
+            for (int phis = 0; phi->srcExists(phis); ++phis) {
                phi->getSrc(phis)->join = v;
+               phi->getSrc(phis)->reg.data.id = v->reg.data.id;
+            }
          }
          reg += v->reg.size;
       }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_draw.c b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
index 7b0d074..1c71534 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_draw.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_draw.c
@@ -127,6 +127,8 @@ nv30_render_draw_elements(struct vbuf_render *render,
    struct nouveau_pushbuf *push = nv30->screen->base.pushbuf;
    unsigned i;
 
+   pipe_mutex_lock(nv30->screen->base.push_mutex);
+
    BEGIN_NV04(push, NV30_3D(VTXBUF(0)), r->vertex_info.num_attribs);
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
@@ -134,8 +136,10 @@ nv30_render_draw_elements(struct vbuf_render *render,
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, false))
+   if (!nv30_state_validate(nv30, ~0, false)) {
+      pipe_mutex_unlock(nv30->screen->base.push_mutex);
       return;
+   }
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
    PUSH_DATA (push, r->prim);
@@ -160,6 +164,8 @@ nv30_render_draw_elements(struct vbuf_render *render,
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
    PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
    PUSH_RESET(push, BUFCTX_VTXTMP);
+
+   pipe_mutex_unlock(nv30->screen->base.push_mutex);
 }
 
 static void
@@ -172,6 +178,8 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
    unsigned ps = fn + (pn ? 1 : 0);
    unsigned i;
 
+   pipe_mutex_lock(nv30->screen->base.push_mutex);
+
    BEGIN_NV04(push, NV30_3D(VTXBUF(0)), r->vertex_info.num_attribs);
    for (i = 0; i < r->vertex_info.num_attribs; i++) {
       PUSH_RESRC(push, NV30_3D(VTXBUF(i)), BUFCTX_VTXTMP,
@@ -179,8 +187,10 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
                        NOUVEAU_BO_LOW | NOUVEAU_BO_RD, 0, NV30_3D_VTXBUF_DMA1);
    }
 
-   if (!nv30_state_validate(nv30, ~0, false))
+   if (!nv30_state_validate(nv30, ~0, false)) {
+      pipe_mutex_unlock(nv30->screen->base.push_mutex);
       return;
+   }
 
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
    PUSH_DATA (push, r->prim);
@@ -197,6 +207,8 @@ nv30_render_draw_arrays(struct vbuf_render *render, unsigned start, uint nr)
    BEGIN_NV04(push, NV30_3D(VERTEX_BEGIN_END), 1);
    PUSH_DATA (push, NV30_3D_VERTEX_BEGIN_END_STOP);
    PUSH_RESET(push, BUFCTX_VTXTMP);
+
+   pipe_mutex_unlock(nv30->screen->base.push_mutex);
 }
 
 static void
@@ -383,6 +395,8 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
 
    nv30_render_validate(nv30);
 
+   pipe_mutex_unlock(nv30->screen->base.push_mutex);
+
    if (nv30->draw_dirty & NV30_NEW_VIEWPORT)
       draw_set_viewport_states(draw, 0, 1, &nv30->viewport);
    if (nv30->draw_dirty & NV30_NEW_RASTERIZER)
@@ -448,6 +462,8 @@ nv30_render_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       if (transfer[i])
          pipe_buffer_unmap(pipe, transfer[i]);
 
+   pipe_mutex_lock(nv30->screen->base.push_mutex);
+
    nv30->draw_dirty = 0;
    nv30_state_release(nv30);
 }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
index 6de61bc..fd21f99 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_fragprog.c
@@ -38,6 +38,8 @@ nv30_fragprog_upload(struct nv30_context *nv30)
    struct nv30_fragprog *fp = nv30->fragprog.program;
    struct pipe_context *pipe = &nv30->base.pipe;
 
+   pipe_mutex_unlock(nv->screen->push_mutex);
+
    if (unlikely(!fp->buffer))
       fp->buffer = pipe_buffer_create(pipe->screen, 0, 0, fp->insn_len * 4);
 
@@ -60,6 +62,8 @@ nv30_fragprog_upload(struct nv30_context *nv30)
 
    if (nv04_resource(fp->buffer)->domain != NOUVEAU_BO_VRAM)
       nouveau_buffer_migrate(nv, nv04_resource(fp->buffer), NOUVEAU_BO_VRAM);
+
+   pipe_mutex_lock(nv->screen->push_mutex);
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_state.c b/src/gallium/drivers/nouveau/nv30/nv30_state.c
index fd604c2..43ecaac 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_state.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_state.c
@@ -379,8 +379,9 @@ nv30_set_framebuffer_state(struct pipe_context *pipe,
        struct nv30_miptree *zeta_mt = nv30_miptree(fb->zsbuf->texture);
 
        if (color_mt->swizzled != zeta_mt->swizzled ||
-           (util_format_get_blocksize(fb->zsbuf->format) > 2) !=
-           (util_format_get_blocksize(fb->cbufs[0]->format) > 2)) {
+           (color_mt->swizzled &&
+            (util_format_get_blocksize(fb->zsbuf->format) > 2) !=
+            (util_format_get_blocksize(fb->cbufs[0]->format) > 2))) {
           nv30->framebuffer.zsbuf = NULL;
           debug_printf("Mismatched color and zeta formats, ignoring zeta.\n");
        }
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
index 9ecbcd1..24fa3bb 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_transfer.c
@@ -115,7 +115,8 @@ nv30_transfer_rect_fragprog(struct nv30_context *nv30)
    struct pipe_context *pipe = &nv30->base.pipe;
 
    if (!fp) {
-      nv30->blit_fp = pipe_buffer_create(pipe->screen, 0, 0, 12 * 4);
+      nv30->blit_fp =
+         pipe_buffer_create(pipe->screen, 0, PIPE_USAGE_STAGING, 12 * 4);
       if (nv30->blit_fp) {
          struct pipe_transfer *transfer;
          u32 *map = pipe_buffer_map(pipe, nv30->blit_fp,
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_formats.c b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
index 34d32d1..6ea5a47 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_formats.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_formats.c
@@ -161,7 +161,7 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
    F3(A, R11G11B10_FLOAT, R11G11B10_FLOAT, R, G, B, xx, FLOAT, BF10GF11RF11, IB),
 
    F3(A, L8_UNORM, R8_UNORM, R, R, R, xx, UNORM, R8, TB),
-   F3(A, L8_SRGB, R8_UNORM, R, R, R, xx, UNORM, R8, TB),
+   F3(A, L8_SRGB, NONE, R, R, R, xx, UNORM, R8, T),
    F3(A, L8_SNORM, R8_SNORM, R, R, R, xx, SNORM, R8, TC),
    I3(A, L8_SINT, R8_SINT, R, R, R, xx, SINT, R8, TR),
    I3(A, L8_UINT, R8_UINT, R, R, R, xx, UINT, R8, TR),
@@ -203,7 +203,7 @@ const struct nv50_format nv50_format_table[PIPE_FORMAT_COUNT] =
    C4(A, L4A4_UNORM, NONE, R, R, R, G, UNORM, G4R4, T),
    C4(A, L8A8_UNORM, RG8_UNORM, R, R, R, G, UNORM, G8R8, T),
    C4(A, L8A8_SNORM, RG8_SNORM, R, R, R, G, SNORM, G8R8, T),
-   C4(A, L8A8_SRGB, RG8_UNORM, R, R, R, G, UNORM, G8R8, T),
+   C4(A, L8A8_SRGB, NONE, R, R, R, G, UNORM, G8R8, T),
    C4(A, L8A8_SINT, RG8_SINT, R, R, R, G, SINT, G8R8, T),
    C4(A, L8A8_UINT, RG8_UINT, R, R, R, G, UINT, G8R8, T),
    C4(A, L16A16_UNORM, RG16_UNORM, R, R, R, G, UNORM, R16_G16, T),
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index c764f5c..383bee3 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -307,6 +307,9 @@ nv50_program_create_strmout_state(const struct nv50_ir_prog_info *info,
       const unsigned r = pso->output[i].register_index;
       b = pso->output[i].output_buffer;
 
+      if (r >= info->numOutputs)
+         continue;
+
       for (c = 0; c < pso->output[i].num_components; ++c)
          so->map[base[b] + p + c] = info->out[r].slot[s + c];
    }
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
index 65f7338..a67a390 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state_validate.c
@@ -1,5 +1,6 @@
 
 #include "util/u_format.h"
+#include "util/u_viewport.h"
 
 #include "nv50/nv50_context.h"
 
@@ -265,8 +266,12 @@ nv50_validate_viewport(struct nv50_context *nv50)
       PUSH_DATAf(push, vpt->scale[1]);
       PUSH_DATAf(push, vpt->scale[2]);
 
-      zmin = vpt->translate[2] - fabsf(vpt->scale[2]);
-      zmax = vpt->translate[2] + fabsf(vpt->scale[2]);
+      /* If the halfz setting ever changes, the viewports will also get
+       * updated. The rast will get updated before the validate function has a
+       * chance to hit, so we can just use it directly without an atom
+       * dependency.
+       */
+      util_viewport_zmin_zmax(vpt, nv50->rast->pipe.clip_halfz, &zmin, &zmax);
 
 #ifdef NV50_SCISSORS_CLIPPING
       BEGIN_NV04(push, NV50_3D(DEPTH_RANGE_NEAR(i)), 2);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
index 28d6fec..12d5b0e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.c
@@ -514,10 +514,8 @@ nvc0_bufctx_fence(struct nvc0_context *nvc0, struct nouveau_bufctx *bufctx,
    NOUVEAU_DRV_STAT(&nvc0->screen->base, resource_validate_count, count);
 }
 
-static void
-nvc0_context_get_sample_position(struct pipe_context *pipe,
-                                 unsigned sample_count, unsigned sample_index,
-                                 float *xy)
+const void *
+nvc0_get_sample_locations(unsigned sample_count)
 {
    static const uint8_t ms1[1][2] = { { 0x8, 0x8 } };
    static const uint8_t ms2[2][2] = {
@@ -549,8 +547,22 @@ nvc0_context_get_sample_position(struct pipe_context *pipe,
    case 8: ptr = ms8; break;
    default:
       assert(0);
-      return; /* bad sample count -> undefined locations */
+      return NULL; /* bad sample count -> undefined locations */
    }
+   return ptr;
+}
+
+static void
+nvc0_context_get_sample_position(struct pipe_context *pipe,
+                                 unsigned sample_count, unsigned sample_index,
+                                 float *xy)
+{
+   const uint8_t (*ptr)[2];
+
+   ptr = nvc0_get_sample_locations(sample_count);
+   if (!ptr)
+      return;
+
    xy[0] = ptr[sample_index][0] * 0.0625f;
    xy[1] = ptr[sample_index][1] * 0.0625f;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 8d27300..ff5467c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -278,6 +278,7 @@ struct pipe_context *nvc0_create(struct pipe_screen *, void *, unsigned flags);
 void nvc0_bufctx_fence(struct nvc0_context *, struct nouveau_bufctx *,
                        bool on_flush);
 void nvc0_default_kick_notify(struct nouveau_pushbuf *);
+const void *nvc0_get_sample_locations(unsigned);
 
 /* nvc0_draw.c */
 extern struct draw_stage *nvc0_draw_render_stage(struct nvc0_context *);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index aba9511..e1ff3b7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -500,11 +500,14 @@ nvc0_program_create_tfb_state(const struct nv50_ir_prog_info *info,
    for (i = 0; i < pso->num_outputs; ++i) {
       unsigned s = pso->output[i].start_component;
       unsigned p = pso->output[i].dst_offset;
+      const unsigned r = pso->output[i].register_index;
       b = pso->output[i].output_buffer;
 
+      if (r >= info->numOutputs)
+         continue;
+
       for (c = 0; c < pso->output[i].num_components; ++c)
-         tfb->varying_index[b][p++] =
-            info->out[pso->output[i].register_index].slot[s + c];
+         tfb->varying_index[b][p++] = info->out[r].slot[s + c];
 
       tfb->varying_count[b] = MAX2(tfb->varying_count[b], p);
       tfb->stream[b] = pso->output[i].stream;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index ad44e85..ce59484 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -2,6 +2,7 @@
 #include "util/u_format.h"
 #include "util/u_framebuffer.h"
 #include "util/u_math.h"
+#include "util/u_viewport.h"
 
 #include "nvc0/nvc0_context.h"
 
@@ -211,6 +212,19 @@ nvc0_validate_fb(struct nvc0_context *nvc0)
        PUSH_DATAf(push, xy[1]);
     }
 
+   if (screen->base.class_3d >= GM200_3D_CLASS) {
+      const uint8_t (*ptr)[2] = nvc0_get_sample_locations(ms);
+      uint32_t val[4] = {};
+
+      for (i = 0; i < 16; i++) {
+         val[i / 4] |= ptr[i % ms][0] << (((i % 4) * 8) + 0);
+         val[i / 4] |= ptr[i % ms][1] << (((i % 4) * 8) + 4);
+      }
+
+      BEGIN_NVC0(push, SUBC_3D(0x11e0), 4);
+      PUSH_DATAp(push, val, 4);
+   }
+
     if (serialize)
        IMMED_NVC0(push, NVC0_3D(SERIALIZE), 0);
 
@@ -316,8 +330,12 @@ nvc0_validate_viewport(struct nvc0_context *nvc0)
       PUSH_DATA (push, (w << 16) | x);
       PUSH_DATA (push, (h << 16) | y);
 
-      zmin = vp->translate[2] - fabsf(vp->scale[2]);
-      zmax = vp->translate[2] + fabsf(vp->scale[2]);
+      /* If the halfz setting ever changes, the viewports will also get
+       * updated. The rast will get updated before the validate function has a
+       * chance to hit, so we can just use it directly without an atom
+       * dependency.
+       */
+      util_viewport_zmin_zmax(vp, nvc0->rast->pipe.clip_halfz, &zmin, &zmax);
 
       BEGIN_NVC0(push, NVC0_3D(DEPTH_RANGE_NEAR(i)), 2);
       PUSH_DATAf(push, zmin);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
index 1a5d8ec..efd90de 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_tex.c
@@ -589,13 +589,11 @@ void nvc0_validate_textures(struct nvc0_context *nvc0)
       PUSH_DATA (nvc0->base.pushbuf, 0);
    }
 
-   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) {
-      /* Invalidate all CP textures because they are aliased. */
-      for (int i = 0; i < nvc0->num_textures[5]; i++)
-         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_CP_TEX(i));
-      nvc0->textures_dirty[5] = ~0;
-      nvc0->dirty_cp |= NVC0_NEW_CP_TEXTURES;
-   }
+   /* Invalidate all CP textures because they are aliased. */
+   for (int i = 0; i < nvc0->num_textures[5]; i++)
+      nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_TEX(i));
+   nvc0->textures_dirty[5] = ~0;
+   nvc0->dirty_cp |= NVC0_NEW_CP_TEXTURES;
 }
 
 bool
@@ -709,11 +707,9 @@ void nvc0_validate_samplers(struct nvc0_context *nvc0)
       PUSH_DATA (nvc0->base.pushbuf, 0);
    }
 
-   if (nvc0->screen->base.class_3d < NVE4_3D_CLASS) {
-      /* Invalidate all CP samplers because they are aliased. */
-      nvc0->samplers_dirty[5] = ~0;
-      nvc0->dirty_cp |= NVC0_NEW_CP_SAMPLERS;
-   }
+   /* Invalidate all CP samplers because they are aliased. */
+   nvc0->samplers_dirty[5] = ~0;
+   nvc0->dirty_cp |= NVC0_NEW_CP_SAMPLERS;
 }
 
 /* Upload the "diagonal" entries for the possible texture sources ($t == $s).
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index cae621c..3d20c68 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -835,7 +835,7 @@ nvc0_draw_indirect(struct nvc0_context *nvc0, const struct pipe_draw_info *info)
 
    /* Queue things up to let the macros write params to the driver constbuf */
    BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-   PUSH_DATA (push, 512);
+   PUSH_DATA (push, 2048);
    PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
    PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
    BEGIN_NVC0(push, NVC0_3D(CB_POS), 1);
@@ -981,7 +981,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    if (nvc0->vertprog->vp.need_draw_parameters) {
       PUSH_SPACE(push, 9);
       BEGIN_NVC0(push, NVC0_3D(CB_SIZE), 3);
-      PUSH_DATA (push, 512);
+      PUSH_DATA (push, 2048);
       PUSH_DATAh(push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
       PUSH_DATA (push, screen->uniform_bo->offset + NVC0_CB_AUX_INFO(0));
       if (!info->indirect) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index a1d1d3e..d172d73 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -237,7 +237,13 @@ nve4_compute_validate_samplers(struct nvc0_context *nvc0)
       BEGIN_NVC0(nvc0->base.pushbuf, NVE4_CP(TSC_FLUSH), 1);
       PUSH_DATA (nvc0->base.pushbuf, 0);
    }
+
+   /* Invalidate all 3D samplers because they are aliased. */
+   for (int s = 0; s < 5; s++)
+      nvc0->samplers_dirty[s] = ~0;
+   nvc0->dirty_3d |= NVC0_NEW_3D_SAMPLERS;
 }
+
 /* (Code duplicated at bottom for various non-convincing reasons.
  *  E.g. we might want to use the COMPUTE subchannel to upload TIC/TSC
  *  entries to avoid a subchannel switch.
@@ -690,6 +696,14 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    }
 
    nvc0->state.num_textures[s] = nvc0->num_textures[s];
+
+   /* Invalidate all 3D textures because they are aliased. */
+   for (int s = 0; s < 5; s++) {
+      for (int i = 0; i < nvc0->num_textures[s]; i++)
+         nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TEX(s, i));
+      nvc0->textures_dirty[s] = ~0;
+   }
+   nvc0->dirty_3d |= NVC0_NEW_3D_TEXTURES;
 }
 
 
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index d100a9d..341f406 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -190,7 +190,7 @@ static boolean r300_setup_atoms(struct r300_context* r300)
     /* VAP. */
     R300_INIT_ATOM(viewport_state, 9);
     R300_INIT_ATOM(pvs_flush, 2);
-    R300_INIT_ATOM(vap_invariant_state, is_r500 ? 11 : 9);
+    R300_INIT_ATOM(vap_invariant_state, is_r500 || !has_tcl ? 11 : 9);
     R300_INIT_ATOM(vertex_stream_state, 0);
     R300_INIT_ATOM(vs_state, 0);
     R300_INIT_ATOM(vs_constants, 0);
@@ -314,6 +314,14 @@ static void r300_init_states(struct pipe_context *pipe)
 
         if (r300->screen->caps.is_r500) {
             OUT_CB_REG(R500_VAP_TEX_TO_COLOR_CNTL, 0);
+        } else if (!r300->screen->caps.has_tcl) {
+            /* RSxxx:
+             * Static VAP setup since r300_emit_vs_state() is never called.
+             */
+            OUT_CB_REG(R300_VAP_CNTL, R300_PVS_NUM_SLOTS(10) |
+                                      R300_PVS_NUM_CNTLRS(5) |
+                                      R300_PVS_NUM_FPUS(2) |
+                                      R300_PVS_VF_MAX_VTX_NUM(5));
         }
         END_CB;
     }
diff --git a/src/gallium/drivers/radeon/cayman_msaa.c b/src/gallium/drivers/radeon/cayman_msaa.c
index 9412e89..6d6998e 100644
--- a/src/gallium/drivers/radeon/cayman_msaa.c
+++ b/src/gallium/drivers/radeon/cayman_msaa.c
@@ -143,6 +143,13 @@ void cayman_init_msaa(struct pipe_context *ctx)
 void cayman_emit_msaa_sample_locs(struct radeon_winsys_cs *cs, int nr_samples)
 {
 	switch (nr_samples) {
+	default:
+	case 1:
+		radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, 0);
+		radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, 0);
+		radeon_set_context_reg(cs, CM_R_028C18_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y1_0, 0);
+		radeon_set_context_reg(cs, CM_R_028C28_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y1_0, 0);
+		break;
 	case 2:
 		radeon_set_context_reg(cs, CM_R_028BF8_PA_SC_AA_SAMPLE_LOCS_PIXEL_X0Y0_0, eg_sample_locs_2x[0]);
 		radeon_set_context_reg(cs, CM_R_028C08_PA_SC_AA_SAMPLE_LOCS_PIXEL_X1Y0_0, eg_sample_locs_2x[1]);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 23ddff4..8de3b18 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -703,8 +703,9 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 	}
 
 	rtex->cmask_buffer = (struct r600_resource *)
-		pipe_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT, rtex->cmask.size);
+		r600_aligned_buffer_create(&rscreen->b, 0, PIPE_USAGE_DEFAULT,
+					   rtex->cmask.size,
+					   rtex->cmask.alignment);
 	if (rtex->cmask_buffer == NULL) {
 		rtex->cmask.size = 0;
 		return;
diff --git a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
index 74b36ec..9ab5af9 100644
--- a/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
+++ b/src/gallium/drivers/radeon/radeon_setup_tgsi_llvm.c
@@ -513,6 +513,16 @@ void radeon_llvm_emit_store(
 	}
 }
 
+/* Emit a branch to the given default target for the current block if
+ * applicable -- that is, if the current block does not already contain a
+ * branch from a break or continue.
+ */
+static void emit_default_branch(LLVMBuilderRef builder, LLVMBasicBlockRef target)
+{
+	if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(builder)))
+		 LLVMBuildBr(builder, target);
+}
+
 static void bgnloop_emit(
 	const struct lp_build_tgsi_action * action,
 	struct lp_build_tgsi_context * bld_base,
@@ -577,28 +587,8 @@ static void else_emit(
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	struct radeon_llvm_branch * current_branch = get_current_branch(ctx);
-	LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder);
-
-	/* We need to add a terminator to the current block if the previous
-	 * instruction was an ENDIF.Example:
-	 * IF
-	 *   [code]
-	 *   IF
-	 *     [code]
-	 *   ELSE
-	 *    [code]
-	 *   ENDIF <--
-	 * ELSE<--
-	 *   [code]
-	 * ENDIF
-	 */
 
-	if (current_block != current_branch->if_block) {
-		LLVMBuildBr(gallivm->builder, current_branch->endif_block);
-	}
-	if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) {
-		LLVMBuildBr(gallivm->builder, current_branch->endif_block);
-	}
+	emit_default_branch(gallivm->builder, current_branch->endif_block);
 	current_branch->has_else = 1;
 	LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block);
 }
@@ -611,26 +601,15 @@ static void endif_emit(
 	struct radeon_llvm_context * ctx = radeon_llvm_context(bld_base);
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	struct radeon_llvm_branch * current_branch = get_current_branch(ctx);
-	LLVMBasicBlockRef current_block = LLVMGetInsertBlock(gallivm->builder);
 
-	/* If we have consecutive ENDIF instructions, then the first ENDIF
-	 * will not have a terminator, so we need to add one. */
-	if (current_block != current_branch->if_block
-			&& current_block != current_branch->else_block
-			&& !LLVMGetBasicBlockTerminator(current_block)) {
+	emit_default_branch(gallivm->builder, current_branch->endif_block);
 
-		 LLVMBuildBr(gallivm->builder, current_branch->endif_block);
-	}
+	/* Need to fixup an empty else block if there was no ELSE opcode. */
 	if (!LLVMGetBasicBlockTerminator(current_branch->else_block)) {
 		LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->else_block);
 		LLVMBuildBr(gallivm->builder, current_branch->endif_block);
 	}
 
-	if (!LLVMGetBasicBlockTerminator(current_branch->if_block)) {
-		LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->if_block);
-		LLVMBuildBr(gallivm->builder, current_branch->endif_block);
-	}
-
 	LLVMPositionBuilderAtEnd(gallivm->builder, current_branch->endif_block);
 	ctx->branch_depth--;
 }
@@ -644,9 +623,7 @@ static void endloop_emit(
 	struct gallivm_state * gallivm = bld_base->base.gallivm;
 	struct radeon_llvm_loop * current_loop = get_current_loop(ctx);
 
-	if (!LLVMGetBasicBlockTerminator(LLVMGetInsertBlock(gallivm->builder))) {
-		 LLVMBuildBr(gallivm->builder, current_loop->loop_block);
-	}
+	emit_default_branch(gallivm->builder, current_loop->loop_block);
 
 	LLVMPositionBuilderAtEnd(gallivm->builder, current_loop->endloop_block);
 	ctx->loop_depth--;
@@ -1326,23 +1303,32 @@ static void emit_lsb(const struct lp_build_tgsi_action * action,
 		     struct lp_build_emit_data * emit_data)
 {
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
+	LLVMBuilderRef builder = gallivm->builder;
 	LLVMValueRef args[2] = {
 		emit_data->args[0],
 
 		/* The value of 1 means that ffs(x=0) = undef, so LLVM won't
 		 * add special code to check for x=0. The reason is that
 		 * the LLVM behavior for x=0 is different from what we
-		 * need here.
-		 *
-		 * The hardware already implements the correct behavior.
+		 * need here. However, LLVM also assumes that ffs(x) is
+		 * in [0, 31], but GLSL expects that ffs(0) = -1, so
+		 * a conditional assignment to handle 0 is still required.
 		 */
-		lp_build_const_int32(gallivm, 1)
+		LLVMConstInt(LLVMInt1TypeInContext(gallivm->context), 1, 0)
 	};
 
-	emit_data->output[emit_data->chan] =
+	LLVMValueRef lsb =
 		lp_build_intrinsic(gallivm->builder, "llvm.cttz.i32",
 				emit_data->dst_type, args, ARRAY_SIZE(args),
 				LLVMReadNoneAttribute);
+
+	/* TODO: We need an intrinsic to skip this conditional. */
+	/* Check for zero: */
+	emit_data->output[emit_data->chan] =
+		LLVMBuildSelect(builder,
+				LLVMBuildICmp(builder, LLVMIntEQ, args[0],
+					      bld_base->uint_bld.zero, ""),
+				lp_build_const_int32(gallivm, -1), lsb, "");
 }
 
 /* Find the last bit set. */
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index d8ec2a3..8e23b61 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -519,6 +519,12 @@ static void cik_sdma_copy(struct pipe_context *ctx,
 		return;
 	}
 
+	/* Carrizo SDMA texture copying is very broken for some users.
+	 * https://bugs.freedesktop.org/show_bug.cgi?id=97029
+	 */
+	if (sctx->b.family == CHIP_CARRIZO)
+		goto fallback;
+
 	if (cik_sdma_copy_texture(sctx, dst, dst_level, dstx, dsty, dstz,
 				  src, src_level, src_box))
 		return;
diff --git a/src/gallium/drivers/radeonsi/si_compute.c b/src/gallium/drivers/radeonsi/si_compute.c
index 484b252..c279c77 100644
--- a/src/gallium/drivers/radeonsi/si_compute.c
+++ b/src/gallium/drivers/radeonsi/si_compute.c
@@ -158,6 +158,7 @@ static void si_set_global_binding(
 static void si_initialize_compute(struct si_context *sctx)
 {
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
+	uint64_t bc_va;
 
 	radeon_set_sh_reg_seq(cs, R_00B810_COMPUTE_START_X, 3);
 	radeon_emit(cs, 0);
@@ -193,6 +194,17 @@ static void si_initialize_compute(struct si_context *sctx)
 		                  0x190 /* Default value */);
 	}
 
+	/* Set the pointer to border colors. */
+	bc_va = sctx->border_color_buffer->gpu_address;
+
+	if (sctx->b.chip_class >= CIK) {
+		radeon_set_uconfig_reg_seq(cs, R_030E00_TA_CS_BC_BASE_ADDR, 2);
+		radeon_emit(cs, bc_va >> 8);  /* R_030E00_TA_CS_BC_BASE_ADDR */
+		radeon_emit(cs, bc_va >> 40); /* R_030E04_TA_CS_BC_BASE_ADDR_HI */
+	} else {
+		radeon_set_config_reg(cs, R_00950C_TA_CS_BC_BASE_ADDR, bc_va >> 8);
+	}
+
 	sctx->cs_shader_state.emitted_program = NULL;
 	sctx->cs_shader_state.initialized = true;
 }
@@ -459,6 +471,20 @@ static void si_launch_grid(
 
 	si_decompress_compute_textures(sctx);
 
+	/* Add buffer sizes for memory checking in need_cs_space. */
+	r600_context_add_resource_size(ctx, &program->shader.bo->b.b);
+	/* TODO: add the scratch buffer */
+
+	if (info->indirect) {
+		r600_context_add_resource_size(ctx, info->indirect);
+
+		/* The hw doesn't read the indirect buffer via TC L2. */
+		if (r600_resource(info->indirect)->TC_L2_dirty) {
+			sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+			r600_resource(info->indirect)->TC_L2_dirty = false;
+		}
+	}
+
 	si_need_cs_space(sctx);
 
 	if (!sctx->cs_shader_state.initialized)
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index bbd02e9..fe4cb29 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -819,9 +819,9 @@ void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuf
 		util_memcpy_cpu_to_le32(tmp, ptr, size);
 }
 
-void si_set_constant_buffer(struct si_context *sctx,
-			    struct si_buffer_resources *buffers,
-			    uint slot, struct pipe_constant_buffer *input)
+static void si_set_constant_buffer(struct si_context *sctx,
+				   struct si_buffer_resources *buffers,
+				   uint slot, struct pipe_constant_buffer *input)
 {
 	assert(slot < buffers->desc.num_elements);
 	pipe_resource_reference(&buffers->buffers[slot], NULL);
@@ -881,6 +881,12 @@ void si_set_constant_buffer(struct si_context *sctx,
 	buffers->desc.dirty_mask |= 1u << slot;
 }
 
+void si_set_rw_buffer(struct si_context *sctx,
+		      uint slot, struct pipe_constant_buffer *input)
+{
+	si_set_constant_buffer(sctx, &sctx->rw_buffers, slot, input);
+}
+
 static void si_pipe_set_constant_buffer(struct pipe_context *ctx,
 					uint shader, uint slot,
 					struct pipe_constant_buffer *input)
@@ -1052,10 +1058,10 @@ static void si_set_streamout_targets(struct pipe_context *ctx,
 		 * and most other clients can use TC L2 as well, we don't need
 		 * to flush it.
 		 *
-		 * The only case which requires flushing it is VGT DMA index
-		 * fetching, which is a rare case. Thus, flag the TC L2
-		 * dirtiness in the resource and handle it when index fetching
-		 * is used.
+		 * The only cases which requires flushing it is VGT DMA index
+		 * fetching (on <= CIK) and indirect draw data, which are rare
+		 * cases. Thus, flag the TC L2 dirtiness in the resource and
+		 * handle it at draw call time.
 		 */
 		for (i = 0; i < sctx->b.streamout.num_targets; i++)
 			if (sctx->b.streamout.targets[i])
@@ -1177,8 +1183,7 @@ static void si_set_polygon_stipple(struct pipe_context *ctx,
 	cb.user_buffer = stipple;
 	cb.buffer_size = sizeof(stipple);
 
-	si_set_constant_buffer(sctx, &sctx->rw_buffers,
-			       SI_PS_CONST_POLY_STIPPLE, &cb);
+	si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE, &cb);
 }
 
 /* TEXTURE METADATA ENABLE/DISABLE */
diff --git a/src/gallium/drivers/radeonsi/si_hw_context.c b/src/gallium/drivers/radeonsi/si_hw_context.c
index e8e0403..69478a8 100644
--- a/src/gallium/drivers/radeonsi/si_hw_context.c
+++ b/src/gallium/drivers/radeonsi/si_hw_context.c
@@ -216,7 +216,8 @@ void si_begin_new_cs(struct si_context *ctx)
 
 	si_mark_atom_dirty(ctx, &ctx->clip_regs);
 	si_mark_atom_dirty(ctx, &ctx->clip_state.atom);
-	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs);
+	ctx->msaa_sample_locs.nr_samples = 0;
+	si_mark_atom_dirty(ctx, &ctx->msaa_sample_locs.atom);
 	si_mark_atom_dirty(ctx, &ctx->msaa_config);
 	si_mark_atom_dirty(ctx, &ctx->sample_mask.atom);
 	si_mark_atom_dirty(ctx, &ctx->cb_render_state);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 88f4f20..9dd9ef5 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -212,8 +212,8 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 	si_begin_new_cs(sctx);
 	r600_query_init_backend_mask(&sctx->b); /* this emits commands and must be last */
 
-	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD is buggy
-	 * with a NULL buffer). We need to use a dummy buffer instead. */
+	/* CIK cannot unbind a constant buffer (S_BUFFER_LOAD doesn't skip loads
+	 * if NUM_RECORDS == 0). We need to use a dummy buffer instead. */
 	if (sctx->b.chip_class == CIK) {
 		sctx->null_const_buf.buffer = pipe_buffer_create(screen, PIPE_BIND_CONSTANT_BUFFER,
 								 PIPE_USAGE_DEFAULT, 16);
@@ -228,6 +228,15 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
 			}
 		}
 
+		si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
+				 &sctx->null_const_buf);
+		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
+				 &sctx->null_const_buf);
+		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
+				 &sctx->null_const_buf);
+		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
+				 &sctx->null_const_buf);
+
 		/* Clear the NULL constant buffer, because loads should return zeros. */
 		sctx->b.clear_buffer(&sctx->b.b, sctx->null_const_buf.buffer, 0,
 				     sctx->null_const_buf.buffer->width0, 0,
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index dbbf9b6..2661972 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -168,6 +168,11 @@ struct si_clip_state {
 	struct pipe_clip_state		state;
 };
 
+struct si_sample_locs {
+	struct r600_atom	atom;
+	unsigned		nr_samples;
+};
+
 struct si_sample_mask {
 	struct r600_atom	atom;
 	uint16_t		sample_mask;
@@ -212,7 +217,7 @@ struct si_context {
 	/* Atom declarations. */
 	struct r600_atom		cache_flush;
 	struct si_framebuffer		framebuffer;
-	struct r600_atom		msaa_sample_locs;
+	struct si_sample_locs		msaa_sample_locs;
 	struct r600_atom		db_render_state;
 	struct r600_atom		msaa_config;
 	struct si_sample_mask		sample_mask;
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 5e5bf68..5ead940 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -1014,7 +1014,7 @@ static LLVMValueRef lds_load(struct lp_build_tgsi_context *bld_base,
 	if (type == TGSI_TYPE_DOUBLE) {
 		LLVMValueRef value2;
 		dw_addr = lp_build_add(&bld_base->uint_bld, dw_addr,
-				       lp_build_const_int32(gallivm, swizzle + 1));
+				       lp_build_const_int32(gallivm, 1));
 		value2 = build_indexed_load(ctx, ctx->lds, dw_addr, false);
 		return radeon_llvm_emit_fetch_double(bld_base, value, value2);
 	}
@@ -1846,13 +1846,13 @@ static LLVMValueRef fetch_constant(
 		result = bitcast(bld_base, type, result);
 	else {
 		LLVMValueRef addr2, result2;
-		addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle + 1];
+		addr2 = ctx->radeon_bld.soa.addr[ireg->Index][ireg->Swizzle];
 		addr2 = LLVMBuildLoad(base->gallivm->builder, addr2, "load addr reg2");
 		addr2 = lp_build_mul_imm(&bld_base->uint_bld, addr2, 16);
 		addr2 = lp_build_add(&bld_base->uint_bld, addr2,
-				     lp_build_const_int32(base->gallivm, idx * 4));
+				     lp_build_const_int32(base->gallivm, (idx + 1) * 4));
 
-		result2 = buffer_load_const(base->gallivm->builder, ctx->const_buffers[buf],
+		result2 = buffer_load_const(base->gallivm->builder, bufp,
 				   addr2, ctx->f32);
 
 		result = radeon_llvm_emit_fetch_double(bld_base,
@@ -5072,7 +5072,7 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
 	}
 
 	intr_name = interp_param ? "llvm.SI.fs.interp" : "llvm.SI.fs.constant";
-	for (chan = 0; chan < 2; chan++) {
+	for (chan = 0; chan < 4; chan++) {
 		LLVMValueRef args[4];
 		LLVMValueRef llvm_chan;
 		unsigned schan;
@@ -6567,6 +6567,41 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
 
 	radeon_llvm_dispose(&ctx.radeon_bld);
 
+	/* Validate SGPR and VGPR usage for compute to detect compiler bugs.
+	 * LLVM 3.9svn has this bug.
+	 */
+	if (sel->type == PIPE_SHADER_COMPUTE) {
+		unsigned *props = sel->info.properties;
+		unsigned wave_size = 64;
+		unsigned max_vgprs = 256;
+		unsigned max_sgprs = sscreen->b.chip_class >= VI ? 800 : 512;
+		unsigned max_sgprs_per_wave = 128;
+		unsigned min_waves_per_cu =
+			DIV_ROUND_UP(props[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH] *
+				     props[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT] *
+				     props[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH],
+				     wave_size);
+		unsigned min_waves_per_simd = DIV_ROUND_UP(min_waves_per_cu, 4);
+
+		max_vgprs = max_vgprs / min_waves_per_simd;
+		max_sgprs = MIN2(max_sgprs / min_waves_per_simd, max_sgprs_per_wave);
+
+		if (shader->config.num_sgprs > max_sgprs ||
+		    shader->config.num_vgprs > max_vgprs) {
+			fprintf(stderr, "LLVM failed to compile a shader correctly: "
+				"SGPR:VGPR usage is %u:%u, but the hw limit is %u:%u\n",
+				shader->config.num_sgprs, shader->config.num_vgprs,
+				max_sgprs, max_vgprs);
+
+			/* Just terminate the process, because dependent
+			 * shaders can hang due to bad input data, but use
+			 * the env var to allow shader-db to work.
+			 */
+			if (!debug_get_bool_option("SI_PASS_BAD_SHADERS", false))
+				abort();
+		}
+	}
+
 	/* Add the scratch offset to input SGPRs. */
 	if (shader->config.scratch_bytes_per_wave)
 		shader->info.num_input_sgprs += 1; /* scratch byte offset */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 3bbb81a..a641b5d 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -460,6 +460,10 @@ static void *si_create_blend_state_mode(struct pipe_context *ctx,
 			S_028760_COLOR_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED) |
 			S_028760_ALPHA_COMB_FCN(V_028760_OPT_COMB_BLEND_DISABLED);
 
+		/* Only set dual source blending for MRT0 to avoid a hang. */
+		if (i >= 1 && blend->dual_src_blend)
+			continue;
+
 		if (!state->rt[j].colormask)
 			continue;
 
@@ -620,8 +624,7 @@ static void si_set_clip_state(struct pipe_context *ctx,
 	cb.user_buffer = state->ucp;
 	cb.buffer_offset = 0;
 	cb.buffer_size = 4*4*8;
-	si_set_constant_buffer(sctx, &sctx->rw_buffers,
-			       SI_VS_CONST_CLIP_PLANES, &cb);
+	si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES, &cb);
 	pipe_resource_reference(&cb.buffer, NULL);
 }
 
@@ -847,9 +850,13 @@ static void si_bind_rs_state(struct pipe_context *ctx, void *state)
 		return;
 
 	if (sctx->framebuffer.nr_samples > 1 &&
-	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable))
+	    (!old_rs || old_rs->multisample_enable != rs->multisample_enable)) {
 		si_mark_atom_dirty(sctx, &sctx->db_render_state);
 
+		if (sctx->b.family >= CHIP_POLARIS10)
+			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
+	}
+
 	r600_set_scissor_enable(&sctx->b, rs->scissor_enable);
 
 	si_pm4_bind_state(sctx, rasterizer, rs);
@@ -1586,6 +1593,10 @@ static unsigned si_tex_dim(unsigned res_target, unsigned view_target,
 	if (view_target == PIPE_TEXTURE_CUBE ||
 	    view_target == PIPE_TEXTURE_CUBE_ARRAY)
 		res_target = view_target;
+	/* If interpreting cubemaps as something else, set 2D_ARRAY. */
+	else if (res_target == PIPE_TEXTURE_CUBE ||
+		 res_target == PIPE_TEXTURE_CUBE_ARRAY)
+		res_target = PIPE_TEXTURE_2D_ARRAY;
 
 	switch (res_target) {
 	default:
@@ -2395,21 +2406,9 @@ static void si_set_framebuffer_state(struct pipe_context *ctx,
 			assert(0);
 		}
 		constbuf.buffer_size = sctx->framebuffer.nr_samples * 2 * 4;
-		si_set_constant_buffer(sctx, &sctx->rw_buffers,
-				       SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
+		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS, &constbuf);
 
-		/* Smoothing (only possible with nr_samples == 1) uses the same
-		 * sample locations as the MSAA it simulates.
-		 *
-		 * Therefore, don't update the sample locations when
-		 * transitioning from no AA to smoothing-equivalent AA, and
-		 * vice versa.
-		 */
-		if ((sctx->framebuffer.nr_samples != 1 ||
-		     old_nr_samples != SI_NUM_SMOOTH_AA_SAMPLES) &&
-		    (sctx->framebuffer.nr_samples != SI_NUM_SMOOTH_AA_SAMPLES ||
-		     old_nr_samples != 1))
-			si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs);
+		si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
 	}
 }
 
@@ -2536,8 +2535,37 @@ static void si_emit_msaa_sample_locs(struct si_context *sctx,
 	struct radeon_winsys_cs *cs = sctx->b.gfx.cs;
 	unsigned nr_samples = sctx->framebuffer.nr_samples;
 
-	cayman_emit_msaa_sample_locs(cs, nr_samples > 1 ? nr_samples :
-						SI_NUM_SMOOTH_AA_SAMPLES);
+	/* Smoothing (only possible with nr_samples == 1) uses the same
+	 * sample locations as the MSAA it simulates.
+	 */
+	if (nr_samples <= 1 && sctx->smoothing_enabled)
+		nr_samples = SI_NUM_SMOOTH_AA_SAMPLES;
+
+	/* On Polaris, the small primitive filter uses the sample locations
+	 * even when MSAA is off, so we need to make sure they're set to 0.
+	 */
+	if ((nr_samples > 1 || sctx->b.family >= CHIP_POLARIS10) &&
+	    (nr_samples != sctx->msaa_sample_locs.nr_samples)) {
+		sctx->msaa_sample_locs.nr_samples = nr_samples;
+		cayman_emit_msaa_sample_locs(cs, nr_samples);
+	}
+
+	if (sctx->b.family >= CHIP_POLARIS10) {
+		struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
+		unsigned small_prim_filter_cntl =
+			S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
+			S_028830_LINE_FILTER_DISABLE(1); /* line bug */
+
+		/* The alternative of setting sample locations to 0 would
+		 * require a DB flush to avoid Z errors, see
+		 * https://bugs.freedesktop.org/show_bug.cgi?id=96908
+		 */
+		if (sctx->framebuffer.nr_samples > 1 && rs && !rs->multisample_enable)
+			small_prim_filter_cntl &= C_028830_SMALL_PRIM_FILTER_ENABLE;
+
+		radeon_set_context_reg(cs, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
+				       small_prim_filter_cntl);
+	}
 }
 
 static void si_emit_msaa_config(struct si_context *sctx, struct r600_atom *atom)
@@ -3246,8 +3274,7 @@ static void si_set_tess_state(struct pipe_context *ctx,
 			       (void*)array, sizeof(array),
 			       &cb.buffer_offset);
 
-	si_set_constant_buffer(sctx, &sctx->rw_buffers,
-			       SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
+	si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS, &cb);
 	pipe_resource_reference(&cb.buffer, NULL);
 }
 
@@ -3337,7 +3364,7 @@ void si_init_state_functions(struct si_context *sctx)
 
 	si_init_atom(sctx, &sctx->cache_flush, &sctx->atoms.s.cache_flush, si_emit_cache_flush);
 	si_init_atom(sctx, &sctx->framebuffer.atom, &sctx->atoms.s.framebuffer, si_emit_framebuffer_state);
-	si_init_atom(sctx, &sctx->msaa_sample_locs, &sctx->atoms.s.msaa_sample_locs, si_emit_msaa_sample_locs);
+	si_init_atom(sctx, &sctx->msaa_sample_locs.atom, &sctx->atoms.s.msaa_sample_locs, si_emit_msaa_sample_locs);
 	si_init_atom(sctx, &sctx->db_render_state, &sctx->atoms.s.db_render_state, si_emit_db_render_state);
 	si_init_atom(sctx, &sctx->msaa_config, &sctx->atoms.s.msaa_config, si_emit_msaa_config);
 	si_init_atom(sctx, &sctx->sample_mask.atom, &sctx->atoms.s.sample_mask, si_emit_sample_mask);
@@ -3810,11 +3837,6 @@ static void si_init_config(struct si_context *sctx)
 	if (sctx->b.family == CHIP_STONEY)
 		si_pm4_set_reg(pm4, R_028C40_PA_SC_SHADER_CONTROL, 0);
 
-	if (sctx->b.family >= CHIP_POLARIS10)
-		si_pm4_set_reg(pm4, R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL,
-			       S_028830_SMALL_PRIM_FILTER_ENABLE(1) |
-			       S_028830_LINE_FILTER_DISABLE(1)); /* line bug */
-
 	si_pm4_set_reg(pm4, R_028080_TA_BC_BASE_ADDR, border_color_va >> 8);
 	if (sctx->b.chip_class >= CIK)
 		si_pm4_set_reg(pm4, R_028084_TA_BC_BASE_ADDR_HI, border_color_va >> 40);
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index aea98ae..97f6dfa 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -269,9 +269,8 @@ void si_update_compressed_colortex_masks(struct si_context *sctx);
 void si_emit_graphics_shader_userdata(struct si_context *sctx,
                                       struct r600_atom *atom);
 void si_emit_compute_shader_userdata(struct si_context *sctx);
-void si_set_constant_buffer(struct si_context *sctx,
-			    struct si_buffer_resources *buffers,
-			    uint slot, struct pipe_constant_buffer *input);
+void si_set_rw_buffer(struct si_context *sctx,
+		      uint slot, struct pipe_constant_buffer *input);
 
 /* si_state.c */
 struct si_shader_selector;
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 3c11d14..4e1c599 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -494,13 +494,13 @@ static void si_emit_draw_registers(struct si_context *sctx,
 		radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, info->primitive_restart);
 		sctx->last_primitive_restart_en = info->primitive_restart;
 
-		if (info->primitive_restart &&
-		    (info->restart_index != sctx->last_restart_index ||
-		     sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) {
-			radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
-					       info->restart_index);
-			sctx->last_restart_index = info->restart_index;
-		}
+	}
+	if (info->primitive_restart &&
+	    (info->restart_index != sctx->last_restart_index ||
+	     sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN)) {
+		radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
+				       info->restart_index);
+		sctx->last_restart_index = info->restart_index;
 	}
 }
 
@@ -963,6 +963,11 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		r600_resource(ib.buffer)->TC_L2_dirty = false;
 	}
 
+	if (info->indirect && r600_resource(info->indirect)->TC_L2_dirty) {
+		sctx->b.flags |= SI_CONTEXT_INV_GLOBAL_L2;
+		r600_resource(info->indirect)->TC_L2_dirty = false;
+	}
+
 	/* Check flush flags. */
 	if (sctx->b.flags)
 		si_mark_atom_dirty(sctx, sctx->atoms.s.cache_flush);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index a7af76d..932d017 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -759,10 +759,10 @@ static void si_shader_ps(struct si_shader *shader)
 		       S_00B02C_USER_SGPR(SI_PS_NUM_USER_SGPR) |
 		       S_00B32C_SCRATCH_EN(shader->config.scratch_bytes_per_wave > 0));
 
-	/* Prefer RE_Z if the shader is complex enough. The requirement is either:
-	 * - the shader uses at least 2 VMEM instructions, or
-	 * - the code size is at least 50 2-dword instructions or 100 1-dword
-	 *   instructions.
+	/* DON'T USE EARLY_Z_THEN_RE_Z !!!
+	 *
+	 * It decreases performance by 15% in DiRT: Showdown on Ultra settings.
+	 * And it has pretty complex shaders.
 	 *
 	 * Shaders with side effects that must execute independently of the
 	 * depth test require LATE_Z.
@@ -770,9 +770,6 @@ static void si_shader_ps(struct si_shader *shader)
 	if (info->writes_memory &&
 	    !info->properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
 		shader->z_order = V_02880C_LATE_Z;
-	else if (info->num_memory_instructions >= 2 ||
-	         shader->binary.code_size > 100*4)
-		shader->z_order = V_02880C_EARLY_Z_THEN_RE_Z;
 	else
 		shader->z_order = V_02880C_EARLY_Z_THEN_LATE_Z;
 }
@@ -2052,6 +2049,9 @@ bool si_update_shaders(struct si_context *sctx)
 
 			if (sctx->b.chip_class == SI)
 				si_mark_atom_dirty(sctx, &sctx->db_render_state);
+
+			if (sctx->framebuffer.nr_samples <= 1)
+				si_mark_atom_dirty(sctx, &sctx->msaa_sample_locs.atom);
 		}
 	}
 
diff --git a/src/gallium/drivers/radeonsi/sid.h b/src/gallium/drivers/radeonsi/sid.h
index 8d49b9d..341ee12 100644
--- a/src/gallium/drivers/radeonsi/sid.h
+++ b/src/gallium/drivers/radeonsi/sid.h
@@ -7209,6 +7209,7 @@
 /*     */
 #define R_028830_PA_SU_SMALL_PRIM_FILTER_CNTL                           0x028830 /* Polaris */
 #define   S_028830_SMALL_PRIM_FILTER_ENABLE(x)                        (((x) & 0x1) << 0)
+#define   C_028830_SMALL_PRIM_FILTER_ENABLE                           0xFFFFFFFE
 #define   S_028830_TRIANGLE_FILTER_DISABLE(x)                         (((x) & 0x1) << 1)
 #define   S_028830_LINE_FILTER_DISABLE(x)                             (((x) & 0x1) << 2)
 #define   S_028830_POINT_FILTER_DISABLE(x)                            (((x) & 0x1) << 3)
diff --git a/src/gallium/drivers/svga/svga_shader.c b/src/gallium/drivers/svga/svga_shader.c
index abfef0f..9e37e23 100644
--- a/src/gallium/drivers/svga/svga_shader.c
+++ b/src/gallium/drivers/svga/svga_shader.c
@@ -173,10 +173,16 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
 
    assert(shader < ARRAY_SIZE(svga->curr.num_sampler_views));
 
-   for (i = 0; i < svga->curr.num_sampler_views[shader]; i++) {
+   /* In case the number of samplers and sampler_views doesn't match,
+    * loop over the lower of the two counts.
+    */
+   key->num_textures = MIN2(svga->curr.num_sampler_views[shader],
+                            svga->curr.num_samplers[shader]);
+
+   for (i = 0; i < key->num_textures; i++) {
       struct pipe_sampler_view *view = svga->curr.sampler_views[shader][i];
-      if (view) {
-         assert(svga->curr.sampler[shader][i]);
+      const struct svga_sampler_state *sampler = svga->curr.sampler[shader][i];
+      if (view && sampler) {
          assert(view->texture);
          assert(view->texture->target < (1 << 4)); /* texture_target:4 */
 
@@ -195,7 +201,7 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
             }
          }
 
-         if (!svga->curr.sampler[shader][i]->normalized_coords) {
+         if (!sampler->normalized_coords) {
             assert(idx < (1 << 5));  /* width_height_idx:5 bitfield */
             key->tex[i].width_height_idx = idx++;
             key->tex[i].unnormalized = TRUE;
@@ -208,7 +214,6 @@ svga_init_shader_key_common(const struct svga_context *svga, unsigned shader,
          key->tex[i].swizzle_a = view->swizzle_a;
       }
    }
-   key->num_textures = svga->curr.num_sampler_views[shader];
 }
 
 
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index b67413a..3459af3 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -22,7 +22,7 @@
 include Makefile.sources
 include $(top_srcdir)/src/gallium/Automake.inc
 
-AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS) -std=c++11
+AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS) $(SWR_CXX11_CXXFLAGS)
 
 noinst_LTLIBRARIES = libmesaswr.la
 
@@ -31,7 +31,7 @@ libmesaswr_la_SOURCES = $(LOADER_SOURCES)
 COMMON_CXXFLAGS = \
 	$(GALLIUM_DRIVER_CFLAGS) \
 	$(LLVM_CXXFLAGS) \
-	-std=c++11 \
+	$(SWR_CXX11_CXXFLAGS) \
 	-I$(builddir)/rasterizer/scripts \
 	-I$(builddir)/rasterizer/jitter \
 	-I$(srcdir)/rasterizer \
@@ -148,7 +148,7 @@ distclean-local:
 lib_LTLIBRARIES = libswrAVX.la libswrAVX2.la
 
 libswrAVX_la_CXXFLAGS = \
-	-march=core-avx-i \
+	$(SWR_AVX_CXXFLAGS) \
 	-DKNOB_ARCH=KNOB_ARCH_AVX \
 	$(COMMON_CXXFLAGS)
 
@@ -171,7 +171,7 @@ libswrAVX_la_LDFLAGS = \
 	$(COMMON_LDFLAGS)
 
 libswrAVX2_la_CXXFLAGS = \
-	-march=core-avx2 \
+	$(SWR_AVX2_CXXFLAGS) \
 	-DKNOB_ARCH=KNOB_ARCH_AVX2 \
 	$(COMMON_CXXFLAGS)
 
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.c b/src/gallium/drivers/vc4/vc4_bufmgr.c
index 21e3bde..f6bacfd 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.c
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.c
@@ -28,6 +28,7 @@
 #include <xf86drm.h>
 #include <xf86drmMode.h>
 
+#include "util/u_hash_table.h"
 #include "util/u_memory.h"
 #include "util/ralloc.h"
 
@@ -329,10 +330,19 @@ vc4_bo_open_handle(struct vc4_screen *screen,
                    uint32_t winsys_stride,
                    uint32_t handle, uint32_t size)
 {
-        struct vc4_bo *bo = CALLOC_STRUCT(vc4_bo);
+        struct vc4_bo *bo;
 
         assert(size);
 
+        pipe_mutex_lock(screen->bo_handles_mutex);
+
+        bo = util_hash_table_get(screen->bo_handles, (void*)(uintptr_t)handle);
+        if (bo) {
+                pipe_reference(NULL, &bo->reference);
+                goto done;
+        }
+
+        bo = CALLOC_STRUCT(vc4_bo);
         pipe_reference_init(&bo->reference, 1);
         bo->screen = screen;
         bo->handle = handle;
@@ -347,6 +357,10 @@ vc4_bo_open_handle(struct vc4_screen *screen,
         bo->map = malloc(bo->size);
 #endif
 
+        util_hash_table_set(screen->bo_handles, (void *)(uintptr_t)handle, bo);
+
+done:
+        pipe_mutex_unlock(screen->bo_handles_mutex);
         return bo;
 }
 
@@ -399,7 +413,11 @@ vc4_bo_get_dmabuf(struct vc4_bo *bo)
                         bo->handle);
                 return -1;
         }
+
+        pipe_mutex_lock(bo->screen->bo_handles_mutex);
         bo->private = false;
+        util_hash_table_set(bo->screen->bo_handles, (void *)(uintptr_t)bo->handle, bo);
+        pipe_mutex_unlock(bo->screen->bo_handles_mutex);
 
         return fd;
 }
diff --git a/src/gallium/drivers/vc4/vc4_bufmgr.h b/src/gallium/drivers/vc4/vc4_bufmgr.h
index b77506e..71a4426 100644
--- a/src/gallium/drivers/vc4/vc4_bufmgr.h
+++ b/src/gallium/drivers/vc4/vc4_bufmgr.h
@@ -25,6 +25,7 @@
 #define VC4_BUFMGR_H
 
 #include <stdint.h>
+#include "util/u_hash_table.h"
 #include "util/u_inlines.h"
 #include "vc4_qir.h"
 
@@ -87,11 +88,27 @@ vc4_bo_reference(struct vc4_bo *bo)
 static inline void
 vc4_bo_unreference(struct vc4_bo **bo)
 {
+        struct vc4_screen *screen;
         if (!*bo)
                 return;
 
-        if (pipe_reference(&(*bo)->reference, NULL))
-                vc4_bo_last_unreference(*bo);
+        if ((*bo)->private) {
+                /* Avoid the mutex for private BOs */
+                if (pipe_reference(&(*bo)->reference, NULL))
+                        vc4_bo_last_unreference(*bo);
+        } else {
+                screen = (*bo)->screen;
+                pipe_mutex_lock(screen->bo_handles_mutex);
+
+                if (pipe_reference(&(*bo)->reference, NULL)) {
+                        util_hash_table_remove(screen->bo_handles,
+                                               (void *)(uintptr_t)(*bo)->handle);
+                        vc4_bo_last_unreference(*bo);
+                }
+
+                pipe_mutex_unlock(screen->bo_handles_mutex);
+        }
+
         *bo = NULL;
 }
 
diff --git a/src/gallium/drivers/vc4/vc4_context.h b/src/gallium/drivers/vc4/vc4_context.h
index c271a95..77013d9 100644
--- a/src/gallium/drivers/vc4/vc4_context.h
+++ b/src/gallium/drivers/vc4/vc4_context.h
@@ -143,6 +143,8 @@ struct vc4_compiled_shader {
         /** bitmask of which inputs are color inputs, for flat shade handling. */
         uint32_t color_inputs;
 
+        bool disable_early_z;
+
         uint8_t num_inputs;
 
         /* Byte offsets for the start of the vertex attributes 0-7, and the
diff --git a/src/gallium/drivers/vc4/vc4_emit.c b/src/gallium/drivers/vc4/vc4_emit.c
index 5d64797..8b192da 100644
--- a/src/gallium/drivers/vc4/vc4_emit.c
+++ b/src/gallium/drivers/vc4/vc4_emit.c
@@ -71,7 +71,9 @@ vc4_emit_state(struct pipe_context *pctx)
                 vc4->draw_max_y = MAX2(vc4->draw_max_y, maxy);
         }
 
-        if (vc4->dirty & (VC4_DIRTY_RASTERIZER | VC4_DIRTY_ZSA)) {
+        if (vc4->dirty & (VC4_DIRTY_RASTERIZER |
+                          VC4_DIRTY_ZSA |
+                          VC4_DIRTY_COMPILED_FS)) {
                 uint8_t ez_enable_mask_out = ~0;
 
                 /* HW-2905: If the RCL ends up doing a full-res load when
@@ -83,7 +85,7 @@ vc4_emit_state(struct pipe_context *pctx)
                  * was seeing bad rendering on glxgears -samples 4 even in
                  * that case.
                  */
-                if (vc4->msaa)
+                if (vc4->msaa || vc4->prog.fs->disable_early_z)
                         ez_enable_mask_out &= ~VC4_CONFIG_BITS_EARLY_Z;
 
                 cl_u8(&bcl, VC4_PACKET_CONFIGURATION_BITS);
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c
index d31b673..5df798a 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -110,11 +110,12 @@ qir_opt_vpm(struct vc4_compile *c)
                          * sources are independent of previous instructions
                          */
                         if (temps == 1) {
-                                list_del(&inst->link);
                                 inst->src[j] = mov->src[0];
-                                list_replace(&mov->link, &inst->link);
-                                c->defs[temp] = NULL;
-                                free(mov);
+
+                                list_del(&inst->link);
+                                list_addtail(&inst->link, &mov->link);
+                                qir_remove_instruction(c, mov);
+
                                 progress = true;
                                 break;
                         }
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 5d036eb..3572cf7 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -2032,6 +2032,11 @@ vc4_get_compiled_shader(struct vc4_context *vc4, enum qstage stage,
                         shader->input_slots[shader->num_inputs] = *slot;
                         shader->num_inputs++;
                 }
+
+                /* Note: the temporary clone in c->s has been freed. */
+                nir_shader *orig_shader = key->shader_state->base.ir.nir;
+                if (orig_shader->info.outputs_written & (1 << FRAG_RESULT_DEPTH))
+                        shader->disable_early_z = true;
         } else {
                 shader->num_inputs = c->num_inputs;
 
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 20f137a..aabe593 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -534,8 +534,8 @@ vc4_resource_from_handle(struct pipe_screen *pscreen,
         struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base.b;
         struct vc4_resource_slice *slice = &rsc->slices[0];
-        uint32_t expected_stride = align(prsc->width0 / rsc->cpp,
-                                         vc4_utile_width(rsc->cpp));
+        uint32_t expected_stride =
+            align(prsc->width0, vc4_utile_width(rsc->cpp)) * rsc->cpp;
 
         if (!rsc)
                 return NULL;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index 733275a..0fb2bbc 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -30,6 +30,7 @@
 #include "util/u_debug.h"
 #include "util/u_memory.h"
 #include "util/u_format.h"
+#include "util/u_hash_table.h"
 #include "util/ralloc.h"
 
 #include "vc4_screen.h"
@@ -82,7 +83,11 @@ vc4_screen_get_vendor(struct pipe_screen *pscreen)
 static void
 vc4_screen_destroy(struct pipe_screen *pscreen)
 {
+        struct vc4_screen *screen = vc4_screen(pscreen);
+
+        util_hash_table_destroy(screen->bo_handles);
         vc4_bufmgr_destroy(pscreen);
+        close(screen->fd);
         ralloc_free(pscreen);
 }
 
@@ -488,6 +493,18 @@ vc4_screen_is_format_supported(struct pipe_screen *pscreen,
         return retval == usage;
 }
 
+#define PTR_TO_UINT(x) ((unsigned)((intptr_t)(x)))
+
+static unsigned handle_hash(void *key)
+{
+    return PTR_TO_UINT(key);
+}
+
+static int handle_compare(void *key1, void *key2)
+{
+    return PTR_TO_UINT(key1) != PTR_TO_UINT(key2);
+}
+
 struct pipe_screen *
 vc4_screen_create(int fd)
 {
@@ -505,6 +522,8 @@ vc4_screen_create(int fd)
 
         screen->fd = fd;
         list_inithead(&screen->bo_cache.time_list);
+        pipe_mutex_init(screen->bo_handles_mutex);
+        screen->bo_handles = util_hash_table_create(handle_hash, handle_compare);
 
         vc4_fence_init(screen);
 
diff --git a/src/gallium/drivers/vc4/vc4_screen.h b/src/gallium/drivers/vc4/vc4_screen.h
index 03f76b2..281d254 100644
--- a/src/gallium/drivers/vc4/vc4_screen.h
+++ b/src/gallium/drivers/vc4/vc4_screen.h
@@ -73,6 +73,9 @@ struct vc4_screen {
                 uint32_t bo_count;
         } bo_cache;
 
+        struct util_hash_table *bo_handles;
+        pipe_mutex bo_handles_mutex;
+
         uint32_t bo_size;
         uint32_t bo_count;
 };
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index ca5812ba..396f563 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -318,17 +318,7 @@ struct pipe_blend_state
 
 struct pipe_blend_color
 {
-   /**
-    * Making the color array explicitly 16-byte aligned provides a hint to
-    * compilers to make more efficient auto-vectorization optimizations.
-    * The actual performance gains from vectorizing the blend color array are
-    * fairly minimal, if any, but the alignment is necessary to work around
-    * buggy vectorization in some compilers which fail to generate the correct
-    * unaligned accessors resulting in a segfault.  Specifically several
-    * versions of the Intel compiler are known to be affected but it's likely
-    * others are as well.
-    */
-   PIPE_ALIGN_VAR(16) float color[4];
+   float color[4];
 };
 
 
diff --git a/src/gallium/state_trackers/clover/llvm/invocation.cpp b/src/gallium/state_trackers/clover/llvm/invocation.cpp
index e2cadda..57eaaaa 100644
--- a/src/gallium/state_trackers/clover/llvm/invocation.cpp
+++ b/src/gallium/state_trackers/clover/llvm/invocation.cpp
@@ -207,7 +207,7 @@ namespace {
       c.getDiagnosticOpts().ShowCarets = false;
       c.getInvocation().setLangDefaults(c.getLangOpts(), clang::IK_OpenCL,
 #if HAVE_LLVM >= 0x0309
-                                        llvm::Triple(triple),
+                                        llvm::Triple(triple), c.getPreprocessorOpts(),
 #endif
                                         clang::LangStandard::lang_opencl11);
       c.createDiagnostics(
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index e1afc4d..a07ecd1 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -30,6 +30,7 @@
 
 #include <xf86drm.h>
 #include <dlfcn.h>
+#include <fcntl.h>
 #include "GL/mesa_glinterop.h"
 #include "util/u_memory.h"
 #include "util/u_inlines.h"
@@ -979,8 +980,10 @@ dri2_query_image(__DRIimage *image, int attrib, int *value)
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FD:
       whandle.type= DRM_API_HANDLE_TYPE_FD;
-      image->texture->screen->resource_get_handle(image->texture->screen,
-         image->texture, &whandle, usage);
+      if (!image->texture->screen->resource_get_handle(image->texture->screen,
+            image->texture, &whandle, usage))
+         return GL_FALSE;
+
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FORMAT:
@@ -1798,7 +1801,7 @@ dri2_init_screen(__DRIscreen * sPriv)
 
    sPriv->driverPrivate = (void *)screen;
 
-   if (screen->fd < 0 || (fd = dup(screen->fd)) < 0)
+   if (screen->fd < 0 || (fd = fcntl(screen->fd, F_DUPFD_CLOEXEC, 3)) < 0)
       goto free_screen;
 
    pscreen = load_pipe_screen(&screen->dev, screen->fd);
@@ -1879,7 +1882,7 @@ dri_kms_init_screen(__DRIscreen * sPriv)
 
    sPriv->driverPrivate = (void *)screen;
 
-   if (screen->fd < 0 || (fd = dup(screen->fd)) < 0)
+   if (screen->fd < 0 || (fd = fcntl(screen->fd, F_DUPFD_CLOEXEC, 3)) < 0)
       goto free_screen;
 
    if (pipe_loader_sw_probe_kms(&screen->dev, fd))
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index bd373d7..e2855d7 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -2031,7 +2031,7 @@ DECL_SPECIAL(DCL)
             ureg_DECL_vs_input(ureg, sem.reg.idx);
             assert(sem.reg.idx < ARRAY_SIZE(tx->info->input_map));
             tx->info->input_map[sem.reg.idx] = sm1_to_nine_declusage(&sem);
-            tx->info->num_inputs = sem.reg.idx + 1;
+            tx->info->num_inputs = MAX2(tx->info->num_inputs, sem.reg.idx + 1);
             /* NOTE: preserving order in case of indirect access */
         } else
         if (tx->version.major >= 3) {
diff --git a/src/gallium/state_trackers/nine/surface9.c b/src/gallium/state_trackers/nine/surface9.c
index 2606dbf..aff4d4c 100644
--- a/src/gallium/state_trackers/nine/surface9.c
+++ b/src/gallium/state_trackers/nine/surface9.c
@@ -405,6 +405,7 @@ NineSurface9_LockRect( struct NineSurface9 *This,
     } else {
         u_box_origin_2d(This->desc.Width, This->desc.Height, &box);
     }
+    box.z = This->layer;
 
     user_warn(This->desc.Format == D3DFMT_NULL);
 
diff --git a/src/gallium/state_trackers/nine/volume9.c b/src/gallium/state_trackers/nine/volume9.c
index 1fdc638..faeeec1 100644
--- a/src/gallium/state_trackers/nine/volume9.c
+++ b/src/gallium/state_trackers/nine/volume9.c
@@ -374,7 +374,7 @@ NineVolume9_UnlockBox( struct NineVolume9 *This )
                                         This->layer_stride_conversion,
                                         0, 0, 0,
                                         This->desc.Width, This->desc.Height,
-                                        This->desc.Height);
+                                        This->desc.Depth);
 
         if (!This->data)
             pipe_transfer_unmap(This->pipe, transfer);
diff --git a/src/gallium/state_trackers/omx/vid_enc.c b/src/gallium/state_trackers/omx/vid_enc.c
index d70439a..c9d9ab1 100644
--- a/src/gallium/state_trackers/omx/vid_enc.c
+++ b/src/gallium/state_trackers/omx/vid_enc.c
@@ -874,8 +874,8 @@ static void enc_ReleaseTasks(struct list_head *head)
 {
    struct encode_task *i, *next;
 
-   if (!head)
-	   return;
+   if (!head || !head->next)
+      return;
 
    LIST_FOR_EACH_ENTRY_SAFE(i, next, head, list) {
       pipe_resource_reference(&i->bitstream, NULL);
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 8a6a397..1ad0d71 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -584,24 +584,26 @@ vlVaCreateSurfaces2(VADriverContextP ctx, unsigned int format,
 
    memset(&templat, 0, sizeof(templat));
 
+   templat.buffer_format = pscreen->get_video_param(
+      pscreen,
+      PIPE_VIDEO_PROFILE_UNKNOWN,
+      PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+      PIPE_VIDEO_CAP_PREFERED_FORMAT
+   );
+   templat.interlaced = pscreen->get_video_param(
+      pscreen,
+      PIPE_VIDEO_PROFILE_UNKNOWN,
+      PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
+      PIPE_VIDEO_CAP_PREFERS_INTERLACED
+   );
+
    if (expected_fourcc) {
-      templat.buffer_format = VaFourccToPipeFormat(expected_fourcc);
-      templat.interlaced = 0;
-   } else {
-      templat.buffer_format = pscreen->get_video_param
-            (
-               pscreen,
-               PIPE_VIDEO_PROFILE_UNKNOWN,
-               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-               PIPE_VIDEO_CAP_PREFERED_FORMAT
-               );
-      templat.interlaced = pscreen->get_video_param
-            (
-               pscreen,
-               PIPE_VIDEO_PROFILE_UNKNOWN,
-               PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
-               PIPE_VIDEO_CAP_PREFERS_INTERLACED
-               );
+      enum pipe_format expected_format = VaFourccToPipeFormat(expected_fourcc);
+
+      if (expected_format != templat.buffer_format || memory_attibute)
+        templat.interlaced = 0;
+
+      templat.buffer_format = expected_format;
    }
 
    templat.chroma_format = ChromaToPipe(format);
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index c644cc8..b278288 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -82,7 +82,7 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
    res_tmpl.depth0 = 1;
    res_tmpl.array_size = 1;
    res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
-                   PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
+                   PIPE_BIND_SHARED;
    res_tmpl.usage = PIPE_USAGE_DEFAULT;
 
    pipe_mutex_lock(dev->mutex);
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index e091b083..c97c0ff 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -27,6 +27,7 @@
  */
 
 #include <unistd.h>
+#include <fcntl.h>
 #include "xa_tracker.h"
 #include "xa_priv.h"
 #include "pipe/p_state.h"
@@ -157,7 +158,7 @@ xa_tracker_create(int drm_fd)
     if (!xa)
 	return NULL;
 
-    if (drm_fd < 0 || (fd = dup(drm_fd)) < 0)
+    if (drm_fd < 0 || (fd = fcntl(drm_fd, F_DUPFD_CLOEXEC, 3)) < 0)
 	goto out_no_fd;
 
     if (pipe_loader_drm_probe_fd(&xa->dev, fd))
diff --git a/src/gallium/targets/dri/Makefile.am b/src/gallium/targets/dri/Makefile.am
index f42dd25..06ade45 100644
--- a/src/gallium/targets/dri/Makefile.am
+++ b/src/gallium/targets/dri/Makefile.am
@@ -1,5 +1,11 @@
 include $(top_srcdir)/src/gallium/Automake.inc
 
+if HAVE_ANDROID
+if HAVE_SHARED_GLAPI
+SHARED_GLAPI_LIB = $(top_builddir)/src/mapi/shared-glapi/libglapi.la
+endif
+endif
+
 AM_CFLAGS = \
 	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/mesa \
@@ -49,6 +55,7 @@ gallium_dri_la_LIBADD = \
 	$(top_builddir)/src/gallium/drivers/noop/libnoop.la \
 	$(top_builddir)/src/gallium/drivers/rbug/librbug.la \
 	$(top_builddir)/src/gallium/drivers/trace/libtrace.la \
+	$(SHARED_GLAPI_LIB) \
 	$(SELINUX_LIBS) \
 	$(EXPAT_LIBS) \
 	$(LIBDRM_LIBS) \
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
index 0ce010e..3f6e280 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_surface.c
@@ -253,6 +253,20 @@ static int compute_level(struct amdgpu_winsys *ws,
    return 0;
 }
 
+static unsigned cik_get_macro_tile_index(struct radeon_surf *surf)
+{
+	unsigned index, tileb;
+
+	tileb = 8 * 8 * surf->bpe;
+	tileb = MIN2(surf->tile_split, tileb);
+
+	for (index = 0; tileb > 64; index++)
+		tileb >>= 1;
+
+	assert(index < 16);
+	return index;
+}
+
 static int amdgpu_surface_init(struct radeon_winsys *rws,
                                struct radeon_surf *surf)
 {
@@ -345,7 +359,8 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
    AddrSurfInfoIn.flags.dccCompatible = !(surf->flags & RADEON_SURF_Z_OR_SBUFFER) &&
                                         !(surf->flags & RADEON_SURF_SCANOUT) &&
                                         !(surf->flags & RADEON_SURF_DISABLE_DCC) &&
-                                        !compressed && AddrDccIn.numSamples <= 1;
+                                        !compressed && AddrDccIn.numSamples <= 1 &&
+                                        surf->last_level == 0;
 
    /* This disables incorrect calculations (hacks) in addrlib. */
    AddrSurfInfoIn.flags.noStencil = 1;
@@ -380,6 +395,9 @@ static int amdgpu_surface_init(struct radeon_winsys *rws,
          AddrSurfInfoIn.tileIndex = 10; /* 2D displayable */
       else
          AddrSurfInfoIn.tileIndex = 14; /* 2D non-displayable */
+
+      /* Addrlib doesn't set this if tileIndex is forced like above. */
+      AddrSurfInfoOut.macroModeIndex = cik_get_macro_tile_index(surf);
    }
 
    surf->bo_size = 0;
diff --git a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
index 598ffcb..52b873f 100644
--- a/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
+++ b/src/gallium/winsys/nouveau/drm/nouveau_drm_winsys.c
@@ -1,5 +1,6 @@
 #include <sys/stat.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include "pipe/p_context.h"
 #include "pipe/p_state.h"
 #include "util/u_format.h"
@@ -91,7 +92,7 @@ nouveau_drm_screen_create(int fd)
 	 * nouveau_device_wrap does not close the fd in case of a device
 	 * creation error.
 	 */
-	dupfd = dup(fd);
+	dupfd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 
 	ret = nouveau_drm_new(dupfd, &drm);
 	if (ret)
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
index 5c85c8f..7619873 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_winsys.c
@@ -44,6 +44,7 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <unistd.h>
+#include <fcntl.h>
 #include <radeon_surface.h>
 
 #ifndef RADEON_INFO_ACTIVE_CU_COUNT
@@ -790,7 +791,7 @@ radeon_drm_winsys_create(int fd, radeon_screen_create_t screen_create)
         return NULL;
     }
 
-    ws->fd = dup(fd);
+    ws->fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 
     if (!do_winsys_init(ws))
         goto fail1;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen.c b/src/gallium/winsys/svga/drm/vmw_screen.c
index 7fcb6d2..d0bfcd7 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen.c
@@ -31,9 +31,15 @@
 #include "util/u_memory.h"
 #include "pipe/p_compiler.h"
 #include "util/u_hash_table.h"
-#include <sys/types.h>
+#ifdef MAJOR_IN_MKDEV
+#include <sys/mkdev.h>
+#endif
+#ifdef MAJOR_IN_SYSMACROS
+#include <sys/sysmacros.h>
+#endif
 #include <sys/stat.h>
 #include <unistd.h>
+#include <fcntl.h>
 
 static struct util_hash_table *dev_hash = NULL;
 
@@ -83,7 +89,7 @@ vmw_winsys_create( int fd )
 
    vws->device = stat_buf.st_rdev;
    vws->open_count = 1;
-   vws->ioctl.drm_fd = dup(fd);
+   vws->ioctl.drm_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
    vws->base.have_gb_dma = TRUE;
    vws->base.need_to_rebind_resources = FALSE;
 
diff --git a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
index 21ac0d7..07eca99 100644
--- a/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
+++ b/src/gallium/winsys/sw/kms-dri/kms_dri_sw_winsys.c
@@ -211,7 +211,29 @@ kms_sw_displaytarget_map(struct sw_winsys *ws,
 }
 
 static struct kms_sw_displaytarget *
-kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd)
+kms_sw_displaytarget_find_and_ref(struct kms_sw_winsys *kms_sw,
+                                  unsigned int kms_handle)
+{
+   struct kms_sw_displaytarget *kms_sw_dt;
+
+   LIST_FOR_EACH_ENTRY(kms_sw_dt, &kms_sw->bo_list, link) {
+      if (kms_sw_dt->handle == kms_handle) {
+         kms_sw_dt->ref_count++;
+
+         DEBUG_PRINT("KMS-DEBUG: imported buffer %u (size %u)\n",
+                     kms_sw_dt->handle, kms_sw_dt->size);
+
+         return kms_sw_dt;
+      }
+   }
+
+   return NULL;
+}
+
+static struct kms_sw_displaytarget *
+kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd,
+                                    unsigned width, unsigned height,
+                                    unsigned stride)
 {
    uint32_t handle = -1;
    struct kms_sw_displaytarget * kms_sw_dt;
@@ -222,6 +244,10 @@ kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd)
    if (ret)
       return NULL;
 
+   kms_sw_dt = kms_sw_displaytarget_find_and_ref(kms_sw, handle);
+   if (kms_sw_dt)
+      return kms_sw_dt;
+
    kms_sw_dt = CALLOC_STRUCT(kms_sw_displaytarget);
    if (!kms_sw_dt)
       return NULL;
@@ -229,6 +255,9 @@ kms_sw_displaytarget_add_from_prime(struct kms_sw_winsys *kms_sw, int fd)
    kms_sw_dt->ref_count = 1;
    kms_sw_dt->handle = handle;
    kms_sw_dt->size = lseek(fd, 0, SEEK_END);
+   kms_sw_dt->width = width;
+   kms_sw_dt->height = height;
+   kms_sw_dt->stride = stride;
 
    if (kms_sw_dt->size == (off_t)-1) {
       FREE(kms_sw_dt);
@@ -274,25 +303,18 @@ kms_sw_displaytarget_from_handle(struct sw_winsys *ws,
 
    switch(whandle->type) {
    case DRM_API_HANDLE_TYPE_FD:
-      kms_sw_dt = kms_sw_displaytarget_add_from_prime(kms_sw, whandle->handle);
-      if (kms_sw_dt) {
-         kms_sw_dt->ref_count++;
-         kms_sw_dt->width = templ->width0;
-         kms_sw_dt->height = templ->height0;
-         kms_sw_dt->stride = whandle->stride;
+      kms_sw_dt = kms_sw_displaytarget_add_from_prime(kms_sw, whandle->handle,
+                                                      templ->width0,
+                                                      templ->height0,
+                                                      whandle->stride);
+      if (kms_sw_dt)
          *stride = kms_sw_dt->stride;
-      }
       return (struct sw_displaytarget *)kms_sw_dt;
    case DRM_API_HANDLE_TYPE_KMS:
-      LIST_FOR_EACH_ENTRY(kms_sw_dt, &kms_sw->bo_list, link) {
-         if (kms_sw_dt->handle == whandle->handle) {
-            kms_sw_dt->ref_count++;
-
-            DEBUG_PRINT("KMS-DEBUG: imported buffer %u (size %u)\n", kms_sw_dt->handle, kms_sw_dt->size);
-
-            *stride = kms_sw_dt->stride;
-            return (struct sw_displaytarget *)kms_sw_dt;
-         }
+      kms_sw_dt = kms_sw_displaytarget_find_and_ref(kms_sw, whandle->handle);
+      if (kms_sw_dt) {
+         *stride = kms_sw_dt->stride;
+         return (struct sw_displaytarget *)kms_sw_dt;
       }
       /* fallthrough */
    default:
diff --git a/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c b/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
index c5434ad..23fe8e7 100644
--- a/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
+++ b/src/gallium/winsys/vc4/drm/vc4_drm_winsys.c
@@ -22,6 +22,7 @@
  */
 
 #include <unistd.h>
+#include <fcntl.h>
 
 #include "vc4_drm_public.h"
 
@@ -30,5 +31,5 @@
 struct pipe_screen *
 vc4_drm_screen_create(int fd)
 {
-	return vc4_screen_create(dup(fd));
+	return vc4_screen_create(fcntl(fd, F_DUPFD_CLOEXEC, 3));
 }
diff --git a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
index dc203cd..f866b24 100644
--- a/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
+++ b/src/gallium/winsys/virgl/drm/virgl_drm_winsys.c
@@ -903,7 +903,7 @@ virgl_drm_screen_create(int fd)
       virgl_screen(pscreen)->refcnt++;
    } else {
       struct virgl_winsys *vws;
-      int dup_fd = dup(fd);
+      int dup_fd = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 
       vws = virgl_drm_winsys_create(dup_fd);
 
diff --git a/src/gbm/backends/dri/gbm_dri.c b/src/gbm/backends/dri/gbm_dri.c
index c3626e3..2330f1b 100644
--- a/src/gbm/backends/dri/gbm_dri.c
+++ b/src/gbm/backends/dri/gbm_dri.c
@@ -589,7 +589,8 @@ gbm_dri_bo_get_fd(struct gbm_bo *_bo)
    if (bo->image == NULL)
       return -1;
 
-   dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_FD, &fd);
+   if (!dri->image->queryImage(bo->image, __DRI_IMAGE_ATTRIB_FD, &fd))
+      return -1;
 
    return fd;
 }
@@ -941,7 +942,7 @@ gbm_dri_bo_map(struct gbm_bo *_bo,
       return *map_data;
    }
 
-   if (!dri->image || dri->image->base.version < 12) {
+   if (!dri->image || dri->image->base.version < 12 || !dri->image->mapImage) {
       errno = ENOSYS;
       return NULL;
    }
@@ -972,7 +973,8 @@ gbm_dri_bo_unmap(struct gbm_bo *_bo, void *map_data)
       return;
    }
 
-   if (!dri->context || !dri->image || dri->image->base.version < 12)
+   if (!dri->context || !dri->image ||
+       dri->image->base.version < 12 || !dri->image->unmapImage)
       return;
 
    dri->image->unmapImage(dri->context, bo->image, map_data);
diff --git a/src/gbm/main/gbm.c b/src/gbm/main/gbm.c
index 0f4657a..c3a2ec33 100644
--- a/src/gbm/main/gbm.c
+++ b/src/gbm/main/gbm.c
@@ -31,7 +31,12 @@
 #include <string.h>
 #include <stdint.h>
 
-#include <sys/types.h>
+#ifdef MAJOR_IN_MKDEV
+#include <sys/mkdev.h>
+#endif
+#ifdef MAJOR_IN_SYSMACROS
+#include <sys/sysmacros.h>
+#endif
 #include <sys/stat.h>
 #include <unistd.h>
 #include <errno.h>
@@ -237,7 +242,8 @@ gbm_bo_get_handle(struct gbm_bo *bo)
  * descriptor.
 
  * \param bo The buffer object
- * \return Returns a file descriptor referring  to the underlying buffer
+ * \return Returns a file descriptor referring to the underlying buffer or -1
+ * if an error occurs.
  */
 GBM_EXPORT int
 gbm_bo_get_fd(struct gbm_bo *bo)
diff --git a/src/glx/dri3_glx.c b/src/glx/dri3_glx.c
index 90d7bba..51b6b1c 100644
--- a/src/glx/dri3_glx.c
+++ b/src/glx/dri3_glx.c
@@ -132,6 +132,16 @@ glx_dri3_get_dri_context(struct loader_dri3_drawable *draw)
    return (gc != &dummyContext) ? dri3Ctx->driContext : NULL;
 }
 
+static __DRIscreen *
+glx_dri3_get_dri_screen(struct loader_dri3_drawable *draw)
+{
+   struct glx_context *gc = __glXGetCurrentContext();
+   struct dri3_context *pcp = (struct dri3_context *) gc;
+   struct dri3_screen *psc = (struct dri3_screen *) pcp->base.psc;
+
+   return (gc != &dummyContext && psc) ? psc->driScreen : NULL;
+}
+
 static void
 glx_dri3_flush_drawable(struct loader_dri3_drawable *draw, unsigned flags)
 {
@@ -169,6 +179,7 @@ static struct loader_dri3_vtable glx_dri3_vtable = {
    .set_drawable_size = glx_dri3_set_drawable_size,
    .in_current_context = glx_dri3_in_current_context,
    .get_dri_context = glx_dri3_get_dri_context,
+   .get_dri_screen = glx_dri3_get_dri_screen,
    .flush_drawable = glx_dri3_flush_drawable,
    .show_fps = glx_dri3_show_fps,
 };
diff --git a/src/glx/glx_error.c b/src/glx/glx_error.c
index b3860db..653cbeb 100644
--- a/src/glx/glx_error.c
+++ b/src/glx/glx_error.c
@@ -39,11 +39,9 @@ __glXSendError(Display * dpy, int_fast8_t errorCode, uint_fast32_t resourceID,
                uint_fast16_t minorCode, bool coreX11error)
 {
    struct glx_display *glx_dpy = __glXInitialize(dpy);
-   struct glx_context *gc = __glXGetCurrentContext();
    xError error;
 
    assert(glx_dpy);
-   assert(gc);
 
    LockDisplay(dpy);
 
@@ -59,7 +57,7 @@ __glXSendError(Display * dpy, int_fast8_t errorCode, uint_fast32_t resourceID,
    error.sequenceNumber = dpy->request;
    error.resourceID = resourceID;
    error.minorCode = minorCode;
-   error.majorCode = gc ? gc->majorOpcode : 0;
+   error.majorCode = glx_dpy->majorOpcode;
 
    _XError(dpy, &error);
 
diff --git a/src/glx/glx_pbuffer.c b/src/glx/glx_pbuffer.c
index a0c1e3d..24c073c 100644
--- a/src/glx/glx_pbuffer.c
+++ b/src/glx/glx_pbuffer.c
@@ -328,7 +328,7 @@ GetDrawableAttribute(Display * dpy, GLXDrawable drawable,
        *   the calling thread's current context a GLXBadDrawable error is
        *   generated."
        */
-      if (pdraw == NULL || gc == NULL || gc->currentDpy != dpy ||
+      if (pdraw == NULL || gc == &dummyContext || gc->currentDpy != dpy ||
          (gc->currentDrawable != drawable &&
          gc->currentReadable != drawable)) {
          __glXSendError(dpy, GLXBadDrawable, drawable,
diff --git a/src/glx/glxcmds.c b/src/glx/glxcmds.c
index 3856032..5ff4dd3 100644
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -524,7 +524,7 @@ glXWaitGL(void)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc && gc->vtable->wait_gl)
+   if (gc != &dummyContext && gc->vtable->wait_gl)
       gc->vtable->wait_gl(gc);
 }
 
@@ -537,7 +537,7 @@ glXWaitX(void)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc && gc->vtable->wait_x)
+   if (gc != &dummyContext && gc->vtable->wait_x)
       gc->vtable->wait_x(gc);
 }
 
@@ -546,7 +546,7 @@ glXUseXFont(Font font, int first, int count, int listBase)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc && gc->vtable->use_x_font)
+   if (gc != &dummyContext && gc->vtable->use_x_font)
       gc->vtable->use_x_font(gc, font, first, count, listBase);
 }
 
@@ -838,7 +838,7 @@ glXSwapBuffers(Display * dpy, GLXDrawable drawable)
       __GLXDRIdrawable *pdraw = GetGLXDRIDrawable(dpy, drawable);
 
       if (pdraw != NULL) {
-         Bool flush = gc && drawable == gc->currentDrawable;
+         Bool flush = gc != &dummyContext && drawable == gc->currentDrawable;
 
          (*pdraw->psc->driScreen->swapBuffers)(pdraw, 0, 0, 0, flush);
          return;
@@ -855,7 +855,7 @@ glXSwapBuffers(Display * dpy, GLXDrawable drawable)
     ** The calling thread may or may not have a current context.  If it
     ** does, send the context tag so the server can do a flush.
     */
-   if ((gc != NULL) && (dpy == gc->currentDpy) &&
+   if ((gc != &dummyContext) && (dpy == gc->currentDpy) &&
        ((drawable == gc->currentDrawable)
         || (drawable == gc->currentReadable))) {
       tag = gc->currentContextTag;
@@ -1388,7 +1388,7 @@ _GLX_PUBLIC Display *
 glXGetCurrentDisplay(void)
 {
    struct glx_context *gc = __glXGetCurrentContext();
-   if (NULL == gc)
+   if (gc == &dummyContext)
       return NULL;
    return gc->currentDpy;
 }
@@ -1630,7 +1630,6 @@ glXCreateNewContext(Display * dpy, GLXFBConfig fbconfig,
                     int renderType, GLXContext shareList, Bool allowDirect)
 {
    struct glx_config *config = (struct glx_config *) fbconfig;
-   int screen = DefaultScreen(dpy);
    struct glx_config **config_list;
    int list_size;
    unsigned i;
@@ -1641,7 +1640,7 @@ glXCreateNewContext(Display * dpy, GLXFBConfig fbconfig,
    }
 
    config_list = (struct glx_config **)
-      glXGetFBConfigs(dpy, screen, &list_size);
+      glXGetFBConfigs(dpy, config->screen, &list_size);
 
    for (i = 0; i < list_size; i++) {
        if (config_list[i] == config)
@@ -1751,7 +1750,7 @@ __glXSwapIntervalSGI(int interval)
    CARD32 *interval_ptr;
    CARD8 opcode;
 
-   if (gc == NULL) {
+   if (gc == &dummyContext) {
       return GLX_BAD_CONTEXT;
    }
 
@@ -1805,7 +1804,7 @@ __glXSwapIntervalMESA(unsigned int interval)
 #ifdef GLX_DIRECT_RENDERING
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc != NULL && gc->isDirect) {
+   if (gc != &dummyContext && gc->isDirect) {
       struct glx_screen *psc;
 
       psc = GetGLXScreenConfigs( gc->currentDpy, gc->screen);
@@ -1827,7 +1826,7 @@ __glXGetSwapIntervalMESA(void)
 #ifdef GLX_DIRECT_RENDERING
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc != NULL && gc->isDirect) {
+   if (gc != &dummyContext && gc->isDirect) {
       struct glx_screen *psc;
 
       psc = GetGLXScreenConfigs( gc->currentDpy, gc->screen);
@@ -1857,7 +1856,7 @@ __glXGetVideoSyncSGI(unsigned int *count)
    __GLXDRIdrawable *pdraw;
 #endif
 
-   if (!gc)
+   if (gc == &dummyContext)
       return GLX_BAD_CONTEXT;
 
 #ifdef GLX_DIRECT_RENDERING
@@ -1899,7 +1898,7 @@ __glXWaitVideoSyncSGI(int divisor, int remainder, unsigned int *count)
    if (divisor <= 0 || remainder < 0)
       return GLX_BAD_VALUE;
 
-   if (!gc)
+   if (gc == &dummyContext)
       return GLX_BAD_CONTEXT;
 
 #ifdef GLX_DIRECT_RENDERING
@@ -2212,7 +2211,7 @@ __glXSwapBuffersMscOML(Display * dpy, GLXDrawable drawable,
    struct glx_screen *psc = pdraw ? pdraw->psc : NULL;
 #endif
 
-   if (!gc) /* no GLX for this */
+   if (gc == &dummyContext) /* no GLX for this */
       return -1;
 
 #ifdef GLX_DIRECT_RENDERING
@@ -2392,7 +2391,7 @@ __glXCopySubBufferMESA(Display * dpy, GLXDrawable drawable,
     ** does, send the context tag so the server can do a flush.
     */
    gc = __glXGetCurrentContext();
-   if ((gc != NULL) && (dpy == gc->currentDpy) &&
+   if ((gc != &dummyContext) && (dpy == gc->currentDpy) &&
        ((drawable == gc->currentDrawable) ||
         (drawable == gc->currentReadable))) {
       tag = gc->currentContextTag;
@@ -2431,7 +2430,7 @@ __glXBindTexImageEXT(Display * dpy,
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc == NULL || gc->vtable->bind_tex_image == NULL)
+   if (gc == &dummyContext || gc->vtable->bind_tex_image == NULL)
       return;
 
    gc->vtable->bind_tex_image(dpy, drawable, buffer, attrib_list);
@@ -2442,7 +2441,7 @@ __glXReleaseTexImageEXT(Display * dpy, GLXDrawable drawable, int buffer)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc == NULL || gc->vtable->release_tex_image == NULL)
+   if (gc == &dummyContext || gc->vtable->release_tex_image == NULL)
       return;
 
    gc->vtable->release_tex_image(dpy, drawable, buffer);
@@ -2718,7 +2717,7 @@ __glXGetUST(int64_t * ust)
 
 #if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)
 
-int
+PUBLIC int
 MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
                                 struct mesa_glinterop_device_info *out)
 {
@@ -2742,7 +2741,7 @@ MesaGLInteropGLXQueryDeviceInfo(Display *dpy, GLXContext context,
    return ret;
 }
 
-int
+PUBLIC int
 MesaGLInteropGLXExportObject(Display *dpy, GLXContext context,
                              struct mesa_glinterop_export_in *in,
                              struct mesa_glinterop_export_out *out)
diff --git a/src/glx/glxglvnd.c b/src/glx/glxglvnd.c
index b7252a7..098304d 100644
--- a/src/glx/glxglvnd.c
+++ b/src/glx/glxglvnd.c
@@ -19,13 +19,13 @@ static void *__glXGLVNDGetProcAddress(const GLubyte *procName)
 
 static unsigned FindGLXFunction(const GLubyte *name)
 {
-    unsigned first = 0;
-    unsigned last = DI_FUNCTION_COUNT - 1;
+    int first = 0;
+    int last = DI_FUNCTION_COUNT - 1;
 
     while (first <= last) {
-        unsigned middle = (first + last) / 2;
-        int comp = strcmp((const char *) name,
-                          __glXDispatchTableStrings[middle]);
+        int middle = (first + last) / 2;
+        int comp = strcmp(__glXDispatchTableStrings[middle],
+                          (const char *) name);
 
         if (comp < 0)
             first = middle + 1;
diff --git a/src/glx/query_renderer.c b/src/glx/query_renderer.c
index 9108ec2..4debf06 100644
--- a/src/glx/query_renderer.c
+++ b/src/glx/query_renderer.c
@@ -106,7 +106,7 @@ glXQueryCurrentRendererIntegerMESA(int attribute, unsigned int *value)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc == NULL)
+   if (gc == &dummyContext)
       return False;
 
    return __glXQueryRendererInteger(gc->psc, attribute, value);
@@ -166,7 +166,7 @@ glXQueryCurrentRendererStringMESA(int attribute)
 {
    struct glx_context *gc = __glXGetCurrentContext();
 
-   if (gc == NULL)
+   if (gc == &dummyContext)
       return False;
 
    return __glXQueryRendererString(gc->psc, attribute);
diff --git a/src/glx/tests/fake_glx_screen.cpp b/src/glx/tests/fake_glx_screen.cpp
index db20749..801f54a 100644
--- a/src/glx/tests/fake_glx_screen.cpp
+++ b/src/glx/tests/fake_glx_screen.cpp
@@ -75,7 +75,20 @@ indirect_create_context_attribs(struct glx_screen *base,
    return indirect_create_context(base, config_base, shareList, 0);
 }
 
-__thread void *__glX_tls_Context = NULL;
+/* This is necessary so that we don't have to link with glxcurrent.c
+ * which would require us to link with X libraries and what not.
+ */
+GLubyte dummyBuffer[__GLX_BUFFER_LIMIT_SIZE];
+struct glx_context_vtable dummyVtable;
+struct glx_context dummyContext = {
+   &dummyBuffer[0],
+   &dummyBuffer[0],
+   &dummyBuffer[0],
+   &dummyBuffer[__GLX_BUFFER_LIMIT_SIZE],
+   sizeof(dummyBuffer),
+   &dummyVtable
+};
+__thread void *__glX_tls_Context = &dummyContext;
 
 #if !defined(GLX_USE_TLS)
 extern "C" struct glx_context *
diff --git a/src/intel/genxml/Makefile.am b/src/intel/genxml/Makefile.am
index d6c1c5b..e8bd84c 100644
--- a/src/intel/genxml/Makefile.am
+++ b/src/intel/genxml/Makefile.am
@@ -35,6 +35,7 @@ $(BUILT_SOURCES): gen_pack_header.py
 CLEANFILES = $(BUILT_SOURCES)
 
 EXTRA_DIST = \
+	$(GENXML_GENERATED_FILES) \
 	gen6.xml \
 	gen7.xml \
 	gen75.xml \
diff --git a/src/intel/genxml/gen6.xml b/src/intel/genxml/gen6.xml
index 44e2804..8bc28a9 100644
--- a/src/intel/genxml/gen6.xml
+++ b/src/intel/genxml/gen6.xml
@@ -79,7 +79,7 @@
     </group>
   </struct>
 
-  <struct name="BLEND_STATE" length="2">
+  <struct name="BLEND_STATE_ENTRY" length="2">
     <field name="Color Buffer Blend Enable" start="31" end="31" type="bool"/>
     <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
     <field name="Alpha Blend Function" start="26" end="28" type="uint">
@@ -169,6 +169,12 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
+  <struct name="BLEND_STATE" length="16">
+    <group count="8" start="0" size="64">
+      <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
+    </group>
+  </struct>
+
   <struct name="CC_VIEWPORT" length="2">
     <field name="Minimum Depth" start="0" end="31" type="float"/>
     <field name="Maximum Depth" start="32" end="63" type="float"/>
@@ -781,6 +787,7 @@
     <field name="CLIP Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
       <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
     </field>
     <field name="Viewport XY ClipTest Enable" start="92" end="92" type="bool"/>
     <field name="Viewport Z ClipTest Enable" start="91" end="91" type="bool"/>
diff --git a/src/intel/genxml/gen7.xml b/src/intel/genxml/gen7.xml
index 2bbfcb7..cbeb2e1 100644
--- a/src/intel/genxml/gen7.xml
+++ b/src/intel/genxml/gen7.xml
@@ -102,7 +102,7 @@
     </group>
   </struct>
 
-  <struct name="BLEND_STATE" length="2">
+  <struct name="BLEND_STATE_ENTRY" length="2">
     <field name="Color Buffer Blend Enable" start="31" end="31" type="bool"/>
     <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
     <field name="Alpha Blend Function" start="26" end="28" type="uint">
@@ -192,6 +192,12 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
+  <struct name="BLEND_STATE" length="16">
+    <group count="8" start="0" size="64">
+      <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
+    </group>
+  </struct>
+
   <struct name="CC_VIEWPORT" length="2">
     <field name="Minimum Depth" start="0" end="31" type="float"/>
     <field name="Maximum Depth" start="32" end="63" type="float"/>
@@ -953,6 +959,7 @@
     <field name="Clip Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
       <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
     </field>
     <field name="Viewport XY ClipTest Enable" start="92" end="92" type="bool"/>
     <field name="Viewport Z ClipTest Enable" start="91" end="91" type="bool"/>
diff --git a/src/intel/genxml/gen75.xml b/src/intel/genxml/gen75.xml
index 9ab432c..40f3c31 100644
--- a/src/intel/genxml/gen75.xml
+++ b/src/intel/genxml/gen75.xml
@@ -112,7 +112,7 @@
     </group>
   </struct>
 
-  <struct name="BLEND_STATE" length="2">
+  <struct name="BLEND_STATE_ENTRY" length="2">
     <field name="Color Buffer Blend Enable" start="31" end="31" type="bool"/>
     <field name="Independent Alpha Blend Enable" start="30" end="30" type="bool"/>
     <field name="Alpha Blend Function" start="26" end="28" type="uint">
@@ -202,6 +202,12 @@
     <field name="Post-Blend Color Clamp Enable" start="32" end="32" type="bool"/>
   </struct>
 
+  <struct name="BLEND_STATE" length="16">
+    <group count="8" start="0" size="64">
+      <field name="Entry" start="0" end="63" type="BLEND_STATE_ENTRY"/>
+    </group>
+  </struct>
+
   <struct name="CC_VIEWPORT" length="2">
     <field name="Minimum Depth" start="0" end="31" type="float"/>
     <field name="Maximum Depth" start="32" end="63" type="float"/>
@@ -1062,6 +1068,7 @@
     <field name="Clip Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
       <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
     </field>
     <field name="Viewport XY ClipTest Enable" start="92" end="92" type="bool"/>
     <field name="Viewport Z ClipTest Enable" start="91" end="91" type="bool"/>
diff --git a/src/intel/genxml/gen8.xml b/src/intel/genxml/gen8.xml
index 80d40fb..e3b0cdb 100644
--- a/src/intel/genxml/gen8.xml
+++ b/src/intel/genxml/gen8.xml
@@ -1115,15 +1115,16 @@
     <field name="User Clip Distance Cull Test Enable Bitmask" start="32" end="39" type="uint"/>
     <field name="Clip Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
-      <value name="OGL" value="0"/>
+      <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
     </field>
     <field name="Viewport XY Clip Test Enable" start="92" end="92" type="bool"/>
     <field name="Guardband Clip Test Enable" start="90" end="90" type="bool"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="80" end="87" type="uint"/>
     <field name="Clip Mode" start="77" end="79" type="uint">
-      <value name="NORMAL" value="0"/>
-      <value name="REJECT_ALL" value="3"/>
-      <value name="ACCEPT_ALL" value="4"/>
+      <value name="CLIPMODE_NORMAL" value="0"/>
+      <value name="CLIPMODE_REJECT_ALL" value="3"/>
+      <value name="CLIPMODE_ACCEPT_ALL" value="4"/>
     </field>
     <field name="Perspective Divide Disable" start="73" end="73" type="bool"/>
     <field name="Non-Perspective Barycentric Enable" start="72" end="72" type="bool"/>
@@ -2035,7 +2036,7 @@
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="30"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
     <field name="SO Function Enable" start="63" end="63" type="uint"/>
-    <field name="API Rendering Disable" start="62" end="62" type="uint"/>
+    <field name="Rendering Disable" start="62" end="62" type="uint"/>
     <field name="Render Stream Select" start="59" end="60" type="uint"/>
     <field name="Reorder Mode" start="58" end="58" type="uint">
       <value name="LEADING" value="0"/>
diff --git a/src/intel/genxml/gen9.xml b/src/intel/genxml/gen9.xml
index 94b7d28..4333b89 100644
--- a/src/intel/genxml/gen9.xml
+++ b/src/intel/genxml/gen9.xml
@@ -1167,15 +1167,16 @@
     <field name="User Clip Distance Cull Test Enable Bitmask" start="32" end="39" type="uint"/>
     <field name="Clip Enable" start="95" end="95" type="bool"/>
     <field name="API Mode" start="94" end="94" type="uint">
-      <value name="OGL" value="0"/>
+      <value name="APIMODE_OGL" value="0"/>
+      <value name="APIMODE_D3D" value="1"/>
     </field>
     <field name="Viewport XY Clip Test Enable" start="92" end="92" type="bool"/>
     <field name="Guardband Clip Test Enable" start="90" end="90" type="bool"/>
     <field name="User Clip Distance Clip Test Enable Bitmask" start="80" end="87" type="uint"/>
     <field name="Clip Mode" start="77" end="79" type="uint">
-      <value name="NORMAL" value="0"/>
-      <value name="REJECT_ALL" value="3"/>
-      <value name="ACCEPT_ALL" value="4"/>
+      <value name="CLIPMODE_NORMAL" value="0"/>
+      <value name="CLIPMODE_REJECT_ALL" value="3"/>
+      <value name="CLIPMODE_ACCEPT_ALL" value="4"/>
     </field>
     <field name="Perspective Divide Disable" start="73" end="73" type="bool"/>
     <field name="Non-Perspective Barycentric Enable" start="72" end="72" type="bool"/>
@@ -2238,7 +2239,7 @@
     <field name="3D Command Sub Opcode" start="16" end="23" type="uint" default="30"/>
     <field name="DWord Length" start="0" end="7" type="uint" default="3"/>
     <field name="SO Function Enable" start="63" end="63" type="uint"/>
-    <field name="API Rendering Disable" start="62" end="62" type="uint"/>
+    <field name="Rendering Disable" start="62" end="62" type="uint"/>
     <field name="Render Stream Select" start="59" end="60" type="uint"/>
     <field name="Reorder Mode" start="58" end="58" type="uint">
       <value name="LEADING" value="0"/>
diff --git a/src/intel/isl/Makefile.am b/src/intel/isl/Makefile.am
index 1fd6683..8e03ee6 100644
--- a/src/intel/isl/Makefile.am
+++ b/src/intel/isl/Makefile.am
@@ -46,23 +46,25 @@ AM_CPPFLAGS = \
 	-I$(top_srcdir)/src/gallium/auxiliary \
 	-I$(top_srcdir)/src/gallium/include
 
-libisl_la_CFLAGS = $(CFLAGS) -Wno-override-init
+AM_CFLAGS = \
+	$(VISIBILITY_CFLAGS) \
+	-Wno-override-init
 
 libisl_la_LIBADD = $(ISL_GEN_LIBS)
 
 libisl_la_SOURCES = $(ISL_FILES) $(ISL_GENERATED_FILES)
 
 libisl_gen7_la_SOURCES = $(ISL_GEN7_FILES)
-libisl_gen7_la_CFLAGS = $(libisl_la_CFLAGS) -DGEN_VERSIONx10=70
+libisl_gen7_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=70
 
 libisl_gen75_la_SOURCES = $(ISL_GEN75_FILES)
-libisl_gen75_la_CFLAGS = $(libisl_la_CFLAGS) -DGEN_VERSIONx10=75
+libisl_gen75_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=75
 
 libisl_gen8_la_SOURCES = $(ISL_GEN8_FILES)
-libisl_gen8_la_CFLAGS = $(libisl_la_CFLAGS) -DGEN_VERSIONx10=80
+libisl_gen8_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=80
 
 libisl_gen9_la_SOURCES = $(ISL_GEN9_FILES)
-libisl_gen9_la_CFLAGS = $(libisl_la_CFLAGS) -DGEN_VERSIONx10=90
+libisl_gen9_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=90
 
 BUILT_SOURCES = $(ISL_GENERATED_FILES)
 
diff --git a/src/intel/isl/isl.c b/src/intel/isl/isl.c
index 77b570d..0bdfa9d 100644
--- a/src/intel/isl/isl.c
+++ b/src/intel/isl/isl.c
@@ -490,27 +490,27 @@ isl_calc_phys_level0_extent_sa(const struct isl_device *dev,
 
       case ISL_MSAA_LAYOUT_ARRAY:
          assert(info->depth == 1);
-         assert(info->array_len == 1);
+         assert(info->levels == 1);
          assert(!isl_format_is_compressed(info->format));
 
          *phys_level0_sa = (struct isl_extent4d) {
             .w = info->width,
             .h = info->height,
             .d = 1,
-            .a = info->samples,
+            .a = info->array_len * info->samples,
          };
          break;
 
       case ISL_MSAA_LAYOUT_INTERLEAVED:
          assert(info->depth == 1);
-         assert(info->array_len == 1);
+         assert(info->levels == 1);
          assert(!isl_format_is_compressed(info->format));
 
          *phys_level0_sa = (struct isl_extent4d) {
             .w = info->width,
             .h = info->height,
             .d = 1,
-            .a = 1,
+            .a = info->array_len,
          };
 
          isl_msaa_interleaved_scale_px_to_sa(info->samples,
diff --git a/src/intel/isl/isl.h b/src/intel/isl/isl.h
index ef86228..64aced8 100644
--- a/src/intel/isl/isl.h
+++ b/src/intel/isl/isl.h
@@ -989,7 +989,7 @@ isl_has_matching_typed_storage_image_format(const struct brw_device_info *devinf
 static inline bool
 isl_tiling_is_any_y(enum isl_tiling tiling)
 {
-   return (1u << tiling) & ISL_TILING_ANY_MASK;
+   return (1u << tiling) & ISL_TILING_ANY_Y_MASK;
 }
 
 static inline bool
diff --git a/src/intel/isl/isl_gen6.c b/src/intel/isl/isl_gen6.c
index 24c3939..cc246f5 100644
--- a/src/intel/isl/isl_gen6.c
+++ b/src/intel/isl/isl_gen6.c
@@ -37,7 +37,7 @@ gen6_choose_msaa_layout(const struct isl_device *dev,
 
    if (info->samples == 1) {
       *msaa_layout = ISL_MSAA_LAYOUT_NONE;
-      return false;
+      return true;
    }
 
    /* From the Sandybridge PRM, Volume 4 Part 1 p72, SURFACE_STATE, Surface
diff --git a/src/intel/isl/isl_surface_state.c b/src/intel/isl/isl_surface_state.c
index b2317d8..1c985b6 100644
--- a/src/intel/isl/isl_surface_state.c
+++ b/src/intel/isl/isl_surface_state.c
@@ -430,8 +430,15 @@ isl_genX(buffer_fill_state_s)(void *state,
    uint32_t num_elements = info->size / info->stride;
 
    if (GEN_GEN >= 7) {
+      /* From the IVB PRM, SURFACE_STATE::Height,
+       *
+       *    For typed buffer and structured buffer surfaces, the number
+       *    of entries in the buffer ranges from 1 to 2^27. For raw buffer
+       *    surfaces, the number of entries in the buffer is the number of bytes
+       *    which can range from 1 to 2^30.
+       */
       if (info->format == ISL_FORMAT_RAW) {
-         assert(num_elements <= (1ull << 31));
+         assert(num_elements <= (1ull << 30));
          assert((num_elements & 3) == 0);
       } else {
          assert(num_elements <= (1ull << 27));
diff --git a/src/intel/vulkan/Makefile.am b/src/intel/vulkan/Makefile.am
index 0e521cf..6b1015a 100644
--- a/src/intel/vulkan/Makefile.am
+++ b/src/intel/vulkan/Makefile.am
@@ -61,7 +61,9 @@ AM_CPPFLAGS = \
 	-I$(top_builddir)/src/intel \
 	-I$(top_srcdir)/src/intel
 
-AM_CFLAGS = -Wno-override-init -msse2
+AM_CFLAGS = \
+	$(VISIBILITY_CFLAGS) \
+	-Wno-override-init -msse2
 
 libanv_gen7_la_CFLAGS = $(AM_CFLAGS) -DGEN_VERSIONx10=70
 libanv_gen7_la_SOURCES = $(GEN7_FILES)
@@ -159,6 +161,7 @@ libvulkan_intel_la_LDFLAGS = \
 	-module \
 	-no-undefined \
 	-avoid-version \
+	$(BSYMBOLIC) \
 	$(GC_SECTIONS) \
 	$(LD_NO_UNDEFINED)
 
diff --git a/src/intel/vulkan/anv_cmd_buffer.c b/src/intel/vulkan/anv_cmd_buffer.c
index 20d3af1..ca78c09 100644
--- a/src/intel/vulkan/anv_cmd_buffer.c
+++ b/src/intel/vulkan/anv_cmd_buffer.c
@@ -359,7 +359,7 @@ anv_cmd_buffer_emit_state_base_address(struct anv_cmd_buffer *cmd_buffer)
    switch (cmd_buffer->device->info.gen) {
    case 7:
       if (cmd_buffer->device->info.is_haswell)
-         return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
+         return gen75_cmd_buffer_emit_state_base_address(cmd_buffer);
       else
          return gen7_cmd_buffer_emit_state_base_address(cmd_buffer);
    case 8:
@@ -741,20 +741,26 @@ anv_cmd_buffer_emit_binding_table(struct anv_cmd_buffer *cmd_buffer,
 {
    struct anv_framebuffer *fb = cmd_buffer->state.framebuffer;
    struct anv_subpass *subpass = cmd_buffer->state.subpass;
-   struct anv_pipeline_bind_map *map;
+   struct anv_pipeline *pipeline;
    uint32_t bias, state_offset;
 
    switch (stage) {
    case  MESA_SHADER_COMPUTE:
-      map = &cmd_buffer->state.compute_pipeline->bindings[stage];
+      pipeline = cmd_buffer->state.compute_pipeline;
       bias = 1;
       break;
    default:
-      map = &cmd_buffer->state.pipeline->bindings[stage];
+      pipeline = cmd_buffer->state.pipeline;
       bias = 0;
       break;
    }
 
+   if (!anv_pipeline_has_stage(pipeline, stage)) {
+      *bt_state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
    if (bias + map->surface_count == 0) {
       *bt_state = (struct anv_state) { 0, };
       return VK_SUCCESS;
@@ -907,13 +913,19 @@ VkResult
 anv_cmd_buffer_emit_samplers(struct anv_cmd_buffer *cmd_buffer,
                              gl_shader_stage stage, struct anv_state *state)
 {
-   struct anv_pipeline_bind_map *map;
+   struct anv_pipeline *pipeline;
 
    if (stage == MESA_SHADER_COMPUTE)
-      map = &cmd_buffer->state.compute_pipeline->bindings[stage];
+      pipeline = cmd_buffer->state.compute_pipeline;
    else
-      map = &cmd_buffer->state.pipeline->bindings[stage];
+      pipeline = cmd_buffer->state.pipeline;
 
+   if (!anv_pipeline_has_stage(pipeline, stage)) {
+      *state = (struct anv_state) { 0, };
+      return VK_SUCCESS;
+   }
+
+   struct anv_pipeline_bind_map *map = &pipeline->shaders[stage]->bind_map;
    if (map->sampler_count == 0) {
       *state = (struct anv_state) { 0, };
       return VK_SUCCESS;
@@ -1080,10 +1092,14 @@ struct anv_state
 anv_cmd_buffer_push_constants(struct anv_cmd_buffer *cmd_buffer,
                               gl_shader_stage stage)
 {
+   /* If we don't have this stage, bail. */
+   if (!anv_pipeline_has_stage(cmd_buffer->state.pipeline, stage))
+      return (struct anv_state) { .offset = 0 };
+
    struct anv_push_constants *data =
       cmd_buffer->state.push_constants[stage];
    const struct brw_stage_prog_data *prog_data =
-      cmd_buffer->state.pipeline->prog_data[stage];
+      anv_shader_bin_get_prog_data(cmd_buffer->state.pipeline->shaders[stage]);
 
    /* If we don't actually have any push constants, bail. */
    if (data == NULL || prog_data == NULL || prog_data->nr_params == 0)
diff --git a/src/intel/vulkan/anv_descriptor_set.c b/src/intel/vulkan/anv_descriptor_set.c
index 448ae0e..4ab1802 100644
--- a/src/intel/vulkan/anv_descriptor_set.c
+++ b/src/intel/vulkan/anv_descriptor_set.c
@@ -27,6 +27,8 @@
 #include <unistd.h>
 #include <fcntl.h>
 
+#include "util/mesa-sha1.h"
+
 #include "anv_private.h"
 
 /*
@@ -65,9 +67,8 @@ VkResult anv_CreateDescriptorSetLayout(
    struct anv_sampler **samplers =
       (struct anv_sampler **)&set_layout->binding[max_binding + 1];
 
+   memset(set_layout, 0, sizeof(*set_layout));
    set_layout->binding_count = max_binding + 1;
-   set_layout->shader_stages = 0;
-   set_layout->size = 0;
 
    for (uint32_t b = 0; b <= max_binding; b++) {
       /* Initialize all binding_layout entries to -1 */
@@ -202,6 +203,15 @@ void anv_DestroyDescriptorSetLayout(
    anv_free2(&device->alloc, pAllocator, set_layout);
 }
 
+static void
+sha1_update_descriptor_set_layout(struct mesa_sha1 *ctx,
+                                  const struct anv_descriptor_set_layout *layout)
+{
+   size_t size = sizeof(*layout) +
+                 sizeof(layout->binding[0]) * layout->binding_count;
+   _mesa_sha1_update(ctx, layout, size);
+}
+
 /*
  * Pipeline layouts.  These have nothing to do with the pipeline.  They are
  * just muttiple descriptor set layouts pasted together
@@ -246,6 +256,19 @@ VkResult anv_CreatePipelineLayout(
       }
    }
 
+   struct mesa_sha1 *ctx = _mesa_sha1_init();
+   for (unsigned s = 0; s < layout->num_sets; s++) {
+      sha1_update_descriptor_set_layout(ctx, layout->set[s].layout);
+      _mesa_sha1_update(ctx, &layout->set[s].dynamic_offset_start,
+                        sizeof(layout->set[s].dynamic_offset_start));
+   }
+   _mesa_sha1_update(ctx, &layout->num_sets, sizeof(layout->num_sets));
+   for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
+      _mesa_sha1_update(ctx, &layout->stage[s].has_dynamic_offsets,
+                        sizeof(layout->stage[s].has_dynamic_offsets));
+   }
+   _mesa_sha1_final(ctx, layout->sha1);
+
    *pPipelineLayout = anv_pipeline_layout_to_handle(layout);
 
    return VK_SUCCESS;
@@ -409,6 +432,11 @@ anv_descriptor_set_create(struct anv_device *device,
       (struct anv_buffer_view *) &set->descriptors[layout->size];
    set->buffer_count = layout->buffer_count;
 
+   /* By defining the descriptors to be zero now, we can later verify that
+    * a descriptor has not been populated with user data.
+    */
+   memset(set->descriptors, 0, sizeof(struct anv_descriptor) * layout->size);
+
    /* Go through and fill out immutable samplers if we have any */
    struct anv_descriptor *desc = set->descriptors;
    for (uint32_t b = 0; b < layout->binding_count; b++) {
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index dd941b6..cd8fb3a 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -372,7 +372,7 @@ void anv_GetPhysicalDeviceFeatures(
       .robustBufferAccess                       = true,
       .fullDrawIndexUint32                      = true,
       .imageCubeArray                           = false,
-      .independentBlend                         = pdevice->info->gen >= 8,
+      .independentBlend                         = true,
       .geometryShader                           = true,
       .tessellationShader                       = false,
       .sampleRateShading                        = false,
@@ -438,6 +438,10 @@ void anv_GetPhysicalDeviceProperties(
 
    const float time_stamp_base = devinfo->gen >= 9 ? 83.333 : 80.0;
 
+   /* See assertions made when programming the buffer surface state. */
+   const uint32_t max_raw_buffer_sz = devinfo->gen >= 7 ?
+                                      (1ul << 30) : (1ul << 27);
+
    VkSampleCountFlags sample_counts =
       isl_device_get_sample_counts(&pdevice->isl_dev);
 
@@ -448,8 +452,8 @@ void anv_GetPhysicalDeviceProperties(
       .maxImageDimensionCube                    = (1 << 14),
       .maxImageArrayLayers                      = (1 << 11),
       .maxTexelBufferElements                   = 128 * 1024 * 1024,
-      .maxUniformBufferRange                    = UINT32_MAX,
-      .maxStorageBufferRange                    = UINT32_MAX,
+      .maxUniformBufferRange                    = (1ul << 27),
+      .maxStorageBufferRange                    = max_raw_buffer_sz,
       .maxPushConstantsSize                     = MAX_PUSH_CONSTANTS_SIZE,
       .maxMemoryAllocationCount                 = UINT32_MAX,
       .maxSamplerAllocationCount                = 64 * 1024,
@@ -649,13 +653,15 @@ PFN_vkVoidFunction anv_GetInstanceProcAddr(
    return anv_lookup_entrypoint(pName);
 }
 
-/* The loader wants us to expose a second GetInstanceProcAddr function
- * to work around certain LD_PRELOAD issues seen in apps.
+/* With version 1+ of the loader interface the ICD should expose
+ * vk_icdGetInstanceProcAddr to work around certain LD_PRELOAD issues seen in apps.
  */
+PUBLIC
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
     VkInstance                                  instance,
     const char*                                 pName);
 
+PUBLIC
 VKAPI_ATTR PFN_vkVoidFunction VKAPI_CALL vk_icdGetInstanceProcAddr(
     VkInstance                                  instance,
     const char*                                 pName)
@@ -869,7 +875,8 @@ VkResult anv_CreateDevice(
                        &device->dynamic_state_block_pool);
 
    anv_block_pool_init(&device->instruction_block_pool, device, 128 * 1024);
-   anv_pipeline_cache_init(&device->default_pipeline_cache, device);
+   anv_state_pool_init(&device->instruction_state_pool,
+                       &device->instruction_block_pool);
 
    anv_block_pool_init(&device->surface_state_block_pool, device, 4096);
 
@@ -944,6 +951,7 @@ void anv_DestroyDevice(
    anv_bo_pool_finish(&device->batch_bo_pool);
    anv_state_pool_finish(&device->dynamic_state_pool);
    anv_block_pool_finish(&device->dynamic_state_block_pool);
+   anv_state_pool_finish(&device->instruction_state_pool);
    anv_block_pool_finish(&device->instruction_block_pool);
    anv_state_pool_finish(&device->surface_state_pool);
    anv_block_pool_finish(&device->surface_state_block_pool);
@@ -1786,23 +1794,3 @@ void anv_DestroyFramebuffer(
 
    anv_free2(&device->alloc, pAllocator, fb);
 }
-
-void vkCmdDbgMarkerBegin(
-    VkCommandBuffer                              commandBuffer,
-    const char*                                 pMarker)
-   __attribute__ ((visibility ("default")));
-
-void vkCmdDbgMarkerEnd(
-   VkCommandBuffer                              commandBuffer)
-   __attribute__ ((visibility ("default")));
-
-void vkCmdDbgMarkerBegin(
-    VkCommandBuffer                              commandBuffer,
-    const char*                                 pMarker)
-{
-}
-
-void vkCmdDbgMarkerEnd(
-    VkCommandBuffer                              commandBuffer)
-{
-}
diff --git a/src/intel/vulkan/anv_entrypoints_gen.py b/src/intel/vulkan/anv_entrypoints_gen.py
index 2896174..dcf25ee 100644
--- a/src/intel/vulkan/anv_entrypoints_gen.py
+++ b/src/intel/vulkan/anv_entrypoints_gen.py
@@ -134,7 +134,6 @@ if opt_header:
         print "%s gen75_%s%s;" % (type, name, args)
         print "%s gen8_%s%s;" % (type, name, args)
         print "%s gen9_%s%s;" % (type, name, args)
-        print "%s anv_validate_%s%s;" % (type, name, args)
         print_guard_end(name)
     exit()
 
@@ -185,23 +184,24 @@ for type, name, args, num, h in entrypoints:
     print "   \"vk%s\\0\"" % name
     offsets.append(i)
     i += 2 + len(name) + 1
-print """   ;
+print "   ;"
 
-/* Weak aliases for all potential validate functions. These will resolve to
- * NULL if they're not defined, which lets the resolve_entrypoint() function
- * either pick a validate wrapper if available or just plug in the actual
- * entry point.
- */
-"""
-
-# Now generate the table of all entry points and their validation functions
+# Now generate the table of all entry points
 
 print "\nstatic const struct anv_entrypoint entrypoints[] = {"
 for type, name, args, num, h in entrypoints:
     print "   { %5d, 0x%08x }," % (offsets[num], h)
 print "};\n"
 
-for layer in [ "anv", "validate", "gen7", "gen75", "gen8", "gen9" ]:
+print """
+
+/* Weak aliases for all potential implementations. These will resolve to
+ * NULL if they're not defined, which lets the resolve_entrypoint() function
+ * either pick the correct entry point.
+ */
+"""
+
+for layer in [ "anv", "gen7", "gen75", "gen8", "gen9" ]:
     for type, name, args, num, h in entrypoints:
         print_guard_start(name)
         print "%s %s_%s%s __attribute__ ((weak));" % (type, layer, name, args)
@@ -214,27 +214,6 @@ for layer in [ "anv", "validate", "gen7", "gen75", "gen8", "gen9" ]:
     print "};\n"
 
 print """
-#ifdef DEBUG
-static bool enable_validate = true;
-#else
-static bool enable_validate = false;
-#endif
-
-/* We can't use symbols that need resolving (like, oh, getenv) in the resolve
- * function. This means that we have to determine whether or not to use the
- * validation layer sometime before that. The constructor function attribute asks
- * the dynamic linker to invoke determine_validate() at dlopen() time which
- * works.
- */
-static void __attribute__ ((constructor))
-determine_validate(void)
-{
-   const char *s = getenv("ANV_VALIDATE");
-
-   if (s)
-      enable_validate = atoi(s);
-}
-
 static const struct brw_device_info *dispatch_devinfo;
 
 void
@@ -246,9 +225,6 @@ anv_set_dispatch_devinfo(const struct brw_device_info *devinfo)
 void * __attribute__ ((noinline))
 anv_resolve_entrypoint(uint32_t index)
 {
-   if (enable_validate && validate_layer.entrypoints[index])
-      return validate_layer.entrypoints[index];
-
    if (dispatch_devinfo == NULL) {
       return anv_layer.entrypoints[index];
    }
@@ -277,17 +253,6 @@ anv_resolve_entrypoint(uint32_t index)
 }
 """
 
-# Now output ifuncs and their resolve helpers for all entry points. The
-# resolve helper calls resolve_entrypoint() with the entry point index, which
-# lets the resolver look it up in the table.
-
-for type, name, args, num, h in entrypoints:
-    print_guard_start(name)
-    print "static void *resolve_%s(void) { return anv_resolve_entrypoint(%d); }" % (name, num)
-    print "%s vk%s%s\n   __attribute__ ((ifunc (\"resolve_%s\"), visibility (\"default\")));\n" % (type, name, args, name)
-    print_guard_end(name)
-
-
 # Now generate the hash table used for entry point look up.  This is a
 # uint16_t table of entry point indices. We use 0xffff to indicate an entry
 # in the hash table is empty.
diff --git a/src/intel/vulkan/anv_image.c b/src/intel/vulkan/anv_image.c
index 77d9931..caf4a3e 100644
--- a/src/intel/vulkan/anv_image.c
+++ b/src/intel/vulkan/anv_image.c
@@ -322,81 +322,6 @@ void anv_GetImageSubresourceLayout(
    }
 }
 
-VkResult
-anv_validate_CreateImageView(VkDevice _device,
-                             const VkImageViewCreateInfo *pCreateInfo,
-                             const VkAllocationCallbacks *pAllocator,
-                             VkImageView *pView)
-{
-   ANV_FROM_HANDLE(anv_image, image, pCreateInfo->image);
-   const VkImageSubresourceRange *subresource;
-
-   /* Validate structure type before dereferencing it. */
-   assert(pCreateInfo);
-   assert(pCreateInfo->sType == VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO);
-   subresource = &pCreateInfo->subresourceRange;
-
-   /* Validate viewType is in range before using it. */
-   assert(pCreateInfo->viewType >= VK_IMAGE_VIEW_TYPE_BEGIN_RANGE);
-   assert(pCreateInfo->viewType <= VK_IMAGE_VIEW_TYPE_END_RANGE);
-
-   /* Validate format is in range before using it. */
-   assert(pCreateInfo->format >= VK_FORMAT_BEGIN_RANGE);
-   assert(pCreateInfo->format <= VK_FORMAT_END_RANGE);
-
-   /* Validate channel swizzles. */
-   assert(pCreateInfo->components.r >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
-   assert(pCreateInfo->components.r <= VK_COMPONENT_SWIZZLE_END_RANGE);
-   assert(pCreateInfo->components.g >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
-   assert(pCreateInfo->components.g <= VK_COMPONENT_SWIZZLE_END_RANGE);
-   assert(pCreateInfo->components.b >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
-   assert(pCreateInfo->components.b <= VK_COMPONENT_SWIZZLE_END_RANGE);
-   assert(pCreateInfo->components.a >= VK_COMPONENT_SWIZZLE_BEGIN_RANGE);
-   assert(pCreateInfo->components.a <= VK_COMPONENT_SWIZZLE_END_RANGE);
-
-   /* Validate subresource. */
-   assert(subresource->aspectMask != 0);
-   assert(subresource->levelCount > 0);
-   assert(subresource->layerCount > 0);
-   assert(subresource->baseMipLevel < image->levels);
-   assert(subresource->baseMipLevel + anv_get_levelCount(image, subresource) <= image->levels);
-   assert(subresource->baseArrayLayer < image->array_size);
-   assert(subresource->baseArrayLayer + anv_get_layerCount(image, subresource) <= image->array_size);
-   assert(pView);
-
-   MAYBE_UNUSED const VkImageAspectFlags view_format_aspects =
-      vk_format_aspects(pCreateInfo->format);
-
-   const VkImageAspectFlags ds_flags = VK_IMAGE_ASPECT_DEPTH_BIT
-                                     | VK_IMAGE_ASPECT_STENCIL_BIT;
-
-   /* Validate format. */
-   if (subresource->aspectMask & VK_IMAGE_ASPECT_COLOR_BIT) {
-      assert(subresource->aspectMask == VK_IMAGE_ASPECT_COLOR_BIT);
-      assert(image->aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-      assert(view_format_aspects == VK_IMAGE_ASPECT_COLOR_BIT);
-   } else if (subresource->aspectMask & ds_flags) {
-      assert((subresource->aspectMask & ~ds_flags) == 0);
-
-      assert(pCreateInfo->format == image->vk_format);
-
-      if (subresource->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         assert(image->aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
-         assert(view_format_aspects & VK_IMAGE_ASPECT_DEPTH_BIT);
-      }
-
-      if (subresource->aspectMask & VK_IMAGE_ASPECT_STENCIL_BIT) {
-         /* FINISHME: Is it legal to have an R8 view of S8? */
-         assert(image->aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
-         assert(view_format_aspects & VK_IMAGE_ASPECT_STENCIL_BIT);
-      }
-   } else {
-      assert(!"bad VkImageSubresourceRange::aspectFlags");
-   }
-
-   return anv_CreateImageView(_device, pCreateInfo, pAllocator, pView);
-}
-
 static struct anv_state
 alloc_surface_state(struct anv_device *device,
                     struct anv_cmd_buffer *cmd_buffer)
@@ -628,18 +553,19 @@ void anv_buffer_view_init(struct anv_buffer_view *view,
    view->format = anv_get_isl_format(&device->info, pCreateInfo->format,
                                      VK_IMAGE_ASPECT_COLOR_BIT,
                                      VK_IMAGE_TILING_LINEAR);
+   const uint32_t format_bs = isl_format_get_layout(view->format)->bs;
    view->bo = buffer->bo;
    view->offset = buffer->offset + pCreateInfo->offset;
    view->range = pCreateInfo->range == VK_WHOLE_SIZE ?
-                 buffer->size - view->offset : pCreateInfo->range;
+                 buffer->size - pCreateInfo->offset : pCreateInfo->range;
+   view->range = align_down_npot_u32(view->range, format_bs);
 
    if (buffer->usage & VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT) {
       view->surface_state = alloc_surface_state(device, cmd_buffer);
 
       anv_fill_buffer_surface_state(device, view->surface_state,
                                     view->format,
-                                    view->offset, view->range,
-                                    isl_format_get_layout(view->format)->bs);
+                                    view->offset, view->range, format_bs);
    } else {
       view->surface_state = (struct anv_state){ 0 };
    }
diff --git a/src/intel/vulkan/anv_meta_blit.c b/src/intel/vulkan/anv_meta_blit.c
index dc098ef..af15c2c 100644
--- a/src/intel/vulkan/anv_meta_blit.c
+++ b/src/intel/vulkan/anv_meta_blit.c
@@ -106,7 +106,7 @@ build_nir_copy_fragment_shader(enum glsl_sampler_dim tex_dim)
    nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
                                                  vec4, "f_color");
    color_out->data.location = FRAG_RESULT_DATA0;
-   nir_store_var(&b, color_out, &tex->dest.ssa, 4);
+   nir_store_var(&b, color_out, &tex->dest.ssa, 0xf);
 
    return b.shader;
 }
diff --git a/src/intel/vulkan/anv_meta_blit2d.c b/src/intel/vulkan/anv_meta_blit2d.c
index 06e1043..649c11f 100644
--- a/src/intel/vulkan/anv_meta_blit2d.c
+++ b/src/intel/vulkan/anv_meta_blit2d.c
@@ -92,6 +92,21 @@ vk_format_for_size(int bs)
    }
 }
 
+/* This function returns the format corresponding to a single component of the
+ * RGB format for the given size returned by vk_format_for_size().
+ */
+static VkFormat
+vk_single_component_format_for_rgb_size(int bs)
+{
+   switch (bs) {
+   case 3: return VK_FORMAT_R8_UNORM;
+   case 6: return VK_FORMAT_R16_UNORM;
+   case 12: return VK_FORMAT_R32_UINT;
+   default:
+      unreachable("Invalid format block size");
+   }
+}
+
 static void
 create_iview(struct anv_cmd_buffer *cmd_buffer,
              struct anv_meta_blit2d_surf *surf,
@@ -99,13 +114,14 @@ create_iview(struct anv_cmd_buffer *cmd_buffer,
              VkImageUsageFlags usage,
              uint32_t width,
              uint32_t height,
+             VkFormat format,
              VkImage *img,
              struct anv_image_view *iview)
 {
    const VkImageCreateInfo image_info = {
       .sType = VK_STRUCTURE_TYPE_IMAGE_CREATE_INFO,
       .imageType = VK_IMAGE_TYPE_2D,
-      .format = vk_format_for_size(surf->bs),
+      .format = format,
       .extent = {
          .width = width,
          .height = height,
@@ -179,6 +195,7 @@ blit2d_bind_src(struct anv_cmd_buffer *cmd_buffer,
 
       create_iview(cmd_buffer, src, offset, VK_IMAGE_USAGE_SAMPLED_BIT,
                    rect->src_x + rect->width, rect->src_y + rect->height,
+                   vk_format_for_size(src->bs),
                    &tmp->image, &tmp->iview);
 
       anv_CreateDescriptorPool(vk_device,
@@ -328,10 +345,11 @@ blit2d_bind_dst(struct anv_cmd_buffer *cmd_buffer,
                 uint64_t offset,
                 uint32_t width,
                 uint32_t height,
+                VkFormat format,
                 struct blit2d_dst_temps *tmp)
 {
    create_iview(cmd_buffer, dst, offset, VK_IMAGE_USAGE_COLOR_ATTACHMENT_BIT,
-                width, height, &tmp->image, &tmp->iview);
+                width, height, format, &tmp->image, &tmp->iview);
 
    anv_CreateFramebuffer(anv_device_to_handle(cmd_buffer->device),
       &(VkFramebufferCreateInfo) {
@@ -406,7 +424,8 @@ anv_meta_blit2d_normal_dst(struct anv_cmd_buffer *cmd_buffer,
 
       struct blit2d_dst_temps dst_temps;
       blit2d_bind_dst(cmd_buffer, dst, offset, rects[r].dst_x + rects[r].width,
-                      rects[r].dst_y + rects[r].height, &dst_temps);
+                      rects[r].dst_y + rects[r].height,
+                      vk_format_for_size(dst->bs), &dst_temps);
 
       struct blit_vb_data {
          float pos[2];
@@ -544,7 +563,8 @@ anv_meta_blit2d_w_tiled_dst(struct anv_cmd_buffer *cmd_buffer,
       };
 
       struct blit2d_dst_temps dst_temps;
-      blit2d_bind_dst(cmd_buffer, &dst_Y, offset, xmax_Y, ymax_Y, &dst_temps);
+      blit2d_bind_dst(cmd_buffer, &dst_Y, offset, xmax_Y, ymax_Y,
+                      VK_FORMAT_R8_UINT, &dst_temps);
 
       struct blit_vb_header {
          struct anv_vue_header vue;
@@ -647,6 +667,141 @@ anv_meta_blit2d_w_tiled_dst(struct anv_cmd_buffer *cmd_buffer,
    }
 }
 
+static void
+anv_meta_blit2d_rgb_dst(struct anv_cmd_buffer *cmd_buffer,
+                        struct anv_meta_blit2d_surf *src,
+                        enum blit2d_src_type src_type,
+                        struct anv_meta_blit2d_surf *dst,
+                        unsigned num_rects,
+                        struct anv_meta_blit2d_rect *rects)
+{
+   struct anv_device *device = cmd_buffer->device;
+
+   for (unsigned r = 0; r < num_rects; ++r) {
+      struct blit2d_src_temps src_temps;
+      blit2d_bind_src(cmd_buffer, src, src_type, &rects[r], &src_temps);
+
+      assert(dst->bs % 3 == 0);
+      assert(dst->tiling == ISL_TILING_LINEAR);
+
+      uint32_t offset;
+      isl_tiling_get_intratile_offset_el(&cmd_buffer->device->isl_dev,
+                                         dst->tiling, 1, dst->pitch,
+                                         rects[r].dst_x, rects[r].dst_y,
+                                         &offset,
+                                         &rects[r].dst_x, &rects[r].dst_y);
+
+      /* A red surface three times as wide as the actual RGB destination */
+      struct anv_meta_blit2d_surf dst_R = {
+         .bo = dst->bo,
+         .tiling = dst->tiling,
+         .base_offset = dst->base_offset,
+         .bs = dst->bs / 3,
+         .pitch = dst->pitch,
+      };
+
+      struct blit2d_dst_temps dst_temps;
+      blit2d_bind_dst(cmd_buffer, &dst_R, offset,
+                      (rects[r].dst_x + rects[r].width) * 3,
+                      rects[r].dst_y + rects[r].height,
+                      vk_single_component_format_for_rgb_size(dst->bs),
+                      &dst_temps);
+
+      struct blit_vb_data {
+         float pos[2];
+         float tex_coord[3];
+      } *vb_data;
+
+      unsigned vb_size = sizeof(struct anv_vue_header) + 3 * sizeof(*vb_data);
+
+      struct anv_state vb_state =
+         anv_cmd_buffer_alloc_dynamic_state(cmd_buffer, vb_size, 16);
+      memset(vb_state.map, 0, sizeof(struct anv_vue_header));
+      vb_data = vb_state.map + sizeof(struct anv_vue_header);
+
+      vb_data[0] = (struct blit_vb_data) {
+         .pos = {
+            (rects[r].dst_x + rects[r].width) * 3,
+            rects[r].dst_y + rects[r].height,
+         },
+         .tex_coord = {
+            rects[r].src_x + rects[r].width,
+            rects[r].src_y + rects[r].height,
+            src->pitch,
+         },
+      };
+
+      vb_data[1] = (struct blit_vb_data) {
+         .pos = {
+            rects[r].dst_x * 3,
+            rects[r].dst_y + rects[r].height,
+         },
+         .tex_coord = {
+            rects[r].src_x,
+            rects[r].src_y + rects[r].height,
+            src->pitch,
+         },
+      };
+
+      vb_data[2] = (struct blit_vb_data) {
+         .pos = {
+            rects[r].dst_x * 3,
+            rects[r].dst_y,
+         },
+         .tex_coord = {
+            rects[r].src_x,
+            rects[r].src_y,
+            src->pitch,
+         },
+      };
+
+      if (!device->info.has_llc)
+         anv_state_clflush(vb_state);
+
+      struct anv_buffer vertex_buffer = {
+         .device = device,
+         .size = vb_size,
+         .bo = &device->dynamic_state_block_pool.bo,
+         .offset = vb_state.offset,
+      };
+
+      anv_CmdBindVertexBuffers(anv_cmd_buffer_to_handle(cmd_buffer), 0, 2,
+         (VkBuffer[]) {
+            anv_buffer_to_handle(&vertex_buffer),
+            anv_buffer_to_handle(&vertex_buffer)
+         },
+         (VkDeviceSize[]) {
+            0,
+            sizeof(struct anv_vue_header),
+         });
+
+      ANV_CALL(CmdBeginRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer),
+         &(VkRenderPassBeginInfo) {
+            .sType = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO,
+            .renderPass = device->meta_state.blit2d.render_pass,
+            .framebuffer = dst_temps.fb,
+            .renderArea = {
+               .offset = { rects[r].dst_x, rects[r].dst_y, },
+               .extent = { rects[r].width, rects[r].height },
+            },
+            .clearValueCount = 0,
+            .pClearValues = NULL,
+         }, VK_SUBPASS_CONTENTS_INLINE);
+
+      bind_pipeline(cmd_buffer, src_type, BLIT2D_DST_TYPE_RGB);
+
+      ANV_CALL(CmdDraw)(anv_cmd_buffer_to_handle(cmd_buffer), 3, 1, 0, 0);
+
+      ANV_CALL(CmdEndRenderPass)(anv_cmd_buffer_to_handle(cmd_buffer));
+
+      /* At the point where we emit the draw call, all data from the
+       * descriptor sets, etc. has been used.  We are free to delete it.
+       */
+      blit2d_unbind_src(cmd_buffer, src_type, &src_temps);
+      blit2d_unbind_dst(cmd_buffer, &dst_temps);
+   }
+}
+
 void
 anv_meta_blit2d(struct anv_cmd_buffer *cmd_buffer,
                 struct anv_meta_blit2d_surf *src,
@@ -666,7 +821,8 @@ anv_meta_blit2d(struct anv_cmd_buffer *cmd_buffer,
                                   num_rects, rects);
       return;
    } else if (dst->bs % 3 == 0) {
-      anv_finishme("Blitting to RGB destinations not yet supported");
+      anv_meta_blit2d_rgb_dst(cmd_buffer, src, src_type, dst,
+                              num_rects, rects);
       return;
    } else {
       assert(util_is_power_of_two(dst->bs));
@@ -892,6 +1048,61 @@ build_nir_copy_fragment_shader(struct anv_device *device,
    return b.shader;
 }
 
+/* RGB copies have the same interface as normal copies */
+#define rgb_vi_create_info normal_vi_create_info
+
+static nir_shader *
+build_nir_rgb_fragment_shader(struct anv_device *device,
+                              texel_fetch_build_func txf_func)
+{
+   const struct glsl_type *vec4 = glsl_vec4_type();
+   const struct glsl_type *vec3 = glsl_vector_type(GLSL_TYPE_FLOAT, 3);
+   nir_builder b;
+
+   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   b.shader->info.name = ralloc_strdup(b.shader, "meta_blit2d_fs");
+
+   nir_variable *tex_pos_in = nir_variable_create(b.shader, nir_var_shader_in,
+                                                  vec3, "v_tex_pos");
+   tex_pos_in->data.location = VARYING_SLOT_VAR0;
+
+   nir_variable *color_out = nir_variable_create(b.shader, nir_var_shader_out,
+                                                 vec4, "f_color");
+   color_out->data.location = FRAG_RESULT_DATA0;
+
+   /* We need gl_FragCoord so we know our position */
+   nir_variable *frag_coord_in = nir_variable_create(b.shader,
+                                                     nir_var_shader_in,
+                                                     vec4, "gl_FragCoord");
+   frag_coord_in->data.location = VARYING_SLOT_POS;
+   frag_coord_in->data.origin_upper_left = true;
+
+   nir_ssa_def *pos_int = nir_f2i(&b, nir_load_var(&b, tex_pos_in));
+   unsigned swiz[4] = { 0, 1 };
+   nir_ssa_def *tex_pos = nir_swizzle(&b, pos_int, swiz, 2, false);
+   nir_ssa_def *tex_pitch = nir_channel(&b, pos_int, 2);
+
+   nir_ssa_def *color = txf_func(&b, device, tex_pos, tex_pitch);
+
+   /* We figure out which component we are by the x component of FragCoord */
+   nir_ssa_def *frag_coord_int = nir_f2i(&b, nir_load_var(&b, frag_coord_in));
+   nir_ssa_def *comp = nir_umod(&b, nir_channel(&b, frag_coord_int, 0),
+                                    nir_imm_int(&b, 3));
+
+   /* Select the given channel from the texelFetch result */
+   nir_ssa_def *color_channel =
+      nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 0)),
+                    nir_channel(&b, color, 0),
+                    nir_bcsel(&b, nir_ieq(&b, comp, nir_imm_int(&b, 1)),
+                                  nir_channel(&b, color, 1),
+                                  nir_channel(&b, color, 2)));
+
+   nir_ssa_def *u = nir_ssa_undef(&b, 1, 32);
+   nir_store_var(&b, color_out, nir_vec4(&b, color_channel, u, u, u), 0x1);
+
+   return b.shader;
+}
+
 static const VkPipelineVertexInputStateCreateInfo w_tiled_vi_create_info = {
    .sType = VK_STRUCTURE_TYPE_PIPELINE_VERTEX_INPUT_STATE_CREATE_INFO,
    .vertexBindingDescriptionCount = 2,
@@ -1095,7 +1306,13 @@ blit2d_init_pipeline(struct anv_device *device,
       vi_create_info = &w_tiled_vi_create_info;
       break;
    case BLIT2D_DST_TYPE_RGB:
-      /* Not yet supported */
+      /* RGB destinations and W-detiling don't mix */
+      if (src_type != BLIT2D_SRC_TYPE_NORMAL)
+         return VK_SUCCESS;
+
+      fs.nir = build_nir_rgb_fragment_shader(device, src_func);
+      vi_create_info = &rgb_vi_create_info;
+      break;
    default:
       return VK_SUCCESS;
    }
diff --git a/src/intel/vulkan/anv_meta_clear.c b/src/intel/vulkan/anv_meta_clear.c
index 7ec0608..5d8dd3d 100644
--- a/src/intel/vulkan/anv_meta_clear.c
+++ b/src/intel/vulkan/anv_meta_clear.c
@@ -25,6 +25,8 @@
 #include "anv_private.h"
 #include "nir/nir_builder.h"
 
+#include "util/u_format_rgb9e5.h"
+
 /** Vertex attributes for color clears.  */
 struct color_clear_vattrs {
    struct anv_vue_header vue_header;
@@ -754,12 +756,22 @@ static void
 anv_cmd_clear_image(struct anv_cmd_buffer *cmd_buffer,
                     struct anv_image *image,
                     VkImageLayout image_layout,
-                    const VkClearValue *clear_value,
+                    VkClearValue clear_value,
                     uint32_t range_count,
                     const VkImageSubresourceRange *ranges)
 {
    VkDevice device_h = anv_device_to_handle(cmd_buffer->device);
 
+   VkFormat vk_format = image->vk_format;
+   if (vk_format == VK_FORMAT_E5B9G9R9_UFLOAT_PACK32) {
+      /* We can't actually render to this format so we have to work around it
+       * by manually unpacking and using R32_UINT.
+       */
+      clear_value.color.uint32[0] =
+         float3_to_rgb9e5(clear_value.color.float32);
+      vk_format = VK_FORMAT_R32_UINT;
+   }
+
    for (uint32_t r = 0; r < range_count; r++) {
       const VkImageSubresourceRange *range = &ranges[r];
       for (uint32_t l = 0; l < anv_get_levelCount(image, range); ++l) {
@@ -773,7 +785,7 @@ anv_cmd_clear_image(struct anv_cmd_buffer *cmd_buffer,
                   .sType = VK_STRUCTURE_TYPE_IMAGE_VIEW_CREATE_INFO,
                   .image = anv_image_to_handle(image),
                   .viewType = anv_meta_get_view_type(image),
-                  .format = image->vk_format,
+                  .format = vk_format,
                   .subresourceRange = {
                      .aspectMask = range->aspectMask,
                      .baseMipLevel = range->baseMipLevel + l,
@@ -800,7 +812,7 @@ anv_cmd_clear_image(struct anv_cmd_buffer *cmd_buffer,
                &fb);
 
             VkAttachmentDescription att_desc = {
-               .format = iview.vk_format,
+               .format = vk_format,
                .loadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
                .storeOp = VK_ATTACHMENT_STORE_OP_STORE,
                .stencilLoadOp = VK_ATTACHMENT_LOAD_OP_LOAD,
@@ -864,7 +876,7 @@ anv_cmd_clear_image(struct anv_cmd_buffer *cmd_buffer,
             VkClearAttachment clear_att = {
                .aspectMask = range->aspectMask,
                .colorAttachment = 0,
-               .clearValue = *clear_value,
+               .clearValue = clear_value,
             };
 
             VkClearRect clear_rect = {
@@ -903,7 +915,7 @@ void anv_CmdClearColorImage(
    meta_clear_begin(&saved_state, cmd_buffer);
 
    anv_cmd_clear_image(cmd_buffer, image, imageLayout,
-                       (const VkClearValue *) pColor,
+                       (VkClearValue) { .color = *pColor },
                        rangeCount, pRanges);
 
    meta_clear_end(&saved_state, cmd_buffer);
@@ -924,7 +936,7 @@ void anv_CmdClearDepthStencilImage(
    meta_clear_begin(&saved_state, cmd_buffer);
 
    anv_cmd_clear_image(cmd_buffer, image, imageLayout,
-                       (const VkClearValue *) pDepthStencil,
+                       (VkClearValue) { .depthStencil = *pDepthStencil },
                        rangeCount, pRanges);
 
    meta_clear_end(&saved_state, cmd_buffer);
@@ -1005,7 +1017,7 @@ do_buffer_fill(struct anv_cmd_buffer *cmd_buffer,
 
    anv_cmd_clear_image(cmd_buffer, anv_image_from_handle(dest_image),
                        VK_IMAGE_LAYOUT_COLOR_ATTACHMENT_OPTIMAL,
-                       &clear_value, 1, &range);
+                       clear_value, 1, &range);
 }
 
 void anv_CmdFillBuffer(
diff --git a/src/intel/vulkan/anv_pipeline.c b/src/intel/vulkan/anv_pipeline.c
index 5a09464..33c7fe4 100644
--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@@ -204,6 +204,12 @@ void anv_DestroyPipeline(
                          pAllocator ? pAllocator : &device->alloc);
    if (pipeline->blend_state.map)
       anv_state_pool_free(&device->dynamic_state_pool, pipeline->blend_state);
+
+   for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
+      if (pipeline->shaders[s])
+         anv_shader_bin_unref(device, pipeline->shaders[s]);
+   }
+
    anv_free2(&device->alloc, pAllocator, pipeline);
 }
 
@@ -391,15 +397,34 @@ anv_fill_binding_table(struct brw_stage_prog_data *prog_data, unsigned bias)
    prog_data->binding_table.image_start = bias;
 }
 
+static struct anv_shader_bin *
+anv_pipeline_upload_kernel(struct anv_pipeline *pipeline,
+                           struct anv_pipeline_cache *cache,
+                           const void *key_data, uint32_t key_size,
+                           const void *kernel_data, uint32_t kernel_size,
+                           const void *prog_data, uint32_t prog_data_size,
+                           const struct anv_pipeline_bind_map *bind_map)
+{
+   if (cache) {
+      return anv_pipeline_cache_upload_kernel(cache, key_data, key_size,
+                                              kernel_data, kernel_size,
+                                              prog_data, prog_data_size,
+                                              bind_map);
+   } else {
+      return anv_shader_bin_create(pipeline->device, key_data, key_size,
+                                   kernel_data, kernel_size,
+                                   prog_data, prog_data_size, bind_map);
+   }
+}
+
+
 static void
 anv_pipeline_add_compiled_stage(struct anv_pipeline *pipeline,
                                 gl_shader_stage stage,
-                                const struct brw_stage_prog_data *prog_data,
-                                struct anv_pipeline_bind_map *map)
+                                struct anv_shader_bin *shader)
 {
-   pipeline->prog_data[stage] = prog_data;
+   pipeline->shaders[stage] = shader;
    pipeline->active_stages |= mesa_to_vk_shader_stage(stage);
-   pipeline->bindings[stage] = *map;
 }
 
 static VkResult
@@ -412,20 +437,20 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,
 {
    const struct brw_compiler *compiler =
       pipeline->device->instance->physicalDevice.compiler;
-   const struct brw_stage_prog_data *stage_prog_data;
    struct anv_pipeline_bind_map map;
    struct brw_vs_prog_key key;
-   uint32_t kernel = NO_KERNEL;
+   struct anv_shader_bin *bin = NULL;
    unsigned char sha1[20];
 
    populate_vs_prog_key(&pipeline->device->info, &key);
 
-   if (module->size > 0) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+   if (cache) {
+      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
+                      pipeline->layout, spec_info);
+      bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
-   if (kernel == NO_KERNEL) {
+   if (bin == NULL) {
       struct brw_vs_prog_data prog_data = { 0, };
       struct anv_pipeline_binding surface_to_descriptor[256];
       struct anv_pipeline_binding sampler_to_descriptor[256];
@@ -464,28 +489,29 @@ anv_pipeline_compile_vs(struct anv_pipeline *pipeline,
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
-      stage_prog_data = &prog_data.base.base;
-      kernel = anv_pipeline_cache_upload_kernel(cache,
-                                                module->size > 0 ? sha1 : NULL,
-                                                shader_code, code_size,
-                                                &stage_prog_data, sizeof(prog_data),
-                                                &map);
+      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
+                                       shader_code, code_size,
+                                       &prog_data, sizeof(prog_data), &map);
+      if (!bin) {
+         ralloc_free(mem_ctx);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
+
       ralloc_free(mem_ctx);
    }
 
    const struct brw_vs_prog_data *vs_prog_data =
-      (const struct brw_vs_prog_data *) stage_prog_data;
+      (const struct brw_vs_prog_data *)anv_shader_bin_get_prog_data(bin);
 
    if (vs_prog_data->base.dispatch_mode == DISPATCH_MODE_SIMD8) {
-      pipeline->vs_simd8 = kernel;
+      pipeline->vs_simd8 = bin->kernel.offset;
       pipeline->vs_vec4 = NO_KERNEL;
    } else {
       pipeline->vs_simd8 = NO_KERNEL;
-      pipeline->vs_vec4 = kernel;
+      pipeline->vs_vec4 = bin->kernel.offset;
    }
 
-   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_VERTEX,
-                                   stage_prog_data, &map);
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_VERTEX, bin);
 
    return VK_SUCCESS;
 }
@@ -500,20 +526,20 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline,
 {
    const struct brw_compiler *compiler =
       pipeline->device->instance->physicalDevice.compiler;
-   const struct brw_stage_prog_data *stage_prog_data;
    struct anv_pipeline_bind_map map;
    struct brw_gs_prog_key key;
-   uint32_t kernel = NO_KERNEL;
+   struct anv_shader_bin *bin = NULL;
    unsigned char sha1[20];
 
    populate_gs_prog_key(&pipeline->device->info, &key);
 
-   if (module->size > 0) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+   if (cache) {
+      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
+                      pipeline->layout, spec_info);
+      bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
-   if (kernel == NO_KERNEL) {
+   if (bin == NULL) {
       struct brw_gs_prog_data prog_data = { 0, };
       struct anv_pipeline_binding surface_to_descriptor[256];
       struct anv_pipeline_binding sampler_to_descriptor[256];
@@ -551,20 +577,20 @@ anv_pipeline_compile_gs(struct anv_pipeline *pipeline,
       }
 
       /* TODO: SIMD8 GS */
-      stage_prog_data = &prog_data.base.base;
-      kernel = anv_pipeline_cache_upload_kernel(cache,
-                                                module->size > 0 ? sha1 : NULL,
-                                                shader_code, code_size,
-                                                &stage_prog_data, sizeof(prog_data),
-                                                &map);
+      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
+                                       shader_code, code_size,
+                                       &prog_data, sizeof(prog_data), &map);
+      if (!bin) {
+         ralloc_free(mem_ctx);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
 
       ralloc_free(mem_ctx);
    }
 
-   pipeline->gs_kernel = kernel;
+   pipeline->gs_kernel = bin->kernel.offset;
 
-   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_GEOMETRY,
-                                   stage_prog_data, &map);
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_GEOMETRY, bin);
 
    return VK_SUCCESS;
 }
@@ -580,20 +606,20 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
 {
    const struct brw_compiler *compiler =
       pipeline->device->instance->physicalDevice.compiler;
-   const struct brw_stage_prog_data *stage_prog_data;
    struct anv_pipeline_bind_map map;
    struct brw_wm_prog_key key;
+   struct anv_shader_bin *bin = NULL;
    unsigned char sha1[20];
 
    populate_wm_prog_key(&pipeline->device->info, info, extra, &key);
 
-   if (module->size > 0) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      pipeline->ps_ksp0 =
-         anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+   if (cache) {
+      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
+                      pipeline->layout, spec_info);
+      bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
-   if (pipeline->ps_ksp0 == NO_KERNEL) {
+   if (bin == NULL) {
       struct brw_wm_prog_data prog_data = { 0, };
       struct anv_pipeline_binding surface_to_descriptor[256];
       struct anv_pipeline_binding sampler_to_descriptor[256];
@@ -633,7 +659,7 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
          assert(num_rts + array_len <= 8);
 
          for (unsigned i = 0; i < array_len; i++) {
-            rt_bindings[num_rts] = (struct anv_pipeline_binding) {
+            rt_bindings[num_rts + i] = (struct anv_pipeline_binding) {
                .set = ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS,
                .binding = 0,
                .index = rt + i,
@@ -682,19 +708,20 @@ anv_pipeline_compile_fs(struct anv_pipeline *pipeline,
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
-      stage_prog_data = &prog_data.base;
-      pipeline->ps_ksp0 =
-         anv_pipeline_cache_upload_kernel(cache,
-                                          module->size > 0 ? sha1 : NULL,
-                                          shader_code, code_size,
-                                                &stage_prog_data, sizeof(prog_data),
-                                                &map);
+      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
+                                       shader_code, code_size,
+                                       &prog_data, sizeof(prog_data), &map);
+      if (!bin) {
+         ralloc_free(mem_ctx);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
 
       ralloc_free(mem_ctx);
    }
 
-   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT,
-                                   stage_prog_data, &map);
+   pipeline->ps_ksp0 = bin->kernel.offset;
+
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_FRAGMENT, bin);
 
    return VK_SUCCESS;
 }
@@ -709,20 +736,20 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
 {
    const struct brw_compiler *compiler =
       pipeline->device->instance->physicalDevice.compiler;
-   const struct brw_stage_prog_data *stage_prog_data;
    struct anv_pipeline_bind_map map;
    struct brw_cs_prog_key key;
-   uint32_t kernel = NO_KERNEL;
+   struct anv_shader_bin *bin = NULL;
    unsigned char sha1[20];
 
    populate_cs_prog_key(&pipeline->device->info, &key);
 
-   if (module->size > 0) {
-      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint, spec_info);
-      kernel = anv_pipeline_cache_search(cache, sha1, &stage_prog_data, &map);
+   if (cache) {
+      anv_hash_shader(sha1, &key, sizeof(key), module, entrypoint,
+                      pipeline->layout, spec_info);
+      bin = anv_pipeline_cache_search(cache, sha1, 20);
    }
 
-   if (module->size == 0 || kernel == NO_KERNEL) {
+   if (bin == NULL) {
       struct brw_cs_prog_data prog_data = { 0, };
       struct anv_pipeline_binding surface_to_descriptor[256];
       struct anv_pipeline_binding sampler_to_descriptor[256];
@@ -754,20 +781,20 @@ anv_pipeline_compile_cs(struct anv_pipeline *pipeline,
          return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       }
 
-      stage_prog_data = &prog_data.base;
-      kernel = anv_pipeline_cache_upload_kernel(cache,
-                                                module->size > 0 ? sha1 : NULL,
-                                                shader_code, code_size,
-                                                &stage_prog_data, sizeof(prog_data),
-                                                &map);
+      bin = anv_pipeline_upload_kernel(pipeline, cache, sha1, 20,
+                                       shader_code, code_size,
+                                       &prog_data, sizeof(prog_data), &map);
+      if (!bin) {
+         ralloc_free(mem_ctx);
+         return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
+      }
 
       ralloc_free(mem_ctx);
    }
 
-   pipeline->cs_simd = kernel;
+   pipeline->cs_simd = bin->kernel.offset;
 
-   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_COMPUTE,
-                                   stage_prog_data, &map);
+   anv_pipeline_add_compiled_stage(pipeline, MESA_SHADER_COMPUTE, bin);
 
    return VK_SUCCESS;
 }
@@ -1161,8 +1188,7 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
    /* When we free the pipeline, we detect stages based on the NULL status
     * of various prog_data pointers.  Make them NULL by default.
     */
-   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
-   memset(pipeline->bindings, 0, sizeof(pipeline->bindings));
+   memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
 
    pipeline->vs_simd8 = NO_KERNEL;
    pipeline->vs_vec4 = NO_KERNEL;
@@ -1180,27 +1206,33 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
    }
 
    if (modules[MESA_SHADER_VERTEX]) {
-      anv_pipeline_compile_vs(pipeline, cache, pCreateInfo,
-                              modules[MESA_SHADER_VERTEX],
-                              pStages[MESA_SHADER_VERTEX]->pName,
-                              pStages[MESA_SHADER_VERTEX]->pSpecializationInfo);
+      result = anv_pipeline_compile_vs(pipeline, cache, pCreateInfo,
+                                       modules[MESA_SHADER_VERTEX],
+                                       pStages[MESA_SHADER_VERTEX]->pName,
+                                       pStages[MESA_SHADER_VERTEX]->pSpecializationInfo);
+      if (result != VK_SUCCESS)
+         goto compile_fail;
    }
 
    if (modules[MESA_SHADER_TESS_CTRL] || modules[MESA_SHADER_TESS_EVAL])
       anv_finishme("no tessellation support");
 
    if (modules[MESA_SHADER_GEOMETRY]) {
-      anv_pipeline_compile_gs(pipeline, cache, pCreateInfo,
-                              modules[MESA_SHADER_GEOMETRY],
-                              pStages[MESA_SHADER_GEOMETRY]->pName,
-                              pStages[MESA_SHADER_GEOMETRY]->pSpecializationInfo);
+      result = anv_pipeline_compile_gs(pipeline, cache, pCreateInfo,
+                                       modules[MESA_SHADER_GEOMETRY],
+                                       pStages[MESA_SHADER_GEOMETRY]->pName,
+                                       pStages[MESA_SHADER_GEOMETRY]->pSpecializationInfo);
+      if (result != VK_SUCCESS)
+         goto compile_fail;
    }
 
    if (modules[MESA_SHADER_FRAGMENT]) {
-      anv_pipeline_compile_fs(pipeline, cache, pCreateInfo, extra,
-                              modules[MESA_SHADER_FRAGMENT],
-                              pStages[MESA_SHADER_FRAGMENT]->pName,
-                              pStages[MESA_SHADER_FRAGMENT]->pSpecializationInfo);
+      result = anv_pipeline_compile_fs(pipeline, cache, pCreateInfo, extra,
+                                       modules[MESA_SHADER_FRAGMENT],
+                                       pStages[MESA_SHADER_FRAGMENT]->pName,
+                                       pStages[MESA_SHADER_FRAGMENT]->pSpecializationInfo);
+      if (result != VK_SUCCESS)
+         goto compile_fail;
    }
 
    if (!(pipeline->active_stages & VK_SHADER_STAGE_VERTEX_BIT)) {
@@ -1263,6 +1295,16 @@ anv_pipeline_init(struct anv_pipeline *pipeline,
       pipeline->topology = _3DPRIM_RECTLIST;
 
    return VK_SUCCESS;
+
+compile_fail:
+   for (unsigned s = 0; s < MESA_SHADER_STAGES; s++) {
+      if (pipeline->shaders[s])
+         anv_shader_bin_unref(device, pipeline->shaders[s]);
+   }
+
+   anv_reloc_list_finish(&pipeline->batch_relocs, alloc);
+
+   return result;
 }
 
 VkResult
@@ -1277,9 +1319,6 @@ anv_graphics_pipeline_create(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
 
-   if (cache == NULL)
-      cache = &device->default_pipeline_cache;
-
    switch (device->info.gen) {
    case 7:
       if (device->info.is_haswell)
@@ -1333,9 +1372,6 @@ static VkResult anv_compute_pipeline_create(
    ANV_FROM_HANDLE(anv_device, device, _device);
    ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
 
-   if (cache == NULL)
-      cache = &device->default_pipeline_cache;
-
    switch (device->info.gen) {
    case 7:
       if (device->info.is_haswell)
diff --git a/src/intel/vulkan/anv_pipeline_cache.c b/src/intel/vulkan/anv_pipeline_cache.c
index fbca311..2753c46 100644
--- a/src/intel/vulkan/anv_pipeline_cache.c
+++ b/src/intel/vulkan/anv_pipeline_cache.c
@@ -22,9 +22,120 @@
  */
 
 #include "util/mesa-sha1.h"
+#include "util/hash_table.h"
 #include "util/debug.h"
 #include "anv_private.h"
 
+struct shader_bin_key {
+   uint32_t size;
+   uint8_t data[0];
+};
+
+static size_t
+anv_shader_bin_size(uint32_t prog_data_size, uint32_t key_size,
+                    uint32_t surface_count, uint32_t sampler_count)
+{
+   const uint32_t binding_data_size =
+      (surface_count + sampler_count) * sizeof(struct anv_pipeline_binding);
+
+   return align_u32(sizeof(struct anv_shader_bin), 8) +
+          align_u32(prog_data_size, 8) +
+          align_u32(sizeof(uint32_t) + key_size, 8) +
+          align_u32(binding_data_size, 8);
+}
+
+static inline const struct shader_bin_key *
+anv_shader_bin_get_key(const struct anv_shader_bin *shader)
+{
+   const void *data = shader;
+   data += align_u32(sizeof(struct anv_shader_bin), 8);
+   data += align_u32(shader->prog_data_size, 8);
+   return data;
+}
+
+struct anv_shader_bin *
+anv_shader_bin_create(struct anv_device *device,
+                      const void *key_data, uint32_t key_size,
+                      const void *kernel_data, uint32_t kernel_size,
+                      const void *prog_data, uint32_t prog_data_size,
+                      const struct anv_pipeline_bind_map *bind_map)
+{
+   const size_t size =
+      anv_shader_bin_size(prog_data_size, key_size,
+                          bind_map->surface_count, bind_map->sampler_count);
+
+   struct anv_shader_bin *shader =
+      anv_alloc(&device->alloc, size, 8, VK_SYSTEM_ALLOCATION_SCOPE_DEVICE);
+   if (!shader)
+      return NULL;
+
+   shader->ref_cnt = 1;
+
+   shader->kernel =
+      anv_state_pool_alloc(&device->instruction_state_pool, kernel_size, 64);
+   memcpy(shader->kernel.map, kernel_data, kernel_size);
+   shader->kernel_size = kernel_size;
+   shader->bind_map = *bind_map;
+   shader->prog_data_size = prog_data_size;
+
+   /* Now we fill out the floating data at the end */
+   void *data = shader;
+   data += align_u32(sizeof(struct anv_shader_bin), 8);
+
+   memcpy(data, prog_data, prog_data_size);
+   data += align_u32(prog_data_size, 8);
+
+   struct shader_bin_key *key = data;
+   key->size = key_size;
+   memcpy(key->data, key_data, key_size);
+   data += align_u32(sizeof(*key) + key_size, 8);
+
+   shader->bind_map.surface_to_descriptor = data;
+   memcpy(data, bind_map->surface_to_descriptor,
+          bind_map->surface_count * sizeof(struct anv_pipeline_binding));
+   data += bind_map->surface_count * sizeof(struct anv_pipeline_binding);
+
+   shader->bind_map.sampler_to_descriptor = data;
+   memcpy(data, bind_map->sampler_to_descriptor,
+          bind_map->sampler_count * sizeof(struct anv_pipeline_binding));
+
+   return shader;
+}
+
+void
+anv_shader_bin_destroy(struct anv_device *device,
+                       struct anv_shader_bin *shader)
+{
+   assert(shader->ref_cnt == 0);
+   anv_state_pool_free(&device->instruction_state_pool, shader->kernel);
+   anv_free(&device->alloc, shader);
+}
+
+static size_t
+anv_shader_bin_data_size(const struct anv_shader_bin *shader)
+{
+   return anv_shader_bin_size(shader->prog_data_size,
+                              anv_shader_bin_get_key(shader)->size,
+                              shader->bind_map.surface_count,
+                              shader->bind_map.sampler_count) +
+          align_u32(shader->kernel_size, 8);
+}
+
+static void
+anv_shader_bin_write_data(const struct anv_shader_bin *shader, void *data)
+{
+   size_t struct_size =
+      anv_shader_bin_size(shader->prog_data_size,
+                          anv_shader_bin_get_key(shader)->size,
+                          shader->bind_map.surface_count,
+                          shader->bind_map.sampler_count);
+
+   memcpy(data, shader, struct_size);
+   data += struct_size;
+
+   memcpy(data, shader->kernel.map, shader->kernel_size);
+}
+
 /* Remaining work:
  *
  * - Compact binding table layout so it's tight and not dependent on
@@ -37,69 +148,62 @@
  *   dual_src_blend.
  */
 
+static uint32_t
+shader_bin_key_hash_func(const void *void_key)
+{
+   const struct shader_bin_key *key = void_key;
+   return _mesa_hash_data(key->data, key->size);
+}
+
+static bool
+shader_bin_key_compare_func(const void *void_a, const void *void_b)
+{
+   const struct shader_bin_key *a = void_a, *b = void_b;
+   if (a->size != b->size)
+      return false;
+
+   return memcmp(a->data, b->data, a->size) == 0;
+}
+
 void
 anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
-                        struct anv_device *device)
+                        struct anv_device *device,
+                        bool cache_enabled)
 {
    cache->device = device;
-   anv_state_stream_init(&cache->program_stream,
-                         &device->instruction_block_pool);
    pthread_mutex_init(&cache->mutex, NULL);
 
-   cache->kernel_count = 0;
-   cache->total_size = 0;
-   cache->table_size = 1024;
-   const size_t byte_size = cache->table_size * sizeof(cache->hash_table[0]);
-   cache->hash_table = malloc(byte_size);
-
-   /* We don't consider allocation failure fatal, we just start with a 0-sized
-    * cache. */
-   if (cache->hash_table == NULL ||
-       !env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true))
-      cache->table_size = 0;
-   else
-      memset(cache->hash_table, 0xff, byte_size);
+   if (cache_enabled) {
+      cache->cache = _mesa_hash_table_create(NULL, shader_bin_key_hash_func,
+                                             shader_bin_key_compare_func);
+   } else {
+      cache->cache = NULL;
+   }
 }
 
 void
 anv_pipeline_cache_finish(struct anv_pipeline_cache *cache)
 {
-   anv_state_stream_finish(&cache->program_stream);
    pthread_mutex_destroy(&cache->mutex);
-   free(cache->hash_table);
-}
-
-struct cache_entry {
-   unsigned char sha1[20];
-   uint32_t prog_data_size;
-   uint32_t kernel_size;
-   uint32_t surface_count;
-   uint32_t sampler_count;
-   uint32_t image_count;
-
-   char prog_data[0];
-
-   /* kernel follows prog_data at next 64 byte aligned address */
-};
-
-static uint32_t
-entry_size(struct cache_entry *entry)
-{
-   /* This returns the number of bytes needed to serialize an entry, which
-    * doesn't include the alignment padding bytes.
-    */
 
-   const uint32_t map_size =
-      entry->surface_count * sizeof(struct anv_pipeline_binding) +
-      entry->sampler_count * sizeof(struct anv_pipeline_binding);
+   if (cache->cache) {
+      /* This is a bit unfortunate.  In order to keep things from randomly
+       * going away, the shader cache has to hold a reference to all shader
+       * binaries it contains.  We unref them when we destroy the cache.
+       */
+      struct hash_entry *entry;
+      hash_table_foreach(cache->cache, entry)
+         anv_shader_bin_unref(cache->device, entry->data);
 
-   return sizeof(*entry) + entry->prog_data_size + map_size;
+      _mesa_hash_table_destroy(cache->cache, NULL);
+   }
 }
 
 void
 anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
                 struct anv_shader_module *module,
                 const char *entrypoint,
+                const struct anv_pipeline_layout *pipeline_layout,
                 const VkSpecializationInfo *spec_info)
 {
    struct mesa_sha1 *ctx;
@@ -108,6 +212,10 @@ anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
    _mesa_sha1_update(ctx, key, key_size);
    _mesa_sha1_update(ctx, module->sha1, sizeof(module->sha1));
    _mesa_sha1_update(ctx, entrypoint, strlen(entrypoint));
+   if (pipeline_layout) {
+      _mesa_sha1_update(ctx, pipeline_layout->sha1,
+                        sizeof(pipeline_layout->sha1));
+   }
    /* hash in shader stage, pipeline layout? */
    if (spec_info) {
       _mesa_sha1_update(ctx, spec_info->pMapEntries,
@@ -117,210 +225,94 @@ anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
    _mesa_sha1_final(ctx, hash);
 }
 
-static uint32_t
-anv_pipeline_cache_search_unlocked(struct anv_pipeline_cache *cache,
-                                   const unsigned char *sha1,
-                                   const struct brw_stage_prog_data **prog_data,
-                                   struct anv_pipeline_bind_map *map)
+static struct anv_shader_bin *
+anv_pipeline_cache_search_locked(struct anv_pipeline_cache *cache,
+                                 const void *key_data, uint32_t key_size)
 {
-   const uint32_t mask = cache->table_size - 1;
-   const uint32_t start = (*(uint32_t *) sha1);
-
-   for (uint32_t i = 0; i < cache->table_size; i++) {
-      const uint32_t index = (start + i) & mask;
-      const uint32_t offset = cache->hash_table[index];
-
-      if (offset == ~0)
-         return NO_KERNEL;
-
-      struct cache_entry *entry =
-         cache->program_stream.block_pool->map + offset;
-      if (memcmp(entry->sha1, sha1, sizeof(entry->sha1)) == 0) {
-         if (prog_data) {
-            assert(map);
-            void *p = entry->prog_data;
-            *prog_data = p;
-            p += entry->prog_data_size;
-            map->surface_count = entry->surface_count;
-            map->sampler_count = entry->sampler_count;
-            map->image_count = entry->image_count;
-            map->surface_to_descriptor = p;
-            p += map->surface_count * sizeof(struct anv_pipeline_binding);
-            map->sampler_to_descriptor = p;
-         }
-
-         return offset + align_u32(entry_size(entry), 64);
-      }
-   }
-
-   /* This can happen if the pipeline cache is disabled via
-    * ANV_ENABLE_PIPELINE_CACHE=false
-    */
-   return NO_KERNEL;
+   uint32_t vla[1 + DIV_ROUND_UP(key_size, sizeof(uint32_t))];
+   struct shader_bin_key *key = (void *)vla;
+   key->size = key_size;
+   memcpy(key->data, key_data, key_size);
+
+   struct hash_entry *entry = _mesa_hash_table_search(cache->cache, key);
+   if (entry)
+      return entry->data;
+   else
+      return NULL;
 }
 
-uint32_t
+struct anv_shader_bin *
 anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
-                          const unsigned char *sha1,
-                          const struct brw_stage_prog_data **prog_data,
-                          struct anv_pipeline_bind_map *map)
+                          const void *key_data, uint32_t key_size)
 {
-   uint32_t kernel;
+   if (!cache->cache)
+      return NULL;
 
    pthread_mutex_lock(&cache->mutex);
 
-   kernel = anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
+   struct anv_shader_bin *shader =
+      anv_pipeline_cache_search_locked(cache, key_data, key_size);
 
    pthread_mutex_unlock(&cache->mutex);
 
-   return kernel;
-}
-
-static void
-anv_pipeline_cache_set_entry(struct anv_pipeline_cache *cache,
-                             struct cache_entry *entry, uint32_t entry_offset)
-{
-   const uint32_t mask = cache->table_size - 1;
-   const uint32_t start = (*(uint32_t *) entry->sha1);
-
-   /* We'll always be able to insert when we get here. */
-   assert(cache->kernel_count < cache->table_size / 2);
+   /* We increment refcount before handing it to the caller */
+   if (shader)
+      anv_shader_bin_ref(shader);
 
-   for (uint32_t i = 0; i < cache->table_size; i++) {
-      const uint32_t index = (start + i) & mask;
-      if (cache->hash_table[index] == ~0) {
-         cache->hash_table[index] = entry_offset;
-         break;
-      }
-   }
-
-   cache->total_size += entry_size(entry) + entry->kernel_size;
-   cache->kernel_count++;
+   return shader;
 }
 
-static VkResult
-anv_pipeline_cache_grow(struct anv_pipeline_cache *cache)
+static struct anv_shader_bin *
+anv_pipeline_cache_add_shader(struct anv_pipeline_cache *cache,
+                              const void *key_data, uint32_t key_size,
+                              const void *kernel_data, uint32_t kernel_size,
+                              const void *prog_data, uint32_t prog_data_size,
+                              const struct anv_pipeline_bind_map *bind_map)
 {
-   const uint32_t table_size = cache->table_size * 2;
-   const uint32_t old_table_size = cache->table_size;
-   const size_t byte_size = table_size * sizeof(cache->hash_table[0]);
-   uint32_t *table;
-   uint32_t *old_table = cache->hash_table;
-
-   table = malloc(byte_size);
-   if (table == NULL)
-      return VK_ERROR_OUT_OF_HOST_MEMORY;
-
-   cache->hash_table = table;
-   cache->table_size = table_size;
-   cache->kernel_count = 0;
-   cache->total_size = 0;
-
-   memset(cache->hash_table, 0xff, byte_size);
-   for (uint32_t i = 0; i < old_table_size; i++) {
-      const uint32_t offset = old_table[i];
-      if (offset == ~0)
-         continue;
+   struct anv_shader_bin *shader =
+      anv_pipeline_cache_search_locked(cache, key_data, key_size);
+   if (shader)
+      return shader;
 
-      struct cache_entry *entry =
-         cache->program_stream.block_pool->map + offset;
-      anv_pipeline_cache_set_entry(cache, entry, offset);
-   }
+   struct anv_shader_bin *bin =
+      anv_shader_bin_create(cache->device, key_data, key_size,
+                            kernel_data, kernel_size,
+                            prog_data, prog_data_size, bind_map);
+   if (!bin)
+      return NULL;
 
-   free(old_table);
+   _mesa_hash_table_insert(cache->cache, anv_shader_bin_get_key(bin), bin);
 
-   return VK_SUCCESS;
+   return bin;
 }
 
-static void
-anv_pipeline_cache_add_entry(struct anv_pipeline_cache *cache,
-                             struct cache_entry *entry, uint32_t entry_offset)
-{
-   if (cache->kernel_count == cache->table_size / 2)
-      anv_pipeline_cache_grow(cache);
-
-   /* Failing to grow that hash table isn't fatal, but may mean we don't
-    * have enough space to add this new kernel. Only add it if there's room.
-    */
-   if (cache->kernel_count < cache->table_size / 2)
-      anv_pipeline_cache_set_entry(cache, entry, entry_offset);
-}
-
-uint32_t
+struct anv_shader_bin *
 anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
-                                 const unsigned char *sha1,
-                                 const void *kernel, size_t kernel_size,
-                                 const struct brw_stage_prog_data **prog_data,
-                                 size_t prog_data_size,
-                                 struct anv_pipeline_bind_map *map)
+                                 const void *key_data, uint32_t key_size,
+                                 const void *kernel_data, uint32_t kernel_size,
+                                 const void *prog_data, uint32_t prog_data_size,
+                                 const struct anv_pipeline_bind_map *bind_map)
 {
-   pthread_mutex_lock(&cache->mutex);
-
-   /* Before uploading, check again that another thread didn't upload this
-    * shader while we were compiling it.
-    */
-   if (sha1) {
-      uint32_t cached_kernel =
-         anv_pipeline_cache_search_unlocked(cache, sha1, prog_data, map);
-      if (cached_kernel != NO_KERNEL) {
-         pthread_mutex_unlock(&cache->mutex);
-         return cached_kernel;
-      }
-   }
+   if (cache->cache) {
+      pthread_mutex_lock(&cache->mutex);
 
-   struct cache_entry *entry;
+      struct anv_shader_bin *bin =
+         anv_pipeline_cache_add_shader(cache, key_data, key_size,
+                                       kernel_data, kernel_size,
+                                       prog_data, prog_data_size, bind_map);
 
-   const uint32_t map_size =
-      map->surface_count * sizeof(struct anv_pipeline_binding) +
-      map->sampler_count * sizeof(struct anv_pipeline_binding);
+      pthread_mutex_unlock(&cache->mutex);
 
-   const uint32_t preamble_size =
-      align_u32(sizeof(*entry) + prog_data_size + map_size, 64);
+      /* We increment refcount before handing it to the caller */
+      anv_shader_bin_ref(bin);
 
-   const uint32_t size = preamble_size + kernel_size;
-
-   assert(size < cache->program_stream.block_pool->block_size);
-   const struct anv_state state =
-      anv_state_stream_alloc(&cache->program_stream, size, 64);
-
-   entry = state.map;
-   entry->prog_data_size = prog_data_size;
-   entry->surface_count = map->surface_count;
-   entry->sampler_count = map->sampler_count;
-   entry->image_count = map->image_count;
-   entry->kernel_size = kernel_size;
-
-   void *p = entry->prog_data;
-   memcpy(p, *prog_data, prog_data_size);
-   p += prog_data_size;
-
-   memcpy(p, map->surface_to_descriptor,
-          map->surface_count * sizeof(struct anv_pipeline_binding));
-   map->surface_to_descriptor = p;
-   p += map->surface_count * sizeof(struct anv_pipeline_binding);
-
-   memcpy(p, map->sampler_to_descriptor,
-          map->sampler_count * sizeof(struct anv_pipeline_binding));
-   map->sampler_to_descriptor = p;
-
-   if (sha1) {
-      assert(anv_pipeline_cache_search_unlocked(cache, sha1,
-                                                NULL, NULL) == NO_KERNEL);
-
-      memcpy(entry->sha1, sha1, sizeof(entry->sha1));
-      anv_pipeline_cache_add_entry(cache, entry, state.offset);
+      return bin;
+   } else {
+      /* In this case, we're not caching it so the caller owns it entirely */
+      return anv_shader_bin_create(cache->device, key_data, key_size,
+                                   kernel_data, kernel_size,
+                                   prog_data, prog_data_size, bind_map);
    }
-
-   pthread_mutex_unlock(&cache->mutex);
-
-   memcpy(state.map + preamble_size, kernel, kernel_size);
-
-   if (!cache->device->info.has_llc)
-      anv_state_clflush(state);
-
-   *prog_data = (const struct brw_stage_prog_data *) entry->prog_data;
-
-   return state.offset + preamble_size;
 }
 
 struct cache_header {
@@ -339,6 +331,9 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
    struct cache_header header;
    uint8_t uuid[VK_UUID_SIZE];
 
+   if (cache->cache == NULL)
+      return;
+
    if (size < sizeof(header))
       return;
    memcpy(&header, data, sizeof(header));
@@ -354,38 +349,62 @@ anv_pipeline_cache_load(struct anv_pipeline_cache *cache,
    if (memcmp(header.uuid, uuid, VK_UUID_SIZE) != 0)
       return;
 
-   void *end = (void *) data + size;
-   void *p = (void *) data + header.header_size;
-
-   while (p < end) {
-      struct cache_entry *entry = p;
-
-      void *data = entry->prog_data;
-      const struct brw_stage_prog_data *prog_data = data;
-      data += entry->prog_data_size;
-
-      struct anv_pipeline_binding *surface_to_descriptor = data;
-      data += entry->surface_count * sizeof(struct anv_pipeline_binding);
-      struct anv_pipeline_binding *sampler_to_descriptor = data;
-      data += entry->sampler_count * sizeof(struct anv_pipeline_binding);
-      void *kernel = data;
-
-      struct anv_pipeline_bind_map map = {
-         .surface_count = entry->surface_count,
-         .sampler_count = entry->sampler_count,
-         .image_count = entry->image_count,
-         .surface_to_descriptor = surface_to_descriptor,
-         .sampler_to_descriptor = sampler_to_descriptor
-      };
-
-      anv_pipeline_cache_upload_kernel(cache, entry->sha1,
-                                       kernel, entry->kernel_size,
-                                       &prog_data,
-                                       entry->prog_data_size, &map);
-      p = kernel + entry->kernel_size;
+   const void *end = data + size;
+   const void *p = data + header.header_size;
+
+   /* Count is the total number of valid entries */
+   uint32_t count;
+   if (p + sizeof(count) >= end)
+      return;
+   memcpy(&count, p, sizeof(count));
+   p += align_u32(sizeof(count), 8);
+
+   for (uint32_t i = 0; i < count; i++) {
+      struct anv_shader_bin bin;
+      if (p + sizeof(bin) > end)
+         break;
+      memcpy(&bin, p, sizeof(bin));
+      p += align_u32(sizeof(struct anv_shader_bin), 8);
+
+      const void *prog_data = p;
+      p += align_u32(bin.prog_data_size, 8);
+
+      struct shader_bin_key key;
+      if (p + sizeof(key) > end)
+         break;
+      memcpy(&key, p, sizeof(key));
+      const void *key_data = p + sizeof(key);
+      p += align_u32(sizeof(key) + key.size, 8);
+
+      /* We're going to memcpy this so getting rid of const is fine */
+      struct anv_pipeline_binding *bindings = (void *)p;
+      p += align_u32((bin.bind_map.surface_count + bin.bind_map.sampler_count) *
+                     sizeof(struct anv_pipeline_binding), 8);
+      bin.bind_map.surface_to_descriptor = bindings;
+      bin.bind_map.sampler_to_descriptor = bindings + bin.bind_map.surface_count;
+
+      const void *kernel_data = p;
+      p += align_u32(bin.kernel_size, 8);
+
+      if (p > end)
+         break;
+
+      anv_pipeline_cache_add_shader(cache, key_data, key.size,
+                                    kernel_data, bin.kernel_size,
+                                    prog_data, bin.prog_data_size,
+                                    &bin.bind_map);
    }
 }
 
+static bool
+pipeline_cache_enabled()
+{
+   static int enabled = -1;
+   if (enabled < 0)
+      enabled = env_var_as_boolean("ANV_ENABLE_PIPELINE_CACHE", true);
+   return enabled;
+}
+
 VkResult anv_CreatePipelineCache(
     VkDevice                                    _device,
     const VkPipelineCacheCreateInfo*            pCreateInfo,
@@ -404,7 +423,7 @@ VkResult anv_CreatePipelineCache(
    if (cache == NULL)
       return vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
 
-   anv_pipeline_cache_init(cache, device);
+   anv_pipeline_cache_init(cache, device, pipeline_cache_enabled());
 
    if (pCreateInfo->initialDataSize > 0)
       anv_pipeline_cache_load(cache,
@@ -439,9 +458,16 @@ VkResult anv_GetPipelineCacheData(
    ANV_FROM_HANDLE(anv_pipeline_cache, cache, _cache);
    struct cache_header *header;
 
-   const size_t size = sizeof(*header) + cache->total_size;
-
    if (pData == NULL) {
+      size_t size = align_u32(sizeof(*header), 8) +
+                    align_u32(sizeof(uint32_t), 8);
+
+      if (cache->cache) {
+         struct hash_entry *entry;
+         hash_table_foreach(cache->cache, entry)
+            size += anv_shader_bin_data_size(entry->data);
+      }
+
       *pDataSize = size;
       return VK_SUCCESS;
    }
@@ -458,25 +484,25 @@ VkResult anv_GetPipelineCacheData(
    header->vendor_id = 0x8086;
    header->device_id = device->chipset_id;
    anv_device_get_cache_uuid(header->uuid);
-   p += header->header_size;
+   p += align_u32(header->header_size, 8);
 
-   struct cache_entry *entry;
-   for (uint32_t i = 0; i < cache->table_size; i++) {
-      if (cache->hash_table[i] == ~0)
-         continue;
+   uint32_t *count = p;
+   p += align_u32(sizeof(*count), 8);
+   *count = 0;
 
-      entry = cache->program_stream.block_pool->map + cache->hash_table[i];
-      const uint32_t size = entry_size(entry);
-      if (end < p + size + entry->kernel_size)
-         break;
-
-      memcpy(p, entry, size);
-      p += size;
+   if (cache->cache) {
+      struct hash_entry *entry;
+      hash_table_foreach(cache->cache, entry) {
+         struct anv_shader_bin *shader = entry->data;
+         size_t data_size = anv_shader_bin_data_size(entry->data);
+         if (p + data_size > end)
+            break;
 
-      void *kernel = (void *) entry + align_u32(size, 64);
+         anv_shader_bin_write_data(shader, p);
+         p += data_size;
 
-      memcpy(p, kernel, entry->kernel_size);
-      p += entry->kernel_size;
+         (*count)++;
+      }
    }
 
    *pDataSize = p - pData;
@@ -484,25 +510,6 @@ VkResult anv_GetPipelineCacheData(
    return VK_SUCCESS;
 }
 
-static void
-anv_pipeline_cache_merge(struct anv_pipeline_cache *dst,
-                         struct anv_pipeline_cache *src)
-{
-   for (uint32_t i = 0; i < src->table_size; i++) {
-      const uint32_t offset = src->hash_table[i];
-      if (offset == ~0)
-         continue;
-
-      struct cache_entry *entry =
-         src->program_stream.block_pool->map + offset;
-
-      if (anv_pipeline_cache_search(dst, entry->sha1, NULL, NULL) != NO_KERNEL)
-         continue;
-
-      anv_pipeline_cache_add_entry(dst, entry, offset);
-   }
-}
-
 VkResult anv_MergePipelineCaches(
     VkDevice                                    _device,
     VkPipelineCache                             destCache,
@@ -511,10 +518,23 @@ VkResult anv_MergePipelineCaches(
 {
    ANV_FROM_HANDLE(anv_pipeline_cache, dst, destCache);
 
+   if (!dst->cache)
+      return VK_SUCCESS;
+
    for (uint32_t i = 0; i < srcCacheCount; i++) {
       ANV_FROM_HANDLE(anv_pipeline_cache, src, pSrcCaches[i]);
+      if (!src->cache)
+         continue;
+
+      struct hash_entry *entry;
+      hash_table_foreach(src->cache, entry) {
+         struct anv_shader_bin *bin = entry->data;
+         if (_mesa_hash_table_search(dst->cache, anv_shader_bin_get_key(bin)))
+            continue;
 
-      anv_pipeline_cache_merge(dst, src);
+         anv_shader_bin_ref(bin);
+         _mesa_hash_table_insert(dst->cache, anv_shader_bin_get_key(bin), bin);
+      }
    }
 
    return VK_SUCCESS;
diff --git a/src/intel/vulkan/anv_private.h b/src/intel/vulkan/anv_private.h
index 50b860c..8b57e1b 100644
--- a/src/intel/vulkan/anv_private.h
+++ b/src/intel/vulkan/anv_private.h
@@ -83,6 +83,12 @@ extern "C" {
 #define MAX(a, b) ((a) > (b) ? (a) : (b))
 
 static inline uint32_t
+align_down_npot_u32(uint32_t v, uint32_t a)
+{
+   return v - (v % a);
+}
+
+static inline uint32_t
 align_u32(uint32_t v, uint32_t a)
 {
    assert(a != 0 && a == (a & -a));
@@ -394,9 +400,9 @@ struct anv_fixed_size_state_pool {
 };
 
 #define ANV_MIN_STATE_SIZE_LOG2 6
-#define ANV_MAX_STATE_SIZE_LOG2 10
+#define ANV_MAX_STATE_SIZE_LOG2 17
 
-#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2)
+#define ANV_STATE_BUCKETS (ANV_MAX_STATE_SIZE_LOG2 - ANV_MIN_STATE_SIZE_LOG2 + 1)
 
 struct anv_state_pool {
    struct anv_block_pool *block_pool;
@@ -652,31 +658,27 @@ struct anv_queue {
 
 struct anv_pipeline_cache {
    struct anv_device *                          device;
-   struct anv_state_stream                      program_stream;
    pthread_mutex_t                              mutex;
 
-   uint32_t                                     total_size;
-   uint32_t                                     table_size;
-   uint32_t                                     kernel_count;
-   uint32_t *                                   hash_table;
+   struct hash_table *                          cache;
 };
 
 struct anv_pipeline_bind_map;
 
 void anv_pipeline_cache_init(struct anv_pipeline_cache *cache,
-                             struct anv_device *device);
+                             struct anv_device *device,
+                             bool cache_enabled);
 void anv_pipeline_cache_finish(struct anv_pipeline_cache *cache);
-uint32_t anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
-                                   const unsigned char *sha1,
-                                   const struct brw_stage_prog_data **prog_data,
-                                   struct anv_pipeline_bind_map *map);
-uint32_t anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
-                                          const unsigned char *sha1,
-                                          const void *kernel,
-                                          size_t kernel_size,
-                                          const struct brw_stage_prog_data **prog_data,
-                                          size_t prog_data_size,
-                                          struct anv_pipeline_bind_map *map);
+
+struct anv_shader_bin *
+anv_pipeline_cache_search(struct anv_pipeline_cache *cache,
+                          const void *key, uint32_t key_size);
+struct anv_shader_bin *
+anv_pipeline_cache_upload_kernel(struct anv_pipeline_cache *cache,
+                                 const void *key_data, uint32_t key_size,
+                                 const void *kernel_data, uint32_t kernel_size,
+                                 const void *prog_data, uint32_t prog_data_size,
+                                 const struct anv_pipeline_bind_map *bind_map);
 
 struct anv_device {
     VK_LOADER_DATA                              _loader_data;
@@ -698,7 +700,7 @@ struct anv_device {
     struct anv_state_pool                       dynamic_state_pool;
 
     struct anv_block_pool                       instruction_block_pool;
-    struct anv_pipeline_cache                   default_pipeline_cache;
+    struct anv_state_pool                       instruction_state_pool;
 
     struct anv_block_pool                       surface_state_block_pool;
     struct anv_state_pool                       surface_state_pool;
@@ -1057,6 +1059,8 @@ struct anv_pipeline_layout {
    struct {
       bool has_dynamic_offsets;
    } stage[MESA_SHADER_STAGES];
+
+   unsigned char sha1[20];
 };
 
 struct anv_buffer {
@@ -1422,6 +1426,7 @@ struct anv_shader_module {
 void anv_hash_shader(unsigned char *hash, const void *key, size_t key_size,
                      struct anv_shader_module *module,
                      const char *entrypoint,
+                     const struct anv_pipeline_layout *pipeline_layout,
                      const VkSpecializationInfo *spec_info);
 
 static inline gl_shader_stage
@@ -1449,13 +1454,57 @@ struct anv_pipeline_bind_map {
    uint32_t surface_count;
    uint32_t sampler_count;
    uint32_t image_count;
-   uint32_t attachment_count;
 
    struct anv_pipeline_binding *                surface_to_descriptor;
    struct anv_pipeline_binding *                sampler_to_descriptor;
-   uint32_t *                                   surface_to_attachment;
 };
 
+struct anv_shader_bin {
+   uint32_t ref_cnt;
+
+   struct anv_state kernel;
+   uint32_t kernel_size;
+
+   struct anv_pipeline_bind_map bind_map;
+
+   uint32_t prog_data_size;
+
+   /* Prog data follows, then the key, both aligned to 8-bytes */
+};
+
+struct anv_shader_bin *
+anv_shader_bin_create(struct anv_device *device,
+                      const void *key, uint32_t key_size,
+                      const void *kernel, uint32_t kernel_size,
+                      const void *prog_data, uint32_t prog_data_size,
+                      const struct anv_pipeline_bind_map *bind_map);
+
+void
+anv_shader_bin_destroy(struct anv_device *device, struct anv_shader_bin *shader);
+
+static inline void
+anv_shader_bin_ref(struct anv_shader_bin *shader)
+{
+   assert(shader->ref_cnt >= 1);
+   __sync_fetch_and_add(&shader->ref_cnt, 1);
+}
+
+static inline void
+anv_shader_bin_unref(struct anv_device *device, struct anv_shader_bin *shader)
+{
+   assert(shader->ref_cnt >= 1);
+   if (__sync_fetch_and_add(&shader->ref_cnt, -1) == 1)
+      anv_shader_bin_destroy(device, shader);
+}
+
+static inline const struct brw_stage_prog_data *
+anv_shader_bin_get_prog_data(const struct anv_shader_bin *shader)
+{
+   const void *data = shader;
+   data += align_u32(sizeof(struct anv_shader_bin), 8);
+   return data;
+}
+
 struct anv_pipeline {
    struct anv_device *                          device;
    struct anv_batch                             batch;
@@ -1465,12 +1514,12 @@ struct anv_pipeline {
    struct anv_dynamic_state                     dynamic_state;
 
    struct anv_pipeline_layout *                 layout;
-   struct anv_pipeline_bind_map                 bindings[MESA_SHADER_STAGES];
 
    bool                                         use_repclear;
    bool                                         needs_data_cache;
 
-   const struct brw_stage_prog_data *           prog_data[MESA_SHADER_STAGES];
+   struct anv_shader_bin *                      shaders[MESA_SHADER_STAGES];
+
    struct {
       uint32_t                                  start[MESA_SHADER_GEOMETRY + 1];
       uint32_t                                  size[MESA_SHADER_GEOMETRY + 1];
@@ -1513,29 +1562,29 @@ struct anv_pipeline {
    } gen9;
 };
 
-static inline const struct brw_vs_prog_data *
-get_vs_prog_data(struct anv_pipeline *pipeline)
-{
-   return (const struct brw_vs_prog_data *) pipeline->prog_data[MESA_SHADER_VERTEX];
-}
-
-static inline const struct brw_gs_prog_data *
-get_gs_prog_data(struct anv_pipeline *pipeline)
+static inline bool
+anv_pipeline_has_stage(const struct anv_pipeline *pipeline,
+                       gl_shader_stage stage)
 {
-   return (const struct brw_gs_prog_data *) pipeline->prog_data[MESA_SHADER_GEOMETRY];
+   return (pipeline->active_stages & mesa_to_vk_shader_stage(stage)) != 0;
 }
 
-static inline const struct brw_wm_prog_data *
-get_wm_prog_data(struct anv_pipeline *pipeline)
-{
-   return (const struct brw_wm_prog_data *) pipeline->prog_data[MESA_SHADER_FRAGMENT];
+#define ANV_DECL_GET_PROG_DATA_FUNC(prefix, stage)                   \
+static inline const struct brw_##prefix##_prog_data *                \
+get_##prefix##_prog_data(struct anv_pipeline *pipeline)              \
+{                                                                    \
+   if (anv_pipeline_has_stage(pipeline, stage)) {                    \
+      return (const struct brw_##prefix##_prog_data *)               \
+             anv_shader_bin_get_prog_data(pipeline->shaders[stage]); \
+   } else {                                                          \
+      return NULL;                                                   \
+   }                                                                 \
 }
 
-static inline const struct brw_cs_prog_data *
-get_cs_prog_data(struct anv_pipeline *pipeline)
-{
-   return (const struct brw_cs_prog_data *) pipeline->prog_data[MESA_SHADER_COMPUTE];
-}
+ANV_DECL_GET_PROG_DATA_FUNC(vs, MESA_SHADER_VERTEX)
+ANV_DECL_GET_PROG_DATA_FUNC(gs, MESA_SHADER_GEOMETRY)
+ANV_DECL_GET_PROG_DATA_FUNC(wm, MESA_SHADER_FRAGMENT)
+ANV_DECL_GET_PROG_DATA_FUNC(cs, MESA_SHADER_COMPUTE)
 
 struct anv_graphics_pipeline_create_info {
    /**
diff --git a/src/intel/vulkan/anv_wsi_wayland.c b/src/intel/vulkan/anv_wsi_wayland.c
index a9e1617..18dae0a 100644
--- a/src/intel/vulkan/anv_wsi_wayland.c
+++ b/src/intel/vulkan/anv_wsi_wayland.c
@@ -519,6 +519,7 @@ wsi_wl_swapchain_acquire_next_image(struct anv_swapchain *anv_chain,
          if (!chain->images[i].busy) {
             /* We found a non-busy image */
             *image_index = i;
+            chain->images[i].busy = true;
             return VK_SUCCESS;
          }
       }
diff --git a/src/intel/vulkan/anv_wsi_x11.c b/src/intel/vulkan/anv_wsi_x11.c
index 2895d6b..81c524b 100644
--- a/src/intel/vulkan/anv_wsi_x11.c
+++ b/src/intel/vulkan/anv_wsi_x11.c
@@ -516,6 +516,7 @@ x11_acquire_next_image(struct anv_swapchain *anv_chain,
             /* We found a non-busy image */
             xshmfence_await(chain->images[i].shm_fence);
             *image_index = i;
+            chain->images[i].busy = true;
             return VK_SUCCESS;
          }
       }
@@ -553,6 +554,7 @@ x11_queue_present(struct anv_swapchain *anv_chain,
 
    xshmfence_reset(image->shm_fence);
 
+   ++chain->send_sbc;
    xcb_void_cookie_t cookie =
       xcb_present_pixmap(chain->conn,
                          chain->window,
@@ -786,6 +788,7 @@ x11_surface_create_swapchain(VkIcdSurfaceBase *icd_surface,
    chain->window = surface->window;
    chain->extent = pCreateInfo->imageExtent;
    chain->image_count = num_images;
+   chain->send_sbc = 0;
 
    chain->event_id = xcb_generate_id(chain->conn);
    xcb_present_select_input(chain->conn, chain->event_id, chain->window,
diff --git a/src/intel/vulkan/gen7_pipeline.c b/src/intel/vulkan/gen7_pipeline.c
index 89cb51f..d1b18e0 100644
--- a/src/intel/vulkan/gen7_pipeline.c
+++ b/src/intel/vulkan/gen7_pipeline.c
@@ -75,76 +75,6 @@ gen7_emit_rs_state(struct anv_pipeline *pipeline,
    GENX(3DSTATE_SF_pack)(NULL, &pipeline->gen7.sf, &sf);
 }
 
-static void
-gen7_emit_cb_state(struct anv_pipeline *pipeline,
-                   const VkPipelineColorBlendStateCreateInfo *info,
-                   const VkPipelineMultisampleStateCreateInfo *ms_info)
-{
-   struct anv_device *device = pipeline->device;
-
-   if (info == NULL || info->attachmentCount == 0) {
-      pipeline->blend_state =
-         anv_state_pool_emit(&device->dynamic_state_pool,
-            GENX(BLEND_STATE), 64,
-            .ColorBufferBlendEnable = false,
-            .WriteDisableAlpha = true,
-            .WriteDisableRed = true,
-            .WriteDisableGreen = true,
-            .WriteDisableBlue = true);
-   } else {
-      const VkPipelineColorBlendAttachmentState *a = &info->pAttachments[0];
-      struct GENX(BLEND_STATE) blend = {
-         .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
-         .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-
-         .LogicOpEnable = info->logicOpEnable,
-         .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
-         .ColorBufferBlendEnable = a->blendEnable,
-         .ColorClampRange = COLORCLAMP_RTFORMAT,
-         .PreBlendColorClampEnable = true,
-         .PostBlendColorClampEnable = true,
-         .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
-         .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
-         .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
-         .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
-         .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
-         .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
-         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
-         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
-         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
-         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
-      };
-
-      /* Our hardware applies the blend factor prior to the blend function
-       * regardless of what function is used.  Technically, this means the
-       * hardware can do MORE than GL or Vulkan specify.  However, it also
-       * means that, for MIN and MAX, we have to stomp the blend factor to
-       * ONE to make it a no-op.
-       */
-      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
-          a->colorBlendOp == VK_BLEND_OP_MAX) {
-         blend.SourceBlendFactor = BLENDFACTOR_ONE;
-         blend.DestinationBlendFactor = BLENDFACTOR_ONE;
-      }
-      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
-          a->alphaBlendOp == VK_BLEND_OP_MAX) {
-         blend.SourceAlphaBlendFactor = BLENDFACTOR_ONE;
-         blend.DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
-      }
-
-      pipeline->blend_state = anv_state_pool_alloc(&device->dynamic_state_pool,
-                                                   GENX(BLEND_STATE_length) * 4,
-                                                   64);
-      GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend);
-      if (pipeline->device->info.has_llc)
-         anv_state_clflush(pipeline->blend_state);
-    }
-
-   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-      bsp.BlendStatePointer = pipeline->blend_state.offset;
-   }
-}
-
 VkResult
 genX(graphics_pipeline_create)(
     VkDevice                                    _device,
@@ -182,31 +112,13 @@ genX(graphics_pipeline_create)(
 
    emit_ds_state(pipeline, pCreateInfo->pDepthStencilState, pass, subpass);
 
-   gen7_emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
-                                pCreateInfo->pMultisampleState);
+   emit_cb_state(pipeline, pCreateInfo->pColorBlendState,
+                           pCreateInfo->pMultisampleState);
 
    emit_urb_setup(pipeline);
 
-   const VkPipelineRasterizationStateCreateInfo *rs_info =
-      pCreateInfo->pRasterizationState;
-
-   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP), clip) {
-      clip.FrontWinding             = vk_to_gen_front_face[rs_info->frontFace],
-      clip.CullMode                 = vk_to_gen_cullmode[rs_info->cullMode],
-      clip.ClipEnable               = !(extra && extra->use_rectlist),
-      clip.APIMode                  = APIMODE_OGL,
-      clip.ViewportXYClipTestEnable = true,
-      clip.ViewportZClipTestEnable  = !pipeline->depth_clamp_enable,
-      clip.ClipMode                 = CLIPMODE_NORMAL,
-
-      clip.TriangleStripListProvokingVertexSelect   = 0,
-      clip.LineStripListProvokingVertexSelect       = 0,
-      clip.TriangleFanProvokingVertexSelect         = 1,
-
-      clip.MinimumPointWidth        = 0.125,
-      clip.MaximumPointWidth        = 255.875,
-      clip.MaximumVPIndex = pCreateInfo->pViewportState->viewportCount - 1;
-   }
+   emit_3dstate_clip(pipeline, pCreateInfo->pViewportState,
+                     pCreateInfo->pRasterizationState, extra);
 
    if (pCreateInfo->pMultisampleState &&
        pCreateInfo->pMultisampleState->rasterizationSamples > 1)
@@ -385,6 +297,7 @@ genX(graphics_pipeline_create)(
          wm.LineEndCapAntialiasingRegionWidth   = 0; /* 0.5 pixels */
          wm.LineAntialiasingRegionWidth         = 1; /* 1.0 pixels */
          wm.PointRasterizationRule              = RASTRULE_UPPER_RIGHT;
+         wm.PixelShaderKillPixel                = wm_prog_data->uses_kill;
          wm.PixelShaderComputedDepthMode        = wm_prog_data->computed_depth_mode;
          wm.PixelShaderUsesSourceDepth          = wm_prog_data->uses_src_depth;
          wm.PixelShaderUsesSourceW              = wm_prog_data->uses_src_w;
diff --git a/src/intel/vulkan/gen8_pipeline.c b/src/intel/vulkan/gen8_pipeline.c
index 6d70df6..cc10d3a 100644
--- a/src/intel/vulkan/gen8_pipeline.c
+++ b/src/intel/vulkan/gen8_pipeline.c
@@ -100,123 +100,6 @@ emit_rs_state(struct anv_pipeline *pipeline,
 }
 
 static void
-emit_cb_state(struct anv_pipeline *pipeline,
-              const VkPipelineColorBlendStateCreateInfo *info,
-              const VkPipelineMultisampleStateCreateInfo *ms_info)
-{
-   struct anv_device *device = pipeline->device;
-
-   uint32_t num_dwords = GENX(BLEND_STATE_length);
-   pipeline->blend_state =
-      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
-
-   struct GENX(BLEND_STATE) blend_state = {
-      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
-      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
-   };
-
-   /* Default everything to disabled */
-   for (uint32_t i = 0; i < 8; i++) {
-      blend_state.Entry[i].WriteDisableAlpha = true;
-      blend_state.Entry[i].WriteDisableRed = true;
-      blend_state.Entry[i].WriteDisableGreen = true;
-      blend_state.Entry[i].WriteDisableBlue = true;
-   }
-
-   struct anv_pipeline_bind_map *map =
-      &pipeline->bindings[MESA_SHADER_FRAGMENT];
-
-   bool has_writeable_rt = false;
-   for (unsigned i = 0; i < map->surface_count; i++) {
-      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
-
-      /* All color attachments are at the beginning of the binding table */
-      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
-         break;
-
-      /* We can have at most 8 attachments */
-      assert(i < 8);
-
-      if (binding->index >= info->attachmentCount)
-         continue;
-
-      assert(binding->binding == 0);
-      const VkPipelineColorBlendAttachmentState *a =
-         &info->pAttachments[binding->index];
-
-      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
-          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
-          a->colorBlendOp != a->alphaBlendOp) {
-         blend_state.IndependentAlphaBlendEnable = true;
-      }
-
-      blend_state.Entry[i] = (struct GENX(BLEND_STATE_ENTRY)) {
-         .LogicOpEnable = info->logicOpEnable,
-         .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
-         .ColorBufferBlendEnable = a->blendEnable,
-         .PreBlendSourceOnlyClampEnable = false,
-         .ColorClampRange = COLORCLAMP_RTFORMAT,
-         .PreBlendColorClampEnable = true,
-         .PostBlendColorClampEnable = true,
-         .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
-         .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
-         .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
-         .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
-         .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
-         .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
-         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
-         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
-         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
-         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
-      };
-
-      if (a->colorWriteMask != 0)
-         has_writeable_rt = true;
-
-      /* Our hardware applies the blend factor prior to the blend function
-       * regardless of what function is used.  Technically, this means the
-       * hardware can do MORE than GL or Vulkan specify.  However, it also
-       * means that, for MIN and MAX, we have to stomp the blend factor to
-       * ONE to make it a no-op.
-       */
-      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
-          a->colorBlendOp == VK_BLEND_OP_MAX) {
-         blend_state.Entry[i].SourceBlendFactor = BLENDFACTOR_ONE;
-         blend_state.Entry[i].DestinationBlendFactor = BLENDFACTOR_ONE;
-      }
-      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
-          a->alphaBlendOp == VK_BLEND_OP_MAX) {
-         blend_state.Entry[i].SourceAlphaBlendFactor = BLENDFACTOR_ONE;
-         blend_state.Entry[i].DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
-      }
-   }
-
-   struct GENX(BLEND_STATE_ENTRY) *bs0 = &blend_state.Entry[0];
-
-   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_BLEND), blend) {
-      blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
-      blend.HasWriteableRT                = has_writeable_rt;
-      blend.ColorBufferBlendEnable        = bs0->ColorBufferBlendEnable;
-      blend.SourceAlphaBlendFactor        = bs0->SourceAlphaBlendFactor;
-      blend.DestinationAlphaBlendFactor   = bs0->DestinationAlphaBlendFactor;
-      blend.SourceBlendFactor             = bs0->SourceBlendFactor;
-      blend.DestinationBlendFactor        = bs0->DestinationBlendFactor;
-      blend.AlphaTestEnable               = false;
-      blend.IndependentAlphaBlendEnable   =
-         blend_state.IndependentAlphaBlendEnable;
-   }
-
-   GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
-   if (!device->info.has_llc)
-      anv_state_clflush(pipeline->blend_state);
-
-   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
-      bsp.BlendStatePointer      = pipeline->blend_state.offset;
-      bsp.BlendStatePointerValid = true;
-   }
-}
-
-static void
 emit_ms_state(struct anv_pipeline *pipeline,
               const VkPipelineMultisampleStateCreateInfo *info)
 {
@@ -303,29 +186,10 @@ genX(graphics_pipeline_create)(
 
    emit_urb_setup(pipeline);
 
-   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
-   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP), clip) {
-      clip.ClipEnable               = !(extra && extra->use_rectlist);
-      clip.EarlyCullEnable          = true;
-      clip.APIMode                  = 1; /* D3D */
-      clip.ViewportXYClipTestEnable = true;
-
-      clip.ClipMode =
-         pCreateInfo->pRasterizationState->rasterizerDiscardEnable ?
-         REJECT_ALL : NORMAL;
-
-      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
-         (wm_prog_data->barycentric_interp_modes & 0x38) != 0 : 0;
-
-      clip.TriangleStripListProvokingVertexSelect  = 0;
-      clip.LineStripListProvokingVertexSelect      = 0;
-      clip.TriangleFanProvokingVertexSelect        = 1;
-
-      clip.MinimumPointWidth  = 0.125;
-      clip.MaximumPointWidth  = 255.875;
-      clip.MaximumVPIndex     = pCreateInfo->pViewportState->viewportCount - 1;
-   }
+   emit_3dstate_clip(pipeline, pCreateInfo->pViewportState,
+                     pCreateInfo->pRasterizationState, extra);
 
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
    anv_batch_emit(&pipeline->batch, GENX(3DSTATE_WM), wm) {
       wm.StatisticsEnable                    = true;
       wm.LineEndCapAntialiasingRegionWidth   = _05pixels;
diff --git a/src/intel/vulkan/genX_cmd_buffer.c b/src/intel/vulkan/genX_cmd_buffer.c
index 741d5bf..f92d856 100644
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@@ -1448,4 +1448,17 @@ void genX(CmdCopyQueryPoolResults)(
    }
 }
 
+#else
+void genX(CmdCopyQueryPoolResults)(
+    VkCommandBuffer                             commandBuffer,
+    VkQueryPool                                 queryPool,
+    uint32_t                                    firstQuery,
+    uint32_t                                    queryCount,
+    VkBuffer                                    destBuffer,
+    VkDeviceSize                                destOffset,
+    VkDeviceSize                                destStride,
+    VkQueryResultFlags                          flags)
+{
+   anv_finishme("Queries not yet supported on Ivy Bridge");
+}
 #endif
diff --git a/src/intel/vulkan/genX_l3.c b/src/intel/vulkan/genX_l3.c
index 0d36e3c..a74071c 100644
--- a/src/intel/vulkan/genX_l3.c
+++ b/src/intel/vulkan/genX_l3.c
@@ -315,10 +315,14 @@ get_pipeline_state_l3_weights(const struct anv_pipeline *pipeline)
    bool needs_dc = false, needs_slm = false;
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
-      const struct brw_stage_prog_data *prog_data = pipeline->prog_data[i];
+      if (!anv_pipeline_has_stage(pipeline, i))
+         continue;
+
+      const struct brw_stage_prog_data *prog_data =
+         anv_shader_bin_get_prog_data(pipeline->shaders[i]);
 
       needs_dc |= pipeline->needs_data_cache;
-      needs_slm |= prog_data && prog_data->total_shared;
+      needs_slm |= prog_data->total_shared;
    }
 
    return get_default_l3_weights(&pipeline->device->info,
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index 5cbcfd2..7d8129d 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -63,8 +63,7 @@ genX(compute_pipeline_create)(
    /* When we free the pipeline, we detect stages based on the NULL status
     * of various prog_data pointers.  Make them NULL by default.
     */
-   memset(pipeline->prog_data, 0, sizeof(pipeline->prog_data));
-   memset(pipeline->bindings, 0, sizeof(pipeline->bindings));
+   memset(pipeline->shaders, 0, sizeof(pipeline->shaders));
 
    pipeline->vs_simd8 = NO_KERNEL;
    pipeline->vs_vec4 = NO_KERNEL;
@@ -76,9 +75,13 @@ genX(compute_pipeline_create)(
 
    assert(pCreateInfo->stage.stage == VK_SHADER_STAGE_COMPUTE_BIT);
    ANV_FROM_HANDLE(anv_shader_module, module,  pCreateInfo->stage.module);
-   anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
-                           pCreateInfo->stage.pName,
-                           pCreateInfo->stage.pSpecializationInfo);
+   result = anv_pipeline_compile_cs(pipeline, cache, pCreateInfo, module,
+                                    pCreateInfo->stage.pName,
+                                    pCreateInfo->stage.pSpecializationInfo);
+   if (result != VK_SUCCESS) {
+      anv_free2(&device->alloc, pAllocator, pipeline);
+      return result;
+   }
 
    pipeline->use_repclear = false;
 
diff --git a/src/intel/vulkan/genX_pipeline_util.h b/src/intel/vulkan/genX_pipeline_util.h
index 669b456..94692e4 100644
--- a/src/intel/vulkan/genX_pipeline_util.h
+++ b/src/intel/vulkan/genX_pipeline_util.h
@@ -291,6 +291,11 @@ emit_3dstate_sbe(struct anv_pipeline *pipeline)
       if (input_index < 0)
          continue;
 
+      if (attr == VARYING_SLOT_PNTC) {
+         sbe.PointSpriteTextureCoordinateEnable = 1 << input_index;
+         continue;
+      }
+
       const int slot = fs_input_map->varying_to_slot[attr];
 
       if (input_index >= 16)
@@ -512,3 +517,177 @@ emit_ds_state(struct anv_pipeline *pipeline,
    GENX(3DSTATE_WM_DEPTH_STENCIL_pack)(NULL, depth_stencil_dw, &depth_stencil);
 #endif
 }
+
+static void
+emit_cb_state(struct anv_pipeline *pipeline,
+              const VkPipelineColorBlendStateCreateInfo *info,
+              const VkPipelineMultisampleStateCreateInfo *ms_info)
+{
+   struct anv_device *device = pipeline->device;
+
+   const uint32_t num_dwords = GENX(BLEND_STATE_length);
+   pipeline->blend_state =
+      anv_state_pool_alloc(&device->dynamic_state_pool, num_dwords * 4, 64);
+
+   struct GENX(BLEND_STATE) blend_state = {
+#if GEN_GEN >= 8
+      .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
+      .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
+#else
+      /* Make sure it gets zeroed */
+      .Entry = { { 0, }, },
+#endif
+   };
+
+   /* Default everything to disabled */
+   for (uint32_t i = 0; i < 8; i++) {
+      blend_state.Entry[i].WriteDisableAlpha = true;
+      blend_state.Entry[i].WriteDisableRed = true;
+      blend_state.Entry[i].WriteDisableGreen = true;
+      blend_state.Entry[i].WriteDisableBlue = true;
+   }
+
+   uint32_t surface_count = 0;
+   struct anv_pipeline_bind_map *map;
+   if (anv_pipeline_has_stage(pipeline, MESA_SHADER_FRAGMENT)) {
+      map = &pipeline->shaders[MESA_SHADER_FRAGMENT]->bind_map;
+      surface_count = map->surface_count;
+   }
+
+   bool has_writeable_rt = false;
+   for (unsigned i = 0; i < surface_count; i++) {
+      struct anv_pipeline_binding *binding = &map->surface_to_descriptor[i];
+
+      /* All color attachments are at the beginning of the binding table */
+      if (binding->set != ANV_DESCRIPTOR_SET_COLOR_ATTACHMENTS)
+         break;
+
+      /* We can have at most 8 attachments */
+      assert(i < 8);
+
+      if (binding->index >= info->attachmentCount)
+         continue;
+
+      assert(binding->binding == 0);
+      const VkPipelineColorBlendAttachmentState *a =
+         &info->pAttachments[binding->index];
+
+      blend_state.Entry[i] = (struct GENX(BLEND_STATE_ENTRY)) {
+#if GEN_GEN < 8
+         .AlphaToCoverageEnable = ms_info && ms_info->alphaToCoverageEnable,
+         .AlphaToOneEnable = ms_info && ms_info->alphaToOneEnable,
+#endif
+         .LogicOpEnable = info->logicOpEnable,
+         .LogicOpFunction = vk_to_gen_logic_op[info->logicOp],
+         .ColorBufferBlendEnable = a->blendEnable,
+         .ColorClampRange = COLORCLAMP_RTFORMAT,
+         .PreBlendColorClampEnable = true,
+         .PostBlendColorClampEnable = true,
+         .SourceBlendFactor = vk_to_gen_blend[a->srcColorBlendFactor],
+         .DestinationBlendFactor = vk_to_gen_blend[a->dstColorBlendFactor],
+         .ColorBlendFunction = vk_to_gen_blend_op[a->colorBlendOp],
+         .SourceAlphaBlendFactor = vk_to_gen_blend[a->srcAlphaBlendFactor],
+         .DestinationAlphaBlendFactor = vk_to_gen_blend[a->dstAlphaBlendFactor],
+         .AlphaBlendFunction = vk_to_gen_blend_op[a->alphaBlendOp],
+         .WriteDisableAlpha = !(a->colorWriteMask & VK_COLOR_COMPONENT_A_BIT),
+         .WriteDisableRed = !(a->colorWriteMask & VK_COLOR_COMPONENT_R_BIT),
+         .WriteDisableGreen = !(a->colorWriteMask & VK_COLOR_COMPONENT_G_BIT),
+         .WriteDisableBlue = !(a->colorWriteMask & VK_COLOR_COMPONENT_B_BIT),
+      };
+
+      if (a->srcColorBlendFactor != a->srcAlphaBlendFactor ||
+          a->dstColorBlendFactor != a->dstAlphaBlendFactor ||
+          a->colorBlendOp != a->alphaBlendOp) {
+#if GEN_GEN >= 8
+         blend_state.IndependentAlphaBlendEnable = true;
+#else
+         blend_state.Entry[i].IndependentAlphaBlendEnable = true;
+#endif
+      }
+
+      if (a->colorWriteMask != 0)
+         has_writeable_rt = true;
+
+      /* Our hardware applies the blend factor prior to the blend function
+       * regardless of what function is used.  Technically, this means the
+       * hardware can do MORE than GL or Vulkan specify.  However, it also
+       * means that, for MIN and MAX, we have to stomp the blend factor to
+       * ONE to make it a no-op.
+       */
+      if (a->colorBlendOp == VK_BLEND_OP_MIN ||
+          a->colorBlendOp == VK_BLEND_OP_MAX) {
+         blend_state.Entry[i].SourceBlendFactor = BLENDFACTOR_ONE;
+         blend_state.Entry[i].DestinationBlendFactor = BLENDFACTOR_ONE;
+      }
+      if (a->alphaBlendOp == VK_BLEND_OP_MIN ||
+          a->alphaBlendOp == VK_BLEND_OP_MAX) {
+         blend_state.Entry[i].SourceAlphaBlendFactor = BLENDFACTOR_ONE;
+         blend_state.Entry[i].DestinationAlphaBlendFactor = BLENDFACTOR_ONE;
+      }
+   }
+
+#if GEN_GEN >= 8
+   struct GENX(BLEND_STATE_ENTRY) *bs0 = &blend_state.Entry[0];
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_PS_BLEND), blend) {
+      blend.AlphaToCoverageEnable         = blend_state.AlphaToCoverageEnable;
+      blend.HasWriteableRT                = has_writeable_rt;
+      blend.ColorBufferBlendEnable        = bs0->ColorBufferBlendEnable;
+      blend.SourceAlphaBlendFactor        = bs0->SourceAlphaBlendFactor;
+      blend.DestinationAlphaBlendFactor   = bs0->DestinationAlphaBlendFactor;
+      blend.SourceBlendFactor             = bs0->SourceBlendFactor;
+      blend.DestinationBlendFactor        = bs0->DestinationBlendFactor;
+      blend.AlphaTestEnable               = false;
+      blend.IndependentAlphaBlendEnable   =
+         blend_state.IndependentAlphaBlendEnable;
+   }
+#else
+   (void)has_writeable_rt;
+#endif
+
+   GENX(BLEND_STATE_pack)(NULL, pipeline->blend_state.map, &blend_state);
+   if (!device->info.has_llc)
+      anv_state_clflush(pipeline->blend_state);
+
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_BLEND_STATE_POINTERS), bsp) {
+      bsp.BlendStatePointer      = pipeline->blend_state.offset;
+#if GEN_GEN >= 8
+      bsp.BlendStatePointerValid = true;
+#endif
+   }
+}
+
+static void
+emit_3dstate_clip(struct anv_pipeline *pipeline,
+                  const VkPipelineViewportStateCreateInfo *vp_info,
+                  const VkPipelineRasterizationStateCreateInfo *rs_info,
+                  const struct anv_graphics_pipeline_create_info *extra)
+{
+   const struct brw_wm_prog_data *wm_prog_data = get_wm_prog_data(pipeline);
+   (void) wm_prog_data;
+   anv_batch_emit(&pipeline->batch, GENX(3DSTATE_CLIP), clip) {
+      clip.ClipEnable               = !(extra && extra->use_rectlist);
+      clip.EarlyCullEnable          = true;
+      clip.APIMode                  = APIMODE_D3D,
+      clip.ViewportXYClipTestEnable = true;
+
+      clip.ClipMode = rs_info->rasterizerDiscardEnable ?
+         CLIPMODE_REJECT_ALL : CLIPMODE_NORMAL;
+
+      clip.TriangleStripListProvokingVertexSelect = 0;
+      clip.LineStripListProvokingVertexSelect     = 0;
+      clip.TriangleFanProvokingVertexSelect       = 1;
+
+      clip.MinimumPointWidth = 0.125;
+      clip.MaximumPointWidth = 255.875;
+      clip.MaximumVPIndex    = vp_info->viewportCount - 1;
+
+#if GEN_GEN == 7
+      clip.FrontWinding            = vk_to_gen_front_face[rs_info->frontFace];
+      clip.CullMode                = vk_to_gen_cullmode[rs_info->cullMode];
+      clip.ViewportZClipTestEnable = !pipeline->depth_clamp_enable;
+#else
+      clip.NonPerspectiveBarycentricEnable = wm_prog_data ?
+         (wm_prog_data->barycentric_interp_modes & 0x38) != 0 : 0;
+#endif
+   }
+}
diff --git a/src/loader/loader.c b/src/loader/loader.c
index 522fba3..56ffc5d 100644
--- a/src/loader/loader.c
+++ b/src/loader/loader.c
@@ -80,8 +80,11 @@
 #include "xmlpool.h"
 #endif
 #endif
-#ifdef HAVE_SYSFS
-#include <sys/types.h>
+#ifdef MAJOR_IN_MKDEV
+#include <sys/mkdev.h>
+#endif
+#ifdef MAJOR_IN_SYSMACROS
+#include <sys/sysmacros.h>
 #endif
 #include "loader.h"
 
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 896f225..4c2e849 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -68,15 +68,10 @@ dri3_fence_await(xcb_connection_t *c, struct loader_dri3_buffer *buffer)
 static void
 dri3_update_num_back(struct loader_dri3_drawable *draw)
 {
-   draw->num_back = 1;
-   if (draw->flipping) {
-      if (!draw->is_pixmap &&
-          !(draw->present_capabilities & XCB_PRESENT_CAPABILITY_ASYNC))
-         draw->num_back++;
-      draw->num_back++;
-   }
-   if (draw->vtable->get_swap_interval(draw) == 0)
-      draw->num_back++;
+   if (draw->flipping)
+      draw->num_back = 3;
+   else
+      draw->num_back = 2;
 }
 
 void
@@ -785,6 +780,7 @@ loader_dri3_open(xcb_connection_t *conn,
    }
 
    fd = xcb_dri3_open_reply_fds(conn, reply)[0];
+   free(reply);
    fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
 
    return fd;
@@ -1115,6 +1111,7 @@ dri3_get_pixmap_buffer(__DRIdrawable *driDrawable, unsigned int format,
    xcb_sync_fence_t                     sync_fence;
    struct xshmfence                     *shm_fence;
    int                                  fence_fd;
+   __DRIscreen                          *cur_screen;
 
    if (buffer)
       return buffer;
@@ -1145,8 +1142,17 @@ dri3_get_pixmap_buffer(__DRIdrawable *driDrawable, unsigned int format,
    if (!bp_reply)
       goto no_image;
 
+   /* Get the currently-bound screen or revert to using the drawable's screen if
+    * no contexts are currently bound. The latter case is at least necessary for
+    * obs-studio, when using Window Capture (Xcomposite) as a Source.
+    */
+   cur_screen = draw->vtable->get_dri_screen(draw);
+   if (!cur_screen) {
+       cur_screen = draw->dri_screen;
+   }
+
    buffer->image = loader_dri3_create_image(draw->conn, bp_reply, format,
-                                            draw->dri_screen, draw->ext->image,
+                                            cur_screen, draw->ext->image,
                                             buffer);
    if (!buffer->image)
       goto no_image;
diff --git a/src/loader/loader_dri3_helper.h b/src/loader/loader_dri3_helper.h
index 5b8fd1d..658e190 100644
--- a/src/loader/loader_dri3_helper.h
+++ b/src/loader/loader_dri3_helper.h
@@ -103,6 +103,7 @@ struct loader_dri3_vtable {
    void (*set_drawable_size)(struct loader_dri3_drawable *, int, int);
    bool (*in_current_context)(struct loader_dri3_drawable *);
    __DRIcontext *(*get_dri_context)(struct loader_dri3_drawable *);
+   __DRIscreen *(*get_dri_screen)(struct loader_dri3_drawable *);
    void (*flush_drawable)(struct loader_dri3_drawable *, unsigned);
    void (*show_fps)(struct loader_dri3_drawable *, uint64_t);
 };
diff --git a/src/mapi/Makefile.am b/src/mapi/Makefile.am
index 68a28a2..b44341d 100644
--- a/src/mapi/Makefile.am
+++ b/src/mapi/Makefile.am
@@ -64,6 +64,9 @@ BUILT_SOURCES += shared-glapi/glapi_mapi_tmp.h
 
 lib_LTLIBRARIES += shared-glapi/libglapi.la
 shared_glapi_libglapi_la_SOURCES = $(MAPI_GLAPI_FILES) shared-glapi/glapi_mapi_tmp.h
+shared_glapi_libglapi_la_CFLAGS = \
+	$(AM_CFLAGS) \
+	$(VISIBILITY_CFLAGS)
 shared_glapi_libglapi_la_CPPFLAGS = \
 	$(AM_CPPFLAGS) \
 	-DMAPI_MODE_GLAPI \
diff --git a/src/mapi/entry_x86-64_tls.h b/src/mapi/entry_x86-64_tls.h
index 38faccc..8f3fa91 100644
--- a/src/mapi/entry_x86-64_tls.h
+++ b/src/mapi/entry_x86-64_tls.h
@@ -25,6 +25,11 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
+#ifdef HAVE_FUNC_ATTRIBUTE_VISIBILITY
+#define HIDDEN __attribute__((visibility("hidden")))
+#else
+#define HIDDEN
+#endif
 
 __asm__(".text\n"
         ".balign 32\n"
@@ -54,8 +59,8 @@ entry_patch_public(void)
 {
 }
 
-static char
-x86_64_entry_start[];
+extern char
+x86_64_entry_start[] HIDDEN;
 
 mapi_func
 entry_get_public(int slot)
diff --git a/src/mapi/entry_x86_tls.h b/src/mapi/entry_x86_tls.h
index 46d2ece..545b5a3 100644
--- a/src/mapi/entry_x86_tls.h
+++ b/src/mapi/entry_x86_tls.h
@@ -27,6 +27,12 @@
 
 #include <string.h>
 
+#ifdef HAVE_FUNC_ATTRIBUTE_VISIBILITY
+#define HIDDEN __attribute__((visibility("hidden")))
+#else
+#define HIDDEN
+#endif
+
 __asm__(".text");
 
 __asm__("x86_current_tls:\n\t"
@@ -71,8 +77,8 @@ __asm__(".text");
 extern unsigned long
 x86_current_tls();
 
-static char x86_entry_start[];
-static char x86_entry_end[];
+extern char x86_entry_start[] HIDDEN;
+extern char x86_entry_end[] HIDDEN;
 
 void
 entry_patch_public(void)
diff --git a/src/mapi/entry_x86_tsd.h b/src/mapi/entry_x86_tsd.h
index ea7bacb..0c28c8f 100644
--- a/src/mapi/entry_x86_tsd.h
+++ b/src/mapi/entry_x86_tsd.h
@@ -25,6 +25,11 @@
  *    Chia-I Wu <olv@lunarg.com>
  */
 
+#ifdef HAVE_FUNC_ATTRIBUTE_VISIBILITY
+#define HIDDEN __attribute__((visibility("hidden")))
+#else
+#define HIDDEN
+#endif
 
 #define X86_ENTRY_SIZE 32
 
@@ -58,8 +63,8 @@ __asm__(".balign 32\n"
 #include <string.h>
 #include "u_execmem.h"
 
-static const char x86_entry_start[];
-static const char x86_entry_end[];
+extern const char x86_entry_start[] HIDDEN;
+extern const char x86_entry_end[] HIDDEN;
 
 void
 entry_patch_public(void)
diff --git a/src/mesa/Android.gen.mk b/src/mesa/Android.gen.mk
index e04482b..aaa2de9 100644
--- a/src/mesa/Android.gen.mk
+++ b/src/mesa/Android.gen.mk
@@ -70,7 +70,7 @@ define es-gen
 	$(hide) $(PRIVATE_SCRIPT) $(1) $(PRIVATE_XML) > $@
 endef
 
-$(intermediates)/main/git_sha1.h: $(wildcard $(MESA_TOP)/.git/HEAD)
+$(intermediates)/main/git_sha1.h: $(wildcard $(MESA_TOP)/.git/ORIG_HEAD)
 	@mkdir -p $(dir $@)
 	@echo "GIT-SHA1: $(PRIVATE_MODULE) <= git"
 	$(hide) touch $@
diff --git a/src/mesa/Makefile.am b/src/mesa/Makefile.am
index 037384a..9710c7f 100644
--- a/src/mesa/Makefile.am
+++ b/src/mesa/Makefile.am
@@ -33,11 +33,6 @@ if HAVE_OSMESA
 SUBDIRS += drivers/osmesa
 endif
 
-if HAVE_GLX
-gldir = $(includedir)/GL
-gl_HEADERS = $(top_srcdir)/include/GL/*.h
-endif
-
 include Makefile.sources
 
 EXTRA_DIST = \
@@ -161,11 +156,6 @@ libmesa_sse41_la_SOURCES = \
 
 libmesa_sse41_la_CFLAGS = $(AM_CFLAGS) $(SSE41_CFLAGS)
 
-if HAVE_GLX
-pkgconfigdir = $(libdir)/pkgconfig
-pkgconfig_DATA = gl.pc
-endif
-
 MKDIR_GEN = $(AM_V_at)$(MKDIR_P) $(@D)
 YACC_GEN = $(AM_V_GEN)$(YACC) $(YFLAGS)
 LEX_GEN = $(AM_V_GEN)$(LEX) $(LFLAGS)
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index 84db5a8..bd5b3d3 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -645,11 +645,11 @@ INCLUDE_DIRS = \
 	-I$(top_builddir)/src \
 	-I$(top_srcdir)/src \
 	-I$(top_builddir)/src/compiler/nir \
-	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa \
-	-I$(top_srcdir)/src/mesa/main \
+	-I$(top_srcdir)/src/mesa \
 	-I$(top_builddir)/src/mesa/main \
-	-I$(top_srcdir)/src/mapi \
+	-I$(top_srcdir)/src/mesa/main \
 	-I$(top_builddir)/src/mapi \
+	-I$(top_srcdir)/src/mapi \
 	-I$(top_srcdir)/src/gallium/include \
 	-I$(top_srcdir)/src/gallium/auxiliary
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index 434800e..8f41174 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -15,13 +15,13 @@ env.MSVC2013Compat()
 env.Append(CPPPATH = [
     '../compiler/nir',  # for generated nir_opcodes.h, etc
     '#/src',
+    Dir('../mapi'), # src/mapi build path
     '#/src/mapi',
     '#/src/glsl',
+    Dir('.'), # src/mesa build path
     '#/src/mesa',
     '#/src/gallium/include',
     '#/src/gallium/auxiliary',
-    Dir('../mapi'), # src/mapi build path
-    Dir('.'), # src/mesa build path
 ])
 
 if env['platform'] == 'windows':
@@ -116,7 +116,7 @@ if env['platform'] not in ('cygwin', 'darwin', 'windows', 'haiku'):
         )
         # Add the dir containing the generated header (somewhere inside  the
         # build dir) to the include path
-        env.Append(CPPPATH = [matypes[0].dir])
+        env.Prepend(CPPPATH = [matypes[0].dir])
 
 
 def write_git_sha1_h_file(filename):
diff --git a/src/mesa/drivers/dri/i915/intel_context.c b/src/mesa/drivers/dri/i915/intel_context.c
index e5a3f00..5607d5b 100644
--- a/src/mesa/drivers/dri/i915/intel_context.c
+++ b/src/mesa/drivers/dri/i915/intel_context.c
@@ -858,6 +858,7 @@ intel_update_image_buffers(struct intel_context *intel, __DRIdrawable *drawable)
    struct __DRIimageList images;
    unsigned int format;
    uint32_t buffer_mask = 0;
+   int ret;
 
    front_rb = intel_get_renderbuffer(fb, BUFFER_FRONT_LEFT);
    back_rb = intel_get_renderbuffer(fb, BUFFER_BACK_LEFT);
@@ -877,12 +878,14 @@ intel_update_image_buffers(struct intel_context *intel, __DRIdrawable *drawable)
    if (back_rb)
       buffer_mask |= __DRI_IMAGE_BUFFER_BACK;
 
-   (*screen->image.loader->getBuffers) (drawable,
-                                        driGLFormatToImageFormat(format),
-                                        &drawable->dri2.stamp,
-                                        drawable->loaderPrivate,
-                                        buffer_mask,
-                                        &images);
+   ret = screen->image.loader->getBuffers(drawable,
+                                          driGLFormatToImageFormat(format),
+                                          &drawable->dri2.stamp,
+                                          drawable->loaderPrivate,
+                                          buffer_mask,
+                                          &images);
+   if (!ret)
+      return;
 
    if (images.image_mask & __DRI_IMAGE_BUFFER_FRONT) {
       drawable->w = images.front->width;
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index f448551..194b412 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -48,6 +48,7 @@ i965_compiler_FILES = \
 	brw_nir_attribute_workarounds.c \
 	brw_nir_intrinsics.c \
 	brw_nir_opt_peephole_ffma.c \
+	brw_nir_tcs_workarounds.c \
 	brw_packed_float.c \
 	brw_predicated_break.cpp \
 	brw_reg.h \
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.c b/src/mesa/drivers/dri/i965/brw_blorp.c
index 9590968..6be82c5 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.c
+++ b/src/mesa/drivers/dri/i965/brw_blorp.c
@@ -167,7 +167,8 @@ nir_uniform_type_size(const struct glsl_type *type)
 }
 
 const unsigned *
-brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
+brw_blorp_compile_nir_shader(struct brw_context *brw, void *mem_ctx,
+                             struct nir_shader *nir,
                              const struct brw_wm_prog_key *wm_key,
                              bool use_repclear,
                              struct brw_blorp_prog_data *prog_data,
@@ -175,13 +176,6 @@ brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
 {
    const struct brw_compiler *compiler = brw->intelScreen->compiler;
 
-   void *mem_ctx = ralloc_context(NULL);
-
-   /* Calling brw_preprocess_nir and friends is destructive and, if cloning is
-    * enabled, may end up completely replacing the nir_shader.  Therefore, we
-    * own it and might as well put it in our context for easy cleanup.
-    */
-   ralloc_steal(mem_ctx, nir);
    nir->options =
       compiler->glsl_compiler_options[MESA_SHADER_FRAGMENT].NirOptions;
 
diff --git a/src/mesa/drivers/dri/i965/brw_blorp.h b/src/mesa/drivers/dri/i965/brw_blorp.h
index 7ec5875..133a8ac 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp.h
+++ b/src/mesa/drivers/dri/i965/brw_blorp.h
@@ -366,7 +366,8 @@ struct brw_blorp_blit_prog_key
 void brw_blorp_init_wm_prog_key(struct brw_wm_prog_key *wm_key);
 
 const unsigned *
-brw_blorp_compile_nir_shader(struct brw_context *brw, struct nir_shader *nir,
+brw_blorp_compile_nir_shader(struct brw_context *brw, void *mem_ctx,
+                             struct nir_shader *nir,
                              const struct brw_wm_prog_key *wm_key,
                              bool use_repclear,
                              struct brw_blorp_prog_data *prog_data,
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
index 782d285..db94f33 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_blit.cpp
@@ -1296,7 +1296,7 @@ blorp_nir_manual_blend_bilinear(nir_builder *b, nir_ssa_def *pos,
  * of samples).
  */
 static nir_shader *
-brw_blorp_build_nir_shader(struct brw_context *brw,
+brw_blorp_build_nir_shader(struct brw_context *brw, void *mem_ctx,
                            const brw_blorp_blit_prog_key *key)
 {
    nir_ssa_def *src_pos, *dst_pos, *color;
@@ -1342,7 +1342,7 @@ brw_blorp_build_nir_shader(struct brw_context *brw,
           (key->dst_samples == 0));
 
    nir_builder b;
-   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
 
    struct brw_blorp_blit_vars v;
    brw_blorp_blit_vars_init(&b, &v, key);
@@ -1505,6 +1505,8 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
                         &params->wm_prog_kernel, &params->wm_prog_data))
       return;
 
+   void *mem_ctx = ralloc_context(NULL);
+
    const unsigned *program;
    unsigned program_size;
    struct brw_blorp_prog_data prog_data;
@@ -1512,7 +1514,7 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
    /* Try and compile with NIR first.  If that fails, fall back to the old
     * method of building shaders manually.
     */
-   nir_shader *nir = brw_blorp_build_nir_shader(brw, prog_key);
+   nir_shader *nir = brw_blorp_build_nir_shader(brw, mem_ctx, prog_key);
    struct brw_wm_prog_key wm_key;
    brw_blorp_init_wm_prog_key(&wm_key);
    wm_key.tex.compressed_multisample_layout_mask =
@@ -1520,7 +1522,7 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
    wm_key.tex.msaa_16 = prog_key->tex_samples == 16;
    wm_key.multisample_fbo = prog_key->rt_samples > 1;
 
-   program = brw_blorp_compile_nir_shader(brw, nir, &wm_key, false,
+   program = brw_blorp_compile_nir_shader(brw, mem_ctx, nir, &wm_key, false,
                                           &prog_data, &program_size);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
@@ -1528,6 +1530,8 @@ brw_blorp_get_blit_kernel(struct brw_context *brw,
                     program, program_size,
                     &prog_data, sizeof(prog_data),
                     &params->wm_prog_kernel, &params->wm_prog_data);
+
+   ralloc_free(mem_ctx);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp b/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
index 2515a04..6400218 100644
--- a/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
+++ b/src/mesa/drivers/dri/i965/brw_blorp_clear.cpp
@@ -64,7 +64,7 @@ brw_blorp_params_get_clear_kernel(struct brw_context *brw,
    void *mem_ctx = ralloc_context(NULL);
 
    nir_builder b;
-   nir_builder_init_simple_shader(&b, NULL, MESA_SHADER_FRAGMENT, NULL);
+   nir_builder_init_simple_shader(&b, mem_ctx, MESA_SHADER_FRAGMENT, NULL);
    b.shader->info.name = ralloc_strdup(b.shader, "BLORP-clear");
 
    nir_variable *u_color = nir_variable_create(b.shader, nir_var_uniform,
@@ -84,7 +84,8 @@ brw_blorp_params_get_clear_kernel(struct brw_context *brw,
    struct brw_blorp_prog_data prog_data;
    unsigned program_size;
    const unsigned *program =
-      brw_blorp_compile_nir_shader(brw, b.shader, &wm_key, use_replicated_data,
+      brw_blorp_compile_nir_shader(brw, mem_ctx,
+                                   b.shader, &wm_key, use_replicated_data,
                                    &prog_data, &program_size);
 
    brw_upload_cache(&brw->cache, BRW_CACHE_BLORP_PROG,
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 10e9f47..7d15c28 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -220,6 +220,8 @@ struct brw_tcs_prog_key
    /** A bitfield of per-vertex outputs written. */
    uint64_t outputs_written;
 
+   bool quads_workaround;
+
    struct brw_sampler_prog_key_data tex;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 1cb99da..2af42e0 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -5681,7 +5681,7 @@ fs_visitor::setup_gs_payload()
     * have to multiply by VerticesIn to obtain the total storage requirement.
     */
    if (8 * vue_prog_data->urb_read_length * nir->info.gs.vertices_in >
-       max_push_components) {
+       max_push_components || gs_prog_data->invocations > 1) {
       gs_prog_data->base.include_vue_handles = true;
 
       /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 11c078a..91763d3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -2322,23 +2322,23 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
          break;
 
       fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
-      fs_reg m0_2 = byte_offset(m0, 2 * sizeof(uint32_t));
+      fs_reg m0_2 = component(m0, 2);
 
-      const fs_builder fwa_bld = bld.exec_all();
+      const fs_builder chanbld = bld.exec_all().group(1, 0);
 
       /* Zero the message header */
-      fwa_bld.MOV(m0, brw_imm_ud(0u));
+      bld.exec_all().MOV(m0, brw_imm_ud(0u));
 
       /* Copy "Barrier ID" from r0.2, bits 16:13 */
-      fwa_bld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
                   brw_imm_ud(INTEL_MASK(16, 13)));
 
       /* Shift it up to bits 27:24. */
-      fwa_bld.SHL(m0_2, m0_2, brw_imm_ud(11));
+      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
 
       /* Set the Barrier Count and the enable bit */
-      fwa_bld.OR(m0_2, m0_2,
-                 brw_imm_ud(tcs_prog_data->instances << 8 | (1 << 15)));
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
 
       bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
       break;
@@ -4060,12 +4060,23 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
       dest = get_nir_dest(instr->dest);
 
    fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
-   fs_reg offset = get_nir_src(instr->src[0]);
+   fs_reg offset;
    fs_reg data1 = get_nir_src(instr->src[1]);
    fs_reg data2;
    if (op == BRW_AOP_CMPWR)
       data2 = get_nir_src(instr->src[2]);
 
+   /* Get the offset */
+   nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+   if (const_offset) {
+      offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+   } else {
+      offset = vgrf(glsl_type::uint_type);
+      bld.ADD(offset,
+	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+	      brw_imm_ud(instr->const_index[0]));
+   }
+
    /* Emit the actual atomic operation operation */
 
    fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
diff --git a/src/mesa/drivers/dri/i965/brw_nir.h b/src/mesa/drivers/dri/i965/brw_nir.h
index 74c354f..6185310 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.h
+++ b/src/mesa/drivers/dri/i965/brw_nir.h
@@ -117,6 +117,8 @@ bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
 
 bool brw_nir_apply_trig_workarounds(nir_shader *nir);
 
+void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
+
 nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
                                       const struct brw_device_info *devinfo,
                                       const struct brw_sampler_prog_key_data *key,
diff --git a/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c b/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c
new file mode 100644
index 0000000..0626981
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_nir_tcs_workarounds.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Implements the WaPreventHSTessLevelsInterference workaround (for Gen7-8).
+ *
+ * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
+ * definition of the patch header layouts):
+ *
+ *    "HW Bug: The Tessellation stage will incorrectly add domain points
+ *     along patch edges under the following conditions, which may result
+ *     in conformance failures and/or cracking artifacts:
+ *
+ *       * QUAD domain
+ *       * INTEGER partitioning
+ *       * All three TessFactors in a given U or V direction (e.g., V
+ *         direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
+ *       * All three TessFactors in the other direction are > 1.0 and all
+ *         round up to the same integer value (e.g, U direction:
+ *         VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
+ *
+ *     The suggested workaround (to be implemented as part of the postamble
+ *     to the HS shader in the HS kernel) is:
+ *
+ *     if (
+ *        (TF[UEQ0] > 1.0) ||
+ *        (TF[VEQ0] > 1.0) ||
+ *        (TF[UEQ1] > 1.0) ||
+ *        (TF[VEQ1] > 1.0) ||
+ *        (TF[INSIDE_U] > 1.0) ||
+ *        (TF[INSIDE_V] > 1.0) )
+ *     {
+ *        TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
+ *        TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
+ *     }"
+ *
+ * There's a subtlety here.  Intel internal HSD-ES bug 1208668495 notes
+ * that the above workaround fails to fix certain GL/ES CTS tests which
+ * have inside tessellation factors of -1.0.  This can be explained by
+ * a quote from the ARB_tessellation_shader specification:
+ *
+ *    "If "equal_spacing" is used, the floating-point tessellation level is
+ *     first clamped to the range [1,<max>], where <max> is implementation-
+ *     dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
+ *
+ * In other words, the actual inner tessellation factor used is
+ * clamp(TF[INSIDE_*], 1.0, 64.0).  So we want to compare the clamped
+ * value against 1.0.  To accomplish this, we change the comparison from
+ * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
+ */
+
+static inline nir_ssa_def *
+load_output(nir_builder *b, int num_components, int offset)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_output);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL);
+   load->num_components = num_components;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_intrinsic_set_base(load, offset);
+
+   nir_builder_instr_insert(b, &load->instr);
+
+   return &load->dest.ssa;
+}
+
+static inline void
+store_output(nir_builder *b, nir_ssa_def *value, int offset, unsigned comps)
+{
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+   store->num_components = comps;
+   nir_intrinsic_set_write_mask(store, (1u << comps) - 1);
+   store->src[0] = nir_src_for_ssa(value);
+   store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_builder_instr_insert(b, &store->instr);
+}
+
+static void
+emit_quads_workaround(nir_builder *b, nir_block *block)
+{
+   /* We're going to insert a new if-statement in a predecessor of the end
+    * block.  This would normally create a new block (after the if) which
+    * would then become the predecessor of the end block, causing our set
+    * walking to get screwed up.  To avoid this, just emit a constant at
+    * the end of our current block, and insert the if before that.
+    */
+   b->cursor = nir_after_block_before_jump(block);
+   b->cursor = nir_before_instr(nir_imm_int(b, 0)->parent_instr);
+
+   nir_ssa_def *inner = load_output(b, 2, 0);
+   nir_ssa_def *outer = load_output(b, 4, 1);
+
+   nir_ssa_def *any_greater_than_1 =
+       nir_ior(b, nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), outer)),
+                  nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), inner)));
+
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(any_greater_than_1);
+   nir_builder_cf_insert(b, &if_stmt->cf_node);
+
+   /* Fill out the new then-block */
+   b->cursor = nir_after_cf_list(&if_stmt->then_list);
+
+   store_output(b, nir_bcsel(b, nir_fge(b, nir_imm_float(b, 1.0f), inner),
+                                nir_imm_float(b, 2.0f), inner), 0, 2);
+}
+
+void
+brw_nir_apply_tcs_quads_workaround(nir_shader *nir)
+{
+   assert(nir->stage == MESA_SHADER_TESS_CTRL);
+
+   nir_foreach_function(func, nir) {
+      if (!func->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, func->impl);
+
+      struct set_entry *entry;
+      set_foreach(func->impl->end_block->predecessors, entry) {
+         nir_block *pred = (nir_block *) entry->key;
+         emit_quads_workaround(&b, pred);
+      }
+
+      nir_metadata_preserve(func->impl, 0);
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_performance_monitor.c b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
index a91c6e2..a42a322 100644
--- a/src/mesa/drivers/dri/i965/brw_performance_monitor.c
+++ b/src/mesa/drivers/dri/i965/brw_performance_monitor.c
@@ -686,12 +686,12 @@ stop_oa_counters(struct brw_context *brw)
  * The amount of batch space it takes to emit an MI_REPORT_PERF_COUNT snapshot,
  * including the required PIPE_CONTROL flushes.
  *
- * Sandybridge is the worst case scenario: brw_emit_mi_flush
- * expands to three PIPE_CONTROLs which are 4 DWords each.  We have to flush
- * before and after MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add
- * the 3 DWords for MI_REPORT_PERF_COUNT itself.
+ * Sandybridge is the worst case scenario: brw_emit_mi_flush expands to four
+ * PIPE_CONTROLs which are 5 DWords each.  We have to flush before and after
+ * MI_REPORT_PERF_COUNT, so multiply by two.  Finally, add the 3 DWords for
+ * MI_REPORT_PERF_COUNT itself.
  */
-#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (3 * 4) + 3)
+#define MI_REPORT_PERF_COUNT_BATCH_DWORDS (2 * (4 * 5) + 3)
 
 /**
  * Emit an MI_REPORT_PERF_COUNT command packet.
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index 4672efd..d51cf1b 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -96,10 +96,38 @@ gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
 void
 brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
 {
+   if (brw->gen >= 6 &&
+       (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) &&
+       (flags & PIPE_CONTROL_CACHE_INVALIDATE_BITS)) {
+      /* A pipe control command with flush and invalidate bits set
+       * simultaneously is an inherently racy operation on Gen6+ if the
+       * contents of the flushed caches were intended to become visible from
+       * any of the invalidated caches.  Split it in two PIPE_CONTROLs, the
+       * first one should stall the pipeline to make sure that the flushed R/W
+       * caches are coherent with memory once the specified R/O caches are
+       * invalidated.  On pre-Gen6 hardware the (implicit) R/O cache
+       * invalidation seems to happen at the bottom of the pipeline together
+       * with any write cache flush, so this shouldn't be a concern.
+       */
+      brw_emit_pipe_control_flush(brw, (flags & PIPE_CONTROL_CACHE_FLUSH_BITS) |
+                                       PIPE_CONTROL_CS_STALL);
+      flags &= ~(PIPE_CONTROL_CACHE_FLUSH_BITS | PIPE_CONTROL_CS_STALL);
+   }
+
    if (brw->gen >= 8) {
       if (brw->gen == 8)
          gen8_add_cs_stall_workaround_bits(&flags);
 
+      if (brw->gen == 9 &&
+          (flags & PIPE_CONTROL_VF_CACHE_INVALIDATE)) {
+         /* Hardware workaround: SKL
+          *
+          * Emit Pipe Control with all bits set to zero before emitting
+          * a Pipe Control with VF Cache Invalidate set.
+          */
+         brw_emit_pipe_control_flush(brw, 0);
+      }
+
       BEGIN_BATCH(6);
       OUT_BATCH(_3DSTATE_PIPE_CONTROL | (6 - 2));
       OUT_BATCH(flags);
@@ -311,15 +339,6 @@ brw_emit_mi_flush(struct brw_context *brw)
    } else {
       int flags = PIPE_CONTROL_NO_WRITE | PIPE_CONTROL_RENDER_TARGET_FLUSH;
       if (brw->gen >= 6) {
-         if (brw->gen == 9) {
-            /* Hardware workaround: SKL
-             *
-             * Emit Pipe Control with all bits set to zero before emitting
-             * a Pipe Control with VF Cache Invalidate set.
-             */
-            brw_emit_pipe_control_flush(brw, 0);
-         }
-
          flags |= PIPE_CONTROL_INSTRUCTION_INVALIDATE |
                   PIPE_CONTROL_DEPTH_CACHE_FLUSH |
                   PIPE_CONTROL_VF_CACHE_INVALIDATE |
diff --git a/src/mesa/drivers/dri/i965/brw_tcs.c b/src/mesa/drivers/dri/i965/brw_tcs.c
index 8a5dd7e..6b7fde2 100644
--- a/src/mesa/drivers/dri/i965/brw_tcs.c
+++ b/src/mesa/drivers/dri/i965/brw_tcs.c
@@ -153,6 +153,8 @@ brw_tcs_debug_recompile(struct brw_context *brw,
                       key->patch_outputs_written);
    found |= key_debug(brw, "TES primitive mode", old_key->tes_primitive_mode,
                       key->tes_primitive_mode);
+   found |= key_debug(brw, "quads and equal_spacing workaround",
+                      old_key->quads_workaround, key->quads_workaround);
    found |= brw_debug_recompile_sampler_key(brw, &old_key->tex, &key->tex);
 
    if (!found) {
@@ -346,6 +348,9 @@ brw_upload_tcs_prog(struct brw_context *brw,
     * based on the domain the DS is expecting to tessellate.
     */
    key.tes_primitive_mode = tep->program.PrimitiveMode;
+   key.quads_workaround = brw->gen < 9 &&
+                          tep->program.PrimitiveMode == GL_QUADS &&
+                          tep->program.Spacing == GL_EQUAL;
 
    if (tcp) {
       key.program_string_id = tcp->id;
@@ -383,6 +388,8 @@ brw_tcs_precompile(struct gl_context *ctx,
 
    struct gl_tess_ctrl_program *tcp = (struct gl_tess_ctrl_program *)prog;
    struct brw_tess_ctrl_program *btcp = brw_tess_ctrl_program(tcp);
+   const struct gl_shader *tes =
+      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL];
 
    memset(&key, 0, sizeof(key));
 
@@ -393,9 +400,14 @@ brw_tcs_precompile(struct gl_context *ctx,
    if (brw->gen < 8)
       key.input_vertices = shader_prog->TessCtrl.VerticesOut;
 
-   key.tes_primitive_mode =
-      shader_prog->_LinkedShaders[MESA_SHADER_TESS_EVAL] ?
-      shader_prog->TessEval.PrimitiveMode : GL_TRIANGLES;
+   if (tes) {
+      key.tes_primitive_mode = shader_prog->TessEval.PrimitiveMode;
+      key.quads_workaround = brw->gen < 9 &&
+                             shader_prog->TessEval.PrimitiveMode == GL_QUADS &&
+                             shader_prog->TessEval.Spacing == GL_EQUAL;
+   } else {
+      key.tes_primitive_mode = GL_TRIANGLES;
+   }
 
    key.outputs_written = prog->OutputsWritten;
    key.patch_outputs_written = prog->PatchOutputsWritten;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 162b481..a7398a7 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -354,95 +354,97 @@ vec4_visitor::opt_vector_float()
 {
    bool progress = false;
 
-   int last_reg = -1, last_reg_offset = -1;
-   enum brw_reg_file last_reg_file = BAD_FILE;
+   foreach_block(block, cfg) {
+      int last_reg = -1, last_reg_offset = -1;
+      enum brw_reg_file last_reg_file = BAD_FILE;
+
+      uint8_t imm[4] = { 0 };
+      int inst_count = 0;
+      vec4_instruction *imm_inst[4];
+      unsigned writemask = 0;
+      enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+
+      foreach_inst_in_block_safe(vec4_instruction, inst, block) {
+         int vf = -1;
+         enum brw_reg_type need_type;
+
+         /* Look for unconditional MOVs from an immediate with a partial
+          * writemask.  Skip type-conversion MOVs other than integer 0,
+          * where the type doesn't matter.  See if the immediate can be
+          * represented as a VF.
+          */
+         if (inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].file == IMM &&
+             inst->predicate == BRW_PREDICATE_NONE &&
+             inst->dst.writemask != WRITEMASK_XYZW &&
+             (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
+
+            vf = brw_float_to_vf(inst->src[0].d);
+            need_type = BRW_REGISTER_TYPE_D;
+
+            if (vf == -1) {
+               vf = brw_float_to_vf(inst->src[0].f);
+               need_type = BRW_REGISTER_TYPE_F;
+            }
+         } else {
+            last_reg = -1;
+         }
 
-   uint8_t imm[4] = { 0 };
-   int inst_count = 0;
-   vec4_instruction *imm_inst[4];
-   unsigned writemask = 0;
-   enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+         /* If this wasn't a MOV, or the destination register doesn't match,
+          * or we have to switch destination types, then this breaks our
+          * sequence.  Combine anything we've accumulated so far.
+          */
+         if (last_reg != inst->dst.nr ||
+             last_reg_offset != inst->dst.reg_offset ||
+             last_reg_file != inst->dst.file ||
+             (vf > 0 && dest_type != need_type)) {
+
+            if (inst_count > 1) {
+               unsigned vf;
+               memcpy(&vf, imm, sizeof(vf));
+               vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+               mov->dst.type = dest_type;
+               mov->dst.writemask = writemask;
+               inst->insert_before(block, mov);
+
+               for (int i = 0; i < inst_count; i++) {
+                  imm_inst[i]->remove(block);
+               }
 
-   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
-      int vf = -1;
-      enum brw_reg_type need_type;
+               progress = true;
+            }
 
-      /* Look for unconditional MOVs from an immediate with a partial
-       * writemask.  Skip type-conversion MOVs other than integer 0,
-       * where the type doesn't matter.  See if the immediate can be
-       * represented as a VF.
-       */
-      if (inst->opcode == BRW_OPCODE_MOV &&
-          inst->src[0].file == IMM &&
-          inst->predicate == BRW_PREDICATE_NONE &&
-          inst->dst.writemask != WRITEMASK_XYZW &&
-          (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
-
-         vf = brw_float_to_vf(inst->src[0].d);
-         need_type = BRW_REGISTER_TYPE_D;
-
-         if (vf == -1) {
-            vf = brw_float_to_vf(inst->src[0].f);
-            need_type = BRW_REGISTER_TYPE_F;
-         }
-      } else {
-         last_reg = -1;
-      }
+            inst_count = 0;
+            last_reg = -1;
+            writemask = 0;
+            dest_type = BRW_REGISTER_TYPE_F;
 
-      /* If this wasn't a MOV, or the destination register doesn't match,
-       * or we have to switch destination types, then this breaks our
-       * sequence.  Combine anything we've accumulated so far.
-       */
-      if (last_reg != inst->dst.nr ||
-          last_reg_offset != inst->dst.reg_offset ||
-          last_reg_file != inst->dst.file ||
-          (vf > 0 && dest_type != need_type)) {
-
-         if (inst_count > 1) {
-            unsigned vf;
-            memcpy(&vf, imm, sizeof(vf));
-            vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
-            mov->dst.type = dest_type;
-            mov->dst.writemask = writemask;
-            inst->insert_before(block, mov);
-
-            for (int i = 0; i < inst_count; i++) {
-               imm_inst[i]->remove(block);
+            for (int i = 0; i < 4; i++) {
+               imm[i] = 0;
             }
-
-            progress = true;
          }
 
-         inst_count = 0;
-         last_reg = -1;
-         writemask = 0;
-         dest_type = BRW_REGISTER_TYPE_F;
-
-         for (int i = 0; i < 4; i++) {
-            imm[i] = 0;
+         /* Record this instruction's value (if it was representable). */
+         if (vf != -1) {
+            if ((inst->dst.writemask & WRITEMASK_X) != 0)
+               imm[0] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Y) != 0)
+               imm[1] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Z) != 0)
+               imm[2] = vf;
+            if ((inst->dst.writemask & WRITEMASK_W) != 0)
+               imm[3] = vf;
+
+            writemask |= inst->dst.writemask;
+            imm_inst[inst_count++] = inst;
+
+            last_reg = inst->dst.nr;
+            last_reg_offset = inst->dst.reg_offset;
+            last_reg_file = inst->dst.file;
+            if (vf > 0)
+               dest_type = need_type;
          }
       }
-
-      /* Record this instruction's value (if it was representable). */
-      if (vf != -1) {
-         if ((inst->dst.writemask & WRITEMASK_X) != 0)
-            imm[0] = vf;
-         if ((inst->dst.writemask & WRITEMASK_Y) != 0)
-            imm[1] = vf;
-         if ((inst->dst.writemask & WRITEMASK_Z) != 0)
-            imm[2] = vf;
-         if ((inst->dst.writemask & WRITEMASK_W) != 0)
-            imm[3] = vf;
-
-         writemask |= inst->dst.writemask;
-         imm_inst[inst_count++] = inst;
-
-         last_reg = inst->dst.nr;
-         last_reg_offset = inst->dst.reg_offset;
-         last_reg_file = inst->dst.file;
-         if (vf > 0)
-            dest_type = need_type;
-      }
    }
 
    if (progress)
@@ -1109,7 +1111,7 @@ vec4_visitor::opt_register_coalesce()
       /* Can't coalesce this GRF if someone else was going to
        * read it later.
        */
-      if (var_range_end(var_from_reg(alloc, inst->src[0]), 4) > ip)
+      if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 4) > ip)
 	 continue;
 
       /* We need to check interference with the final destination between this
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
index 0c1f0c3..10898a5 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_cse.cpp
@@ -246,7 +246,7 @@ vec4_visitor::opt_cse_local(bblock_t *block)
              * more -- a sure sign they'll fail operands_match().
              */
             if (src->file == VGRF) {
-               if (var_range_end(var_from_reg(alloc, *src), 4) < ip) {
+               if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 4) < ip) {
                   entry->remove();
                   ralloc_free(entry);
                   break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
index 927438f..26a910c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_gs_visitor.cpp
@@ -59,7 +59,10 @@ vec4_gs_visitor::make_reg_for_system_value(int location)
    switch (location) {
    case SYSTEM_VALUE_INVOCATION_ID:
       this->current_annotation = "initialize gl_InvocationID";
-      emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      if (gs_prog_data->invocations > 1)
+         emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      else
+         emit(MOV(*reg, brw_imm_ud(0)));
       break;
    default:
       unreachable("not reached");
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index f61c612..5440dba 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -451,6 +451,9 @@ brw_compile_tcs(const struct brw_compiler *compiler,
    nir = brw_nir_apply_sampler_key(nir, devinfo, &key->tex, is_scalar);
    brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
    brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map);
+   if (key->quads_workaround)
+      brw_nir_apply_tcs_quads_workaround(nir);
+
    nir = brw_postprocess_nir(nir, compiler->devinfo, is_scalar);
 
    if (is_scalar)
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 609285e..61ada53 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -1443,10 +1443,12 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
    /* _NEW_PROGRAM */
    struct gl_shader_program *prog =
       ctx->_Shader->CurrentProgram[MESA_SHADER_COMPUTE];
+   /* BRW_NEW_CS_PROG_DATA */
+   const struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
 
-   if (prog && brw->cs.prog_data->uses_num_work_groups) {
+   if (prog && cs_prog_data->uses_num_work_groups) {
       const unsigned surf_idx =
-         brw->cs.prog_data->binding_table.work_groups_start;
+         cs_prog_data->binding_table.work_groups_start;
       uint32_t *surf_offset = &brw->cs.base.surf_offset[surf_idx];
       drm_intel_bo *bo;
       uint32_t bo_offset;
@@ -1475,6 +1477,7 @@ brw_upload_cs_work_groups_surface(struct brw_context *brw)
 const struct brw_tracked_state brw_cs_work_groups_surface = {
    .dirty = {
       .brw = BRW_NEW_BLORP |
+             BRW_NEW_CS_PROG_DATA |
              BRW_NEW_CS_WORK_GROUPS
    },
    .emit = brw_upload_cs_work_groups_surface,
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 26de633..64ccdb6 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -50,6 +50,7 @@ upload_clip_state(struct brw_context *brw)
       dw2 |= GEN6_CLIP_NON_PERSPECTIVE_BARYCENTRIC_ENABLE;
    }
 
+   /* BRW_NEW_VS_PROG_DATA */
    dw1 |= brw->vs.prog_data->base.cull_distance_mask;
 
    if (brw->gen >= 7)
@@ -224,6 +225,7 @@ const struct brw_tracked_state gen7_clip_state = {
                BRW_NEW_CONTEXT |
                BRW_NEW_FS_PROG_DATA |
                BRW_NEW_GEOMETRY_PROGRAM |
+               BRW_NEW_VS_PROG_DATA |
                BRW_NEW_META_IN_PROGRESS |
                BRW_NEW_PRIMITIVE |
                BRW_NEW_RASTERIZER_DISCARD |
diff --git a/src/mesa/drivers/dri/i965/gen7_cs_state.c b/src/mesa/drivers/dri/i965/gen7_cs_state.c
index 5427fa5..b245226 100644
--- a/src/mesa/drivers/dri/i965/gen7_cs_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_cs_state.c
@@ -283,7 +283,7 @@ gen7_upload_cs_push_constants(struct brw_context *brw)
       (struct brw_compute_program *) brw->compute_program;
 
    if (cp) {
-      /* CACHE_NEW_CS_PROG */
+      /* BRW_NEW_CS_PROG_DATA */
       struct brw_cs_prog_data *cs_prog_data = brw->cs.prog_data;
 
       brw_upload_cs_push_constants(brw, &cp->program.Base, cs_prog_data,
@@ -297,6 +297,7 @@ const struct brw_tracked_state gen7_cs_push_constants = {
       .brw = BRW_NEW_BATCH |
              BRW_NEW_BLORP |
              BRW_NEW_COMPUTE_PROGRAM |
+             BRW_NEW_CS_PROG_DATA |
              BRW_NEW_PUSH_CONSTANT_ALLOCATION,
    },
    .emit = gen7_upload_cs_push_constants,
diff --git a/src/mesa/drivers/dri/i965/gen8_ds_state.c b/src/mesa/drivers/dri/i965/gen8_ds_state.c
index 6f01abb..3b79b55 100644
--- a/src/mesa/drivers/dri/i965/gen8_ds_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ds_state.c
@@ -69,6 +69,7 @@ gen8_upload_ds_state(struct brw_context *brw)
                  GEN7_DS_SIMD8_DISPATCH_ENABLE : 0) |
                 (tes_prog_data->domain == BRW_TESS_DOMAIN_TRI ?
                  GEN7_DS_COMPUTE_W_COORDINATE_ENABLE : 0));
+      /* _NEW_TRANSFORM */
       OUT_BATCH(SET_FIELD(ctx->Transform.ClipPlanesEnabled,
                           GEN8_DS_USER_CLIP_DISTANCE) |
                 SET_FIELD(vue_prog_data->cull_distance_mask,
@@ -106,7 +107,7 @@ gen8_upload_ds_state(struct brw_context *brw)
 
 const struct brw_tracked_state gen8_ds_state = {
    .dirty = {
-      .mesa  = 0,
+      .mesa  = _NEW_TRANSFORM,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
                BRW_NEW_TESS_PROGRAMS |
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index 8a904fe..f916d99 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -124,6 +124,7 @@ const struct brw_tracked_state gen8_ps_extra = {
       .mesa  = _NEW_BUFFERS | _NEW_COLOR,
       .brw   = BRW_NEW_BLORP |
                BRW_NEW_CONTEXT |
+               BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA,
    },
    .emit = upload_ps_extra,
@@ -283,7 +284,6 @@ const struct brw_tracked_state gen8_ps_state = {
       .mesa  = _NEW_MULTISAMPLE,
       .brw   = BRW_NEW_BATCH |
                BRW_NEW_BLORP |
-               BRW_NEW_FRAGMENT_PROGRAM |
                BRW_NEW_FS_PROG_DATA,
    },
    .emit = upload_ps_state,
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index aa1dc38..67e8e8f 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -21,13 +21,13 @@ extern "C" {
  *   - Gen4-5 record ending occlusion query values (4 * 4 = 16 bytes)
  *   - Disabling OA counters on Gen6+ (3 DWords = 12 bytes)
  *   - Ending MI_REPORT_PERF_COUNT on Gen5+, plus associated PIPE_CONTROLs:
- *     - Two sets of PIPE_CONTROLs, which become 3 PIPE_CONTROLs each on SNB,
- *       which are 5 DWords each ==> 2 * 3 * 5 * 4 = 120 bytes
+ *     - Two sets of PIPE_CONTROLs, which become 4 PIPE_CONTROLs each on SNB,
+ *       which are 5 DWords each ==> 2 * 4 * 5 * 4 = 160 bytes
  *     - 3 DWords for MI_REPORT_PERF_COUNT itself on Gen6+.  ==> 12 bytes.
  *       On Ironlake, it's 6 DWords, but we have some slack due to the lack of
  *       Sandybridge PIPE_CONTROL madness.
- *   - CC_STATE workaround on HSW (12 * 4 = 48 bytes)
- *     - 5 dwords for initial mi_flush
+ *   - CC_STATE workaround on HSW (17 * 4 = 68 bytes)
+ *     - 10 dwords for initial mi_flush
  *     - 2 dwords for CC state setup
  *     - 5 dwords for the required pipe control at the end
  *   - Restoring L3 configuration: (24 dwords = 96 bytes)
@@ -35,7 +35,7 @@ extern "C" {
  *     - 7 dwords for L3 configuration set-up.
  *     - 5 dwords for L3 atomic set-up (on HSW).
  */
-#define BATCH_RESERVED 248
+#define BATCH_RESERVED 308
 
 struct intel_batchbuffer;
 
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 939f9a0..8a0d2ad 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -374,6 +374,19 @@ intel_image_target_renderbuffer_storage(struct gl_context *ctx,
    if (!irb->mt)
       return;
 
+   /* Adjust the miptree's upper-left coordinate.
+    *
+    * FIXME: Adjusting the miptree's layout outside of
+    * intel_miptree_create_layout() is fragile. Plumb the adjustment through
+    * intel_miptree_create_layout() and brw_tex_layout().
+    */
+   irb->mt->level[0].level_x = image->tile_x;
+   irb->mt->level[0].level_y = image->tile_y;
+   irb->mt->level[0].slice[0].x_offset = image->tile_x;
+   irb->mt->level[0].slice[0].y_offset = image->tile_y;
+   irb->mt->total_width += image->tile_x;
+   irb->mt->total_height += image->tile_y;
+
    rb->InternalFormat = image->internal_format;
    rb->Width = image->width;
    rb->Height = image->height;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index b6265dc..e74a2dc 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -366,25 +366,8 @@ intel_miptree_create_layout(struct brw_context *brw,
        _mesa_get_format_name(format),
        first_level, last_level, depth0, mt);
 
-   if (target == GL_TEXTURE_1D_ARRAY) {
-      /* For a 1D Array texture the OpenGL API will treat the height0
-       * parameter as the number of array slices. For Intel hardware, we treat
-       * the 1D array as a 2D Array with a height of 1.
-       *
-       * So, when we first come through this path to create a 1D Array
-       * texture, height0 stores the number of slices, and depth0 is 1. In
-       * this case, we want to swap height0 and depth0.
-       *
-       * Since some miptrees will be created based on the base miptree, we may
-       * come through this path and see height0 as 1 and depth0 being the
-       * number of slices. In this case we don't need to do the swap.
-       */
-      assert(height0 == 1 || depth0 == 1);
-      if (height0 > 1) {
-         depth0 = height0;
-         height0 = 1;
-      }
-   }
+   if (target == GL_TEXTURE_1D_ARRAY)
+      assert(height0 == 1);
 
    mt->target = target;
    mt->format = format;
@@ -1050,6 +1033,7 @@ intel_get_image_dims(struct gl_texture_image *image,
        * as a 2D Array with a height of 1. So, here we want to swap image
        * height and depth.
        */
+      assert(image->Depth == 1);
       *width = image->Width;
       *height = 1;
       *depth = image->Height;
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
index a486d6e..cacd7e2 100644
--- a/src/mesa/drivers/dri/i965/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -110,22 +110,6 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
    if (ctx->_ImageTransferState)
       return false;
 
-   /* This renderbuffer can come from a texture.  In this case, we impose
-    * some of the same restrictions we have for textures and adjust for
-    * miplevels.
-    */
-   if (rb->TexImage) {
-      if (rb->TexImage->TexObject->Target != GL_TEXTURE_2D &&
-          rb->TexImage->TexObject->Target != GL_TEXTURE_RECTANGLE)
-         return false;
-
-      int level = rb->TexImage->Level + rb->TexImage->TexObject->MinLevel;
-
-      /* Adjust x and y offset based on miplevel */
-      xoffset += irb->mt->level[level].level_x;
-      yoffset += irb->mt->level[level].level_y;
-   }
-
    /* It is possible that the renderbuffer (or underlying texture) is
     * multisampled.  Since ReadPixels from a multisampled buffer requires a
     * multisample resolve, we can't handle this here
@@ -169,6 +153,9 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       return false;
    }
 
+   xoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].x_offset;
+   yoffset += irb->mt->level[irb->mt_level].slice[irb->mt_layer].y_offset;
+
    dst_pitch = _mesa_image_row_stride(pack, width, format, type);
 
    /* For a window-system renderbuffer, the buffer is actually flipped
@@ -201,7 +188,7 @@ intel_readpixels_tiled_memcpy(struct gl_context * ctx,
       xoffset * cpp, (xoffset + width) * cpp,
       yoffset, yoffset + height,
       pixels - (ptrdiff_t) yoffset * dst_pitch - (ptrdiff_t) xoffset * cpp,
-      bo->virtual,
+      bo->virtual + irb->mt->offset,
       dst_pitch, irb->mt->pitch,
       brw->has_swizzling,
       irb->mt->tiling,
diff --git a/src/mesa/drivers/dri/i965/intel_reg.h b/src/mesa/drivers/dri/i965/intel_reg.h
index 95365fe..7a82be4 100644
--- a/src/mesa/drivers/dri/i965/intel_reg.h
+++ b/src/mesa/drivers/dri/i965/intel_reg.h
@@ -134,6 +134,15 @@
 #define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
 #define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
 
+#define PIPE_CONTROL_CACHE_FLUSH_BITS \
+   (PIPE_CONTROL_DEPTH_CACHE_FLUSH | PIPE_CONTROL_DATA_CACHE_FLUSH | \
+    PIPE_CONTROL_RENDER_TARGET_FLUSH)
+
+#define PIPE_CONTROL_CACHE_INVALIDATE_BITS \
+   (PIPE_CONTROL_STATE_CACHE_INVALIDATE | PIPE_CONTROL_CONST_CACHE_INVALIDATE | \
+    PIPE_CONTROL_VF_CACHE_INVALIDATE | PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE | \
+    PIPE_CONTROL_INSTRUCTION_INVALIDATE)
+
 /** @} */
 
 #define XY_SETUP_BLT_CMD		(CMD_2D | (0x01 << 22))
diff --git a/src/mesa/drivers/dri/i965/intel_syncobj.c b/src/mesa/drivers/dri/i965/intel_syncobj.c
index 3e359a5..39c9636 100644
--- a/src/mesa/drivers/dri/i965/intel_syncobj.c
+++ b/src/mesa/drivers/dri/i965/intel_syncobj.c
@@ -49,6 +49,7 @@ struct brw_fence {
    /** The fence waits for completion of this batch. */
    drm_intel_bo *batch_bo;
 
+   mtx_t mutex;
    bool signalled;
 };
 
@@ -58,10 +59,20 @@ struct intel_gl_sync_object {
 };
 
 static void
+brw_fence_init(struct brw_context *brw, struct brw_fence *fence)
+{
+   fence->brw = brw;
+   fence->batch_bo = NULL;
+   mtx_init(&fence->mutex, mtx_plain);
+}
+
+static void
 brw_fence_finish(struct brw_fence *fence)
 {
    if (fence->batch_bo)
       drm_intel_bo_unreference(fence->batch_bo);
+
+   mtx_destroy(&fence->mutex);
 }
 
 static void
@@ -77,7 +88,7 @@ brw_fence_insert(struct brw_context *brw, struct brw_fence *fence)
 }
 
 static bool
-brw_fence_has_completed(struct brw_fence *fence)
+brw_fence_has_completed_locked(struct brw_fence *fence)
 {
    if (fence->signalled)
       return true;
@@ -92,13 +103,21 @@ brw_fence_has_completed(struct brw_fence *fence)
    return false;
 }
 
-/**
- * Return true if the function successfully signals or has already signalled.
- * (This matches the behavior expected from __DRI2fence::client_wait_sync).
- */
 static bool
-brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
-                      uint64_t timeout)
+brw_fence_has_completed(struct brw_fence *fence)
+{
+   bool ret;
+
+   mtx_lock(&fence->mutex);
+   ret = brw_fence_has_completed_locked(fence);
+   mtx_unlock(&fence->mutex);
+
+   return ret;
+}
+
+static bool
+brw_fence_client_wait_locked(struct brw_context *brw, struct brw_fence *fence,
+                             uint64_t timeout)
 {
    if (fence->signalled)
       return true;
@@ -123,6 +142,23 @@ brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
    return true;
 }
 
+/**
+ * Return true if the function successfully signals or has already signalled.
+ * (This matches the behavior expected from __DRI2fence::client_wait_sync).
+ */
+static bool
+brw_fence_client_wait(struct brw_context *brw, struct brw_fence *fence,
+                      uint64_t timeout)
+{
+   bool ret;
+
+   mtx_lock(&fence->mutex);
+   ret = brw_fence_client_wait_locked(brw, fence, timeout);
+   mtx_unlock(&fence->mutex);
+
+   return ret;
+}
+
 static void
 brw_fence_server_wait(struct brw_context *brw, struct brw_fence *fence)
 {
@@ -161,6 +197,7 @@ intel_gl_fence_sync(struct gl_context *ctx, struct gl_sync_object *s,
    struct brw_context *brw = brw_context(ctx);
    struct intel_gl_sync_object *sync = (struct intel_gl_sync_object *)s;
 
+   brw_fence_init(brw, &sync->fence);
    brw_fence_insert(brw, &sync->fence);
 }
 
@@ -215,7 +252,7 @@ intel_dri_create_fence(__DRIcontext *ctx)
    if (!fence)
       return NULL;
 
-   fence->brw = brw;
+   brw_fence_init(brw, fence);
    brw_fence_insert(brw, fence);
 
    return fence;
@@ -244,6 +281,12 @@ intel_dri_server_wait_sync(__DRIcontext *ctx, void *driver_fence, unsigned flags
 {
    struct brw_fence *fence = driver_fence;
 
+   /* We might be called here with a NULL fence as a result of WaitSyncKHR
+    * on a EGL_KHR_reusable_sync fence. Nothing to do here in such case.
+    */
+   if (!fence)
+      return;
+
    brw_fence_server_wait(fence->brw, fence);
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex.c b/src/mesa/drivers/dri/i965/intel_tex.c
index cac33ac..a1364b9 100644
--- a/src/mesa/drivers/dri/i965/intel_tex.c
+++ b/src/mesa/drivers/dri/i965/intel_tex.c
@@ -140,6 +140,8 @@ intel_alloc_texture_storage(struct gl_context *ctx,
        !intel_miptree_match_image(intel_texobj->mt, first_image) ||
        intel_texobj->mt->last_level != levels - 1) {
       intel_miptree_release(&intel_texobj->mt);
+
+      intel_get_image_dims(first_image, &width, &height, &depth);
       intel_texobj->mt = intel_miptree_create(brw, texobj->Target,
                                               first_image->TexFormat,
                                               0, levels - 1,
diff --git a/src/mesa/drivers/dri/nouveau/nv20_state_frag.c b/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
index 492ecdc..2c5c2db 100644
--- a/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
+++ b/src/mesa/drivers/dri/nouveau/nv20_state_frag.c
@@ -67,5 +67,5 @@ nv20_emit_frag(struct gl_context *ctx, int emit)
 	PUSH_DATA (push, in >> 32);
 
 	BEGIN_NV04(push, NV20_3D(RC_ENABLE), 1);
-	PUSH_DATA (push, n);
+	PUSH_DATA (push, MAX2(1, n));
 }
diff --git a/src/mesa/drivers/dri/swrast/swrast.c b/src/mesa/drivers/dri/swrast/swrast.c
index 2d4bb70..6e006f8 100644
--- a/src/mesa/drivers/dri/swrast/swrast.c
+++ b/src/mesa/drivers/dri/swrast/swrast.c
@@ -484,14 +484,14 @@ swrast_map_renderbuffer(struct gl_context *ctx,
 
       xrb->map_mode = mode;
       xrb->map_x = x;
-      xrb->map_y = y;
+      xrb->map_y = rb->Height - y - h;
       xrb->map_w = w;
       xrb->map_h = h;
 
       stride = w * cpp;
       xrb->Base.Buffer = malloc(h * stride);
 
-      sPriv->swrast_loader->getImage(dPriv, x, rb->Height - y - h, w, h,
+      sPriv->swrast_loader->getImage(dPriv, x, xrb->map_y, w, h,
 				     (char *) xrb->Base.Buffer,
 				     dPriv->loaderPrivate);
 
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index a28c583..d8815af 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -378,17 +378,48 @@ draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
 
    /* complicated error checking... */
    for (output = 0; output < n; output++) {
-      /* Section 4.2 (Whole Framebuffer Operations) of the OpenGL 3.0
+      destMask[output] = draw_buffer_enum_to_bitmask(ctx, buffers[output]);
+
+      /* From the OpenGL 3.0 specification, page 258:
+       * "Each buffer listed in bufs must be one of the values from tables
+       *  4.5 or 4.6.  Otherwise, an INVALID_ENUM error is generated.
+       */
+      if (destMask[output] == BAD_MASK) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                     caller, _mesa_enum_to_string(buffers[output]));
+         return;
+      }
+
+      /* From the OpenGL 4.0 specification, page 256:
+       * "For both the default framebuffer and framebuffer objects, the
+       *  constants FRONT, BACK, LEFT, RIGHT, and FRONT_AND_BACK are not
+       *  valid in the bufs array passed to DrawBuffers, and will result in
+       *  the error INVALID_ENUM. This restriction is because these
+       *  constants may themselves refer to multiple buffers, as shown in
+       *  table 4.4."
+       *  Previous versions of the OpenGL specification say INVALID_OPERATION,
+       *  but the Khronos conformance tests expect INVALID_ENUM.
+       */
+      if (_mesa_bitcount(destMask[output]) > 1) {
+         _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
+                     caller, _mesa_enum_to_string(buffers[output]));
+         return;
+      }
+
+      /* Section 4.2 (Whole Framebuffer Operations) of the OpenGL ES 3.0
        * specification says:
        *
-       *     "Each buffer listed in bufs must be BACK, NONE, or one of the values
-       *      from table 4.3 (NONE, COLOR_ATTACHMENTi)"
+       *     "If the GL is bound to a draw framebuffer object, the ith buffer
+       *     listed in bufs must be COLOR_ATTACHMENTi or NONE . Specifying a
+       *     buffer out of order, BACK , or COLOR_ATTACHMENTm where m is greater
+       *     than or equal to the value of MAX_- COLOR_ATTACHMENTS , will
+       *     generate the error INVALID_OPERATION .
        */
-      if (_mesa_is_gles3(ctx) && buffers[output] != GL_NONE &&
-          buffers[output] != GL_BACK &&
+      if (_mesa_is_gles3(ctx) && _mesa_is_user_fbo(fb) &&
+          buffers[output] != GL_NONE &&
           (buffers[output] < GL_COLOR_ATTACHMENT0 ||
            buffers[output] >= GL_COLOR_ATTACHMENT0 + ctx->Const.MaxColorAttachments)) {
-         _mesa_error(ctx, GL_INVALID_ENUM, "glDrawBuffers(buffer)");
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawBuffers(buffer)");
          return;
       }
 
@@ -412,34 +443,6 @@ draw_buffers(struct gl_context *ctx, struct gl_framebuffer *fb,
             return;
          }
 
-         destMask[output] = draw_buffer_enum_to_bitmask(ctx, buffers[output]);
-
-         /* From the OpenGL 3.0 specification, page 258:
-          * "Each buffer listed in bufs must be one of the values from tables
-          *  4.5 or 4.6.  Otherwise, an INVALID_ENUM error is generated.
-          */
-         if (destMask[output] == BAD_MASK) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_enum_to_string(buffers[output]));
-            return;
-         }
-
-         /* From the OpenGL 4.0 specification, page 256:
-          * "For both the default framebuffer and framebuffer objects, the
-          *  constants FRONT, BACK, LEFT, RIGHT, and FRONT_AND_BACK are not
-          *  valid in the bufs array passed to DrawBuffers, and will result in
-          *  the error INVALID_ENUM. This restriction is because these
-          *  constants may themselves refer to multiple buffers, as shown in
-          *  table 4.4."
-          *  Previous versions of the OpenGL specification say INVALID_OPERATION,
-          *  but the Khronos conformance tests expect INVALID_ENUM.
-          */
-         if (_mesa_bitcount(destMask[output]) > 1) {
-            _mesa_error(ctx, GL_INVALID_ENUM, "%s(invalid buffer %s)",
-                        caller, _mesa_enum_to_string(buffers[output]));
-            return;
-         }
-
          /* From the OpenGL 3.0 specification, page 259:
           * "If the GL is bound to the default framebuffer and DrawBuffers is
           *  supplied with a constant (other than NONE) that does not indicate
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index bf47c1c..68da639 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -303,9 +303,21 @@ _mesa_get_fb0_attachment(struct gl_context *ctx, struct gl_framebuffer *fb,
 
    switch (attachment) {
    case GL_FRONT_LEFT:
-      return &fb->Attachment[BUFFER_FRONT_LEFT];
+      /* Front buffers can be allocated on the first use, but
+       * glGetFramebufferAttachmentParameteriv must work even if that
+       * allocation hasn't happened yet. In such case, use the back buffer,
+       * which should be the same.
+       */
+      if (fb->Attachment[BUFFER_FRONT_LEFT].Type == GL_NONE)
+         return &fb->Attachment[BUFFER_BACK_LEFT];
+      else
+         return &fb->Attachment[BUFFER_FRONT_LEFT];
    case GL_FRONT_RIGHT:
-      return &fb->Attachment[BUFFER_FRONT_RIGHT];
+      /* Same as above. */
+      if (fb->Attachment[BUFFER_FRONT_RIGHT].Type == GL_NONE)
+         return &fb->Attachment[BUFFER_BACK_RIGHT];
+      else
+         return &fb->Attachment[BUFFER_FRONT_RIGHT];
    case GL_BACK_LEFT:
       return &fb->Attachment[BUFFER_BACK_LEFT];
    case GL_BACK_RIGHT:
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index d72bc71..18dffc3 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -293,10 +293,9 @@ struct ureg {
    GLuint file:4;
    GLint idx:9;      /* relative addressing may be negative */
                      /* sizeof(idx) should == sizeof(prog_src_reg::Index) */
-   GLuint abs:1;
    GLuint negate:1;
    GLuint swz:12;
-   GLuint pad:5;
+   GLuint pad:6;
 };
 
 
@@ -325,7 +324,6 @@ static const struct ureg undef = {
    0,
    0,
    0,
-   0,
    0
 };
 
@@ -344,7 +342,6 @@ static struct ureg make_ureg(GLuint file, GLint idx)
    struct ureg reg;
    reg.file = file;
    reg.idx = idx;
-   reg.abs = 0;
    reg.negate = 0;
    reg.swz = SWIZZLE_NOOP;
    reg.pad = 0;
@@ -352,15 +349,6 @@ static struct ureg make_ureg(GLuint file, GLint idx)
 }
 
 
-
-static struct ureg absolute( struct ureg reg )
-{
-   reg.abs = 1;
-   reg.negate = 0;
-   return reg;
-}
-
-
 static struct ureg negate( struct ureg reg )
 {
    reg.negate ^= 1;
@@ -961,7 +949,8 @@ static struct ureg calculate_light_attenuation( struct tnl_program *p,
 
       emit_op2(p, OPCODE_DP3, spot, 0, negate(VPpli), spot_dir_norm);
       emit_op2(p, OPCODE_SLT, slt, 0, swizzle1(spot_dir_norm,W), spot);
-      emit_op2(p, OPCODE_POW, spot, 0, absolute(spot), swizzle1(attenuation, W));
+      emit_op1(p, OPCODE_ABS, spot, 0, spot);
+      emit_op2(p, OPCODE_POW, spot, 0, spot, swizzle1(attenuation, W));
       emit_op2(p, OPCODE_MUL, att, 0, slt, spot);
 
       release_temp(p, spot);
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 215c14f..e9727ea 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -387,13 +387,13 @@ _is_target_supported(struct gl_context *ctx, GLenum target)
     *     "if a particular type of <target> is not supported by the
     *     implementation the "unsupported" answer should be given.
     *     This is not an error."
+    *
+    * For OpenGL ES, queries can only be used with GL_RENDERBUFFER or MS.
     */
    switch(target){
+   case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_3D:
-      break;
-
-   case GL_TEXTURE_1D:
       if (!_mesa_is_desktop_gl(ctx))
          return false;
       break;
@@ -404,12 +404,12 @@ _is_target_supported(struct gl_context *ctx, GLenum target)
       break;
 
    case GL_TEXTURE_2D_ARRAY:
-      if (!(_mesa_has_EXT_texture_array(ctx) || _mesa_is_gles3(ctx)))
+      if (!_mesa_has_EXT_texture_array(ctx))
          return false;
       break;
 
    case GL_TEXTURE_CUBE_MAP:
-      if (!_mesa_has_ARB_texture_cube_map(ctx))
+      if (ctx->API != API_OPENGL_CORE && !_mesa_has_ARB_texture_cube_map(ctx))
          return false;
       break;
 
@@ -419,7 +419,7 @@ _is_target_supported(struct gl_context *ctx, GLenum target)
       break;
 
    case GL_TEXTURE_RECTANGLE:
-      if (!_mesa_has_NV_texture_rectangle(ctx))
+      if (!_mesa_has_ARB_texture_rectangle(ctx))
           return false;
       break;
 
@@ -962,7 +962,8 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
 
       switch (pname) {
       case GL_INTERNALFORMAT_DEPTH_SIZE:
-         if (!_mesa_has_ARB_depth_texture(ctx) &&
+         if (ctx->API != API_OPENGL_CORE &&
+             !_mesa_has_ARB_depth_texture(ctx) &&
              target != GL_RENDERBUFFER &&
              target != GL_TEXTURE_BUFFER)
             goto end;
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index d917220..2afe7be 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -85,10 +85,15 @@ _mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx,
        *  not specified with an unsized internal format from table 8.3 or a
        *  sized internal format that is both color-renderable and
        *  texture-filterable according to table 8.10."
+       *
+       * GL_EXT_texture_format_BGRA8888 adds a GL_BGRA_EXT unsized internal
+       * format, and includes it in a very similar looking table.  So we
+       * include it here as well.
        */
       return internalformat == GL_RGBA || internalformat == GL_RGB ||
              internalformat == GL_LUMINANCE_ALPHA ||
              internalformat == GL_LUMINANCE || internalformat == GL_ALPHA ||
+             internalformat == GL_BGRA_EXT ||
              (_mesa_is_es3_color_renderable(internalformat) &&
               _mesa_is_es3_texture_filterable(internalformat));
    }
@@ -144,6 +149,11 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
       return;
    }
 
+   if (srcImage->Width == 0 || srcImage->Height == 0) {
+      _mesa_unlock_texture(ctx, texObj);
+      return;
+   }
+
    if (target == GL_TEXTURE_CUBE_MAP) {
       GLuint face;
       for (face = 0; face < 6; face++) {
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index 9f70749..7623b93 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -411,6 +411,14 @@ static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = {
 static const int extra_ARB_gpu_shader5_or_OES_sample_variables[] = {
    EXT(ARB_gpu_shader5),
    EXT(OES_sample_variables),
+   EXTRA_END
+};
+
+static const int extra_KHR_robustness_or_GL[] = {
+   EXT(KHR_robustness),
+   EXTRA_API_GL,
+   EXTRA_API_GL_CORE,
+   EXTRA_END
 };
 
 EXTRA_EXT(ARB_texture_cube_map);
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index bfcbfd6..ea3649a 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -338,6 +338,9 @@ descriptor=[
 
 # blend_func_extended
   [ "MAX_DUAL_SOURCE_DRAW_BUFFERS", "CONTEXT_INT(Const.MaxDualSourceDrawBuffers), extra_ARB_blend_func_extended" ],
+
+# GL_ARB_robustness / GL_KHR_robustness
+  [ "RESET_NOTIFICATION_STRATEGY_ARB", "CONTEXT_ENUM(Const.ResetStrategy), extra_KHR_robustness_or_GL" ],
 ]},
 
 # GLES3 is not a typo.
@@ -842,9 +845,6 @@ descriptor=[
 # GL 3.2
   [ "CONTEXT_PROFILE_MASK", "CONTEXT_INT(Const.ProfileMask), extra_version_32" ],
 
-# GL_ARB_robustness
-  [ "RESET_NOTIFICATION_STRATEGY_ARB", "CONTEXT_ENUM(Const.ResetStrategy), NO_EXTRA" ],
-
 # GL_ARB_timer_query
   [ "TIMESTAMP", "LOC_CUSTOM, TYPE_INT64, 0, extra_ARB_timer_query" ],
 
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 24ce7b0..6df09bb 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -907,6 +907,29 @@ _mesa_is_astc_format(GLenum internalFormat)
 }
 
 /**
+ * Test if the given format is an ETC2 format.
+ */
+GLboolean
+_mesa_is_etc2_format(GLenum internalFormat)
+{
+   switch (internalFormat) {
+   case GL_COMPRESSED_RGB8_ETC2:
+   case GL_COMPRESSED_SRGB8_ETC2:
+   case GL_COMPRESSED_RGBA8_ETC2_EAC:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
+   case GL_COMPRESSED_R11_EAC:
+   case GL_COMPRESSED_RG11_EAC:
+   case GL_COMPRESSED_SIGNED_R11_EAC:
+   case GL_COMPRESSED_SIGNED_RG11_EAC:
+   case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+   case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
  * Test if the given format is an integer (non-normalized) format.
  */
 GLboolean
@@ -2495,7 +2518,6 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       case GL_RGBA8I_EXT:
       case GL_RGBA16I_EXT:
       case GL_RGBA32I_EXT:
-      case GL_RGB10_A2UI:
          return GL_RGBA;
       case GL_RGB8UI_EXT:
       case GL_RGB16UI_EXT:
@@ -2507,6 +2529,13 @@ _mesa_base_tex_format(const struct gl_context *ctx, GLint internalFormat)
       }
    }
 
+   if (ctx->Extensions.ARB_texture_rgb10_a2ui) {
+      switch (internalFormat) {
+      case GL_RGB10_A2UI:
+         return GL_RGBA;
+      }
+   }
+
    if (ctx->Extensions.EXT_texture_integer) {
       switch (internalFormat) {
       case GL_ALPHA8UI_EXT:
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index c73f464..474ede2 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -61,6 +61,9 @@ extern GLboolean
 _mesa_is_astc_format(GLenum internalFormat);
 
 extern GLboolean
+_mesa_is_etc2_format(GLenum internalFormat);
+
+extern GLboolean
 _mesa_is_type_unsigned(GLenum type);
 
 extern GLboolean
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 5956ce4..35ce0f2 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -1385,13 +1385,24 @@ _mesa_get_program_resourceiv(struct gl_shader_program *shProg,
 
 static bool
 validate_io(struct gl_shader_program *producer,
-            struct gl_shader_program *consumer)
+            struct gl_shader_program *consumer,
+            gl_shader_stage producer_stage,
+            gl_shader_stage consumer_stage)
 {
    if (producer == consumer)
       return true;
 
+   const bool nonarray_stage_to_array_stage =
+      producer_stage != MESA_SHADER_TESS_CTRL &&
+      (consumer_stage == MESA_SHADER_GEOMETRY ||
+       consumer_stage == MESA_SHADER_TESS_CTRL ||
+       consumer_stage == MESA_SHADER_TESS_EVAL);
+
    bool valid = true;
 
+   void *name_buffer = NULL;
+   size_t name_buffer_size = 0;
+
    gl_shader_variable const **outputs =
       (gl_shader_variable const **) calloc(producer->NumProgramResourceList,
                                            sizeof(gl_shader_variable *));
@@ -1463,11 +1474,52 @@ validate_io(struct gl_shader_program *producer,
             }
          }
       } else {
+         char *consumer_name = consumer_var->name;
+
+         if (nonarray_stage_to_array_stage &&
+             consumer_var->interface_type != NULL &&
+             consumer_var->interface_type->is_array() &&
+             !is_gl_identifier(consumer_var->name)) {
+            const size_t name_len = strlen(consumer_var->name);
+
+            if (name_len >= name_buffer_size) {
+               free(name_buffer);
+
+               name_buffer_size = name_len + 1;
+               name_buffer = malloc(name_buffer_size);
+               if (name_buffer == NULL) {
+                  valid = false;
+                  goto out;
+               }
+            }
+
+            consumer_name = (char *) name_buffer;
+
+            char *s = strchr(consumer_var->name, '[');
+            if (s == NULL) {
+               valid = false;
+               goto out;
+            }
+
+            char *t = strchr(s, ']');
+            if (t == NULL) {
+               valid = false;
+               goto out;
+            }
+
+            assert(t[1] == '.' || t[1] == '[');
+
+            const ptrdiff_t base_name_len = s - consumer_var->name;
+
+            memcpy(consumer_name, consumer_var->name, base_name_len);
+            strcpy(consumer_name + base_name_len, t + 1);
+         }
+
          for (unsigned j = 0; j < num_outputs; j++) {
             const gl_shader_variable *const var = outputs[j];
 
             if (!var->explicit_location &&
-                strcmp(consumer_var->name, var->name) == 0) {
+                strcmp(consumer_name, var->name) == 0) {
                producer_var = var;
                match_index = j;
                break;
@@ -1529,25 +1581,53 @@ validate_io(struct gl_shader_program *producer,
        * Note that location mismatches are detected by the loops above that
        * find the producer variable that goes with the consumer variable.
        */
-      if (producer_var->type != consumer_var->type ||
-          producer_var->interpolation != consumer_var->interpolation ||
-          producer_var->precision != consumer_var->precision) {
+      if (nonarray_stage_to_array_stage) {
+         if (!consumer_var->type->is_array() ||
+             consumer_var->type->fields.array != producer_var->type) {
+            valid = false;
+            goto out;
+         }
+
+         if (consumer_var->interface_type != NULL) {
+            if (!consumer_var->interface_type->is_array() ||
+                consumer_var->interface_type->fields.array != producer_var->interface_type) {
+               valid = false;
+               goto out;
+            }
+         } else if (producer_var->interface_type != NULL) {
+            valid = false;
+            goto out;
+         }
+      } else {
+         if (producer_var->type != consumer_var->type) {
+            valid = false;
+            goto out;
+         }
+
+         if (producer_var->interface_type != consumer_var->interface_type) {
+            valid = false;
+            goto out;
+         }
+      }
+
+      if (producer_var->interpolation != consumer_var->interpolation) {
          valid = false;
          goto out;
       }
 
-      if (producer_var->outermost_struct_type != consumer_var->outermost_struct_type) {
+      if (producer_var->precision != consumer_var->precision) {
          valid = false;
          goto out;
       }
 
-      if (producer_var->interface_type != consumer_var->interface_type) {
+      if (producer_var->outermost_struct_type != consumer_var->outermost_struct_type) {
          valid = false;
          goto out;
       }
    }
 
  out:
+   free(name_buffer);
    free(outputs);
    return valid && num_outputs == 0;
 }
@@ -1579,7 +1659,9 @@ _mesa_validate_pipeline_io(struct gl_pipeline_object *pipeline)
          if (shProg[idx]->_LinkedShaders[idx]->Stage == MESA_SHADER_COMPUTE)
             break;
 
-         if (!validate_io(shProg[prev], shProg[idx]))
+         if (!validate_io(shProg[prev], shProg[idx],
+                          shProg[prev]->_LinkedShaders[prev]->Stage,
+                          shProg[idx]->_LinkedShaders[idx]->Stage))
             return false;
 
          prev = idx;
diff --git a/src/mesa/main/texgetimage.c b/src/mesa/main/texgetimage.c
index fc3cc6b..3dde03f 100644
--- a/src/mesa/main/texgetimage.c
+++ b/src/mesa/main/texgetimage.c
@@ -502,13 +502,15 @@ get_tex_rgba_uncompressed(struct gl_context *ctx, GLuint dimensions,
           */
          if (format == rgba_format) {
             rgba = dest;
-         } else if (rgba == NULL) { /* Allocate the RGBA buffer only once */
+         } else {
             need_convert = true;
-            rgba = malloc(height * rgba_stride);
-            if (!rgba) {
-               _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage()");
-               ctx->Driver.UnmapTextureImage(ctx, texImage, img);
-               return;
+            if (rgba == NULL) { /* Allocate the RGBA buffer only once */
+               rgba = malloc(height * rgba_stride);
+               if (!rgba) {
+                  _mesa_error(ctx, GL_OUT_OF_MEMORY, "glGetTexImage()");
+                  ctx->Driver.UnmapTextureImage(ctx, texImage, img);
+                  return;
+               }
             }
          }
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 58b7f27..7b13a28 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -1300,6 +1300,7 @@ bool
 _mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format)
 {
    return _mesa_is_astc_format(format) ||
+          _mesa_is_etc2_format(format) ||
           compressedteximage_only_format(ctx, format);
 }
 
@@ -2587,10 +2588,16 @@ check_rtt_cb(GLuint key, void *data, void *userData)
              att->Texture == texObj &&
              att->TextureLevel == level &&
              att->CubeMapFace == face) {
-            _mesa_update_texture_renderbuffer(ctx, ctx->DrawBuffer, att);
+            _mesa_update_texture_renderbuffer(ctx, fb, att);
             assert(att->Renderbuffer->TexImage);
             /* Mark fb status as indeterminate to force re-validation */
             fb->_Status = 0;
+
+            /* Make sure that the revalidation actually happens if this is
+             * being done to currently-bound buffers.
+             */
+            if (fb == ctx->DrawBuffer || fb == ctx->ReadBuffer)
+               ctx->NewState |= _NEW_BUFFERS;
          }
       }
    }
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index f4a0760..72ed869 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -179,9 +179,7 @@ clear_texture_fields(struct gl_context *ctx,
             return;
 	 }
 
-         _mesa_init_teximage_fields(ctx, texImage,
-                                    0, 0, 0, 0, /* w, h, d, border */
-                                    GL_NONE, MESA_FORMAT_NONE);
+         _mesa_clear_texture_image(ctx, texImage);
       }
    }
 }
diff --git a/src/mesa/state_tracker/st_atom_array.c b/src/mesa/state_tracker/st_atom_array.c
index 0847184..758d8b4 100644
--- a/src/mesa/state_tracker/st_atom_array.c
+++ b/src/mesa/state_tracker/st_atom_array.c
@@ -386,6 +386,7 @@ static void init_velement(struct pipe_vertex_element *velement,
 }
 
 static void init_velement_lowered(struct st_context *st,
+                                  const struct st_vertex_program *vp,
                                   struct pipe_vertex_element *velements,
                                   int src_offset, int format,
                                   int instance_divisor, int vbo_index,
@@ -396,23 +397,33 @@ static void init_velement_lowered(struct st_context *st,
    if (doubles) {
       int lower_format;
 
-      if (nr_components == 1)
+      if (nr_components < 2)
          lower_format = PIPE_FORMAT_R32G32_UINT;
-      else if (nr_components >= 2)
+      else
          lower_format = PIPE_FORMAT_R32G32B32A32_UINT;
 
       init_velement(&velements[idx], src_offset,
                     lower_format, instance_divisor, vbo_index);
       idx++;
 
-      if (nr_components > 2) {
-         if (nr_components == 3)
-            lower_format = PIPE_FORMAT_R32G32_UINT;
-         else if (nr_components >= 4)
-            lower_format = PIPE_FORMAT_R32G32B32A32_UINT;
+      if (idx < vp->num_inputs &&
+          vp->index_to_input[idx] == ST_DOUBLE_ATTRIB_PLACEHOLDER) {
+         if (nr_components >= 3) {
+            if (nr_components == 3)
+               lower_format = PIPE_FORMAT_R32G32_UINT;
+            else
+               lower_format = PIPE_FORMAT_R32G32B32A32_UINT;
+
+            init_velement(&velements[idx], src_offset + 4 * sizeof(float),
+                        lower_format, instance_divisor, vbo_index);
+         } else {
+            /* The values here are undefined. Fill in some conservative
+             * dummy values.
+             */
+            init_velement(&velements[idx], src_offset, PIPE_FORMAT_R32G32_UINT,
+                          instance_divisor, vbo_index);
+         }
 
-         init_velement(&velements[idx], src_offset + 4 * sizeof(float),
-                       lower_format, instance_divisor, vbo_index);
          idx++;
       }
    } else {
@@ -435,10 +446,9 @@ setup_interleaved_attribs(struct st_context *st,
                           const struct st_vp_variant *vpv,
                           const struct gl_client_array **arrays,
                           struct pipe_vertex_buffer *vbuffer,
-                          struct pipe_vertex_element velements[],
-                          unsigned *num_velements)
+                          struct pipe_vertex_element velements[])
 {
-   GLuint attr, attr_idx;
+   GLuint attr;
    const GLubyte *low_addr = NULL;
    GLboolean usingVBO;      /* all arrays in a VBO? */
    struct gl_buffer_object *bufobj;
@@ -481,15 +491,13 @@ setup_interleaved_attribs(struct st_context *st,
    /* are the arrays in user space? */
    usingVBO = _mesa_is_bufferobj(bufobj);
 
-   attr_idx = 0;
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
+   for (attr = 0; attr < vpv->num_inputs;) {
       const struct gl_client_array *array;
       unsigned src_offset;
       unsigned src_format;
 
       array = get_client_array(vp, arrays, attr);
-      if (!array)
-         continue;
+      assert(array);
 
       src_offset = (unsigned) (array->Ptr - low_addr);
       assert(array->_ElementSize ==
@@ -501,13 +509,11 @@ setup_interleaved_attribs(struct st_context *st,
                                          array->Normalized,
                                          array->Integer);
 
-      init_velement_lowered(st, velements, src_offset, src_format,
+      init_velement_lowered(st, vp, velements, src_offset, src_format,
                             array->InstanceDivisor, 0,
-                            array->Size, array->Doubles, &attr_idx);
+                            array->Size, array->Doubles, &attr);
    }
 
-   *num_velements = attr_idx;
-
    /*
     * Return the vbuffer info and setup user-space attrib info, if needed.
     */
@@ -554,25 +560,25 @@ setup_non_interleaved_attribs(struct st_context *st,
                               const struct gl_client_array **arrays,
                               struct pipe_vertex_buffer vbuffer[],
                               struct pipe_vertex_element velements[],
-                              unsigned *num_velements)
+                              unsigned *num_vbuffers)
 {
    struct gl_context *ctx = st->ctx;
-   GLuint attr, attr_idx = 0;
+   GLuint attr;
 
-   for (attr = 0; attr < vpv->num_inputs; attr++) {
+   *num_vbuffers = 0;
+
+   for (attr = 0; attr < vpv->num_inputs;) {
       const GLuint mesaAttr = vp->index_to_input[attr];
       const struct gl_client_array *array;
       struct gl_buffer_object *bufobj;
       GLsizei stride;
       unsigned src_format;
+      unsigned bufidx;
 
       array = get_client_array(vp, arrays, attr);
-      if (!array) {
-         vbuffer[attr].buffer = NULL;
-         vbuffer[attr].user_buffer = NULL;
-         vbuffer[attr].buffer_offset = 0;
-         continue;
-      }
+      assert(array);
+
+      bufidx = (*num_vbuffers)++;
 
       stride = array->StrideB;
       bufobj = array->BufferObj;
@@ -590,9 +596,9 @@ setup_non_interleaved_attribs(struct st_context *st,
             return FALSE; /* out-of-memory error probably */
          }
 
-         vbuffer[attr].buffer = stobj->buffer;
-         vbuffer[attr].user_buffer = NULL;
-         vbuffer[attr].buffer_offset = pointer_to_offset(array->Ptr);
+         vbuffer[bufidx].buffer = stobj->buffer;
+         vbuffer[bufidx].user_buffer = NULL;
+         vbuffer[bufidx].buffer_offset = pointer_to_offset(array->Ptr);
       }
       else {
          /* wrap user data */
@@ -609,13 +615,13 @@ setup_non_interleaved_attribs(struct st_context *st,
 
          assert(ptr);
 
-         vbuffer[attr].buffer = NULL;
-         vbuffer[attr].user_buffer = ptr;
-         vbuffer[attr].buffer_offset = 0;
+         vbuffer[bufidx].buffer = NULL;
+         vbuffer[bufidx].user_buffer = ptr;
+         vbuffer[bufidx].buffer_offset = 0;
       }
 
       /* common-case setup */
-      vbuffer[attr].stride = stride; /* in bytes */
+      vbuffer[bufidx].stride = stride; /* in bytes */
 
       src_format = st_pipe_vertex_format(array->Type,
                                          array->Size,
@@ -623,13 +629,11 @@ setup_non_interleaved_attribs(struct st_context *st,
                                          array->Normalized,
                                          array->Integer);
 
-      init_velement_lowered(st, velements, 0, src_format,
-                            array->InstanceDivisor, attr,
-                            array->Size, array->Doubles, &attr_idx);
-
+      init_velement_lowered(st, vp, velements, 0, src_format,
+                            array->InstanceDivisor, bufidx,
+                            array->Size, array->Doubles, &attr);
    }
 
-   *num_velements = attr_idx;
    return TRUE;
 }
 
@@ -641,7 +645,7 @@ static void update_array(struct st_context *st)
    const struct st_vp_variant *vpv;
    struct pipe_vertex_buffer vbuffer[PIPE_MAX_SHADER_INPUTS];
    struct pipe_vertex_element velements[PIPE_MAX_ATTRIBS];
-   unsigned num_vbuffers, num_velements;
+   unsigned num_vbuffers;
 
    st->vertex_array_out_of_memory = FALSE;
 
@@ -659,23 +663,21 @@ static void update_array(struct st_context *st)
     * Setup the vbuffer[] and velements[] arrays.
     */
    if (is_interleaved_arrays(vp, vpv, arrays)) {
-      if (!setup_interleaved_attribs(st, vp, vpv, arrays, vbuffer, velements, &num_velements)) {
+      if (!setup_interleaved_attribs(st, vp, vpv, arrays, vbuffer, velements)) {
          st->vertex_array_out_of_memory = TRUE;
          return;
       }
 
       num_vbuffers = 1;
-      if (num_velements == 0)
+      if (vpv->num_inputs == 0)
          num_vbuffers = 0;
    }
    else {
       if (!setup_non_interleaved_attribs(st, vp, vpv, arrays, vbuffer,
-                                         velements, &num_velements)) {
+                                         velements, &num_vbuffers)) {
          st->vertex_array_out_of_memory = TRUE;
          return;
       }
-
-      num_vbuffers = vpv->num_inputs;
    }
 
    cso_set_vertex_buffers(st->cso_context, 0, num_vbuffers, vbuffer);
@@ -685,7 +687,7 @@ static void update_array(struct st_context *st)
                              st->last_num_vbuffers - num_vbuffers, NULL);
    }
    st->last_num_vbuffers = num_vbuffers;
-   cso_set_vertex_elements(st->cso_context, num_velements, velements);
+   cso_set_vertex_elements(st->cso_context, vpv->num_inputs, velements);
 }
 
 
diff --git a/src/mesa/state_tracker/st_atom_texture.c b/src/mesa/state_tracker/st_atom_texture.c
index 4b7ad77..3d409a6 100644
--- a/src/mesa/state_tracker/st_atom_texture.c
+++ b/src/mesa/state_tracker/st_atom_texture.c
@@ -304,12 +304,10 @@ st_create_texture_sampler_view_from_stobj(struct st_context *st,
       templ.target = gl_target_to_pipe(stObj->base.Target);
    }
 
-   if (swizzle != SWIZZLE_NOOP) {
-      templ.swizzle_r = GET_SWZ(swizzle, 0);
-      templ.swizzle_g = GET_SWZ(swizzle, 1);
-      templ.swizzle_b = GET_SWZ(swizzle, 2);
-      templ.swizzle_a = GET_SWZ(swizzle, 3);
-   }
+   templ.swizzle_r = GET_SWZ(swizzle, 0);
+   templ.swizzle_g = GET_SWZ(swizzle, 1);
+   templ.swizzle_b = GET_SWZ(swizzle, 2);
+   templ.swizzle_a = GET_SWZ(swizzle, 3);
 
    return st->pipe->create_sampler_view(st->pipe, stObj->pt, &templ);
 }
diff --git a/src/mesa/state_tracker/st_cb_clear.c b/src/mesa/state_tracker/st_cb_clear.c
index 362cef4..1acec7c 100644
--- a/src/mesa/state_tracker/st_cb_clear.c
+++ b/src/mesa/state_tracker/st_cb_clear.c
@@ -313,11 +313,13 @@ clear_with_quad(struct gl_context *ctx, unsigned clear_buffers)
 static inline GLboolean
 is_scissor_enabled(struct gl_context *ctx, struct gl_renderbuffer *rb)
 {
+   const struct gl_scissor_rect *scissor = &ctx->Scissor.ScissorArray[0];
+
    return (ctx->Scissor.EnableFlags & 1) &&
-          (ctx->Scissor.ScissorArray[0].X > 0 ||
-           ctx->Scissor.ScissorArray[0].Y > 0 ||
-           (unsigned) ctx->Scissor.ScissorArray[0].Width < rb->Width ||
-           (unsigned) ctx->Scissor.ScissorArray[0].Height < rb->Height);
+          (scissor->X > 0 ||
+           scissor->Y > 0 ||
+           scissor->X + scissor->Width < rb->Width ||
+           scissor->Y + scissor->Height < rb->Height);
 }
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 3db5749..c013d3b 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -203,8 +203,19 @@ st_draw_vbo(struct gl_context *ctx,
       /* The VBO module handles restart for the non-indexed GLDrawArrays
        * so we only set these fields for indexed drawing:
        */
-      info.primitive_restart = ctx->Array._PrimitiveRestart;
-      info.restart_index = _mesa_primitive_restart_index(ctx, ib->type);
+      if (ctx->Array._PrimitiveRestart) {
+         info.restart_index = _mesa_primitive_restart_index(ctx, ib->type);
+
+         /* Enable primitive restart only when the restart index can have an
+          * effect. This is required for correctness in radeonsi VI support,
+          * though other hardware may also benefit from taking a faster,
+          * non-restart path when possible.
+          */
+         if ((ibuffer.index_size >= 4) ||
+             (ibuffer.index_size >= 2 && info.restart_index <= 0xffff) ||
+             (info.restart_index <= 0xff))
+            info.primitive_restart = true;
+      }
    }
    else {
       /* Transform feedback drawing is always non-indexed. */
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 9a280fc..5f76241 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -37,6 +37,7 @@
 #include "main/enums.h"
 #include "main/formats.h"
 #include "main/glformats.h"
+#include "main/texcompress.h"
 #include "main/texgetimage.h"
 #include "main/teximage.h"
 #include "main/texstore.h"
@@ -2282,6 +2283,12 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
    }
 
    if (pFormat == PIPE_FORMAT_NONE) {
+      /* lie about using etc1/etc2 natively if we do decoding tricks */
+      mFormat = _mesa_glenum_to_compressed_format(internalFormat);
+      if ((mFormat == MESA_FORMAT_ETC1_RGB8 && !st->has_etc1) ||
+          (_mesa_is_format_etc2(mFormat) && !st->has_etc2))
+          return mFormat;
+
       /* no luck at all */
       return MESA_FORMAT_NONE;
    }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index aa443a5..ee117c9 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -255,6 +255,7 @@ public:
    ir_instruction *ir;
    GLboolean cond_update;
    bool saturate;
+   bool is_64bit_expanded;
    st_src_reg sampler; /**< sampler register */
    int sampler_base;
    int sampler_array_size; /**< 1-based size of sampler array, 1 if not array */
@@ -515,7 +516,8 @@ public:
                           unsigned *array_size,
                           unsigned *base,
                           unsigned *index,
-                          st_src_reg *reladdr);
+                          st_src_reg *reladdr,
+                          bool opaque);
   void calc_deref_offsets(ir_dereference *head,
                           ir_dereference *tail,
                           unsigned *array_elements,
@@ -523,6 +525,7 @@ public:
                           unsigned *index,
                           st_src_reg *indirect,
                           unsigned *location);
+   st_src_reg canonicalize_gather_offset(st_src_reg offset);
 
    bool try_emit_mad(ir_expression *ir,
               int mul_operand);
@@ -670,6 +673,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
    inst->src[1] = src1;
    inst->src[2] = src2;
    inst->src[3] = src3;
+   inst->is_64bit_expanded = false;
    inst->ir = ir;
    inst->dead_mask = 0;
    /* default to float, for paths where this is not initialized
@@ -792,6 +796,7 @@ glsl_to_tgsi_visitor::emit_asm(ir_instruction *ir, unsigned op,
             dinst->prev = NULL;
          }
          this->instructions.push_tail(dinst);
+         dinst->is_64bit_expanded = true;
 
          /* modify the destination if we are splitting */
          for (j = 0; j < 2; j++) {
@@ -1136,7 +1141,7 @@ glsl_to_tgsi_visitor::st_src_reg_for_double(double val)
    uval[0].u = *(uint32_t *)&val;
    uval[1].u = *(((uint32_t *)&val) + 1);
    src.index = add_constant(src.file, uval, 1, GL_DOUBLE, &src.swizzle);
-
+   src.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_X, SWIZZLE_Y);
    return src;
 }
 
@@ -1958,12 +1963,14 @@ glsl_to_tgsi_visitor::visit_expression(ir_expression* ir, st_src_reg *op)
          emit_asm(ir, TGSI_OPCODE_TRUNC, result_dst, op[0]);
       break;
    case ir_unop_bitcast_f2i:
-      result_src = op[0];
-      result_src.type = GLSL_TYPE_INT;
-      break;
    case ir_unop_bitcast_f2u:
-      result_src = op[0];
-      result_src.type = GLSL_TYPE_UINT;
+      /* Make sure we don't propagate the negate modifier to integer opcodes. */
+      if (op[0].negate)
+         emit_asm(ir, TGSI_OPCODE_MOV, result_dst, op[0]);
+      else
+         result_src = op[0];
+      result_src.type = ir->operation == ir_unop_bitcast_f2i ? GLSL_TYPE_INT :
+                                                               GLSL_TYPE_UINT;
       break;
    case ir_unop_bitcast_i2f:
    case ir_unop_bitcast_u2f:
@@ -2792,6 +2799,7 @@ glsl_to_tgsi_visitor::emit_block_mov(ir_assignment *ir, const struct glsl_type *
 
    assert(type->is_scalar() || type->is_vector());
 
+   l->type = type->base_type;
    r->type = type->base_type;
    if (cond) {
       st_src_reg l_src = st_src_reg(*l);
@@ -2903,6 +2911,7 @@ glsl_to_tgsi_visitor::visit(ir_assignment *ir)
    } else if (ir->rhs->as_expression() &&
               this->instructions.get_tail() &&
               ir->rhs == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->ir &&
+              !((glsl_to_tgsi_instruction *)this->instructions.get_tail())->is_64bit_expanded &&
               type_size(ir->lhs->type) == 1 &&
               l.writemask == ((glsl_to_tgsi_instruction *)this->instructions.get_tail())->dst[0].writemask) {
       /* To avoid emitting an extra MOV when assigning an expression to a
@@ -3144,7 +3153,7 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
    st_src_reg offset;
    unsigned array_size = 0, base = 0, index = 0;
 
-   get_deref_offsets(deref, &array_size, &base, &index, &offset);
+   get_deref_offsets(deref, &array_size, &base, &index, &offset, false);
 
    if (offset.file != PROGRAM_UNDEFINED) {
       emit_asm(ir, TGSI_OPCODE_MUL, st_dst_reg(offset),
@@ -3451,7 +3460,7 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
    st_src_reg image(PROGRAM_IMAGE, 0, GLSL_TYPE_UINT);
 
    get_deref_offsets(img, &sampler_array_size, &sampler_base,
-                     (unsigned int *)&image.index, &reladdr);
+                     (unsigned int *)&image.index, &reladdr, true);
    if (reladdr.file != PROGRAM_UNDEFINED) {
       emit_arl(ir, sampler_reladdr, reladdr);
       image.reladdr = ralloc(mem_ctx, st_src_reg);
@@ -3811,7 +3820,8 @@ glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
                                         unsigned *array_size,
                                         unsigned *base,
                                         unsigned *index,
-                                        st_src_reg *reladdr)
+                                        st_src_reg *reladdr,
+                                        bool opaque)
 {
    GLuint shader = _mesa_program_enum_to_shader_stage(this->prog->Target);
    unsigned location = 0;
@@ -3836,12 +3846,27 @@ glsl_to_tgsi_visitor::get_deref_offsets(ir_dereference *ir,
       *array_size = 1;
    }
 
-   if (location != 0xffffffff) {
+   if (opaque) {
+      assert(location != 0xffffffff);
       *base += this->shader_program->UniformStorage[location].opaque[shader].index;
       *index += this->shader_program->UniformStorage[location].opaque[shader].index;
    }
 }
 
+st_src_reg
+glsl_to_tgsi_visitor::canonicalize_gather_offset(st_src_reg offset)
+{
+   if (offset.reladdr || offset.reladdr2) {
+      st_src_reg tmp = get_temp(glsl_type::ivec2_type);
+      st_dst_reg tmp_dst = st_dst_reg(tmp);
+      tmp_dst.writemask = WRITEMASK_XY;
+      emit_asm(NULL, TGSI_OPCODE_MOV, tmp_dst, offset);
+      return tmp;
+   }
+
+   return offset;
+}
+
 void
 glsl_to_tgsi_visitor::visit(ir_texture *ir)
 {
@@ -3967,9 +3992,10 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
                offset[i].index += i * type_size(elt_type);
                offset[i].type = elt_type->base_type;
                offset[i].swizzle = swizzle_for_size(elt_type->vector_elements);
+               offset[i] = canonicalize_gather_offset(offset[i]);
             }
          } else {
-            offset[0] = this->result;
+            offset[0] = canonicalize_gather_offset(this->result);
          }
       }
       break;
@@ -4075,7 +4101,7 @@ glsl_to_tgsi_visitor::visit(ir_texture *ir)
    }
 
    get_deref_offsets(ir->sampler, &sampler_array_size, &sampler_base,
-                     &sampler_index, &reladdr);
+                     &sampler_index, &reladdr, true);
    if (reladdr.file != PROGRAM_UNDEFINED)
       emit_arl(ir, sampler_reladdr, reladdr);
 
@@ -5526,60 +5552,24 @@ translate_src(struct st_translate *t, const st_src_reg *src_reg)
 
 static struct tgsi_texture_offset
 translate_tex_offset(struct st_translate *t,
-                     const st_src_reg *in_offset, int idx)
+                     const st_src_reg *in_offset)
 {
    struct tgsi_texture_offset offset;
-   struct ureg_src imm_src;
-   struct ureg_dst dst;
-   int array;
+   struct ureg_src src = translate_src(t, in_offset);
 
-   switch (in_offset->file) {
-   case PROGRAM_IMMEDIATE:
-      assert(in_offset->index >= 0 && in_offset->index < t->num_immediates);
-      imm_src = t->immediates[in_offset->index];
+   offset.File = src.File;
+   offset.Index = src.Index;
+   offset.SwizzleX = src.SwizzleX;
+   offset.SwizzleY = src.SwizzleY;
+   offset.SwizzleZ = src.SwizzleZ;
+   offset.Padding = 0;
 
-      offset.File = imm_src.File;
-      offset.Index = imm_src.Index;
-      offset.SwizzleX = imm_src.SwizzleX;
-      offset.SwizzleY = imm_src.SwizzleY;
-      offset.SwizzleZ = imm_src.SwizzleZ;
-      offset.Padding = 0;
-      break;
-   case PROGRAM_INPUT:
-      imm_src = t->inputs[t->inputMapping[in_offset->index]];
-      offset.File = imm_src.File;
-      offset.Index = imm_src.Index;
-      offset.SwizzleX = GET_SWZ(in_offset->swizzle, 0);
-      offset.SwizzleY = GET_SWZ(in_offset->swizzle, 1);
-      offset.SwizzleZ = GET_SWZ(in_offset->swizzle, 2);
-      offset.Padding = 0;
-      break;
-   case PROGRAM_TEMPORARY:
-      imm_src = ureg_src(t->temps[in_offset->index]);
-      offset.File = imm_src.File;
-      offset.Index = imm_src.Index;
-      offset.SwizzleX = GET_SWZ(in_offset->swizzle, 0);
-      offset.SwizzleY = GET_SWZ(in_offset->swizzle, 1);
-      offset.SwizzleZ = GET_SWZ(in_offset->swizzle, 2);
-      offset.Padding = 0;
-      break;
-   case PROGRAM_ARRAY:
-      array = in_offset->index >> 16;
-
-      assert(array >= 0);
-      assert(array < (int)t->num_temp_arrays);
+   assert(!src.Indirect);
+   assert(!src.DimIndirect);
+   assert(!src.Dimension);
+   assert(!src.Absolute); /* those shouldn't be used with integers anyway */
+   assert(!src.Negate);
 
-      dst = t->arrays[array];
-      offset.File = dst.File;
-      offset.Index = dst.Index + (in_offset->index & 0xFFFF) - 0x8000;
-      offset.SwizzleX = GET_SWZ(in_offset->swizzle, 0);
-      offset.SwizzleY = GET_SWZ(in_offset->swizzle, 1);
-      offset.SwizzleZ = GET_SWZ(in_offset->swizzle, 2);
-      offset.Padding = 0;
-      break;
-   default:
-      break;
-   }
    return offset;
 }
 
@@ -5643,7 +5633,7 @@ compile_tgsi_instruction(struct st_translate *t,
             ureg_src_indirect(src[num_src], ureg_src(t->address[2]));
       num_src++;
       for (i = 0; i < (int)inst->tex_offset_num_offset; i++) {
-         texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i], i);
+         texoffsets[i] = translate_tex_offset(t, &inst->tex_offsets[i]);
       }
       tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
 
@@ -6045,7 +6035,11 @@ st_translate_program(
                               inputSemanticName[i], inputSemanticIndex[i],
                               interpMode[i], 0, interpLocation[i],
                               array_id, array_size);
-            i += array_size - 1;
+
+            GLuint base_attr = inputSlotToAttr[i];
+            while (i + 1 < numInputs &&
+                   inputSlotToAttr[i + 1] < base_attr + array_size)
+               ++i;
          }
          else {
             t->inputs[i] = ureg_DECL_fs_input_cyl_centroid(ureg,
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index dffa52f..4f599dd 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -65,6 +65,7 @@ st_vdpau_video_surface_gallium(struct gl_context *ctx, const void *vdpSurface,
 
    struct pipe_video_buffer *buffer;
    struct pipe_sampler_view **samplers;
+   struct pipe_resource *res = NULL;
 
    getProcAddr = (void *)ctx->vdpGetProcAddress;
    if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f))
@@ -82,7 +83,8 @@ st_vdpau_video_surface_gallium(struct gl_context *ctx, const void *vdpSurface,
    if (!sv)
       return NULL;
 
-   return sv->texture;
+   pipe_resource_reference(&res, sv->texture);
+   return res;
 }
 
 static struct pipe_resource *
@@ -90,13 +92,15 @@ st_vdpau_output_surface_gallium(struct gl_context *ctx, const void *vdpSurface)
 {
    int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
    uint32_t device = (uintptr_t)ctx->vdpDevice;
+   struct pipe_resource *res = NULL;
    VdpOutputSurfaceGallium *f;
 
    getProcAddr = (void *)ctx->vdpGetProcAddress;
    if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f))
       return NULL;
 
-   return f((uintptr_t)vdpSurface);
+   pipe_resource_reference(&res, f((uintptr_t)vdpSurface));
+   return res;
 }
 
 static struct pipe_resource *
@@ -208,6 +212,7 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
    /* do we have different screen objects ? */
    if (res->screen != st->pipe->screen) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
+      pipe_resource_reference(&res, NULL);
       return;
    }
 
@@ -241,6 +246,7 @@ st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
    stObj->surface_format = res->format;
 
    _mesa_dirty_texobj(ctx, texObj);
+   pipe_resource_reference(&res, NULL);
 }
 
 static void
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 87ed7f7..96ed84f 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -814,6 +814,7 @@ vbo_validated_drawrangeelements(struct gl_context *ctx, GLenum mode,
    prim[0].basevertex = basevertex;
    prim[0].num_instances = numInstances;
    prim[0].base_instance = baseInstance;
+   prim[0].draw_id = 0;
 
    /* Need to give special consideration to rendering a range of
     * indices starting somewhere above zero.  Typically the
diff --git a/src/mesa/vbo/vbo_save_api.c b/src/mesa/vbo/vbo_save_api.c
index 97a1dfd..fafdf1d 100644
--- a/src/mesa/vbo/vbo_save_api.c
+++ b/src/mesa/vbo/vbo_save_api.c
@@ -1167,8 +1167,8 @@ _save_OBE_DrawArrays(GLenum mode, GLint start, GLsizei count)
  * then emitting an indexed prim at runtime.
  */
 static void GLAPIENTRY
-_save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
-                       const GLvoid * indices)
+_save_OBE_DrawElementsBaseVertex(GLenum mode, GLsizei count, GLenum type,
+                                 const GLvoid * indices, GLint basevertex)
 {
    GET_CURRENT_CONTEXT(ctx);
    struct vbo_save_context *save = &vbo_context(ctx)->save;
@@ -1205,15 +1205,15 @@ _save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
    switch (type) {
    case GL_UNSIGNED_BYTE:
       for (i = 0; i < count; i++)
-         CALL_ArrayElement(GET_DISPATCH(), (((GLubyte *) indices)[i]));
+         CALL_ArrayElement(GET_DISPATCH(), (basevertex + ((GLubyte *) indices)[i]));
       break;
    case GL_UNSIGNED_SHORT:
       for (i = 0; i < count; i++)
-         CALL_ArrayElement(GET_DISPATCH(), (((GLushort *) indices)[i]));
+         CALL_ArrayElement(GET_DISPATCH(), (basevertex + ((GLushort *) indices)[i]));
       break;
    case GL_UNSIGNED_INT:
       for (i = 0; i < count; i++)
-         CALL_ArrayElement(GET_DISPATCH(), (((GLuint *) indices)[i]));
+         CALL_ArrayElement(GET_DISPATCH(), (basevertex + ((GLuint *) indices)[i]));
       break;
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glDrawElements(type)");
@@ -1225,6 +1225,13 @@ _save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
    _ae_unmap_vbos(ctx);
 }
 
+static void GLAPIENTRY
+_save_OBE_DrawElements(GLenum mode, GLsizei count, GLenum type,
+                       const GLvoid * indices)
+{
+   _save_OBE_DrawElementsBaseVertex(mode, count, type, indices, 0);
+}
+
 
 static void GLAPIENTRY
 _save_OBE_DrawRangeElements(GLenum mode, GLuint start, GLuint end,
@@ -1462,6 +1469,7 @@ vbo_initialize_save_dispatch(const struct gl_context *ctx,
 {
    SET_DrawArrays(exec, _save_OBE_DrawArrays);
    SET_DrawElements(exec, _save_OBE_DrawElements);
+   SET_DrawElementsBaseVertex(exec, _save_OBE_DrawElementsBaseVertex);
    SET_DrawRangeElements(exec, _save_OBE_DrawRangeElements);
    SET_MultiDrawElementsEXT(exec, _save_OBE_MultiDrawElements);
    SET_MultiDrawElementsBaseVertex(exec, _save_OBE_MultiDrawElementsBaseVertex);
diff --git a/src/mesa/vbo/vbo_split_copy.c b/src/mesa/vbo/vbo_split_copy.c
index cb27ef9..1c3474c 100644
--- a/src/mesa/vbo/vbo_split_copy.c
+++ b/src/mesa/vbo/vbo_split_copy.c
@@ -243,7 +243,7 @@ begin( struct copy_context *copy, GLenum mode, GLboolean begin_flag )
 static GLuint
 elt(struct copy_context *copy, GLuint elt_idx)
 {
-   GLuint elt = copy->srcelt[elt_idx];
+   GLuint elt = copy->srcelt[elt_idx] + copy->prim->basevertex;
    GLuint slot = elt & (ELT_TABLE_SIZE-1);
 
 /*    printf("elt %d\n", elt); */
author	Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de>	2016-12-17 03:40:28 +0100
committer	Wolfgang Wiedmeyer <wolfgit@wiedmeyer.de>	2016-12-17 03:40:28 +0100
commit	ef9a82038acd73936830671dbe43205c28a2151d (patch)
tree	90be2cdd9f48750c18b669ca2ab9553575d9f822
parent	f84f60446aebaeee8a1df741328cbd4a30dd24ea (diff)
parent	743c2327b167b95046e02af4c7b2f7a282a0943d (diff)
download	external_mesa3d-replicant-6.0-old.zip external_mesa3d-replicant-6.0-old.tar.gz external_mesa3d-replicant-6.0-old.tar.bz2