87 files changed, 4354 insertions, 2724 deletions
diff --git a/third_party/libwebp/README.chromium b/third_party/libwebp/README.chromium
index 460e9e0..a60ca93 100644
--- a/third_party/libwebp/README.chromium
+++ b/third_party/libwebp/README.chromium
@@ -1,14 +1,14 @@
 Name: WebP image encoder/decoder
 Short Name: libwebp
 URL: http://developers.google.com/speed/webp
-Version: v0.3.1
+Version: v0.4.0
 License: BSD
 License File: LICENSE
 Security Critical: Yes
 
 Description:
 Source archive:
-  http://code.google.com/p/webp/downloads/detail?name=libwebp-0.3.1.tar.gz
+  https://code.google.com/p/webp/downloads/detail?name=libwebp-0.4.0.tar.gz
 
 WebP is an image format that does both lossy and lossless compression of
 digital photographic images. WebP consists of a codec based on VP8, that Google
@@ -21,13 +21,4 @@ Local changes:
    the contents of src/ less mux/ which is unused.
  * Merged COPYING/PATENTS to LICENSE
 Cherry-picks:
-  f626fe2 Detect canvas and image size mismatch in decoder.
-  f5fbdee demux: stricter image bounds check
-  a03c351 Demux: WebPIterator now also denotes if the frame has alpha.
-  6284854b Support for "Do not blend" in mux and demux libraries
-  40ae352 Fix memleak in WebPIDelete()
-  92d47e4 improve VP8L signature detection by checking the version bits too
-  dde91fd Demux: Correct the extended format validation
-  96ad0e0 VPLBitReader: Catch error if bit_pos > LBITS.
-  e18e667 demux: strictly enforce the animation flag
-  61cb884 demux: (non-exp) fail if the fragmented flag is set
+  Revert patch f7fc4bc: dec/webp.c: don't wait for data before reporting w/h
diff --git a/third_party/libwebp/dec/alpha.c b/third_party/libwebp/dec/alpha.c
index b5e6891..93729a0 100644
--- a/third_party/libwebp/dec/alpha.c
+++ b/third_party/libwebp/dec/alpha.c
@@ -12,104 +12,150 @@
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include <stdlib.h>
+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
-#include "../utils/filters.h"
 #include "../utils/quant_levels_dec.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+//------------------------------------------------------------------------------
+// ALPHDecoder object.
+
+ALPHDecoder* ALPHNew(void) {
+  ALPHDecoder* const dec = (ALPHDecoder*)calloc(1, sizeof(*dec));
+  return dec;
+}
+
+void ALPHDelete(ALPHDecoder* const dec) {
+  if (dec != NULL) {
+    VP8LDelete(dec->vp8l_dec_);
+    dec->vp8l_dec_ = NULL;
+    free(dec);
+  }
+}
 
 //------------------------------------------------------------------------------
-// Decodes the compressed data 'data' of size 'data_size' into the 'output'.
-// The 'output' buffer should be pre-allocated and must be of the same
-// dimension 'height'x'width', as that of the image.
-//
-// Returns 1 on successfully decoding the compressed alpha and
-//         0 if either:
-//           error in bit-stream header (invalid compression mode or filter), or
-//           error returned by appropriate compression method.
-
-static int DecodeAlpha(const uint8_t* data, size_t data_size,
-                       int width, int height, uint8_t* output) {
-  WEBP_FILTER_TYPE filter;
-  int pre_processing;
-  int rsrv;
+// Decoding.
+
+// Initialize alpha decoding by parsing the alpha header and decoding the image
+// header for alpha data stored using lossless compression.
+// Returns false in case of error in alpha header (data too short, invalid
+// compression method or filter, error in lossless header data etc).
+static int ALPHInit(ALPHDecoder* const dec, const uint8_t* data,
+                    size_t data_size, int width, int height, uint8_t* output) {
   int ok = 0;
-  int method;
   const uint8_t* const alpha_data = data + ALPHA_HEADER_LEN;
   const size_t alpha_data_size = data_size - ALPHA_HEADER_LEN;
+  int rsrv;
 
   assert(width > 0 && height > 0);
   assert(data != NULL && output != NULL);
 
+  dec->width_ = width;
+  dec->height_ = height;
+
   if (data_size <= ALPHA_HEADER_LEN) {
     return 0;
   }
 
-  method = (data[0] >> 0) & 0x03;
-  filter = (data[0] >> 2) & 0x03;
-  pre_processing = (data[0] >> 4) & 0x03;
+  dec->method_ = (data[0] >> 0) & 0x03;
+  dec->filter_ = (data[0] >> 2) & 0x03;
+  dec->pre_processing_ = (data[0] >> 4) & 0x03;
   rsrv = (data[0] >> 6) & 0x03;
-  if (method < ALPHA_NO_COMPRESSION ||
-      method > ALPHA_LOSSLESS_COMPRESSION ||
-      filter >= WEBP_FILTER_LAST ||
-      pre_processing > ALPHA_PREPROCESSED_LEVELS ||
+  if (dec->method_ < ALPHA_NO_COMPRESSION ||
+      dec->method_ > ALPHA_LOSSLESS_COMPRESSION ||
+      dec->filter_ >= WEBP_FILTER_LAST ||
+      dec->pre_processing_ > ALPHA_PREPROCESSED_LEVELS ||
       rsrv != 0) {
     return 0;
   }
 
-  if (method == ALPHA_NO_COMPRESSION) {
-    const size_t alpha_decoded_size = height * width;
+  if (dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t alpha_decoded_size = dec->width_ * dec->height_;
     ok = (alpha_data_size >= alpha_decoded_size);
-    if (ok) memcpy(output, alpha_data, alpha_decoded_size);
   } else {
-    ok = VP8LDecodeAlphaImageStream(width, height, alpha_data, alpha_data_size,
-                                    output);
+    assert(dec->method_ == ALPHA_LOSSLESS_COMPRESSION);
+    ok = VP8LDecodeAlphaHeader(dec, alpha_data, alpha_data_size, output);
   }
+  return ok;
+}
 
-  if (ok) {
-    WebPUnfilterFunc unfilter_func = WebPUnfilters[filter];
-    if (unfilter_func != NULL) {
-      // TODO(vikas): Implement on-the-fly decoding & filter mechanism to decode
-      // and apply filter per image-row.
-      unfilter_func(width, height, width, output);
+// Decodes, unfilters and dequantizes *at least* 'num_rows' rows of alpha
+// starting from row number 'row'. It assumes that rows up to (row - 1) have
+// already been decoded.
+// Returns false in case of bitstream error.
+static int ALPHDecode(VP8Decoder* const dec, int row, int num_rows) {
+  ALPHDecoder* const alph_dec = dec->alph_dec_;
+  const int width = alph_dec->width_;
+  const int height = alph_dec->height_;
+  WebPUnfilterFunc unfilter_func = WebPUnfilters[alph_dec->filter_];
+  uint8_t* const output = dec->alpha_plane_;
+  if (alph_dec->method_ == ALPHA_NO_COMPRESSION) {
+    const size_t offset = row * width;
+    const size_t num_pixels = num_rows * width;
+    assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN + offset + num_pixels);
+    memcpy(dec->alpha_plane_ + offset,
+           dec->alpha_data_ + ALPHA_HEADER_LEN + offset, num_pixels);
+  } else {  // alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION
+    assert(alph_dec->vp8l_dec_ != NULL);
+    if (!VP8LDecodeAlphaImageStream(alph_dec, row + num_rows)) {
+      return 0;
     }
-    if (pre_processing == ALPHA_PREPROCESSED_LEVELS) {
-      ok = DequantizeLevels(output, width, height);
+  }
+
+  if (unfilter_func != NULL) {
+    unfilter_func(width, height, width, row, num_rows, output);
+  }
+
+  if (alph_dec->pre_processing_ == ALPHA_PREPROCESSED_LEVELS) {
+    if (!DequantizeLevels(output, width, height, row, num_rows)) {
+      return 0;
     }
   }
 
-  return ok;
+  if (row + num_rows == dec->pic_hdr_.height_) {
+    dec->is_alpha_decoded_ = 1;
+  }
+  return 1;
 }
 
 //------------------------------------------------------------------------------
+// Main entry point.
 
 const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec,
                                       int row, int num_rows) {
   const int width = dec->pic_hdr_.width_;
   const int height = dec->pic_hdr_.height_;
 
-  if (row < 0 || num_rows < 0 || row + num_rows > height) {
+  if (row < 0 || num_rows <= 0 || row + num_rows > height) {
     return NULL;    // sanity check.
   }
 
   if (row == 0) {
-    // Decode everything during the first call.
-    assert(!dec->is_alpha_decoded_);
-    if (!DecodeAlpha(dec->alpha_data_, (size_t)dec->alpha_data_size_,
-                     width, height, dec->alpha_plane_)) {
-      return NULL;  // Error.
+    // Initialize decoding.
+    assert(dec->alpha_plane_ != NULL);
+    dec->alph_dec_ = ALPHNew();
+    if (dec->alph_dec_ == NULL) return NULL;
+    if (!ALPHInit(dec->alph_dec_, dec->alpha_data_, dec->alpha_data_size_,
+                  width, height, dec->alpha_plane_)) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+      return NULL;
     }
-    dec->is_alpha_decoded_ = 1;
+  }
+
+  if (!dec->is_alpha_decoded_) {
+    int ok = 0;
+    assert(dec->alph_dec_ != NULL);
+    ok = ALPHDecode(dec, row, num_rows);
+    if (!ok || dec->is_alpha_decoded_) {
+      ALPHDelete(dec->alph_dec_);
+      dec->alph_dec_ = NULL;
+    }
+    if (!ok) return NULL;  // Error.
   }
 
   // Return a pointer to the current decoded row.
   return dec->alpha_plane_ + row * width;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/alphai.h b/third_party/libwebp/dec/alphai.h
new file mode 100644
index 0000000..5fa230c
--- /dev/null
+++ b/third_party/libwebp/dec/alphai.h
@@ -0,0 +1,55 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Alpha decoder: internal header.
+//
+// Author: Urvang (urvang@google.com)
+
+#ifndef WEBP_DEC_ALPHAI_H_
+#define WEBP_DEC_ALPHAI_H_
+
+#include "./webpi.h"
+#include "../utils/filters.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct VP8LDecoder;  // Defined in dec/vp8li.h.
+
+typedef struct ALPHDecoder ALPHDecoder;
+struct ALPHDecoder {
+  int width_;
+  int height_;
+  int method_;
+  WEBP_FILTER_TYPE filter_;
+  int pre_processing_;
+  struct VP8LDecoder* vp8l_dec_;
+  VP8Io io_;
+  int use_8b_decode;  // Although alpha channel requires only 1 byte per
+                      // pixel, sometimes VP8LDecoder may need to allocate
+                      // 4 bytes per pixel internally during decode.
+};
+
+//------------------------------------------------------------------------------
+// internal functions. Not public.
+
+// Allocates a new alpha decoder instance.
+ALPHDecoder* ALPHNew(void);
+
+// Clears and deallocates an alpha decoder instance.
+void ALPHDelete(ALPHDecoder* const dec);
+
+//------------------------------------------------------------------------------
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_DEC_ALPHAI_H_ */
diff --git a/third_party/libwebp/dec/buffer.c b/third_party/libwebp/dec/buffer.c
index 3855715..1e852ef 100644
--- a/third_party/libwebp/dec/buffer.c
+++ b/third_party/libwebp/dec/buffer.c
@@ -17,10 +17,6 @@
 #include "./webpi.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // WebPDecBuffer
 
@@ -212,6 +208,3 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/decode_vp8.h b/third_party/libwebp/dec/decode_vp8.h
index acdb15a..b9337bb 100644
--- a/third_party/libwebp/dec/decode_vp8.h
+++ b/third_party/libwebp/dec/decode_vp8.h
@@ -16,7 +16,7 @@
 
 #include "../webp/decode.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -132,7 +132,8 @@ static WEBP_INLINE int VP8InitIo(VP8Io* const io) {
   return VP8InitIoInternal(io, WEBP_DECODER_ABI_VERSION);
 }
 
-// Start decoding a new picture. Returns true if ok.
+// Decode the VP8 frame header. Returns true if ok.
+// Note: 'io->data' must be pointing to the start of the VP8 frame header.
 int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io);
 
 // Decode a picture. Will call VP8GetHeaders() if it wasn't done already.
@@ -177,7 +178,7 @@ WEBP_EXTERN(int) VP8LGetInfo(
     const uint8_t* data, size_t data_size,  // data available so far
     int* const width, int* const height, int* const has_alpha);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/dec/frame.c b/third_party/libwebp/dec/frame.c
index 5f6a7d9..e1eea94 100644
--- a/third_party/libwebp/dec/frame.c
+++ b/third_party/libwebp/dec/frame.c
@@ -15,12 +15,11 @@
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define ALIGN_MASK (32 - 1)
 
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx);  // TODO(skal): remove
+
 //------------------------------------------------------------------------------
 // Filtering
 
@@ -31,25 +30,18 @@ extern "C" {
 //                 U/V, so it's 8 samples total (because of the 2x upsampling).
 static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 };
 
-static WEBP_INLINE int hev_thresh_from_level(int level, int keyframe) {
-  if (keyframe) {
-    return (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
-  } else {
-    return (level >= 40) ? 3 : (level >= 20) ? 2 : (level >= 15) ? 1 : 0;
-  }
-}
-
 static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int y_bps = dec->cache_y_stride_;
-  VP8FInfo* const f_info = ctx->f_info_ + mb_x;
-  uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16;
-  const int level = f_info->f_level_;
+  const VP8FInfo* const f_info = ctx->f_info_ + mb_x;
+  uint8_t* const y_dst = dec->cache_y_ + cache_id * 16 * y_bps + mb_x * 16;
   const int ilevel = f_info->f_ilevel_;
-  const int limit = 2 * level + ilevel;
-  if (level == 0) {
+  const int limit = f_info->f_limit_;
+  if (limit == 0) {
     return;
   }
+  assert(limit >= 3);
   if (dec->filter_type_ == 1) {   // simple
     if (mb_x > 0) {
       VP8SimpleHFilter16(y_dst, y_bps, limit + 4);
@@ -65,10 +57,9 @@ static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) {
     }
   } else {    // complex
     const int uv_bps = dec->cache_uv_stride_;
-    uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8;
-    const int hev_thresh =
-        hev_thresh_from_level(level, dec->frm_hdr_.key_frame_);
+    uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+    uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+    const int hev_thresh = f_info->hev_thresh_;
     if (mb_x > 0) {
       VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh);
       VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh);
@@ -128,26 +119,108 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
           }
         }
         level = (level < 0) ? 0 : (level > 63) ? 63 : level;
-        info->f_level_ = level;
-
-        if (hdr->sharpness_ > 0) {
-          if (hdr->sharpness_ > 4) {
-            level >>= 2;
-          } else {
-            level >>= 1;
-          }
-          if (level > 9 - hdr->sharpness_) {
-            level = 9 - hdr->sharpness_;
+        if (level > 0) {
+          int ilevel = level;
+          if (hdr->sharpness_ > 0) {
+            if (hdr->sharpness_ > 4) {
+              ilevel >>= 2;
+            } else {
+              ilevel >>= 1;
+            }
+            if (ilevel > 9 - hdr->sharpness_) {
+              ilevel = 9 - hdr->sharpness_;
+            }
           }
+          if (ilevel < 1) ilevel = 1;
+          info->f_ilevel_ = ilevel;
+          info->f_limit_ = 2 * level + ilevel;
+          info->hev_thresh_ = (level >= 40) ? 2 : (level >= 15) ? 1 : 0;
+        } else {
+          info->f_limit_ = 0;  // no filtering
         }
-        info->f_ilevel_ = (level < 1) ? 1 : level;
-        info->f_inner_ = 0;
+        info->f_inner_ = i4x4;
       }
     }
   }
 }
 
 //------------------------------------------------------------------------------
+// Dithering
+
+#define DITHER_AMP_TAB_SIZE 12
+static const int kQuantToDitherAmp[DITHER_AMP_TAB_SIZE] = {
+  // roughly, it's dqm->uv_mat_[1]
+  8, 7, 6, 4, 4, 2, 2, 2, 1, 1, 1, 1
+};
+
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec) {
+  assert(dec != NULL);
+  if (options != NULL) {
+    const int d = options->dithering_strength;
+    const int max_amp = (1 << VP8_RANDOM_DITHER_FIX) - 1;
+    const int f = (d < 0) ? 0 : (d > 100) ? max_amp : (d * max_amp / 100);
+    if (f > 0) {
+      int s;
+      int all_amp = 0;
+      for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
+        VP8QuantMatrix* const dqm = &dec->dqm_[s];
+        if (dqm->uv_quant_ < DITHER_AMP_TAB_SIZE) {
+          // TODO(skal): should we specially dither more for uv_quant_ < 0?
+          const int idx = (dqm->uv_quant_ < 0) ? 0 : dqm->uv_quant_;
+          dqm->dither_ = (f * kQuantToDitherAmp[idx]) >> 3;
+        }
+        all_amp |= dqm->dither_;
+      }
+      if (all_amp != 0) {
+        VP8InitRandom(&dec->dithering_rg_, 1.0f);
+        dec->dither_ = 1;
+      }
+    }
+  }
+}
+
+// minimal amp that will provide a non-zero dithering effect
+#define MIN_DITHER_AMP 4
+#define DITHER_DESCALE 4
+#define DITHER_DESCALE_ROUNDER (1 << (DITHER_DESCALE - 1))
+#define DITHER_AMP_BITS 8
+#define DITHER_AMP_CENTER (1 << DITHER_AMP_BITS)
+
+static void Dither8x8(VP8Random* const rg, uint8_t* dst, int bps, int amp) {
+  int i, j;
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 8; ++i) {
+      // TODO: could be made faster with SSE2
+      const int bits =
+          VP8RandomBits2(rg, DITHER_AMP_BITS + 1, amp) - DITHER_AMP_CENTER;
+      // Convert to range: [-2,2] for dither=50, [-4,4] for dither=100
+      const int delta = (bits + DITHER_DESCALE_ROUNDER) >> DITHER_DESCALE;
+      const int v = (int)dst[i] + delta;
+      dst[i] = (v < 0) ? 0 : (v > 255) ? 255u : (uint8_t)v;
+    }
+    dst += bps;
+  }
+}
+
+static void DitherRow(VP8Decoder* const dec) {
+  int mb_x;
+  assert(dec->dither_);
+  for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) {
+    const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+    const VP8MBData* const data = ctx->mb_data_ + mb_x;
+    const int cache_id = ctx->id_;
+    const int uv_bps = dec->cache_uv_stride_;
+    if (data->dither_ >= MIN_DITHER_AMP) {
+      uint8_t* const u_dst = dec->cache_u_ + cache_id * 8 * uv_bps + mb_x * 8;
+      uint8_t* const v_dst = dec->cache_v_ + cache_id * 8 * uv_bps + mb_x * 8;
+      Dither8x8(&dec->dithering_rg_, u_dst, uv_bps, data->dither_);
+      Dither8x8(&dec->dithering_rg_, v_dst, uv_bps, data->dither_);
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
 // This function is called after a row of macroblocks is finished decoding.
 // It also takes into account the following restrictions:
 //  * In case of in-loop filtering, we must hold off sending some of the bottom
@@ -164,25 +237,35 @@ static void PrecomputeFilterStrengths(VP8Decoder* const dec) {
 static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   const VP8ThreadContext* const ctx = &dec->thread_ctx_;
+  const int cache_id = ctx->id_;
   const int extra_y_rows = kFilterExtraRows[dec->filter_type_];
   const int ysize = extra_y_rows * dec->cache_y_stride_;
   const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_;
-  const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_;
-  const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_;
+  const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+  const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
   uint8_t* const ydst = dec->cache_y_ - ysize + y_offset;
   uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset;
   uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset;
-  const int first_row = (ctx->mb_y_ == 0);
-  const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1);
-  int y_start = MACROBLOCK_VPOS(ctx->mb_y_);
-  int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1);
+  const int mb_y = ctx->mb_y_;
+  const int is_first_row = (mb_y == 0);
+  const int is_last_row = (mb_y >= dec->br_mb_y_ - 1);
+
+  if (dec->mt_method_ == 2) {
+    ReconstructRow(dec, ctx);
+  }
 
   if (ctx->filter_row_) {
     FilterRow(dec);
   }
 
-  if (io->put) {
-    if (!first_row) {
+  if (dec->dither_) {
+    DitherRow(dec);
+  }
+
+  if (io->put != NULL) {
+    int y_start = MACROBLOCK_VPOS(mb_y);
+    int y_end = MACROBLOCK_VPOS(mb_y + 1);
+    if (!is_first_row) {
       y_start -= extra_y_rows;
       io->y = ydst;
       io->u = udst;
@@ -193,7 +276,7 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
       io->v = dec->cache_v_ + uv_offset;
     }
 
-    if (!last_row) {
+    if (!is_last_row) {
       y_end -= extra_y_rows;
     }
     if (y_end > io->crop_bottom) {
@@ -201,11 +284,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
     io->a = NULL;
     if (dec->alpha_data_ != NULL && y_start < y_end) {
-      // TODO(skal): several things to correct here:
-      // * testing presence of alpha with dec->alpha_data_ is not a good idea
-      // * we're actually decompressing the full plane only once. It should be
-      //   more obvious from signature.
-      // * we could free alpha_data_ right after this call, but we don't own.
+      // TODO(skal): testing presence of alpha with dec->alpha_data_ is not a
+      // good idea.
       io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start);
       if (io->a == NULL) {
         return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
@@ -237,8 +317,8 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
     }
   }
   // rotate top samples if needed
-  if (ctx->id_ + 1 == dec->num_caches_) {
-    if (!last_row) {
+  if (cache_id + 1 == dec->num_caches_) {
+    if (!is_last_row) {
       memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize);
       memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize);
       memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize);
@@ -255,10 +335,14 @@ static int FinishRow(VP8Decoder* const dec, VP8Io* const io) {
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
   VP8ThreadContext* const ctx = &dec->thread_ctx_;
-  if (!dec->use_threads_) {
+  const int filter_row =
+      (dec->filter_type_ > 0) &&
+      (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  if (dec->mt_method_ == 0) {
     // ctx->id_ and ctx->f_info_ are already set
     ctx->mb_y_ = dec->mb_y_;
-    ctx->filter_row_ = dec->filter_row_;
+    ctx->filter_row_ = filter_row;
+    ReconstructRow(dec, ctx);
     ok = FinishRow(dec, io);
   } else {
     WebPWorker* const worker = &dec->worker_;
@@ -269,13 +353,21 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
       ctx->io_ = *io;
       ctx->id_ = dec->cache_id_;
       ctx->mb_y_ = dec->mb_y_;
-      ctx->filter_row_ = dec->filter_row_;
-      if (ctx->filter_row_) {    // just swap filter info
+      ctx->filter_row_ = filter_row;
+      if (dec->mt_method_ == 2) {  // swap macroblock data
+        VP8MBData* const tmp = ctx->mb_data_;
+        ctx->mb_data_ = dec->mb_data_;
+        dec->mb_data_ = tmp;
+      } else {
+        // perform reconstruction directly in main thread
+        ReconstructRow(dec, ctx);
+      }
+      if (filter_row) {            // swap filter info
         VP8FInfo* const tmp = ctx->f_info_;
         ctx->f_info_ = dec->f_info_;
         dec->f_info_ = tmp;
       }
-      WebPWorkerLaunch(worker);
+      WebPWorkerLaunch(worker);    // (reconstruct)+filter in parallel
       if (++dec->cache_id_ == dec->num_caches_) {
         dec->cache_id_ = 0;
       }
@@ -289,8 +381,8 @@ int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) {
 
 VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
   // Call setup() first. This may trigger additional decoding features on 'io'.
-  // Note: Afterward, we must call teardown() not matter what.
-  if (io->setup && !io->setup(io)) {
+  // Note: Afterward, we must call teardown() no matter what.
+  if (io->setup != NULL && !io->setup(io)) {
     VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed");
     return dec->status_;
   }
@@ -303,7 +395,7 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 
   // Define the area where we can skip in-loop filtering, in case of cropping.
   //
-  // 'Simple' filter reads two luma samples outside of the macroblock and
+  // 'Simple' filter reads two luma samples outside of the macroblock
   // and filters one. It doesn't filter the chroma samples. Hence, we can
   // avoid doing the in-loop filtering before crop_top/crop_left position.
   // For the 'Complex' filter, 3 samples are read and up to 3 are filtered.
@@ -344,11 +436,11 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) {
 
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
   int ok = 1;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     ok = WebPWorkerSync(&dec->worker_);
   }
 
-  if (io->teardown) {
+  if (io->teardown != NULL) {
     io->teardown(io);
   }
   return ok;
@@ -384,7 +476,7 @@ int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) {
 // Initialize multi/single-thread worker
 static int InitThreadContext(VP8Decoder* const dec) {
   dec->cache_id_ = 0;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     WebPWorker* const worker = &dec->worker_;
     if (!WebPWorkerReset(worker)) {
       return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY,
@@ -401,6 +493,28 @@ static int InitThreadContext(VP8Decoder* const dec) {
   return 1;
 }
 
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height) {
+  if (options == NULL || options->use_threads == 0) {
+    return 0;
+  }
+  (void)headers;
+  (void)width;
+  (void)height;
+  assert(!headers->is_lossless);
+#if defined(WEBP_USE_THREAD)
+  if (width < MIN_WIDTH_FOR_THREADS) return 0;
+  // TODO(skal): tune the heuristic further
+#if 0
+  if (height < 2 * width) return 2;
+#endif
+  return 2;
+#else   // !WEBP_USE_THREAD
+  return 0;
+#endif
+}
+
 #undef MT_CACHE_LINES
 #undef ST_CACHE_LINES
 
@@ -412,14 +526,15 @@ static int AllocateMemory(VP8Decoder* const dec) {
   const int mb_w = dec->mb_w_;
   // Note: we use 'size_t' when there's no overflow risk, uint64_t otherwise.
   const size_t intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t);
-  const size_t top_size = (16 + 8 + 8) * mb_w;
+  const size_t top_size = sizeof(VP8TopSamples) * mb_w;
   const size_t mb_info_size = (mb_w + 1) * sizeof(VP8MB);
   const size_t f_info_size =
       (dec->filter_type_ > 0) ?
-          mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo)
+          mb_w * (dec->mt_method_ > 0 ? 2 : 1) * sizeof(VP8FInfo)
         : 0;
   const size_t yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_);
-  const size_t coeffs_size = 384 * sizeof(*dec->coeffs_);
+  const size_t mb_data_size =
+      (dec->mt_method_ == 2 ? 2 : 1) * mb_w * sizeof(*dec->mb_data_);
   const size_t cache_height = (16 * num_caches
                             + kFilterExtraRows[dec->filter_type_]) * 3 / 2;
   const size_t cache_size = top_size * cache_height;
@@ -428,7 +543,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
       (uint64_t)dec->pic_hdr_.width_ * dec->pic_hdr_.height_ : 0ULL;
   const uint64_t needed = (uint64_t)intra_pred_mode_size
                         + top_size + mb_info_size + f_info_size
-                        + yuv_size + coeffs_size
+                        + yuv_size + mb_data_size
                         + cache_size + alpha_size + ALIGN_MASK;
   uint8_t* mem;
 
@@ -449,12 +564,8 @@ static int AllocateMemory(VP8Decoder* const dec) {
   dec->intra_t_ = (uint8_t*)mem;
   mem += intra_pred_mode_size;
 
-  dec->y_t_ = (uint8_t*)mem;
-  mem += 16 * mb_w;
-  dec->u_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
-  dec->v_t_ = (uint8_t*)mem;
-  mem += 8 * mb_w;
+  dec->yuv_t_ = (VP8TopSamples*)mem;
+  mem += top_size;
 
   dec->mb_info_ = ((VP8MB*)mem) + 1;
   mem += mb_info_size;
@@ -463,7 +574,7 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += f_info_size;
   dec->thread_ctx_.id_ = 0;
   dec->thread_ctx_.f_info_ = dec->f_info_;
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     // secondary cache line. The deblocking process need to make use of the
     // filtering strength from previous macroblock row, while the new ones
     // are being decoded in parallel. We'll just swap the pointers.
@@ -475,8 +586,12 @@ static int AllocateMemory(VP8Decoder* const dec) {
   dec->yuv_b_ = (uint8_t*)mem;
   mem += yuv_size;
 
-  dec->coeffs_ = (int16_t*)mem;
-  mem += coeffs_size;
+  dec->mb_data_ = (VP8MBData*)mem;
+  dec->thread_ctx_.mb_data_ = (VP8MBData*)mem;
+  if (dec->mt_method_ == 2) {
+    dec->thread_ctx_.mb_data_ += mb_w;
+  }
+  mem += mb_data_size;
 
   dec->cache_y_stride_ = 16 * mb_w;
   dec->cache_uv_stride_ = 8 * mb_w;
@@ -498,8 +613,9 @@ static int AllocateMemory(VP8Decoder* const dec) {
   mem += alpha_size;
   assert(mem <= (uint8_t*)dec->mem_ + dec->mem_size_);
 
-  // note: left-info is initialized once for all.
+  // note: left/top-info is initialized once for all.
   memset(dec->mb_info_ - 1, 0, mb_info_size);
+  VP8InitScanline(dec);   // initialize left too.
 
   // initialize top
   memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size);
@@ -536,159 +652,167 @@ static const int kScan[16] = {
   0 + 12 * BPS,  4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS
 };
 
-static WEBP_INLINE int CheckMode(VP8Decoder* const dec, int mode) {
+static int CheckMode(int mb_x, int mb_y, int mode) {
   if (mode == B_DC_PRED) {
-    if (dec->mb_x_ == 0) {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
+    if (mb_x == 0) {
+      return (mb_y == 0) ? B_DC_PRED_NOTOPLEFT : B_DC_PRED_NOLEFT;
     } else {
-      return (dec->mb_y_ == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
+      return (mb_y == 0) ? B_DC_PRED_NOTOP : B_DC_PRED;
     }
   }
   return mode;
 }
 
-static WEBP_INLINE void Copy32b(uint8_t* dst, uint8_t* src) {
-  *(uint32_t*)dst = *(uint32_t*)src;
+static void Copy32b(uint8_t* dst, uint8_t* src) {
+  memcpy(dst, src, 4);
+}
+
+static WEBP_INLINE void DoTransform(uint32_t bits, const int16_t* const src,
+                                    uint8_t* const dst) {
+  switch (bits >> 30) {
+    case 3:
+      VP8Transform(src, dst, 0);
+      break;
+    case 2:
+      VP8TransformAC3(src, dst);
+      break;
+    case 1:
+      VP8TransformDC(src, dst);
+      break;
+    default:
+      break;
+  }
 }
 
-void VP8ReconstructBlock(VP8Decoder* const dec) {
+static void DoUVTransform(uint32_t bits, const int16_t* const src,
+                          uint8_t* const dst) {
+  if (bits & 0xff) {    // any non-zero coeff at all?
+    if (bits & 0xaa) {  // any non-zero AC coefficient?
+      VP8TransformUV(src, dst);   // note we don't use the AC3 variant for U/V
+    } else {
+      VP8TransformDCUV(src, dst);
+    }
+  }
+}
+
+static void ReconstructRow(const VP8Decoder* const dec,
+                           const VP8ThreadContext* ctx) {
   int j;
+  int mb_x;
+  const int mb_y = ctx->mb_y_;
+  const int cache_id = ctx->id_;
   uint8_t* const y_dst = dec->yuv_b_ + Y_OFF;
   uint8_t* const u_dst = dec->yuv_b_ + U_OFF;
   uint8_t* const v_dst = dec->yuv_b_ + V_OFF;
+  for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) {
+    const VP8MBData* const block = ctx->mb_data_ + mb_x;
 
-  // Rotate in the left samples from previously decoded block. We move four
-  // pixels at a time for alignment reason, and because of in-loop filter.
-  if (dec->mb_x_ > 0) {
-    for (j = -1; j < 16; ++j) {
-      Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
-    }
-    for (j = -1; j < 8; ++j) {
-      Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
-      Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
-    }
-  } else {
-    for (j = 0; j < 16; ++j) {
-      y_dst[j * BPS - 1] = 129;
-    }
-    for (j = 0; j < 8; ++j) {
-      u_dst[j * BPS - 1] = 129;
-      v_dst[j * BPS - 1] = 129;
-    }
-    // Init top-left sample on left column too
-    if (dec->mb_y_ > 0) {
-      y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
-    }
-  }
-  {
-    // bring top samples into the cache
-    uint8_t* const top_y = dec->y_t_ + dec->mb_x_ * 16;
-    uint8_t* const top_u = dec->u_t_ + dec->mb_x_ * 8;
-    uint8_t* const top_v = dec->v_t_ + dec->mb_x_ * 8;
-    const int16_t* coeffs = dec->coeffs_;
-    int n;
-
-    if (dec->mb_y_ > 0) {
-      memcpy(y_dst - BPS, top_y, 16);
-      memcpy(u_dst - BPS, top_u, 8);
-      memcpy(v_dst - BPS, top_v, 8);
-    } else if (dec->mb_x_ == 0) {
-      // we only need to do this init once at block (0,0).
-      // Afterward, it remains valid for the whole topmost row.
-      memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
-      memset(u_dst - BPS - 1, 127, 8 + 1);
-      memset(v_dst - BPS - 1, 127, 8 + 1);
+    // Rotate in the left samples from previously decoded block. We move four
+    // pixels at a time for alignment reason, and because of in-loop filter.
+    if (mb_x > 0) {
+      for (j = -1; j < 16; ++j) {
+        Copy32b(&y_dst[j * BPS - 4], &y_dst[j * BPS + 12]);
+      }
+      for (j = -1; j < 8; ++j) {
+        Copy32b(&u_dst[j * BPS - 4], &u_dst[j * BPS + 4]);
+        Copy32b(&v_dst[j * BPS - 4], &v_dst[j * BPS + 4]);
+      }
+    } else {
+      for (j = 0; j < 16; ++j) {
+        y_dst[j * BPS - 1] = 129;
+      }
+      for (j = 0; j < 8; ++j) {
+        u_dst[j * BPS - 1] = 129;
+        v_dst[j * BPS - 1] = 129;
+      }
+      // Init top-left sample on left column too
+      if (mb_y > 0) {
+        y_dst[-1 - BPS] = u_dst[-1 - BPS] = v_dst[-1 - BPS] = 129;
+      }
     }
+    {
+      // bring top samples into the cache
+      VP8TopSamples* const top_yuv = dec->yuv_t_ + mb_x;
+      const int16_t* const coeffs = block->coeffs_;
+      uint32_t bits = block->non_zero_y_;
+      int n;
+
+      if (mb_y > 0) {
+        memcpy(y_dst - BPS, top_yuv[0].y, 16);
+        memcpy(u_dst - BPS, top_yuv[0].u, 8);
+        memcpy(v_dst - BPS, top_yuv[0].v, 8);
+      } else if (mb_x == 0) {
+        // we only need to do this init once at block (0,0).
+        // Afterward, it remains valid for the whole topmost row.
+        memset(y_dst - BPS - 1, 127, 16 + 4 + 1);
+        memset(u_dst - BPS - 1, 127, 8 + 1);
+        memset(v_dst - BPS - 1, 127, 8 + 1);
+      }
 
-    // predict and add residuals
-
-    if (dec->is_i4x4_) {   // 4x4
-      uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
+      // predict and add residuals
+      if (block->is_i4x4_) {   // 4x4
+        uint32_t* const top_right = (uint32_t*)(y_dst - BPS + 16);
 
-      if (dec->mb_y_ > 0) {
-        if (dec->mb_x_ >= dec->mb_w_ - 1) {    // on rightmost border
-          top_right[0] = top_y[15] * 0x01010101u;
-        } else {
-          memcpy(top_right, top_y + 16, sizeof(*top_right));
-        }
-      }
-      // replicate the top-right pixels below
-      top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
-
-      // predict and add residues for all 4x4 blocks in turn.
-      for (n = 0; n < 16; n++) {
-        uint8_t* const dst = y_dst + kScan[n];
-        VP8PredLuma4[dec->imodes_[n]](dst);
-        if (dec->non_zero_ac_ & (1 << n)) {
-          VP8Transform(coeffs + n * 16, dst, 0);
-        } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-          VP8TransformDC(coeffs + n * 16, dst);
+        if (mb_y > 0) {
+          if (mb_x >= dec->mb_w_ - 1) {    // on rightmost border
+            memset(top_right, top_yuv[0].y[15], sizeof(*top_right));
+          } else {
+            memcpy(top_right, top_yuv[1].y, sizeof(*top_right));
+          }
         }
-      }
-    } else {    // 16x16
-      const int pred_func = CheckMode(dec, dec->imodes_[0]);
-      VP8PredLuma16[pred_func](y_dst);
-      if (dec->non_zero_) {
-        for (n = 0; n < 16; n++) {
+        // replicate the top-right pixels below
+        top_right[BPS] = top_right[2 * BPS] = top_right[3 * BPS] = top_right[0];
+
+        // predict and add residuals for all 4x4 blocks in turn.
+        for (n = 0; n < 16; ++n, bits <<= 2) {
           uint8_t* const dst = y_dst + kScan[n];
-          if (dec->non_zero_ac_ & (1 << n)) {
-            VP8Transform(coeffs + n * 16, dst, 0);
-          } else if (dec->non_zero_ & (1 << n)) {  // only DC is present
-            VP8TransformDC(coeffs + n * 16, dst);
-          }
+          VP8PredLuma4[block->imodes_[n]](dst);
+          DoTransform(bits, coeffs + n * 16, dst);
         }
-      }
-    }
-    {
-      // Chroma
-      const int pred_func = CheckMode(dec, dec->uvmode_);
-      VP8PredChroma8[pred_func](u_dst);
-      VP8PredChroma8[pred_func](v_dst);
-
-      if (dec->non_zero_ & 0x0f0000) {   // chroma-U
-        const int16_t* const u_coeffs = dec->coeffs_ + 16 * 16;
-        if (dec->non_zero_ac_ & 0x0f0000) {
-          VP8TransformUV(u_coeffs, u_dst);
-        } else {
-          VP8TransformDCUV(u_coeffs, u_dst);
+      } else {    // 16x16
+        const int pred_func = CheckMode(mb_x, mb_y,
+                                        block->imodes_[0]);
+        VP8PredLuma16[pred_func](y_dst);
+        if (bits != 0) {
+          for (n = 0; n < 16; ++n, bits <<= 2) {
+            DoTransform(bits, coeffs + n * 16, y_dst + kScan[n]);
+          }
         }
       }
-      if (dec->non_zero_ & 0xf00000) {   // chroma-V
-        const int16_t* const v_coeffs = dec->coeffs_ + 20 * 16;
-        if (dec->non_zero_ac_ & 0xf00000) {
-          VP8TransformUV(v_coeffs, v_dst);
-        } else {
-          VP8TransformDCUV(v_coeffs, v_dst);
-        }
+      {
+        // Chroma
+        const uint32_t bits_uv = block->non_zero_uv_;
+        const int pred_func = CheckMode(mb_x, mb_y, block->uvmode_);
+        VP8PredChroma8[pred_func](u_dst);
+        VP8PredChroma8[pred_func](v_dst);
+        DoUVTransform(bits_uv >> 0, coeffs + 16 * 16, u_dst);
+        DoUVTransform(bits_uv >> 8, coeffs + 20 * 16, v_dst);
       }
 
       // stash away top samples for next block
-      if (dec->mb_y_ < dec->mb_h_ - 1) {
-        memcpy(top_y, y_dst + 15 * BPS, 16);
-        memcpy(top_u, u_dst +  7 * BPS,  8);
-        memcpy(top_v, v_dst +  7 * BPS,  8);
+      if (mb_y < dec->mb_h_ - 1) {
+        memcpy(top_yuv[0].y, y_dst + 15 * BPS, 16);
+        memcpy(top_yuv[0].u, u_dst +  7 * BPS,  8);
+        memcpy(top_yuv[0].v, v_dst +  7 * BPS,  8);
       }
     }
-  }
-  // Transfer reconstructed samples from yuv_b_ cache to final destination.
-  {
-    const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_;
-    const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_;
-    uint8_t* const y_out = dec->cache_y_ + dec->mb_x_ * 16 + y_offset;
-    uint8_t* const u_out = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset;
-    uint8_t* const v_out = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset;
-    for (j = 0; j < 16; ++j) {
-      memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
-    }
-    for (j = 0; j < 8; ++j) {
-      memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
-      memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+    // Transfer reconstructed samples from yuv_b_ cache to final destination.
+    {
+      const int y_offset = cache_id * 16 * dec->cache_y_stride_;
+      const int uv_offset = cache_id * 8 * dec->cache_uv_stride_;
+      uint8_t* const y_out = dec->cache_y_ + mb_x * 16 + y_offset;
+      uint8_t* const u_out = dec->cache_u_ + mb_x * 8 + uv_offset;
+      uint8_t* const v_out = dec->cache_v_ + mb_x * 8 + uv_offset;
+      for (j = 0; j < 16; ++j) {
+        memcpy(y_out + j * dec->cache_y_stride_, y_dst + j * BPS, 16);
+      }
+      for (j = 0; j < 8; ++j) {
+        memcpy(u_out + j * dec->cache_uv_stride_, u_dst + j * BPS, 8);
+        memcpy(v_out + j * dec->cache_uv_stride_, v_dst + j * BPS, 8);
+      }
     }
   }
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/idec.c b/third_party/libwebp/dec/idec.c
index 61635ec..40d5ff6 100644
--- a/third_party/libwebp/dec/idec.c
+++ b/third_party/libwebp/dec/idec.c
@@ -15,14 +15,11 @@
 #include <string.h>
 #include <stdlib.h>
 
+#include "./alphai.h"
 #include "./webpi.h"
 #include "./vp8i.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 // In append mode, buffer allocations increase as multiples of this value.
 // Needs to be a power of 2.
 #define CHUNK_SIZE 4096
@@ -31,11 +28,13 @@ extern "C" {
 //------------------------------------------------------------------------------
 // Data structures for memory and states
 
-// Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE.
+// Decoding states. State normally flows as:
+// WEBP_HEADER->VP8_HEADER->VP8_PARTS0->VP8_DATA->DONE for a lossy image, and
+// WEBP_HEADER->VP8L_HEADER->VP8L_DATA->DONE for a lossless image.
 // If there is any error the decoder goes into state ERROR.
 typedef enum {
-  STATE_PRE_VP8,  // All data before that of the first VP8 chunk.
-  STATE_VP8_FRAME_HEADER,  // For VP8 Frame header (within VP8 chunk).
+  STATE_WEBP_HEADER,  // All the data before that of the VP8/VP8L chunk.
+  STATE_VP8_HEADER,   // The VP8 Frame header (within the VP8 chunk).
   STATE_VP8_PARTS0,
   STATE_VP8_DATA,
   STATE_VP8L_HEADER,
@@ -102,7 +101,7 @@ static WEBP_INLINE size_t MemDataSize(const MemBuffer* mem) {
 // Check if we need to preserve the compressed alpha data, as it may not have
 // been decoded yet.
 static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
-  if (idec->state_ == STATE_PRE_VP8) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
     // We haven't parsed the headers yet, so we don't know whether the image is
     // lossy or lossless. This also means that we haven't parsed the ALPH chunk.
     return 0;
@@ -111,7 +110,7 @@ static int NeedCompressedAlpha(const WebPIDecoder* const idec) {
     return 0;  // ALPH chunk is not present for lossless images.
   } else {
     const VP8Decoder* const dec = (VP8Decoder*)idec->dec_;
-    assert(dec != NULL);  // Must be true as idec->state_ != STATE_PRE_VP8.
+    assert(dec != NULL);  // Must be true as idec->state_ != STATE_WEBP_HEADER.
     return (dec->alpha_data_ != NULL) && !dec->is_alpha_decoded_;
   }
 }
@@ -141,7 +140,22 @@ static void DoRemap(WebPIDecoder* const idec, ptrdiff_t offset) {
       }
       assert(last_part >= 0);
       dec->parts_[last_part].buf_end_ = mem->buf_ + mem->end_;
-      if (NeedCompressedAlpha(idec)) dec->alpha_data_ += offset;
+      if (NeedCompressedAlpha(idec)) {
+        ALPHDecoder* const alph_dec = dec->alph_dec_;
+        dec->alpha_data_ += offset;
+        if (alph_dec != NULL) {
+          if (alph_dec->method_ == ALPHA_LOSSLESS_COMPRESSION) {
+            VP8LDecoder* const alph_vp8l_dec = alph_dec->vp8l_dec_;
+            assert(alph_vp8l_dec != NULL);
+            assert(dec->alpha_data_size_ >= ALPHA_HEADER_LEN);
+            VP8LBitReaderSetBuffer(&alph_vp8l_dec->br_,
+                                   dec->alpha_data_ + ALPHA_HEADER_LEN,
+                                   dec->alpha_data_size_ - ALPHA_HEADER_LEN);
+          } else {  // alph_dec->method_ == ALPHA_NO_COMPRESSION
+            // Nothing special to do in this case.
+          }
+        }
+      }
     } else {    // Resize lossless bitreader
       VP8LDecoder* const dec = (VP8LDecoder*)idec->dec_;
       VP8LBitReaderSetBuffer(&dec->br_, new_base, MemDataSize(mem));
@@ -268,7 +282,7 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec,
 static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) {
   if (idec->state_ == STATE_VP8_DATA) {
     VP8Io* const io = &idec->io_;
-    if (io->teardown) {
+    if (io->teardown != NULL) {
       io->teardown(io);
     }
   }
@@ -311,15 +325,9 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
     idec->dec_ = dec;
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = (idec->params_.options != NULL) &&
-                        (idec->params_.options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
-    ChangeState(idec, STATE_VP8_FRAME_HEADER, headers.offset);
+    ChangeState(idec, STATE_VP8_HEADER, headers.offset);
   } else {
     VP8LDecoder* const dec = VP8LNew();
     if (dec == NULL) {
@@ -334,13 +342,14 @@ static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) {
 static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) {
   const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_;
   const size_t curr_size = MemDataSize(&idec->mem_);
+  int width, height;
   uint32_t bits;
 
   if (curr_size < VP8_FRAME_HEADER_SIZE) {
     // Not enough data bytes to extract VP8 Frame Header.
     return VP8_STATUS_SUSPENDED;
   }
-  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, NULL, NULL)) {
+  if (!VP8GetInfo(data, curr_size, idec->chunk_size_, &width, &height)) {
     return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR);
   }
 
@@ -407,7 +416,10 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) {
   if (dec->status_ != VP8_STATUS_OK) {
     return IDecError(idec, dec->status_);
   }
-
+  // This change must be done before calling VP8InitFrame()
+  dec->mt_method_ = VP8GetThreadMethod(params->options, NULL,
+                                       io->width, io->height);
+  VP8InitDithering(params->options, dec);
   if (!CopyParts0Data(idec)) {
     return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY);
   }
@@ -433,16 +445,11 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   VP8Io* const io = &idec->io_;
 
   assert(dec->ready_);
-
   for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) {
     VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    if (dec->mb_x_ == 0) {
-      VP8InitScanline(dec);
-    }
-    for (; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
       MBContext context;
       SaveContext(dec, token_br, &context);
-
       if (!VP8DecodeMB(dec, token_br)) {
         RestoreContext(&context, dec, token_br);
         // We shouldn't fail when MAX_MB data was available
@@ -451,19 +458,18 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
         }
         return VP8_STATUS_SUSPENDED;
       }
-      // Reconstruct and emit samples.
-      VP8ReconstructBlock(dec);
-
       // Release buffer only if there is only one partition
       if (dec->num_parts_ == 1) {
         idec->mem_.start_ = token_br->buf_ - idec->mem_.buf_;
         assert(idec->mem_.start_ <= idec->mem_.end_);
       }
     }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return IDecError(idec, VP8_STATUS_USER_ABORT);
     }
-    dec->mb_x_ = 0;
   }
   // Synchronize the thread and check for errors.
   if (!VP8ExitCritical(dec, io)) {
@@ -475,7 +481,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) {
   return VP8_STATUS_OK;
 }
 
-static int ErrorStatusLossless(WebPIDecoder* const idec, VP8StatusCode status) {
+static VP8StatusCode ErrorStatusLossless(WebPIDecoder* const idec,
+                                         VP8StatusCode status) {
   if (status == VP8_STATUS_SUSPENDED || status == VP8_STATUS_NOT_ENOUGH_DATA) {
     return VP8_STATUS_SUSPENDED;
   }
@@ -532,14 +539,14 @@ static VP8StatusCode DecodeVP8LData(WebPIDecoder* const idec) {
 static VP8StatusCode IDecode(WebPIDecoder* idec) {
   VP8StatusCode status = VP8_STATUS_SUSPENDED;
 
-  if (idec->state_ == STATE_PRE_VP8) {
+  if (idec->state_ == STATE_WEBP_HEADER) {
     status = DecodeWebPHeaders(idec);
   } else {
     if (idec->dec_ == NULL) {
       return VP8_STATUS_SUSPENDED;    // can't continue if we have no decoder.
     }
   }
-  if (idec->state_ == STATE_VP8_FRAME_HEADER) {
+  if (idec->state_ == STATE_VP8_HEADER) {
     status = DecodeVP8FrameHeader(idec);
   }
   if (idec->state_ == STATE_VP8_PARTS0) {
@@ -566,7 +573,7 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
     return NULL;
   }
 
-  idec->state_ = STATE_PRE_VP8;
+  idec->state_ = STATE_WEBP_HEADER;
   idec->chunk_size_ = 0;
 
   InitMemBuffer(&idec->mem_);
@@ -574,7 +581,8 @@ WebPIDecoder* WebPINewDecoder(WebPDecBuffer* output_buffer) {
   VP8InitIo(&idec->io_);
 
   WebPResetDecParams(&idec->params_);
-  idec->params_.output = output_buffer ? output_buffer : &idec->output_;
+  idec->params_.output = (output_buffer != NULL) ? output_buffer
+                                                 : &idec->output_;
   WebPInitCustomIo(&idec->params_, &idec->io_);  // Plug the I/O functions.
 
   return idec;
@@ -608,11 +616,11 @@ void WebPIDelete(WebPIDecoder* idec) {
     if (!idec->is_lossless_) {
       if (idec->state_ == STATE_VP8_DATA) {
         // Synchronize the thread, clean-up and check for errors.
-        VP8ExitCritical(idec->dec_, &idec->io_);
+        VP8ExitCritical((VP8Decoder*)idec->dec_, &idec->io_);
       }
-      VP8Delete(idec->dec_);
+      VP8Delete((VP8Decoder*)idec->dec_);
     } else {
-      VP8LDelete(idec->dec_);
+      VP8LDelete((VP8LDecoder*)idec->dec_);
     }
   }
   ClearMemBuffer(&idec->mem_);
@@ -827,7 +835,7 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
                     VP8IoSetupHook setup,
                     VP8IoTeardownHook teardown,
                     void* user_data) {
-  if (idec == NULL || idec->state_ > STATE_PRE_VP8) {
+  if (idec == NULL || idec->state_ > STATE_WEBP_HEADER) {
     return 0;
   }
 
@@ -839,6 +847,3 @@ int WebPISetIOHooks(WebPIDecoder* const idec,
   return 1;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/io.c b/third_party/libwebp/dec/io.c
index 63810b4..1ba376e 100644
--- a/third_party/libwebp/dec/io.c
+++ b/third_party/libwebp/dec/io.c
@@ -18,10 +18,6 @@
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Main YUV<->RGB conversion functions
 
@@ -119,7 +115,7 @@ static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) {
 
   if (y == 0) {
     // First line is special cased. We mirror the u/v samples at boundary.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, mb_w);
   } else {
     // We can finish the left-over line from previous call.
     upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v,
@@ -603,7 +599,7 @@ static int CustomPut(const VP8Io* io) {
     return 0;
   }
   num_lines_out = p->emit(io, p);
-  if (p->emit_alpha) {
+  if (p->emit_alpha != NULL) {
     p->emit_alpha(io, p);
   }
   p->last_y += num_lines_out;
@@ -630,6 +626,3 @@ void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/layer.c b/third_party/libwebp/dec/layer.c
index 9a4b2d9..dacb9e2 100644
--- a/third_party/libwebp/dec/layer.c
+++ b/third_party/libwebp/dec/layer.c
@@ -16,10 +16,6 @@
 
 #include "./vp8i.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 
 int VP8DecodeLayer(VP8Decoder* const dec) {
@@ -32,6 +28,3 @@ int VP8DecodeLayer(VP8Decoder* const dec) {
   return 1;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/quant.c b/third_party/libwebp/dec/quant.c
index a4cc693..5b648f9 100644
--- a/third_party/libwebp/dec/quant.c
+++ b/third_party/libwebp/dec/quant.c
@@ -13,10 +13,6 @@
 
 #include "./vp8i.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 static WEBP_INLINE int clip(int v, int M) {
   return v < 0 ? 0 : v > M ? M : v;
 }
@@ -104,12 +100,11 @@ void VP8ParseQuant(VP8Decoder* const dec) {
 
       m->uv_mat_[0] = kDcTable[clip(q + dquv_dc, 117)];
       m->uv_mat_[1] = kAcTable[clip(q + dquv_ac, 127)];
+
+      m->uv_quant_ = q + dquv_ac;   // for dithering strength evaluation
     }
   }
 }
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/tree.c b/third_party/libwebp/dec/tree.c
index 3f02efe..bf9b7c5 100644
--- a/third_party/libwebp/dec/tree.c
+++ b/third_party/libwebp/dec/tree.c
@@ -15,10 +15,6 @@
 
 #define USE_GENERIC_TREE
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef USE_GENERIC_TREE
 static const int8_t kYModesIntra4[18] = {
   -B_DC_PRED, 1,
@@ -33,61 +29,12 @@ static const int8_t kYModesIntra4[18] = {
 };
 #endif
 
-#ifndef ONLY_KEYFRAME_CODE
-
-// inter prediction modes
-enum {
-  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
-  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
-
-static const int8_t kYModesInter[8] = {
-  -DC_PRED, 1,
-    2, 3,
-      -V_PRED, -H_PRED,
-      -TM_PRED, -B_PRED
-};
-
-static const int8_t kMBSplit[6] = {
-  -3, 1,
-    -2, 2,
-      -0, -1
-};
-
-static const int8_t kMVRef[8] = {
-  -ZEROMV, 1,
-    -NEARESTMV, 2,
-      -NEARMV, 3,
-        -NEWMV, -SPLITMV
-};
-
-static const int8_t kMVRef4[6] = {
-  -LEFT4, 1,
-    -ABOVE4, 2,
-      -ZERO4, -NEW4
-};
-#endif
-
 //------------------------------------------------------------------------------
 // Default probabilities
 
-// Inter
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t kYModeProbaInter0[4] = { 112, 86, 140, 37 };
-static const uint8_t kUVModeProbaInter0[3] = { 162, 101, 204 };
-static const uint8_t kMVProba0[2][NUM_MV_PROBAS] = {
-  { 162, 128, 225, 146, 172, 147, 214,  39,
-    156, 128, 129, 132,  75, 145, 178, 206,
-    239, 254, 254 },
-  { 164, 128, 204, 170, 119, 235, 140, 230,
-    228, 128, 130, 130,  74, 148, 180, 203,
-    236, 254, 254 }
-};
-#endif
-
 // Paragraph 13.5
 static const uint8_t
   CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -328,28 +275,25 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
 
 void VP8ResetProba(VP8Proba* const proba) {
   memset(proba->segments_, 255u, sizeof(proba->segments_));
-  memcpy(proba->coeffs_, CoeffsProba0, sizeof(CoeffsProba0));
-#ifndef ONLY_KEYFRAME_CODE
-  memcpy(proba->mv_, kMVProba0, sizeof(kMVProba0));
-  memcpy(proba->ymode_, kYModeProbaInter0, sizeof(kYModeProbaInter0));
-  memcpy(proba->uvmode_, kUVModeProbaInter0, sizeof(kUVModeProbaInter0));
-#endif
+  // proba->bands_[][] is initialized later
 }
 
-void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
+void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) {
   uint8_t* const top = dec->intra_t_ + 4 * dec->mb_x_;
   uint8_t* const left = dec->intra_l_;
-  // Hardcoded 16x16 intra-mode decision tree.
-  dec->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
-  if (!dec->is_i4x4_) {
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+
+  block->is_i4x4_ = !VP8GetBit(br, 145);   // decide for B_PRED first
+  if (!block->is_i4x4_) {
+    // Hardcoded 16x16 intra-mode decision tree.
     const int ymode =
         VP8GetBit(br, 156) ? (VP8GetBit(br, 128) ? TM_PRED : H_PRED)
                            : (VP8GetBit(br, 163) ? V_PRED : DC_PRED);
-    dec->imodes_[0] = ymode;
-    memset(top, ymode, 4 * sizeof(top[0]));
-    memset(left, ymode, 4 * sizeof(left[0]));
+    block->imodes_[0] = ymode;
+    memset(top, ymode, 4 * sizeof(*top));
+    memset(left, ymode, 4 * sizeof(*left));
   } else {
-    uint8_t* modes = dec->imodes_;
+    uint8_t* modes = block->imodes_;
     int y;
     for (y = 0; y < 4; ++y) {
       int ymode = left[y];
@@ -358,10 +302,10 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
         const uint8_t* const prob = kBModesProba[top[x]][ymode];
 #ifdef USE_GENERIC_TREE
         // Generic tree-parsing
-        int i = 0;
-        do {
+        int i = kYModesIntra4[VP8GetBit(br, prob[0])];
+        while (i > 0) {
           i = kYModesIntra4[2 * i + VP8GetBit(br, prob[i])];
-        } while (i > 0);
+        }
         ymode = -i;
 #else
         // Hardcoded tree parsing
@@ -376,15 +320,16 @@ void VP8ParseIntraMode(VP8BitReader* const br,  VP8Decoder* const dec) {
                             (!VP8GetBit(br, prob[8]) ? B_HD_PRED : B_HU_PRED)));
 #endif    // USE_GENERIC_TREE
         top[x] = ymode;
-        *modes++ = ymode;
       }
+      memcpy(modes, top, 4 * sizeof(*top));
+      modes += 4;
       left[y] = ymode;
     }
   }
   // Hardcoded UVMode decision tree
-  dec->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
-               : !VP8GetBit(br, 114) ? V_PRED
-               : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
+  block->uvmode_ = !VP8GetBit(br, 142) ? DC_PRED
+                 : !VP8GetBit(br, 114) ? V_PRED
+                 : VP8GetBit(br, 183) ? TM_PRED : H_PRED;
 }
 
 //------------------------------------------------------------------------------
@@ -526,17 +471,6 @@ static const uint8_t
   }
 };
 
-#ifndef ONLY_KEYFRAME_CODE
-static const uint8_t MVUpdateProba[2][NUM_MV_PROBAS] = {
-  { 237, 246, 253, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 250, 250,
-    252, 254, 254 },
-  { 231, 243, 245, 253, 254, 254, 254, 254,
-    254, 254, 254, 254, 254, 254, 251, 251,
-    254, 254, 254 }
-};
-#endif
-
 // Paragraph 9.9
 void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   VP8Proba* const proba = &dec->proba_;
@@ -545,9 +479,9 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
     for (b = 0; b < NUM_BANDS; ++b) {
       for (c = 0; c < NUM_CTX; ++c) {
         for (p = 0; p < NUM_PROBAS; ++p) {
-          if (VP8GetBit(br, CoeffsUpdateProba[t][b][c][p])) {
-            proba->coeffs_[t][b][c][p] = VP8GetValue(br, 8);
-          }
+          const int v = VP8GetBit(br, CoeffsUpdateProba[t][b][c][p]) ?
+                        VP8GetValue(br, 8) : CoeffsProba0[t][b][c][p];
+          proba->bands_[t][b].probas_[c][p] = v;
         }
       }
     }
@@ -556,36 +490,5 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   if (dec->use_skip_proba_) {
     dec->skip_p_ = VP8GetValue(br, 8);
   }
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->frm_hdr_.key_frame_) {
-    int i;
-    dec->intra_p_ = VP8GetValue(br, 8);
-    dec->last_p_ = VP8GetValue(br, 8);
-    dec->golden_p_ = VP8GetValue(br, 8);
-    if (VP8Get(br)) {   // update y-mode
-      for (i = 0; i < 4; ++i) {
-        proba->ymode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    if (VP8Get(br)) {   // update uv-mode
-      for (i = 0; i < 3; ++i) {
-        proba->uvmode_[i] = VP8GetValue(br, 8);
-      }
-    }
-    // update MV
-    for (i = 0; i < 2; ++i) {
-      int k;
-      for (k = 0; k < NUM_MV_PROBAS; ++k) {
-        if (VP8GetBit(br, MVUpdateProba[i][k])) {
-          const int v = VP8GetValue(br, 7);
-          proba->mv_[i][k] = v ? v << 1 : 1;
-        }
-      }
-    }
-  }
-#endif
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/vp8.c b/third_party/libwebp/dec/vp8.c
index 8632e48..bfd0e8f 100644
--- a/third_party/libwebp/dec/vp8.c
+++ b/third_party/libwebp/dec/vp8.c
@@ -13,15 +13,12 @@
 
 #include <stdlib.h>
 
+#include "./alphai.h"
 #include "./vp8i.h"
 #include "./vp8li.h"
 #include "./webpi.h"
 #include "../utils/bit_reader.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 
 int WebPGetDecoderVersion(void) {
@@ -123,6 +120,9 @@ int VP8GetInfo(const uint8_t* data, size_t data_size, size_t chunk_size,
     if (((bits >> 5)) >= chunk_size) {  // partition_length
       return 0;         // inconsistent size information.
     }
+    if (w == 0 || h == 0) {
+      return 0;         // We don't support both width and height to be zero.
+    }
 
     if (width) {
       *width = w;
@@ -249,7 +249,6 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   VP8PictureHeader* pic_hdr;
   VP8BitReader* br;
   VP8StatusCode status;
-  WebPHeaderStructure headers;
 
   if (dec == NULL) {
     return 0;
@@ -259,33 +258,8 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     return VP8SetError(dec, VP8_STATUS_INVALID_PARAM,
                        "null VP8Io passed to VP8GetHeaders()");
   }
-
-  // Process Pre-VP8 chunks.
-  headers.data = io->data;
-  headers.data_size = io->data_size;
-  status = WebPParseHeaders(&headers);
-  if (status != VP8_STATUS_OK) {
-    return VP8SetError(dec, status, "Incorrect/incomplete header.");
-  }
-  if (headers.is_lossless) {
-    return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR,
-                       "Unexpected lossless format encountered.");
-  }
-
-  if (dec->alpha_data_ == NULL) {
-    assert(dec->alpha_data_size_ == 0);
-    // We have NOT set alpha data yet. Set it now.
-    // (This is to ensure that dec->alpha_data_ is NOT reset to NULL if
-    // WebPParseHeaders() is called more than once, as in incremental decoding
-    // case.)
-    dec->alpha_data_ = headers.alpha_data;
-    dec->alpha_data_size_ = headers.alpha_data_size;
-  }
-
-  // Process the VP8 frame header.
-  buf = headers.data + headers.offset;
-  buf_size = headers.data_size - headers.offset;
-  assert(headers.data_size >= headers.offset);  // WebPParseHeaders' guarantee
+  buf = io->data;
+  buf_size = io->data_size;
   if (buf_size < 4) {
     return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                        "Truncated header.");
@@ -381,38 +355,11 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
   // Frame buffer marking
   if (!frm_hdr->key_frame_) {
-    // Paragraph 9.7
-#ifndef ONLY_KEYFRAME_CODE
-    dec->buffer_flags_ = VP8Get(br) << 0;   // update golden
-    dec->buffer_flags_ |= VP8Get(br) << 1;  // update alt ref
-    if (!(dec->buffer_flags_ & 1)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 2;
-    }
-    if (!(dec->buffer_flags_ & 2)) {
-      dec->buffer_flags_ |= VP8GetValue(br, 2) << 4;
-    }
-    dec->buffer_flags_ |= VP8Get(br) << 6;    // sign bias golden
-    dec->buffer_flags_ |= VP8Get(br) << 7;    // sign bias alt ref
-#else
     return VP8SetError(dec, VP8_STATUS_UNSUPPORTED_FEATURE,
                        "Not a key frame.");
-#endif
-  } else {
-    dec->buffer_flags_ = 0x003 | 0x100;
   }
 
-  // Paragraph 9.8
-#ifndef ONLY_KEYFRAME_CODE
-  dec->update_proba_ = VP8Get(br);
-  if (!dec->update_proba_) {    // save for later restore
-    dec->proba_saved_ = dec->proba_;
-  }
-  dec->buffer_flags_ &= 1 << 8;
-  dec->buffer_flags_ |=
-      (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
-#else
-  VP8Get(br);   // just ignore the value of update_proba_
-#endif
+  VP8Get(br);   // ignore the value of update_proba_
 
   VP8ParseProba(br, dec);
 
@@ -461,9 +408,6 @@ static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
-typedef const uint8_t (*ProbaCtxArray)[NUM_PROBAS];
-
 // See section 13-2: http://tools.ietf.org/html/rfc6386#section-13.2
 static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
   int v;
@@ -497,19 +441,20 @@ static int GetLargeValue(VP8BitReader* const br, const uint8_t* const p) {
 }
 
 // Returns the position of the last non-zero coeff plus one
-// (and 0 if there's no coeff at all)
-static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
+static int GetCoeffs(VP8BitReader* const br, const VP8BandProbas* const prob,
                      int ctx, const quant_t dq, int n, int16_t* out) {
   // n is either 0 or 1 here. kBands[n] is not necessary for extracting '*p'.
-  const uint8_t* p = prob[n][ctx];
-  if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
-    return 0;
-  }
+  const uint8_t* p = prob[n].probas_[ctx];
   for (; n < 16; ++n) {
-    const ProbaCtxArray p_ctx = prob[kBands[n + 1]];
-    if (!VP8GetBit(br, p[1])) {
-      p = p_ctx[0];
-    } else {  // non zero coeff
+    if (!VP8GetBit(br, p[0])) {
+      return n;  // previous coeff was last non-zero coeff
+    }
+    while (!VP8GetBit(br, p[1])) {       // sequence of zero coeffs
+      p = prob[kBands[++n]].probas_[0];
+      if (n == 16) return 16;
+    }
+    {        // non zero coeff
+      const VP8ProbaArray* const p_ctx = &prob[kBands[n + 1]].probas_[0];
       int v;
       if (!VP8GetBit(br, p[2])) {
         v = 1;
@@ -519,115 +464,107 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
         p = p_ctx[2];
       }
       out[kZigzag[n]] = VP8GetSigned(br, v) * dq[n > 0];
-      if (n < 15 && !VP8GetBit(br, p[0])) {   // EOB
-        return n + 1;
-      }
     }
   }
   return 16;
 }
 
-// Alias-safe way of converting 4bytes to 32bits.
-typedef union {
-  uint8_t  i8[4];
-  uint32_t i32;
-} PackedNz;
-
-// Table to unpack four bits into four bytes
-static const PackedNz kUnpackTab[16] = {
-  {{0, 0, 0, 0}},  {{1, 0, 0, 0}},  {{0, 1, 0, 0}},  {{1, 1, 0, 0}},
-  {{0, 0, 1, 0}},  {{1, 0, 1, 0}},  {{0, 1, 1, 0}},  {{1, 1, 1, 0}},
-  {{0, 0, 0, 1}},  {{1, 0, 0, 1}},  {{0, 1, 0, 1}},  {{1, 1, 0, 1}},
-  {{0, 0, 1, 1}},  {{1, 0, 1, 1}},  {{0, 1, 1, 1}},  {{1, 1, 1, 1}} };
-
-// Macro to pack four LSB of four bytes into four bits.
-#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
-    defined(__BIG_ENDIAN__)
-#define PACK_CST 0x08040201U
-#else
-#define PACK_CST 0x01020408U
-#endif
-#define PACK(X, S) ((((X).i32 * PACK_CST) & 0xff000000) >> (S))
-
-static void ParseResiduals(VP8Decoder* const dec,
-                           VP8MB* const mb, VP8BitReader* const token_br) {
-  int out_t_nz, out_l_nz, first;
-  ProbaArray ac_prob;
-  const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
-  int16_t* dst = dec->coeffs_;
+static WEBP_INLINE uint32_t NzCodeBits(uint32_t nz_coeffs, int nz, int dc_nz) {
+  nz_coeffs <<= 2;
+  nz_coeffs |= (nz > 3) ? 3 : (nz > 1) ? 2 : dc_nz;
+  return nz_coeffs;
+}
+
+static int ParseResiduals(VP8Decoder* const dec,
+                          VP8MB* const mb, VP8BitReader* const token_br) {
+  VP8BandProbas (* const bands)[NUM_BANDS] = dec->proba_.bands_;
+  const VP8BandProbas* ac_proba;
+  const VP8QuantMatrix* const q = &dec->dqm_[dec->segment_];
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int16_t* dst = block->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
-  PackedNz nz_ac, nz_dc;
-  PackedNz tnz, lnz;
-  uint32_t non_zero_ac = 0;
-  uint32_t non_zero_dc = 0;
+  uint8_t tnz, lnz;
+  uint32_t non_zero_y = 0;
+  uint32_t non_zero_uv = 0;
   int x, y, ch;
+  uint32_t out_t_nz, out_l_nz;
+  int first;
 
-  nz_dc.i32 = nz_ac.i32 = 0;
   memset(dst, 0, 384 * sizeof(*dst));
-  if (!dec->is_i4x4_) {    // parse DC
+  if (!block->is_i4x4_) {    // parse DC
     int16_t dc[16] = { 0 };
-    const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
-    mb->dc_nz_ = left_mb->dc_nz_ =
-        (GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
-                   ctx, q->y2_mat_, 0, dc) > 0);
+    const int ctx = mb->nz_dc_ + left_mb->nz_dc_;
+    const int nz = GetCoeffs(token_br, bands[1], ctx, q->y2_mat_, 0, dc);
+    mb->nz_dc_ = left_mb->nz_dc_ = (nz > 0);
+    if (nz > 1) {   // more than just the DC -> perform the full transform
+      VP8TransformWHT(dc, dst);
+    } else {        // only DC is non-zero -> inlined simplified transform
+      int i;
+      const int dc0 = (dc[0] + 3) >> 3;
+      for (i = 0; i < 16 * 16; i += 16) dst[i] = dc0;
+    }
     first = 1;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
-    VP8TransformWHT(dc, dst);
+    ac_proba = bands[0];
   } else {
     first = 0;
-    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
+    ac_proba = bands[3];
   }
 
-  tnz = kUnpackTab[mb->nz_ & 0xf];
-  lnz = kUnpackTab[left_mb->nz_ & 0xf];
+  tnz = mb->nz_ & 0x0f;
+  lnz = left_mb->nz_ & 0x0f;
   for (y = 0; y < 4; ++y) {
-    int l = lnz.i8[y];
+    int l = lnz & 1;
+    uint32_t nz_coeffs = 0;
     for (x = 0; x < 4; ++x) {
-      const int ctx = l + tnz.i8[x];
-      const int nz = GetCoeffs(token_br, ac_prob, ctx,
-                               q->y1_mat_, first, dst);
-      tnz.i8[x] = l = (nz > 0);
-      nz_dc.i8[x] = (dst[0] != 0);
-      nz_ac.i8[x] = (nz > 1);
+      const int ctx = l + (tnz & 1);
+      const int nz = GetCoeffs(token_br, ac_proba, ctx, q->y1_mat_, first, dst);
+      l = (nz > first);
+      tnz = (tnz >> 1) | (l << 7);
+      nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
       dst += 16;
     }
-    lnz.i8[y] = l;
-    non_zero_dc |= PACK(nz_dc, 24 - y * 4);
-    non_zero_ac |= PACK(nz_ac, 24 - y * 4);
+    tnz >>= 4;
+    lnz = (lnz >> 1) | (l << 7);
+    non_zero_y = (non_zero_y << 8) | nz_coeffs;
   }
-  out_t_nz = PACK(tnz, 24);
-  out_l_nz = PACK(lnz, 24);
+  out_t_nz = tnz;
+  out_l_nz = lnz >> 4;
 
-  tnz = kUnpackTab[mb->nz_ >> 4];
-  lnz = kUnpackTab[left_mb->nz_ >> 4];
   for (ch = 0; ch < 4; ch += 2) {
+    uint32_t nz_coeffs = 0;
+    tnz = mb->nz_ >> (4 + ch);
+    lnz = left_mb->nz_ >> (4 + ch);
     for (y = 0; y < 2; ++y) {
-      int l = lnz.i8[ch + y];
+      int l = lnz & 1;
       for (x = 0; x < 2; ++x) {
-        const int ctx = l + tnz.i8[ch + x];
-        const int nz =
-            GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
-                      ctx, q->uv_mat_, 0, dst);
-        tnz.i8[ch + x] = l = (nz > 0);
-        nz_dc.i8[y * 2 + x] = (dst[0] != 0);
-        nz_ac.i8[y * 2 + x] = (nz > 1);
+        const int ctx = l + (tnz & 1);
+        const int nz = GetCoeffs(token_br, bands[2], ctx, q->uv_mat_, 0, dst);
+        l = (nz > 0);
+        tnz = (tnz >> 1) | (l << 3);
+        nz_coeffs = NzCodeBits(nz_coeffs, nz, dst[0] != 0);
         dst += 16;
       }
-      lnz.i8[ch + y] = l;
+      tnz >>= 2;
+      lnz = (lnz >> 1) | (l << 5);
     }
-    non_zero_dc |= PACK(nz_dc, 8 - ch * 2);
-    non_zero_ac |= PACK(nz_ac, 8 - ch * 2);
+    // Note: we don't really need the per-4x4 details for U/V blocks.
+    non_zero_uv |= nz_coeffs << (4 * ch);
+    out_t_nz |= (tnz << 4) << ch;
+    out_l_nz |= (lnz & 0xf0) << ch;
   }
-  out_t_nz |= PACK(tnz, 20);
-  out_l_nz |= PACK(lnz, 20);
   mb->nz_ = out_t_nz;
   left_mb->nz_ = out_l_nz;
 
-  dec->non_zero_ac_ = non_zero_ac;
-  dec->non_zero_ = non_zero_ac | non_zero_dc;
-  mb->skip_ = !dec->non_zero_;
+  block->non_zero_y_ = non_zero_y;
+  block->non_zero_uv_ = non_zero_uv;
+
+  // We look at the mode-code of each block and check if some blocks have less
+  // than three non-zero coeffs (code < 2). This is to avoid dithering flat and
+  // empty blocks.
+  block->dither_ = (non_zero_uv & 0xaaaa) ? 0 : q->dither_;
+
+  return !(non_zero_y | non_zero_uv);  // will be used for further optimization
 }
-#undef PACK
 
 //------------------------------------------------------------------------------
 // Main loop
@@ -635,7 +572,9 @@ static void ParseResiduals(VP8Decoder* const dec,
 int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
   VP8BitReader* const br = &dec->br_;
   VP8MB* const left = dec->mb_info_ - 1;
-  VP8MB* const info = dec->mb_info_ + dec->mb_x_;
+  VP8MB* const mb = dec->mb_info_ + dec->mb_x_;
+  VP8MBData* const block = dec->mb_data_ + dec->mb_x_;
+  int skip;
 
   // Note: we don't save segment map (yet), as we don't expect
   // to decode more than 1 keyframe.
@@ -645,71 +584,64 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) {
         VP8GetBit(br, dec->proba_.segments_[1]) :
         2 + VP8GetBit(br, dec->proba_.segments_[2]);
   }
-  info->skip_ = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
+  skip = dec->use_skip_proba_ ? VP8GetBit(br, dec->skip_p_) : 0;
 
   VP8ParseIntraMode(br, dec);
   if (br->eof_) {
     return 0;
   }
 
-  if (!info->skip_) {
-    ParseResiduals(dec, info, token_br);
+  if (!skip) {
+    skip = ParseResiduals(dec, mb, token_br);
   } else {
-    left->nz_ = info->nz_ = 0;
-    if (!dec->is_i4x4_) {
-      left->dc_nz_ = info->dc_nz_ = 0;
+    left->nz_ = mb->nz_ = 0;
+    if (!block->is_i4x4_) {
+      left->nz_dc_ = mb->nz_dc_ = 0;
     }
-    dec->non_zero_ = 0;
-    dec->non_zero_ac_ = 0;
+    block->non_zero_y_ = 0;
+    block->non_zero_uv_ = 0;
   }
 
   if (dec->filter_type_ > 0) {  // store filter info
     VP8FInfo* const finfo = dec->f_info_ + dec->mb_x_;
-    *finfo = dec->fstrengths_[dec->segment_][dec->is_i4x4_];
-    finfo->f_inner_ = (!info->skip_ || dec->is_i4x4_);
+    *finfo = dec->fstrengths_[dec->segment_][block->is_i4x4_];
+    finfo->f_inner_ |= !skip;
   }
 
-  return (!token_br->eof_);
+  return !token_br->eof_;
 }
 
 void VP8InitScanline(VP8Decoder* const dec) {
   VP8MB* const left = dec->mb_info_ - 1;
   left->nz_ = 0;
-  left->dc_nz_ = 0;
+  left->nz_dc_ = 0;
   memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_));
-  dec->filter_row_ =
-    (dec->filter_type_ > 0) &&
-    (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_);
+  dec->mb_x_ = 0;
 }
 
 static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
   for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) {
+    // Parse bitstream for this row.
     VP8BitReader* const token_br =
         &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)];
-    VP8InitScanline(dec);
-    for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_;  dec->mb_x_++) {
+    for (; dec->mb_x_ < dec->mb_w_; ++dec->mb_x_) {
       if (!VP8DecodeMB(dec, token_br)) {
         return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA,
                            "Premature end-of-file encountered.");
       }
-      // Reconstruct and emit samples.
-      VP8ReconstructBlock(dec);
     }
+    VP8InitScanline(dec);   // Prepare for next scanline
+
+    // Reconstruct, filter and emit the row.
     if (!VP8ProcessRow(dec, io)) {
       return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted.");
     }
   }
-  if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) {
-    return 0;
+  if (dec->mt_method_ > 0) {
+    if (!WebPWorkerSync(&dec->worker_)) return 0;
   }
 
   // Finish
-#ifndef ONLY_KEYFRAME_CODE
-  if (!dec->update_proba_) {
-    dec->proba_ = dec->proba_saved_;
-  }
-#endif
-
 #ifdef WEBP_EXPERIMENTAL_FEATURES
   if (dec->layer_data_size_ > 0) {
     if (!VP8DecodeLayer(dec)) {
@@ -765,12 +697,12 @@ void VP8Clear(VP8Decoder* const dec) {
   if (dec == NULL) {
     return;
   }
-  if (dec->use_threads_) {
+  if (dec->mt_method_ > 0) {
     WebPWorkerEnd(&dec->worker_);
   }
-  if (dec->mem_) {
-    free(dec->mem_);
-  }
+  ALPHDelete(dec->alph_dec_);
+  dec->alph_dec_ = NULL;
+  free(dec->mem_);
   dec->mem_ = NULL;
   dec->mem_size_ = 0;
   memset(&dec->br_, 0, sizeof(dec->br_));
@@ -779,6 +711,3 @@ void VP8Clear(VP8Decoder* const dec) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/vp8i.h b/third_party/libwebp/dec/vp8i.h
index 1d0d407..3f4cf29 100644
--- a/third_party/libwebp/dec/vp8i.h
+++ b/third_party/libwebp/dec/vp8i.h
@@ -17,10 +17,11 @@
 #include <string.h>     // for memcpy()
 #include "./vp8li.h"
 #include "../utils/bit_reader.h"
+#include "../utils/random.h"
 #include "../utils/thread.h"
 #include "../dsp/dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -29,10 +30,8 @@ extern "C" {
 
 // version numbers
 #define DEC_MAJ_VERSION 0
-#define DEC_MIN_VERSION 3
-#define DEC_REV_VERSION 1
-
-#define ONLY_KEYFRAME_CODE      // to remove any code related to P-Frames
+#define DEC_MIN_VERSION 4
+#define DEC_REV_VERSION 0
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -100,6 +99,9 @@ enum { MB_FEATURE_TREE_PROBS = 3,
 #define U_OFF    (Y_OFF + BPS * 16 + BPS)
 #define V_OFF    (U_OFF + 16)
 
+// minimal width under which lossy multi-threading is always disabled
+#define MIN_WIDTH_FOR_THREADS 512
+
 //------------------------------------------------------------------------------
 // Headers
 
@@ -128,15 +130,19 @@ typedef struct {
   int8_t filter_strength_[NUM_MB_SEGMENTS];  // filter strength for segments
 } VP8SegmentHeader;
 
+
+// probas associated to one of the contexts
+typedef uint8_t VP8ProbaArray[NUM_PROBAS];
+
+typedef struct {   // all the probas associated to one band
+  VP8ProbaArray probas_[NUM_CTX];
+} VP8BandProbas;
+
 // Struct collecting all frame-persistent probabilities.
 typedef struct {
   uint8_t segments_[MB_FEATURE_TREE_PROBS];
   // Type: 0:Intra16-AC  1:Intra16-DC   2:Chroma   3:Intra4
-  uint8_t coeffs_[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS];
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t ymode_[4], uvmode_[3];
-  uint8_t mv_[2][NUM_MV_PROBAS];
-#endif
+  VP8BandProbas bands_[NUM_TYPES][NUM_BANDS];
 } VP8Proba;
 
 // Filter parameters
@@ -153,32 +159,59 @@ typedef struct {
 // Informations about the macroblocks.
 
 typedef struct {  // filter specs
-  unsigned int f_level_:6;      // filter strength: 0..63
-  unsigned int f_ilevel_:6;     // inner limit: 1..63
-  unsigned int f_inner_:1;      // do inner filtering?
+  uint8_t f_limit_;      // filter limit in [3..189], or 0 if no filtering
+  uint8_t f_ilevel_;     // inner limit in [1..63]
+  uint8_t f_inner_;      // do inner filtering?
+  uint8_t hev_thresh_;   // high edge variance threshold in [0..2]
 } VP8FInfo;
 
-typedef struct {  // used for syntax-parsing
-  unsigned int nz_:24;       // non-zero AC/DC coeffs (24bit)
-  unsigned int dc_nz_:1;     // non-zero DC coeffs
-  unsigned int skip_:1;      // block type
+typedef struct {  // Top/Left Contexts used for syntax-parsing
+  uint8_t nz_;        // non-zero AC/DC coeffs (4bit for luma + 4bit for chroma)
+  uint8_t nz_dc_;     // non-zero DC coeff (1bit)
 } VP8MB;
 
 // Dequantization matrices
 typedef int quant_t[2];      // [DC / AC].  Can be 'uint16_t[2]' too (~slower).
 typedef struct {
   quant_t y1_mat_, y2_mat_, uv_mat_;
+
+  int uv_quant_;   // U/V quantizer value
+  int dither_;     // dithering amplitude (0 = off, max=255)
 } VP8QuantMatrix;
 
+// Data needed to reconstruct a macroblock
+typedef struct {
+  int16_t coeffs_[384];   // 384 coeffs = (16+4+4) * 4*4
+  uint8_t is_i4x4_;       // true if intra4x4
+  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
+  uint8_t uvmode_;        // chroma prediction mode
+  // bit-wise info about the content of each sub-4x4 blocks (in decoding order).
+  // Each of the 4x4 blocks for y/u/v is associated with a 2b code according to:
+  //   code=0 -> no coefficient
+  //   code=1 -> only DC
+  //   code=2 -> first three coefficients are non-zero
+  //   code=3 -> more than three coefficients are non-zero
+  // This allows to call specialized transform functions.
+  uint32_t non_zero_y_;
+  uint32_t non_zero_uv_;
+  uint8_t dither_;      // local dithering strength (deduced from non_zero_*)
+} VP8MBData;
+
 // Persistent information needed by the parallel processing
 typedef struct {
-  int id_;            // cache row to process (in [0..2])
-  int mb_y_;          // macroblock position of the row
-  int filter_row_;    // true if row-filtering is needed
-  VP8FInfo* f_info_;  // filter strengths
-  VP8Io io_;          // copy of the VP8Io to pass to put()
+  int id_;              // cache row to process (in [0..2])
+  int mb_y_;            // macroblock position of the row
+  int filter_row_;      // true if row-filtering is needed
+  VP8FInfo* f_info_;    // filter strengths (swapped with dec->f_info_)
+  VP8MBData* mb_data_;  // reconstruction data (swapped with dec->mb_data_)
+  VP8Io io_;            // copy of the VP8Io to pass to put()
 } VP8ThreadContext;
 
+// Saved top samples, per macroblock. Fits into a cache-line.
+typedef struct {
+  uint8_t y[16], u[8], v[8];
+} VP8TopSamples;
+
 //------------------------------------------------------------------------------
 // VP8Decoder: the main opaque structure handed over to user
 
@@ -198,7 +231,8 @@ struct VP8Decoder {
 
   // Worker
   WebPWorker worker_;
-  int use_threads_;    // use multi-thread
+  int mt_method_;      // multi-thread method: 0=off, 1=[parse+recon][filter]
+                       // 2=[parse][recon+filter]
   int cache_id_;       // current cache row
   int num_caches_;     // number of cached rows of 16 pixels (1, 2 or 3)
   VP8ThreadContext thread_ctx_;  // Thread context
@@ -215,12 +249,9 @@ struct VP8Decoder {
   // per-partition boolean decoders.
   VP8BitReader parts_[MAX_NUM_PARTITIONS];
 
-  // buffer refresh flags
-  //   bit 0: refresh Gold, bit 1: refresh Alt
-  //   bit 2-3: copy to Gold, bit 4-5: copy to Alt
-  //   bit 6: Gold sign bias, bit 7: Alt sign bias
-  //   bit 8: refresh last frame
-  uint32_t buffer_flags_;
+  // Dithering strength, deduced from decoding options
+  int dither_;                // whether to use dithering or not
+  VP8Random dithering_rg_;    // random generator for dithering
 
   // dequantization (one set of DC/AC dequant factor per segment)
   VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
@@ -229,24 +260,19 @@ struct VP8Decoder {
   VP8Proba proba_;
   int use_skip_proba_;
   uint8_t skip_p_;
-#ifndef ONLY_KEYFRAME_CODE
-  uint8_t intra_p_, last_p_, golden_p_;
-  VP8Proba proba_saved_;
-  int update_proba_;
-#endif
 
   // Boundary data cache and persistent buffers.
-  uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
-  uint8_t  intra_l_[4];  // left intra modes values
-  uint8_t* y_t_;         // top luma samples: 16 * mb_w_
-  uint8_t* u_t_, *v_t_;  // top u/v samples: 8 * mb_w_ each
+  uint8_t* intra_t_;      // top intra modes values: 4 * mb_w_
+  uint8_t  intra_l_[4];   // left intra modes values
+
+  uint8_t segment_;       // segment of the currently parsed block
+  VP8TopSamples* yuv_t_;  // top y/u/v samples
 
-  VP8MB* mb_info_;       // contextual macroblock info (mb_w_ + 1)
-  VP8FInfo* f_info_;     // filter strength info
-  uint8_t* yuv_b_;       // main block for Y/U/V (size = YUV_SIZE)
-  int16_t* coeffs_;      // 384 coeffs = (16+8+8) * 4*4
+  VP8MB* mb_info_;        // contextual macroblock info (mb_w_ + 1)
+  VP8FInfo* f_info_;      // filter strength info
+  uint8_t* yuv_b_;        // main block for Y/U/V (size = YUV_SIZE)
 
-  uint8_t* cache_y_;     // macroblock row for storing unfiltered samples
+  uint8_t* cache_y_;      // macroblock row for storing unfiltered samples
   uint8_t* cache_u_;
   uint8_t* cache_v_;
   int cache_y_stride_;
@@ -258,29 +284,20 @@ struct VP8Decoder {
 
   // Per macroblock non-persistent infos.
   int mb_x_, mb_y_;       // current position, in macroblock units
-  uint8_t is_i4x4_;       // true if intra4x4
-  uint8_t imodes_[16];    // one 16x16 mode (#0) or sixteen 4x4 modes
-  uint8_t uvmode_;        // chroma prediction mode
-  uint8_t segment_;       // block's segment
-
-  // bit-wise info about the content of each sub-4x4 blocks: there are 16 bits
-  // for luma (bits #0->#15), then 4 bits for chroma-u (#16->#19) and 4 bits for
-  // chroma-v (#20->#23), each corresponding to one 4x4 block in decoding order.
-  // If the bit is set, the 4x4 block contains some non-zero coefficients.
-  uint32_t non_zero_;
-  uint32_t non_zero_ac_;
+  VP8MBData* mb_data_;    // parsed reconstruction data
 
   // Filtering side-info
   int filter_type_;                          // 0=off, 1=simple, 2=complex
-  int filter_row_;                           // per-row flag
   VP8FInfo fstrengths_[NUM_MB_SEGMENTS][2];  // precalculated per-segment/type
 
-  // extensions
-  const uint8_t* alpha_data_;   // compressed alpha data (if present)
+  // Alpha
+  struct ALPHDecoder* alph_dec_;  // alpha-plane decoder object
+  const uint8_t* alpha_data_;     // compressed alpha data (if present)
   size_t alpha_data_size_;
   int is_alpha_decoded_;  // true if alpha_data_ is decoded in alpha_plane_
   uint8_t* alpha_plane_;        // output. Persistent, contains the whole data.
 
+  // extensions
   int layer_colorspace_;
   const uint8_t* layer_data_;   // compressed layer data (if present)
   size_t layer_data_size_;
@@ -303,8 +320,6 @@ void VP8ParseQuant(VP8Decoder* const dec);
 
 // in frame.c
 int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
-// Predict a block and add residual
-void VP8ReconstructBlock(VP8Decoder* const dec);
 // Call io->setup() and finish setting up scan parameters.
 // After this call returns, one must always call VP8ExitCritical() with the
 // same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK
@@ -313,7 +328,15 @@ VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io);
 // Must always be called in pair with VP8EnterCritical().
 // Returns false in case of error.
 int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io);
-// Process the last decoded row (filtering + output)
+// Return the multi-threading method to use (0=off), depending
+// on options and bitstream size. Only for lossy decoding.
+int VP8GetThreadMethod(const WebPDecoderOptions* const options,
+                       const WebPHeaderStructure* const headers,
+                       int width, int height);
+// Initialize dithering post-process if needed.
+void VP8InitDithering(const WebPDecoderOptions* const options,
+                      VP8Decoder* const dec);
+// Process the last decoded row (filtering + output).
 int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io);
 // To be called at the start of a new scanline, to initialize predictors.
 void VP8InitScanline(VP8Decoder* const dec);
@@ -329,7 +352,7 @@ int VP8DecodeLayer(VP8Decoder* const dec);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/dec/vp8l.c b/third_party/libwebp/dec/vp8l.c
index 7c394af..ea0254d 100644
--- a/third_party/libwebp/dec/vp8l.c
+++ b/third_party/libwebp/dec/vp8l.c
@@ -14,16 +14,14 @@
 
 #include <stdio.h>
 #include <stdlib.h>
+#include "./alphai.h"
 #include "./vp8li.h"
 #include "../dsp/lossless.h"
 #include "../dsp/yuv.h"
+#include "../utils/alpha_processing.h"
 #include "../utils/huffman.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define NUM_ARGB_CACHE_ROWS          16
 
 static const int kCodeLengthLiterals = 16;
@@ -59,7 +57,7 @@ static const uint8_t kCodeLengthCodeOrder[NUM_CODE_LENGTH_CODES] = {
 };
 
 #define CODE_TO_PLANE_CODES        120
-static const uint8_t code_to_plane_lut[CODE_TO_PLANE_CODES] = {
+static const uint8_t kCodeToPlane[CODE_TO_PLANE_CODES] = {
   0x18, 0x07, 0x17, 0x19, 0x28, 0x06, 0x27, 0x29, 0x16, 0x1a,
   0x26, 0x2a, 0x38, 0x05, 0x37, 0x39, 0x15, 0x1b, 0x36, 0x3a,
   0x25, 0x2b, 0x48, 0x04, 0x47, 0x49, 0x14, 0x1c, 0x35, 0x3b,
@@ -141,11 +139,11 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
   if (plane_code > CODE_TO_PLANE_CODES) {
     return plane_code - CODE_TO_PLANE_CODES;
   } else {
-    const int dist_code = code_to_plane_lut[plane_code - 1];
+    const int dist_code = kCodeToPlane[plane_code - 1];
     const int yoffset = dist_code >> 4;
     const int xoffset = 8 - (dist_code & 0xf);
     const int dist = yoffset * xsize + xoffset;
-    return (dist >= 1) ? dist : 1;
+    return (dist >= 1) ? dist : 1;  // dist<1 can happen if xsize is very small
   }
 }
 
@@ -156,15 +154,27 @@ static WEBP_INLINE int PlaneCodeToDistance(int xsize, int plane_code) {
 static WEBP_INLINE int ReadSymbol(const HuffmanTree* tree,
                                   VP8LBitReader* const br) {
   const HuffmanTreeNode* node = tree->root_;
-  int num_bits = 0;
   uint32_t bits = VP8LPrefetchBits(br);
+  int bitpos = br->bit_pos_;
+  // Check if we find the bit combination from the Huffman lookup table.
+  const int lut_ix = bits & (HUFF_LUT - 1);
+  const int lut_bits = tree->lut_bits_[lut_ix];
+  if (lut_bits <= HUFF_LUT_BITS) {
+    VP8LSetBitPos(br, bitpos + lut_bits);
+    return tree->lut_symbol_[lut_ix];
+  }
+  node += tree->lut_jump_[lut_ix];
+  bitpos += HUFF_LUT_BITS;
+  bits >>= HUFF_LUT_BITS;
+
+  // Decode the value from a binary tree.
   assert(node != NULL);
-  while (!HuffmanTreeNodeIsLeaf(node)) {
+  do {
     node = HuffmanTreeNextNode(node, bits & 1);
     bits >>= 1;
-    ++num_bits;
-  }
-  VP8LDiscardBits(br, num_bits);
+    ++bitpos;
+  } while (HuffmanTreeNodeIsNotLeaf(node));
+  VP8LSetBitPos(br, bitpos);
   return node->symbol_;
 }
 
@@ -405,12 +415,13 @@ static int AllocateAndInitRescaler(VP8LDecoder* const dec, VP8Io* const io) {
 // We have special "export" function since we need to convert from BGRA
 static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
                   int rgba_stride, uint8_t* const rgba) {
-  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  uint32_t* const src = (uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     uint8_t* const dst = rgba + num_lines_out * rgba_stride;
     WebPRescalerExportRow(rescaler);
+    WebPMultARGBRow(src, dst_width, 1);
     VP8LConvertFromBGRA(src, dst_width, colorspace, dst);
     ++num_lines_out;
   }
@@ -418,18 +429,22 @@ static int Export(WebPRescaler* const rescaler, WEBP_CSP_MODE colorspace,
 }
 
 // Emit scaled rows.
-static int EmitRescaledRows(const VP8LDecoder* const dec,
-                            const uint32_t* const data, int in_stride, int mb_h,
-                            uint8_t* const out, int out_stride) {
+static int EmitRescaledRowsRGBA(const VP8LDecoder* const dec,
+                                uint8_t* in, int in_stride, int mb_h,
+                                uint8_t* const out, int out_stride) {
   const WEBP_CSP_MODE colorspace = dec->output_->colorspace;
-  const uint8_t* const in = (const uint8_t*)data;
   int num_lines_in = 0;
   int num_lines_out = 0;
   while (num_lines_in < mb_h) {
-    const uint8_t* const row_in = in + num_lines_in * in_stride;
+    uint8_t* const row_in = in + num_lines_in * in_stride;
     uint8_t* const row_out = out + num_lines_out * out_stride;
-    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
-                                       row_in, in_stride);
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    assert(needed_lines > 0 && needed_lines <= lines_left);
+    WebPMultARGBRows(row_in, in_stride,
+                     dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, row_in, in_stride);
+    num_lines_in += needed_lines;
     num_lines_out += Export(dec->rescaler, colorspace, out_stride, row_out);
   }
   return num_lines_out;
@@ -437,11 +452,10 @@ static int EmitRescaledRows(const VP8LDecoder* const dec,
 
 // Emit rows without any scaling.
 static int EmitRows(WEBP_CSP_MODE colorspace,
-                    const uint32_t* const data, int in_stride,
+                    const uint8_t* row_in, int in_stride,
                     int mb_w, int mb_h,
                     uint8_t* const out, int out_stride) {
   int lines = mb_h;
-  const uint8_t* row_in = (const uint8_t*)data;
   uint8_t* row_out = out;
   while (lines-- > 0) {
     VP8LConvertFromBGRA((const uint32_t*)row_in, mb_w, colorspace, row_out);
@@ -463,7 +477,8 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
     uint8_t* const y = buf->y + y_pos * buf->y_stride;
     for (i = 0; i < width; ++i) {
       const uint32_t p = src[i];
-      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff);
+      y[i] = VP8RGBToY((p >> 16) & 0xff, (p >> 8) & 0xff, (p >> 0) & 0xff,
+                       YUV_HALF);
     }
   }
 
@@ -482,11 +497,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
       const int g = ((v0 >>  7) & 0x1fe) + ((v1 >>  7) & 0x1fe);
       const int b = ((v0 <<  1) & 0x1fe) + ((v1 <<  1) & 0x1fe);
       if (!(y_pos & 1)) {  // even lines: store values
-        u[i] = VP8RGBToU(r, g, b);
-        v[i] = VP8RGBToV(r, g, b);
+        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
       } else {             // odd lines: average with previous values
-        const int tmp_u = VP8RGBToU(r, g, b);
-        const int tmp_v = VP8RGBToV(r, g, b);
+        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
         // Approximated average-of-four. But it's an acceptable diff.
         u[i] = (u[i] + tmp_u + 1) >> 1;
         v[i] = (v[i] + tmp_v + 1) >> 1;
@@ -498,11 +513,11 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
       const int g = (v0 >>  6) & 0x3fc;
       const int b = (v0 <<  2) & 0x3fc;
       if (!(y_pos & 1)) {  // even lines
-        u[i] = VP8RGBToU(r, g, b);
-        v[i] = VP8RGBToV(r, g, b);
+        u[i] = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        v[i] = VP8RGBToV(r, g, b, YUV_HALF << 2);
       } else {             // odd lines (note: we could just skip this)
-        const int tmp_u = VP8RGBToU(r, g, b);
-        const int tmp_v = VP8RGBToV(r, g, b);
+        const int tmp_u = VP8RGBToU(r, g, b, YUV_HALF << 2);
+        const int tmp_v = VP8RGBToV(r, g, b, YUV_HALF << 2);
         u[i] = (u[i] + tmp_u + 1) >> 1;
         v[i] = (v[i] + tmp_v + 1) >> 1;
       }
@@ -518,11 +533,12 @@ static void ConvertToYUVA(const uint32_t* const src, int width, int y_pos,
 
 static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
   WebPRescaler* const rescaler = dec->rescaler;
-  const uint32_t* const src = (const uint32_t*)rescaler->dst;
+  uint32_t* const src = (uint32_t*)rescaler->dst;
   const int dst_width = rescaler->dst_width;
   int num_lines_out = 0;
   while (WebPRescalerHasPendingOutput(rescaler)) {
     WebPRescalerExportRow(rescaler);
+    WebPMultARGBRow(src, dst_width, 1);
     ConvertToYUVA(src, dst_width, y_pos, dec->output_);
     ++y_pos;
     ++num_lines_out;
@@ -531,28 +547,28 @@ static int ExportYUVA(const VP8LDecoder* const dec, int y_pos) {
 }
 
 static int EmitRescaledRowsYUVA(const VP8LDecoder* const dec,
-                                const uint32_t* const data,
-                                int in_stride, int mb_h) {
-  const uint8_t* const in = (const uint8_t*)data;
+                                uint8_t* in, int in_stride, int mb_h) {
   int num_lines_in = 0;
   int y_pos = dec->last_out_row_;
   while (num_lines_in < mb_h) {
-    const uint8_t* const row_in = in + num_lines_in * in_stride;
-    num_lines_in += WebPRescalerImport(dec->rescaler, mb_h - num_lines_in,
-                                       row_in, in_stride);
+    const int lines_left = mb_h - num_lines_in;
+    const int needed_lines = WebPRescaleNeededLines(dec->rescaler, lines_left);
+    WebPMultARGBRows(in, in_stride, dec->rescaler->src_width, needed_lines, 0);
+    WebPRescalerImport(dec->rescaler, lines_left, in, in_stride);
+    num_lines_in += needed_lines;
+    in += needed_lines * in_stride;
     y_pos += ExportYUVA(dec, y_pos);
   }
   return y_pos;
 }
 
 static int EmitRowsYUVA(const VP8LDecoder* const dec,
-                        const uint32_t* const data, int in_stride,
+                        const uint8_t* in, int in_stride,
                         int mb_w, int num_rows) {
   int y_pos = dec->last_out_row_;
-  const uint8_t* row_in = (const uint8_t*)data;
   while (num_rows-- > 0) {
-    ConvertToYUVA((const uint32_t*)row_in, mb_w, y_pos, dec->output_);
-    row_in += in_stride;
+    ConvertToYUVA((const uint32_t*)in, mb_w, y_pos, dec->output_);
+    in += in_stride;
     ++y_pos;
   }
   return y_pos;
@@ -563,11 +579,11 @@ static int EmitRowsYUVA(const VP8LDecoder* const dec,
 
 // Sets io->mb_y, io->mb_h & io->mb_w according to start row, end row and
 // crop options. Also updates the input data pointer, so that it points to the
-// start of the cropped window.
-// Note that 'pixel_stride' is in units of 'uint32_t' (and not 'bytes).
+// start of the cropped window. Note that pixels are in ARGB format even if
+// 'in_data' is uint8_t*.
 // Returns true if the crop window is not empty.
 static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
-                         const uint32_t** const in_data, int pixel_stride) {
+                         uint8_t** const in_data, int pixel_stride) {
   assert(y_start < y_end);
   assert(io->crop_left < io->crop_right);
   if (y_end > io->crop_bottom) {
@@ -576,11 +592,11 @@ static int SetCropWindow(VP8Io* const io, int y_start, int y_end,
   if (y_start < io->crop_top) {
     const int delta = io->crop_top - y_start;
     y_start = io->crop_top;
-    *in_data += pixel_stride * delta;
+    *in_data += delta * pixel_stride;
   }
   if (y_start >= y_end) return 0;  // Crop window is empty.
 
-  *in_data += io->crop_left;
+  *in_data += io->crop_left * sizeof(uint32_t);
 
   io->mb_y = y_start - io->crop_top;
   io->mb_w = io->crop_right - io->crop_left;
@@ -654,18 +670,18 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   // Emit output.
   {
     VP8Io* const io = dec->io_;
-    const uint32_t* rows_data = dec->argb_cache_;
-    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, io->width)) {
+    uint8_t* rows_data = (uint8_t*)dec->argb_cache_;
+    const int in_stride = io->width * sizeof(uint32_t);  // in unit of RGBA
+    if (!SetCropWindow(io, dec->last_row_, row, &rows_data, in_stride)) {
       // Nothing to output (this time).
     } else {
       const WebPDecBuffer* const output = dec->output_;
-      const int in_stride = io->width * sizeof(*rows_data);
       if (output->colorspace < MODE_YUV) {  // convert to RGBA
         const WebPRGBABuffer* const buf = &output->u.RGBA;
         uint8_t* const rgba = buf->rgba + dec->last_out_row_ * buf->stride;
         const int num_rows_out = io->use_scaling ?
-            EmitRescaledRows(dec, rows_data, in_stride, io->mb_h,
-                             rgba, buf->stride) :
+            EmitRescaledRowsRGBA(dec, rows_data, in_stride, io->mb_h,
+                                 rgba, buf->stride) :
             EmitRows(output->colorspace, rows_data, in_stride,
                      io->mb_w, io->mb_h, rgba, buf->stride);
         // Update 'last_out_row_'.
@@ -684,134 +700,232 @@ static void ProcessRows(VP8LDecoder* const dec, int row) {
   assert(dec->last_row_ <= dec->height_);
 }
 
-#define DECODE_DATA_FUNC(FUNC_NAME, TYPE, STORE_PIXEL)                         \
-static int FUNC_NAME(VP8LDecoder* const dec, TYPE* const data, int width,      \
-                     int height, ProcessRowsFunc process_func) {               \
-  int ok = 1;                                                                  \
-  int col = 0, row = 0;                                                        \
-  VP8LBitReader* const br = &dec->br_;                                         \
-  VP8LMetadata* const hdr = &dec->hdr_;                                        \
-  HTreeGroup* htree_group = hdr->htree_groups_;                                \
-  TYPE* src = data;                                                            \
-  TYPE* last_cached = data;                                                    \
-  TYPE* const src_end = data + width * height;                                 \
-  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;             \
-  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;       \
-  VP8LColorCache* const color_cache =                                          \
-      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;                \
-  const int mask = hdr->huffman_mask_;                                         \
-  assert(htree_group != NULL);                                                 \
-  while (!br->eos_ && src < src_end) {                                         \
-    int code;                                                                  \
-    /* Only update when changing tile. Note we could use this test:        */  \
-    /* if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed */  \
-    /* but that's actually slower and needs storing the previous col/row.  */  \
-    if ((col & mask) == 0) {                                                   \
-      htree_group = GetHtreeGroupForPos(hdr, col, row);                        \
-    }                                                                          \
-    VP8LFillBitWindow(br);                                                     \
-    code = ReadSymbol(&htree_group->htrees_[GREEN], br);                       \
-    if (code < NUM_LITERAL_CODES) {  /* Literal*/                              \
-      int red, green, blue, alpha;                                             \
-      red = ReadSymbol(&htree_group->htrees_[RED], br);                        \
-      green = code;                                                            \
-      VP8LFillBitWindow(br);                                                   \
-      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);                      \
-      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);                    \
-      *src = STORE_PIXEL(alpha, red, green, blue);                             \
-    AdvanceByOne:                                                              \
-      ++src;                                                                   \
-      ++col;                                                                   \
-      if (col >= width) {                                                      \
-        col = 0;                                                               \
-        ++row;                                                                 \
-        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {      \
-          process_func(dec, row);                                              \
-        }                                                                      \
-        if (color_cache != NULL) {                                             \
-          while (last_cached < src) {                                          \
-            VP8LColorCacheInsert(color_cache, *last_cached++);                 \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-    } else if (code < len_code_limit) {  /* Backward reference */              \
-      int dist_code, dist;                                                     \
-      const int length_sym = code - NUM_LITERAL_CODES;                         \
-      const int length = GetCopyLength(length_sym, br);                        \
-      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);     \
-      VP8LFillBitWindow(br);                                                   \
-      dist_code = GetCopyDistance(dist_symbol, br);                            \
-      dist = PlaneCodeToDistance(width, dist_code);                            \
-      if (src - data < dist || src_end - src < length) {                       \
-        ok = 0;                                                                \
-        goto End;                                                              \
-      }                                                                        \
-      {                                                                        \
-        int i;                                                                 \
-        for (i = 0; i < length; ++i) src[i] = src[i - dist];                   \
-        src += length;                                                         \
-      }                                                                        \
-      col += length;                                                           \
-      while (col >= width) {                                                   \
-        col -= width;                                                          \
-        ++row;                                                                 \
-        if ((process_func != NULL) && (row % NUM_ARGB_CACHE_ROWS == 0)) {      \
-          process_func(dec, row);                                              \
-        }                                                                      \
-      }                                                                        \
-      if (src < src_end) {                                                     \
-        htree_group = GetHtreeGroupForPos(hdr, col, row);                      \
-        if (color_cache != NULL) {                                             \
-          while (last_cached < src) {                                          \
-            VP8LColorCacheInsert(color_cache, *last_cached++);                 \
-          }                                                                    \
-        }                                                                      \
-      }                                                                        \
-    } else if (code < color_cache_limit) {  /* Color cache */                  \
-      const int key = code - len_code_limit;                                   \
-      assert(color_cache != NULL);                                             \
-      while (last_cached < src) {                                              \
-        VP8LColorCacheInsert(color_cache, *last_cached++);                     \
-      }                                                                        \
-      *src = VP8LColorCacheLookup(color_cache, key);                           \
-      goto AdvanceByOne;                                                       \
-    } else {  /* Not reached */                                                \
-      ok = 0;                                                                  \
-      goto End;                                                                \
-    }                                                                          \
-    ok = !br->error_;                                                          \
-    if (!ok) goto End;                                                         \
-  }                                                                            \
-  /* Process the remaining rows corresponding to last row-block. */            \
-  if (process_func != NULL) process_func(dec, row);                            \
-End:                                                                           \
-  if (br->error_ || !ok || (br->eos_ && src < src_end)) {                      \
-    ok = 0;                                                                    \
-    dec->status_ =                                                             \
-        (!br->eos_) ? VP8_STATUS_BITSTREAM_ERROR : VP8_STATUS_SUSPENDED;       \
-  } else if (src == src_end) {                                                 \
-    dec->state_ = READ_DATA;                                                   \
-  }                                                                            \
-  return ok;                                                                   \
+// Row-processing for the special case when alpha data contains only one
+// transform (color indexing), and trivial non-green literals.
+static int Is8bOptimizable(const VP8LMetadata* const hdr) {
+  int i;
+  if (hdr->color_cache_size_ > 0) return 0;
+  // When the Huffman tree contains only one symbol, we can skip the
+  // call to ReadSymbol() for red/blue/alpha channels.
+  for (i = 0; i < hdr->num_htree_groups_; ++i) {
+    const HuffmanTree* const htrees = hdr->htree_groups_[i].htrees_;
+    if (htrees[RED].num_nodes_ > 1) return 0;
+    if (htrees[BLUE].num_nodes_ > 1) return 0;
+    if (htrees[ALPHA].num_nodes_ > 1) return 0;
+  }
+  return 1;
 }
 
-static WEBP_INLINE uint32_t GetARGBPixel(int alpha, int red, int green,
-                                         int blue) {
-  return (alpha << 24) | (red << 16) | (green << 8) | blue;
+static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
+  const int num_rows = row - dec->last_row_;
+  const uint8_t* const in =
+      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
+  if (num_rows > 0) {
+    ApplyInverseTransformsAlpha(dec, num_rows, in);
+  }
+  dec->last_row_ = dec->last_out_row_ = row;
 }
 
-static WEBP_INLINE uint8_t GetAlphaPixel(int alpha, int red, int green,
-                                         int blue) {
-  (void)alpha;
-  (void)red;
-  (void)blue;
-  return green;  // Alpha value is stored in green channel.
+static int DecodeAlphaData(VP8LDecoder* const dec, uint8_t* const data,
+                           int width, int height, int last_row) {
+  int ok = 1;
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  const HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  int pos = dec->last_pixel_;         // current position
+  const int end = width * height;     // End of data
+  const int last = width * last_row;  // Last pixel to decode
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int mask = hdr->huffman_mask_;
+  assert(htree_group != NULL);
+  assert(last_row <= height);
+  assert(Is8bOptimizable(hdr));
+
+  while (!br->eos_ && pos < last) {
+    int code;
+    // Only update when changing tile.
+    if ((col & mask) == 0) {
+      htree_group = GetHtreeGroupForPos(hdr, col, row);
+    }
+    VP8LFillBitWindow(br);
+    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      data[pos] = code;
+      ++pos;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+    } else if (code < len_code_limit) {  // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
+      VP8LFillBitWindow(br);
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (pos >= dist && end - pos >= length) {
+        int i;
+        for (i = 0; i < length; ++i) data[pos + i] = data[pos + i - dist];
+      } else {
+        ok = 0;
+        goto End;
+      }
+      pos += length;
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if (row % NUM_ARGB_CACHE_ROWS == 0) {
+          ExtractPalettedAlphaRows(dec, row);
+        }
+      }
+      if (pos < last && (col & mask)) {
+        htree_group = GetHtreeGroupForPos(hdr, col, row);
+      }
+    } else {  // Not reached
+      ok = 0;
+      goto End;
+    }
+    ok = !br->error_;
+    if (!ok) goto End;
+  }
+  // Process the remaining rows corresponding to last row-block.
+  ExtractPalettedAlphaRows(dec, row);
+
+ End:
+  if (br->error_ || !ok || (br->eos_ && pos < end)) {
+    ok = 0;
+    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
+                            : VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    dec->last_pixel_ = (int)pos;
+    if (pos == end) dec->state_ = READ_DATA;
+  }
+  return ok;
 }
 
-DECODE_DATA_FUNC(DecodeImageData, uint32_t, GetARGBPixel)
-DECODE_DATA_FUNC(DecodeAlphaData, uint8_t, GetAlphaPixel)
+static int DecodeImageData(VP8LDecoder* const dec, uint32_t* const data,
+                           int width, int height, int last_row,
+                           ProcessRowsFunc process_func) {
+  int ok = 1;
+  int row = dec->last_pixel_ / width;
+  int col = dec->last_pixel_ % width;
+  VP8LBitReader* const br = &dec->br_;
+  VP8LMetadata* const hdr = &dec->hdr_;
+  HTreeGroup* htree_group = GetHtreeGroupForPos(hdr, col, row);
+  uint32_t* src = data + dec->last_pixel_;
+  uint32_t* last_cached = src;
+  uint32_t* const src_end = data + width * height;     // End of data
+  uint32_t* const src_last = data + width * last_row;  // Last pixel to decode
+  const int len_code_limit = NUM_LITERAL_CODES + NUM_LENGTH_CODES;
+  const int color_cache_limit = len_code_limit + hdr->color_cache_size_;
+  VP8LColorCache* const color_cache =
+      (hdr->color_cache_size_ > 0) ? &hdr->color_cache_ : NULL;
+  const int mask = hdr->huffman_mask_;
+  assert(htree_group != NULL);
+  assert(src_last <= src_end);
+
+  while (!br->eos_ && src < src_last) {
+    int code;
+    // Only update when changing tile. Note we could use this test:
+    // if "((((prev_col ^ col) | prev_row ^ row)) > mask)" -> tile changed
+    // but that's actually slower and needs storing the previous col/row.
+    if ((col & mask) == 0) {
+      htree_group = GetHtreeGroupForPos(hdr, col, row);
+    }
+    VP8LFillBitWindow(br);
+    code = ReadSymbol(&htree_group->htrees_[GREEN], br);
+    if (code < NUM_LITERAL_CODES) {  // Literal
+      int red, green, blue, alpha;
+      red = ReadSymbol(&htree_group->htrees_[RED], br);
+      green = code;
+      VP8LFillBitWindow(br);
+      blue = ReadSymbol(&htree_group->htrees_[BLUE], br);
+      alpha = ReadSymbol(&htree_group->htrees_[ALPHA], br);
+      *src = (alpha << 24) | (red << 16) | (green << 8) | blue;
+    AdvanceByOne:
+      ++src;
+      ++col;
+      if (col >= width) {
+        col = 0;
+        ++row;
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+          process_func(dec, row);
+        }
+        if (color_cache != NULL) {
+          while (last_cached < src) {
+            VP8LColorCacheInsert(color_cache, *last_cached++);
+          }
+        }
+      }
+    } else if (code < len_code_limit) {  // Backward reference
+      int dist_code, dist;
+      const int length_sym = code - NUM_LITERAL_CODES;
+      const int length = GetCopyLength(length_sym, br);
+      const int dist_symbol = ReadSymbol(&htree_group->htrees_[DIST], br);
+      VP8LFillBitWindow(br);
+      dist_code = GetCopyDistance(dist_symbol, br);
+      dist = PlaneCodeToDistance(width, dist_code);
+      if (src - data < (ptrdiff_t)dist || src_end - src < (ptrdiff_t)length) {
+        ok = 0;
+        goto End;
+      } else {
+        int i;
+        for (i = 0; i < length; ++i) src[i] = src[i - dist];
+        src += length;
+      }
+      col += length;
+      while (col >= width) {
+        col -= width;
+        ++row;
+        if ((row % NUM_ARGB_CACHE_ROWS == 0) && (process_func != NULL)) {
+          process_func(dec, row);
+        }
+      }
+      if (src < src_last) {
+        if (col & mask) htree_group = GetHtreeGroupForPos(hdr, col, row);
+        if (color_cache != NULL) {
+          while (last_cached < src) {
+            VP8LColorCacheInsert(color_cache, *last_cached++);
+          }
+        }
+      }
+    } else if (code < color_cache_limit) {  // Color cache
+      const int key = code - len_code_limit;
+      assert(color_cache != NULL);
+      while (last_cached < src) {
+        VP8LColorCacheInsert(color_cache, *last_cached++);
+      }
+      *src = VP8LColorCacheLookup(color_cache, key);
+      goto AdvanceByOne;
+    } else {  // Not reached
+      ok = 0;
+      goto End;
+    }
+    ok = !br->error_;
+    if (!ok) goto End;
+  }
+  // Process the remaining rows corresponding to last row-block.
+  if (process_func != NULL) process_func(dec, row);
 
-#undef DECODE_DATA_FUNC
+ End:
+  if (br->error_ || !ok || (br->eos_ && src < src_end)) {
+    ok = 0;
+    dec->status_ = br->eos_ ? VP8_STATUS_SUSPENDED
+                            : VP8_STATUS_BITSTREAM_ERROR;
+  } else {
+    dec->last_pixel_ = (int)(src - data);
+    if (src == src_end) dec->state_ = READ_DATA;
+  }
+  return ok;
+}
 
 // -----------------------------------------------------------------------------
 // VP8LTransform
@@ -926,6 +1040,9 @@ VP8LDecoder* VP8LNew(void) {
   dec->status_ = VP8_STATUS_OK;
   dec->action_ = READ_DIM;
   dec->state_ = READ_DIM;
+
+  VP8LDspInit();  // Init critical function pointers.
+
   return dec;
 }
 
@@ -1031,7 +1148,8 @@ static int DecodeImageStream(int xsize, int ysize,
   }
 
   // Use the Huffman trees to decode the LZ77 encoded data.
-  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize, NULL);
+  ok = DecodeImageData(dec, data, transform_xsize, transform_ysize,
+                       transform_ysize, NULL);
   ok = ok && !br->error_;
 
  End:
@@ -1053,6 +1171,7 @@ static int DecodeImageStream(int xsize, int ysize,
       assert(data == NULL);
       assert(is_level0);
     }
+    dec->last_pixel_ = 0;  // Reset for future DECODE_DATA_FUNC() calls.
     if (!is_level0) ClearMetadata(hdr);  // Clean up temporary data behind.
   }
   return ok;
@@ -1060,29 +1179,35 @@ static int DecodeImageStream(int xsize, int ysize,
 
 //------------------------------------------------------------------------------
 // Allocate internal buffers dec->pixels_ and dec->argb_cache_.
-static int AllocateInternalBuffers(VP8LDecoder* const dec, int final_width,
-                                   size_t bytes_per_pixel) {
-  const int argb_cache_needed = (bytes_per_pixel == sizeof(uint32_t));
+static int AllocateInternalBuffers32b(VP8LDecoder* const dec, int final_width) {
   const uint64_t num_pixels = (uint64_t)dec->width_ * dec->height_;
   // Scratch buffer corresponding to top-prediction row for transforming the
   // first row in the row-blocks. Not needed for paletted alpha.
-  const uint64_t cache_top_pixels =
-      argb_cache_needed ? (uint16_t)final_width : 0ULL;
+  const uint64_t cache_top_pixels = (uint16_t)final_width;
   // Scratch buffer for temporary BGRA storage. Not needed for paletted alpha.
-  const uint64_t cache_pixels =
-      argb_cache_needed ? (uint64_t)final_width * NUM_ARGB_CACHE_ROWS : 0ULL;
+  const uint64_t cache_pixels = (uint64_t)final_width * NUM_ARGB_CACHE_ROWS;
   const uint64_t total_num_pixels =
       num_pixels + cache_top_pixels + cache_pixels;
 
   assert(dec->width_ <= final_width);
-  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, bytes_per_pixel);
+  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint32_t));
   if (dec->pixels_ == NULL) {
     dec->argb_cache_ = NULL;    // for sanity check
     dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
     return 0;
   }
-  dec->argb_cache_ =
-      argb_cache_needed ? dec->pixels_ + num_pixels + cache_top_pixels : NULL;
+  dec->argb_cache_ = dec->pixels_ + num_pixels + cache_top_pixels;
+  return 1;
+}
+
+static int AllocateInternalBuffers8b(VP8LDecoder* const dec) {
+  const uint64_t total_num_pixels = (uint64_t)dec->width_ * dec->height_;
+  dec->argb_cache_ = NULL;    // for sanity check
+  dec->pixels_ = (uint32_t*)WebPSafeMalloc(total_num_pixels, sizeof(uint8_t));
+  if (dec->pixels_ == NULL) {
+    dec->status_ = VP8_STATUS_OUT_OF_MEMORY;
+    return 0;
+  }
   return 1;
 }
 
@@ -1108,64 +1233,73 @@ static void ExtractAlphaRows(VP8LDecoder* const dec, int row) {
   dec->last_row_ = dec->last_out_row_ = row;
 }
 
-// Row-processing for the special case when alpha data contains only one
-// transform: color indexing.
-static void ExtractPalettedAlphaRows(VP8LDecoder* const dec, int row) {
-  const int num_rows = row - dec->last_row_;
-  const uint8_t* const in =
-      (uint8_t*)dec->pixels_ + dec->width_ * dec->last_row_;
-  if (num_rows <= 0) return;  // Nothing to be done.
-  ApplyInverseTransformsAlpha(dec, num_rows, in);
-  dec->last_row_ = dec->last_out_row_ = row;
-}
-
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
-                               size_t data_size, uint8_t* const output) {
-  VP8Io io;
+int VP8LDecodeAlphaHeader(ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output) {
   int ok = 0;
-  VP8LDecoder* const dec = VP8LNew();
-  size_t bytes_per_pixel = sizeof(uint32_t);  // Default: BGRA mode.
-  if (dec == NULL) return 0;
-
-  dec->width_ = width;
-  dec->height_ = height;
-  dec->io_ = &io;
+  VP8LDecoder* dec;
+  VP8Io* io;
+  assert(alph_dec != NULL);
+  alph_dec->vp8l_dec_ = VP8LNew();
+  if (alph_dec->vp8l_dec_ == NULL) return 0;
+  dec = alph_dec->vp8l_dec_;
+
+  dec->width_ = alph_dec->width_;
+  dec->height_ = alph_dec->height_;
+  dec->io_ = &alph_dec->io_;
+  io = dec->io_;
 
-  VP8InitIo(&io);
-  WebPInitCustomIo(NULL, &io);    // Just a sanity Init. io won't be used.
-  io.opaque = output;
-  io.width = width;
-  io.height = height;
+  VP8InitIo(io);
+  WebPInitCustomIo(NULL, io);  // Just a sanity Init. io won't be used.
+  io->opaque = output;
+  io->width = alph_dec->width_;
+  io->height = alph_dec->height_;
 
   dec->status_ = VP8_STATUS_OK;
   VP8LInitBitReader(&dec->br_, data, data_size);
 
   dec->action_ = READ_HDR;
-  if (!DecodeImageStream(width, height, 1, dec, NULL)) goto Err;
+  if (!DecodeImageStream(alph_dec->width_, alph_dec->height_, 1, dec, NULL)) {
+    goto Err;
+  }
 
   // Special case: if alpha data uses only the color indexing transform and
   // doesn't use color cache (a frequent case), we will use DecodeAlphaData()
   // method that only needs allocation of 1 byte per pixel (alpha channel).
   if (dec->next_transform_ == 1 &&
       dec->transforms_[0].type_ == COLOR_INDEXING_TRANSFORM &&
-      dec->hdr_.color_cache_size_ == 0) {
-    bytes_per_pixel = sizeof(uint8_t);
+      Is8bOptimizable(&dec->hdr_)) {
+    alph_dec->use_8b_decode = 1;
+    ok = AllocateInternalBuffers8b(dec);
+  } else {
+    // Allocate internal buffers (note that dec->width_ may have changed here).
+    alph_dec->use_8b_decode = 0;
+    ok = AllocateInternalBuffers32b(dec, alph_dec->width_);
   }
 
-  // Allocate internal buffers (note that dec->width_ may have changed here).
-  if (!AllocateInternalBuffers(dec, width, bytes_per_pixel)) goto Err;
+  if (!ok) goto Err;
 
-  // Decode (with special row processing).
   dec->action_ = READ_DATA;
-  ok = (bytes_per_pixel == sizeof(uint8_t)) ?
-      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
-                      ExtractPalettedAlphaRows) :
-      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                      ExtractAlphaRows);
+  return 1;
 
  Err:
-  VP8LDelete(dec);
-  return ok;
+  VP8LDelete(alph_dec->vp8l_dec_);
+  alph_dec->vp8l_dec_ = NULL;
+  return 0;
+}
+
+int VP8LDecodeAlphaImageStream(ALPHDecoder* const alph_dec, int last_row) {
+  VP8LDecoder* const dec = alph_dec->vp8l_dec_;
+  assert(dec != NULL);
+  assert(dec->action_ == READ_DATA);
+  assert(last_row <= dec->height_);
+
+  // Decode (with special row processing).
+  return alph_dec->use_8b_decode ?
+      DecodeAlphaData(dec, (uint8_t*)dec->pixels_, dec->width_, dec->height_,
+                      last_row) :
+      DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
+                      last_row, ExtractAlphaRows);
 }
 
 //------------------------------------------------------------------------------
@@ -1201,7 +1335,6 @@ int VP8LDecodeHeader(VP8LDecoder* const dec, VP8Io* const io) {
 }
 
 int VP8LDecodeImage(VP8LDecoder* const dec) {
-  const size_t bytes_per_pixel = sizeof(uint32_t);
   VP8Io* io = NULL;
   WebPDecParams* params = NULL;
 
@@ -1221,14 +1354,14 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
     goto Err;
   }
 
-  if (!AllocateInternalBuffers(dec, io->width, bytes_per_pixel)) goto Err;
+  if (!AllocateInternalBuffers32b(dec, io->width)) goto Err;
 
   if (io->use_scaling && !AllocateAndInitRescaler(dec, io)) goto Err;
 
   // Decode.
   dec->action_ = READ_DATA;
   if (!DecodeImageData(dec, dec->pixels_, dec->width_, dec->height_,
-                       ProcessRows)) {
+                       dec->height_, ProcessRows)) {
     goto Err;
   }
 
@@ -1245,6 +1378,3 @@ int VP8LDecodeImage(VP8LDecoder* const dec) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/vp8li.h b/third_party/libwebp/dec/vp8li.h
index 543a767..afa294d 100644
--- a/third_party/libwebp/dec/vp8li.h
+++ b/third_party/libwebp/dec/vp8li.h
@@ -22,7 +22,7 @@
 #include "../utils/huffman.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -57,7 +57,8 @@ typedef struct {
   HTreeGroup     *htree_groups_;
 } VP8LMetadata;
 
-typedef struct {
+typedef struct VP8LDecoder VP8LDecoder;
+struct VP8LDecoder {
   VP8StatusCode    status_;
   VP8LDecodeState  action_;
   VP8LDecodeState  state_;
@@ -74,6 +75,9 @@ typedef struct {
   int              width_;
   int              height_;
   int              last_row_;      // last input row decoded so far.
+  int              last_pixel_;    // last pixel decoded so far. However, it may
+                                   // not be transformed, scaled and
+                                   // color-converted yet.
   int              last_out_row_;  // last row output so far.
 
   VP8LMetadata     hdr_;
@@ -85,18 +89,27 @@ typedef struct {
 
   uint8_t         *rescaler_memory;  // Working memory for rescaling work.
   WebPRescaler    *rescaler;         // Common rescaler for all channels.
-} VP8LDecoder;
+};
 
 //------------------------------------------------------------------------------
 // internal functions. Not public.
 
+struct ALPHDecoder;  // Defined in dec/alphai.h.
+
 // in vp8l.c
 
-// Decodes a raw image stream (without header) and store the alpha data
-// into *output, which must be of size width x height. Returns false in case
-// of error.
-int VP8LDecodeAlphaImageStream(int width, int height, const uint8_t* const data,
-                               size_t data_size, uint8_t* const output);
+// Decodes image header for alpha data stored using lossless compression.
+// Returns false in case of error.
+int VP8LDecodeAlphaHeader(struct ALPHDecoder* const alph_dec,
+                          const uint8_t* const data, size_t data_size,
+                          uint8_t* const output);
+
+// Decodes *at least* 'last_row' rows of alpha. If some of the initial rows are
+// already decoded in previous call(s), it will resume decoding from where it
+// was paused.
+// Returns false in case of bitstream error.
+int VP8LDecodeAlphaImageStream(struct ALPHDecoder* const alph_dec,
+                               int last_row);
 
 // Allocates and initialize a new lossless decoder instance.
 VP8LDecoder* VP8LNew(void);
@@ -117,7 +130,7 @@ void VP8LDelete(VP8LDecoder* const dec);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/dec/webp.c b/third_party/libwebp/dec/webp.c
index e4fe73d..302220f 100644
--- a/third_party/libwebp/dec/webp.c
+++ b/third_party/libwebp/dec/webp.c
@@ -18,10 +18,6 @@
 #include "./webpi.h"
 #include "../webp/mux_types.h"  // ALPHA_FLAG
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // RIFF layout is:
 //   Offset  tag
@@ -285,6 +281,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
                                           int* const height,
                                           int* const has_alpha,
                                           int* const has_animation,
+                                          int* const format,
                                           WebPHeaderStructure* const headers) {
   int canvas_width = 0;
   int canvas_height = 0;
@@ -292,6 +289,9 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   int image_height = 0;
   int found_riff = 0;
   int found_vp8x = 0;
+  int animation_present = 0;
+  int fragments_present = 0;
+
   VP8StatusCode status;
   WebPHeaderStructure hdrs;
 
@@ -312,13 +312,13 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
   // Skip over VP8X.
   {
     uint32_t flags = 0;
-    int animation_present;
     status = ParseVP8X(&data, &data_size, &found_vp8x,
                        &canvas_width, &canvas_height, &flags);
     if (status != VP8_STATUS_OK) {
       return status;  // Wrong VP8X / insufficient data.
     }
     animation_present = !!(flags & ANIMATION_FLAG);
+    fragments_present = !!(flags & FRAGMENTS_FLAG);
     if (!found_riff && found_vp8x) {
       // Note: This restriction may be removed in the future, if it becomes
       // necessary to send VP8X chunk to the decoder.
@@ -326,8 +326,10 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     }
     if (has_alpha != NULL) *has_alpha = !!(flags & ALPHA_FLAG);
     if (has_animation != NULL) *has_animation = animation_present;
+    if (format != NULL) *format = 0;   // default = undefined
 
-    if (found_vp8x && animation_present && headers == NULL) {
+    if (found_vp8x && (animation_present || fragments_present) &&
+        headers == NULL) {
       if (width != NULL) *width = canvas_width;
       if (height != NULL) *height = canvas_height;
       return VP8_STATUS_OK;  // Just return features from VP8X header.
@@ -356,6 +358,10 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
     return VP8_STATUS_BITSTREAM_ERROR;
   }
 
+  if (format != NULL && !(animation_present || fragments_present)) {
+    *format = hdrs.is_lossless ? 2 : 1;
+  }
+
   if (!hdrs.is_lossless) {
     if (data_size < VP8_FRAME_HEADER_SIZE) {
       return VP8_STATUS_NOT_ENOUGH_DATA;
@@ -374,7 +380,7 @@ static VP8StatusCode ParseHeadersInternal(const uint8_t* data,
       return VP8_STATUS_BITSTREAM_ERROR;
     }
   }
-  // Validates image size coherency. TODO(urvang): what about FRGM?
+  // Validates image size coherency.
   if (found_vp8x) {
     if (canvas_width != image_width || canvas_height != image_height) {
       return VP8_STATUS_BITSTREAM_ERROR;
@@ -402,7 +408,8 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
   assert(headers != NULL);
   // fill out headers, ignore width/height/has_alpha.
   status = ParseHeadersInternal(headers->data, headers->data_size,
-                                NULL, NULL, NULL, &has_animation, headers);
+                                NULL, NULL, NULL, &has_animation,
+                                NULL, headers);
   if (status == VP8_STATUS_OK || status == VP8_STATUS_NOT_ENOUGH_DATA) {
     // TODO(jzern): full support of animation frames will require API additions.
     if (has_animation) {
@@ -416,7 +423,7 @@ VP8StatusCode WebPParseHeaders(WebPHeaderStructure* const headers) {
 // WebPDecParams
 
 void WebPResetDecParams(WebPDecParams* const params) {
-  if (params) {
+  if (params != NULL) {
     memset(params, 0, sizeof(*params));
   }
 }
@@ -449,11 +456,6 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
     if (dec == NULL) {
       return VP8_STATUS_OUT_OF_MEMORY;
     }
-#ifdef WEBP_USE_THREAD
-    dec->use_threads_ = params->options && (params->options->use_threads > 0);
-#else
-    dec->use_threads_ = 0;
-#endif
     dec->alpha_data_ = headers.alpha_data;
     dec->alpha_data_size_ = headers.alpha_data_size;
 
@@ -465,6 +467,10 @@ static VP8StatusCode DecodeInto(const uint8_t* const data, size_t data_size,
       status = WebPAllocateDecBuffer(io.width, io.height, params->options,
                                      params->output);
       if (status == VP8_STATUS_OK) {  // Decode
+        // This change must be done before calling VP8Decode()
+        dec->mt_method_ = VP8GetThreadMethod(params->options, &headers,
+                                             io.width, io.height);
+        VP8InitDithering(params->options, dec);
         if (!VP8Decode(dec, &io)) {
           status = dec->status_;
         }
@@ -651,7 +657,6 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, size_t data_size,
 static void DefaultFeatures(WebPBitstreamFeatures* const features) {
   assert(features != NULL);
   memset(features, 0, sizeof(*features));
-  features->bitstream_version = 0;
 }
 
 static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
@@ -665,7 +670,7 @@ static VP8StatusCode GetFeatures(const uint8_t* const data, size_t data_size,
   return ParseHeadersInternal(data, data_size,
                               &features->width, &features->height,
                               &features->has_alpha, &features->has_animation,
-                              NULL);
+                              &features->format, NULL);
 }
 
 //------------------------------------------------------------------------------
@@ -803,6 +808,3 @@ int WebPIoInitFromOptions(const WebPDecoderOptions* const options,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dec/webpi.h b/third_party/libwebp/dec/webpi.h
index 4ae0bfc..d915f5e 100644
--- a/third_party/libwebp/dec/webpi.h
+++ b/third_party/libwebp/dec/webpi.h
@@ -14,7 +14,7 @@
 #ifndef WEBP_DEC_WEBPI_H_
 #define WEBP_DEC_WEBPI_H_
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -109,7 +109,7 @@ void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/demux/demux.c b/third_party/libwebp/demux/demux.c
index c6e4acd..f66ac6d 100644
--- a/third_party/libwebp/demux/demux.c
+++ b/third_party/libwebp/demux/demux.c
@@ -23,13 +23,9 @@
 #include "../webp/demux.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define DMUX_MAJ_VERSION 0
-#define DMUX_MIN_VERSION 1
-#define DMUX_REV_VERSION 1
+#define DMUX_MIN_VERSION 2
+#define DMUX_REV_VERSION 0
 
 typedef struct {
   size_t start_;        // start location of the data
@@ -75,6 +71,7 @@ struct WebPDemuxer {
   Frame* frames_;
   Frame** frames_tail_;
   Chunk* chunks_;  // non-image chunks
+  Chunk** chunks_tail_;
 };
 
 typedef enum {
@@ -179,10 +176,9 @@ static WEBP_INLINE uint32_t ReadLE32(MemBuffer* const mem) {
 // Secondary chunk parsing
 
 static void AddChunk(WebPDemuxer* const dmux, Chunk* const chunk) {
-  Chunk** c = &dmux->chunks_;
-  while (*c != NULL) c = &(*c)->next_;
-  *c = chunk;
+  *dmux->chunks_tail_ = chunk;
   chunk->next_ = NULL;
+  dmux->chunks_tail_ = &chunk->next_;
 }
 
 // Add a frame to the end of the list, ensuring the last frame is complete.
@@ -301,7 +297,7 @@ static ParseStatus NewFrame(const MemBuffer* const mem,
 // 'frame_chunk_size' is the previously validated, padded chunk size.
 static ParseStatus ParseAnimationFrame(
     WebPDemuxer* const dmux, uint32_t frame_chunk_size) {
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
   const uint32_t anmf_payload_size = frame_chunk_size - ANMF_CHUNK_SIZE;
   int added_frame = 0;
   int bits;
@@ -328,7 +324,7 @@ static ParseStatus ParseAnimationFrame(
   // Store a frame only if the animation flag is set there is some data for
   // this frame is available.
   status = StoreFrame(dmux->num_frames_ + 1, anmf_payload_size, mem, frame);
-  if (status != PARSE_ERROR && has_frames && frame->frame_num_ > 0) {
+  if (status != PARSE_ERROR && is_animation && frame->frame_num_ > 0) {
     added_frame = AddFrame(dmux, frame);
     if (added_frame) {
       ++dmux->num_frames_;
@@ -347,7 +343,7 @@ static ParseStatus ParseAnimationFrame(
 static ParseStatus ParseFragment(WebPDemuxer* const dmux,
                                  uint32_t fragment_chunk_size) {
   const int frame_num = 1;  // All fragments belong to the 1st (and only) frame.
-  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
   const uint32_t frgm_payload_size = fragment_chunk_size - FRGM_CHUNK_SIZE;
   int added_fragment = 0;
   MemBuffer* const mem = &dmux->mem_;
@@ -360,10 +356,10 @@ static ParseStatus ParseFragment(WebPDemuxer* const dmux,
   frame->x_offset_ = 2 * ReadLE24s(mem);
   frame->y_offset_ = 2 * ReadLE24s(mem);
 
-  // Store a fragment only if the fragments flag is set there is some data for
-  // this fragment is available.
+  // Store a fragment only if the 'fragments' flag is set and there is some
+  // data available.
   status = StoreFrame(frame_num, frgm_payload_size, mem, frame);
-  if (status != PARSE_ERROR && has_fragments && frame->frame_num_ > 0) {
+  if (status != PARSE_ERROR && is_fragmented && frame->frame_num_ > 0) {
     added_fragment = AddFrame(dmux, frame);
     if (!added_fragment) {
       status = PARSE_ERROR;
@@ -395,20 +391,20 @@ static int StoreChunk(WebPDemuxer* const dmux,
 // -----------------------------------------------------------------------------
 // Primary chunk parsing
 
-static int ReadHeader(MemBuffer* const mem) {
+static ParseStatus ReadHeader(MemBuffer* const mem) {
   const size_t min_size = RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE;
   uint32_t riff_size;
 
   // Basic file level validation.
-  if (MemDataSize(mem) < min_size) return 0;
+  if (MemDataSize(mem) < min_size) return PARSE_NEED_MORE_DATA;
   if (memcmp(GetBuffer(mem), "RIFF", CHUNK_SIZE_BYTES) ||
       memcmp(GetBuffer(mem) + CHUNK_HEADER_SIZE, "WEBP", CHUNK_SIZE_BYTES)) {
-    return 0;
+    return PARSE_ERROR;
   }
 
   riff_size = GetLE32(GetBuffer(mem) + TAG_SIZE);
-  if (riff_size < CHUNK_HEADER_SIZE) return 0;
-  if (riff_size > MAX_CHUNK_PAYLOAD) return 0;
+  if (riff_size < CHUNK_HEADER_SIZE) return PARSE_ERROR;
+  if (riff_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
 
   // There's no point in reading past the end of the RIFF chunk
   mem->riff_end_ = riff_size + CHUNK_HEADER_SIZE;
@@ -417,7 +413,7 @@ static int ReadHeader(MemBuffer* const mem) {
   }
 
   Skip(mem, RIFF_HEADER_SIZE);
-  return 1;
+  return PARSE_OK;
 }
 
 static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
@@ -425,6 +421,7 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
   MemBuffer* const mem = &dmux->mem_;
   Frame* frame;
   ParseStatus status;
+  int image_added = 0;
 
   if (dmux->frames_ != NULL) return PARSE_ERROR;
   if (SizeIsInvalid(mem, min_size)) return PARSE_ERROR;
@@ -453,45 +450,24 @@ static ParseStatus ParseSingleImage(WebPDemuxer* const dmux) {
       dmux->canvas_height_ = frame->height_;
       dmux->feature_flags_ |= frame->has_alpha_ ? ALPHA_FLAG : 0;
     }
-    AddFrame(dmux, frame);
-    dmux->num_frames_ = 1;
-  } else {
-    free(frame);
+    if (!AddFrame(dmux, frame)) {
+      status = PARSE_ERROR;  // last frame was left incomplete
+    } else {
+      image_added = 1;
+      dmux->num_frames_ = 1;
+    }
   }
 
+  if (!image_added) free(frame);
   return status;
 }
 
-static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+static ParseStatus ParseVP8XChunks(WebPDemuxer* const dmux) {
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
   MemBuffer* const mem = &dmux->mem_;
   int anim_chunks = 0;
-  uint32_t vp8x_size;
   ParseStatus status = PARSE_OK;
 
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
-  dmux->is_ext_format_ = 1;
-  Skip(mem, TAG_SIZE);  // VP8X
-  vp8x_size = ReadLE32(mem);
-  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
-  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
-  vp8x_size += vp8x_size & 1;
-  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
-  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
-
-  dmux->feature_flags_ = ReadByte(mem);
-  Skip(mem, 3);  // Reserved.
-  dmux->canvas_width_  = 1 + ReadLE24s(mem);
-  dmux->canvas_height_ = 1 + ReadLE24s(mem);
-  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
-    return PARSE_ERROR;  // image final dimension is too large
-  }
-  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
-  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
-
-  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
-  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
-
   do {
     int store_chunk = 1;
     const size_t chunk_start_offset = mem->start_;
@@ -509,9 +485,8 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
       case MKFOURCC('A', 'L', 'P', 'H'):
       case MKFOURCC('V', 'P', '8', ' '):
       case MKFOURCC('V', 'P', '8', 'L'): {
-        const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
         // check that this isn't an animation (all frames should be in an ANMF).
-        if (anim_chunks > 0 || has_frames) return PARSE_ERROR;
+        if (anim_chunks > 0 || is_animation) return PARSE_ERROR;
 
         Rewind(mem, CHUNK_HEADER_SIZE);
         status = ParseSingleImage(dmux);
@@ -548,14 +523,14 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
         store_chunk = !!(dmux->feature_flags_ & ICCP_FLAG);
         goto Skip;
       }
-      case MKFOURCC('X', 'M', 'P', ' '): {
-        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
-        goto Skip;
-      }
       case MKFOURCC('E', 'X', 'I', 'F'): {
         store_chunk = !!(dmux->feature_flags_ & EXIF_FLAG);
         goto Skip;
       }
+      case MKFOURCC('X', 'M', 'P', ' '): {
+        store_chunk = !!(dmux->feature_flags_ & XMP_FLAG);
+        goto Skip;
+      }
  Skip:
       default: {
         if (chunk_size_padded <= MemDataSize(mem)) {
@@ -584,6 +559,37 @@ static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
   return status;
 }
 
+static ParseStatus ParseVP8X(WebPDemuxer* const dmux) {
+  MemBuffer* const mem = &dmux->mem_;
+  uint32_t vp8x_size;
+
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  dmux->is_ext_format_ = 1;
+  Skip(mem, TAG_SIZE);  // VP8X
+  vp8x_size = ReadLE32(mem);
+  if (vp8x_size > MAX_CHUNK_PAYLOAD) return PARSE_ERROR;
+  if (vp8x_size < VP8X_CHUNK_SIZE) return PARSE_ERROR;
+  vp8x_size += vp8x_size & 1;
+  if (SizeIsInvalid(mem, vp8x_size)) return PARSE_ERROR;
+  if (MemDataSize(mem) < vp8x_size) return PARSE_NEED_MORE_DATA;
+
+  dmux->feature_flags_ = ReadByte(mem);
+  Skip(mem, 3);  // Reserved.
+  dmux->canvas_width_  = 1 + ReadLE24s(mem);
+  dmux->canvas_height_ = 1 + ReadLE24s(mem);
+  if (dmux->canvas_width_ * (uint64_t)dmux->canvas_height_ >= MAX_IMAGE_AREA) {
+    return PARSE_ERROR;  // image final dimension is too large
+  }
+  Skip(mem, vp8x_size - VP8X_CHUNK_SIZE);  // skip any trailing data.
+  dmux->state_ = WEBP_DEMUX_PARSED_HEADER;
+
+  if (SizeIsInvalid(mem, CHUNK_HEADER_SIZE)) return PARSE_ERROR;
+  if (MemDataSize(mem) < CHUNK_HEADER_SIZE) return PARSE_NEED_MORE_DATA;
+
+  return ParseVP8XChunks(dmux);
+}
+
 // -----------------------------------------------------------------------------
 // Format validation
 
@@ -620,8 +626,8 @@ static int CheckFrameBounds(const Frame* const frame, int exact,
 }
 
 static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
-  const int has_fragments = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
-  const int has_frames = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_animation = !!(dmux->feature_flags_ & ANIMATION_FLAG);
+  const int is_fragmented = !!(dmux->feature_flags_ & FRAGMENTS_FLAG);
   const Frame* f = dmux->frames_;
 
   if (dmux->state_ == WEBP_DEMUX_PARSING_HEADER) return 1;
@@ -630,7 +636,7 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
   if (dmux->loop_count_ < 0) return 0;
   if (dmux->state_ == WEBP_DEMUX_DONE && dmux->frames_ == NULL) return 0;
 #ifndef WEBP_EXPERIMENTAL_FEATURES
-  if (has_fragments) return 0;
+  if (is_fragmented) return 0;
 #endif
 
   while (f != NULL) {
@@ -643,9 +649,9 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
       const ChunkData* const image = f->img_components_;
       const ChunkData* const alpha = f->img_components_ + 1;
 
-      if (has_fragments && !f->is_fragment_) return 0;
-      if (!has_fragments && f->is_fragment_) return 0;
-      if (!has_frames && f->frame_num_ > 1) return 0;
+      if (is_fragmented && !f->is_fragment_) return 0;
+      if (!is_fragmented && f->is_fragment_) return 0;
+      if (!is_animation && f->frame_num_ > 1) return 0;
 
       if (f->complete_) {
         if (alpha->size_ == 0 && image->size_ == 0) return 0;
@@ -669,7 +675,7 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
       }
 
       if (f->width_ > 0 && f->height_ > 0 &&
-          !CheckFrameBounds(f, !(has_frames || has_fragments),
+          !CheckFrameBounds(f, !(is_animation || is_fragmented),
                             dmux->canvas_width_, dmux->canvas_height_)) {
         return 0;
       }
@@ -677,9 +683,8 @@ static int IsValidExtendedFormat(const WebPDemuxer* const dmux) {
       fragment_count += f->is_fragment_;
       ++frame_count;
     }
-    if (!has_fragments && frame_count > 1) return 0;
+    if (!is_fragmented && frame_count > 1) return 0;
     if (fragment_count > 0 && frame_count != fragment_count) return 0;
-    if (f == NULL) break;
   }
   return 1;
 }
@@ -694,6 +699,7 @@ static void InitDemux(WebPDemuxer* const dmux, const MemBuffer* const mem) {
   dmux->canvas_width_ = -1;
   dmux->canvas_height_ = -1;
   dmux->frames_tail_ = &dmux->frames_;
+  dmux->chunks_tail_ = &dmux->chunks_;
   dmux->mem_ = *mem;
 }
 
@@ -705,11 +711,20 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
   MemBuffer mem;
   WebPDemuxer* dmux;
 
+  if (state != NULL) *state = WEBP_DEMUX_PARSE_ERROR;
+
   if (WEBP_ABI_IS_INCOMPATIBLE(version, WEBP_DEMUX_ABI_VERSION)) return NULL;
   if (data == NULL || data->bytes == NULL || data->size == 0) return NULL;
 
   if (!InitMemBuffer(&mem, data->bytes, data->size)) return NULL;
-  if (!ReadHeader(&mem)) return NULL;
+  status = ReadHeader(&mem);
+  if (status != PARSE_OK) {
+    if (state != NULL) {
+      *state = (status == PARSE_NEED_MORE_DATA) ? WEBP_DEMUX_PARSING_HEADER
+                                                : WEBP_DEMUX_PARSE_ERROR;
+    }
+    return NULL;
+  }
 
   partial = (mem.buf_size_ < mem.riff_end_);
   if (!allow_partial && partial) return NULL;
@@ -718,16 +733,18 @@ WebPDemuxer* WebPDemuxInternal(const WebPData* data, int allow_partial,
   if (dmux == NULL) return NULL;
   InitDemux(dmux, &mem);
 
+  status = PARSE_ERROR;
   for (parser = kMasterChunks; parser->parse != NULL; ++parser) {
     if (!memcmp(parser->id, GetBuffer(&dmux->mem_), TAG_SIZE)) {
       status = parser->parse(dmux);
       if (status == PARSE_OK) dmux->state_ = WEBP_DEMUX_DONE;
       if (status == PARSE_NEED_MORE_DATA && !partial) status = PARSE_ERROR;
       if (status != PARSE_ERROR && !parser->valid(dmux)) status = PARSE_ERROR;
+      if (status == PARSE_ERROR) dmux->state_ = WEBP_DEMUX_PARSE_ERROR;
       break;
     }
   }
-  if (state) *state = dmux->state_;
+  if (state != NULL) *state = dmux->state_;
 
   if (status == PARSE_ERROR) {
     WebPDemuxDelete(dmux);
@@ -983,6 +1000,3 @@ void WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter) {
   (void)iter;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}  // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/cpu.c b/third_party/libwebp/dsp/cpu.c
index 179901e..7a1f417 100644
--- a/third_party/libwebp/dsp/cpu.c
+++ b/third_party/libwebp/dsp/cpu.c
@@ -17,10 +17,6 @@
 #include <cpu-features.h>
 #endif
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // SSE2 detection.
 //
@@ -82,6 +78,3 @@ VP8CPUInfo VP8GetCPUInfo = armCPUInfo;
 VP8CPUInfo VP8GetCPUInfo = NULL;
 #endif
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/dec.c b/third_party/libwebp/dsp/dec.c
index a6d8519..8b246fa 100644
--- a/third_party/libwebp/dsp/dec.c
+++ b/third_party/libwebp/dsp/dec.c
@@ -14,10 +14,6 @@
 #include "./dsp.h"
 #include "../dec/vp8i.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // run-time tables (~4k)
 
@@ -61,6 +57,14 @@ static WEBP_INLINE uint8_t clip_8b(int v) {
 #define STORE(x, y, v) \
   dst[x + y * BPS] = clip_8b(dst[x + y * BPS] + ((v) >> 3))
 
+#define STORE2(y, dc, d, c) do {    \
+  const int DC = (dc);              \
+  STORE(0, y, DC + (d));            \
+  STORE(1, y, DC + (c));            \
+  STORE(2, y, DC - (c));            \
+  STORE(3, y, DC - (d));            \
+} while (0)
+
 static const int kC1 = 20091 + (1 << 16);
 static const int kC2 = 35468;
 #define MUL(a, b) (((a) * (b)) >> 16)
@@ -103,7 +107,21 @@ static void TransformOne(const int16_t* in, uint8_t* dst) {
     dst += BPS;
   }
 }
+
+// Simplified transform when only in[0], in[1] and in[4] are non-zero
+static void TransformAC3(const int16_t* in, uint8_t* dst) {
+  const int a = in[0] + 4;
+  const int c4 = MUL(in[4], kC2);
+  const int d4 = MUL(in[4], kC1);
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  STORE2(0, a + d4, d1, c1);
+  STORE2(1, a + c4, d1, c1);
+  STORE2(2, a - c4, d1, c1);
+  STORE2(3, a - d4, d1, c1);
+}
 #undef MUL
+#undef STORE2
 
 static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
   TransformOne(in, dst);
@@ -679,6 +697,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride,
 //------------------------------------------------------------------------------
 
 VP8DecIdct2 VP8Transform;
+VP8DecIdct VP8TransformAC3;
 VP8DecIdct VP8TransformUV;
 VP8DecIdct VP8TransformDC;
 VP8DecIdct VP8TransformDCUV;
@@ -697,9 +716,7 @@ VP8SimpleFilterFunc VP8SimpleVFilter16i;
 VP8SimpleFilterFunc VP8SimpleHFilter16i;
 
 extern void VP8DspInitSSE2(void);
-#if defined(WEBP_USE_NEON)
 extern void VP8DspInitNEON(void);
-#endif
 
 void VP8DspInit(void) {
   DspInitTables();
@@ -708,6 +725,7 @@ void VP8DspInit(void) {
   VP8TransformUV = TransformUV;
   VP8TransformDC = TransformDC;
   VP8TransformDCUV = TransformDCUV;
+  VP8TransformAC3 = TransformAC3;
 
   VP8VFilter16 = VFilter16;
   VP8HFilter16 = HFilter16;
@@ -736,6 +754,3 @@ void VP8DspInit(void) {
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c
index 5dcd3b7..9c3d8cc 100644
--- a/third_party/libwebp/dsp/dec_neon.c
+++ b/third_party/libwebp/dsp/dec_neon.c
@@ -14,15 +14,11 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_NEON)
 
 #include "../dec/vp8i.h"
 
-#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",                  \
+#define QRegs "q0", "q1", "q2", "q3",                                          \
               "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15"
 
 #define FLIP_SIGN_BIT2(a, b, s)                                                \
@@ -101,9 +97,9 @@ static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) {
     "vld1.u8    {q1}, [%[p]], %[stride]        \n"  // p1
     "vld1.u8    {q2}, [%[p]], %[stride]        \n"  // p0
     "vld1.u8    {q3}, [%[p]], %[stride]        \n"  // q0
-    "vld1.u8    {q4}, [%[p]]                   \n"  // q1
+    "vld1.u8    {q12}, [%[p]]                  \n"  // q1
 
-    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+    DO_FILTER2(q1, q2, q3, q12, %[thresh])
 
     "sub        %[p], %[p], %[stride], lsl #1  \n"  // p -= 2 * stride
 
@@ -122,18 +118,18 @@ static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) {
     "add        r5, r4, %[stride]              \n"  // base2 = base1 + stride
 
     LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6)
-    LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6)
-    "vswp       d3, d6                         \n"  // p1:q1 p0:q3
-    "vswp       d5, d8                         \n"  // q0:q2 q1:q4
-    "vswp       q2, q3                         \n"  // p1:q1 p0:q2 q0:q3 q1:q4
+    LOAD8x4(d24, d25, d26, d27, [r4], [r5], r6)
+    "vswp       d3, d24                        \n"  // p1:q1 p0:q3
+    "vswp       d5, d26                        \n"  // q0:q2 q1:q4
+    "vswp       q2, q12                        \n"  // p1:q1 p0:q2 q0:q3 q1:q4
 
-    DO_FILTER2(q1, q2, q3, q4, %[thresh])
+    DO_FILTER2(q1, q2, q12, q13, %[thresh])
 
     "sub        %[p], %[p], #1                 \n"  // p - 1
 
-    "vswp        d5, d6                        \n"
+    "vswp        d5, d24                       \n"
     STORE8x2(d4, d5, [%[p]], %[stride])
-    STORE8x2(d6, d7, [%[p]], %[stride])
+    STORE8x2(d24, d25, [%[p]], %[stride])
 
     : [p] "+r"(p)
     : [stride] "r"(stride), [thresh] "r"(thresh)
@@ -160,7 +156,7 @@ static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) {
 //-----------------------------------------------------------------------------
 // Inverse transforms (Paragraph 14.4)
 
-static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
+static void TransformOne(const int16_t* in, uint8_t* dst) {
   const int kBPS = BPS;
   const int16_t constants[] = {20091, 17734, 0, 0};
   /* kC1, kC2. Padded because vld1.16 loads 8 bytes
@@ -309,13 +305,44 @@ static void TransformOneNEON(const int16_t *in, uint8_t *dst) {
   );
 }
 
-static void TransformTwoNEON(const int16_t* in, uint8_t* dst, int do_two) {
-  TransformOneNEON(in, dst);
+static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) {
+  TransformOne(in, dst);
   if (do_two) {
-    TransformOneNEON(in + 16, dst + 4);
+    TransformOne(in + 16, dst + 4);
   }
 }
 
+static void TransformDC(const int16_t* in, uint8_t* dst) {
+  const int DC = (in[0] + 4) >> 3;
+  const int kBPS = BPS;
+  __asm__ volatile (
+    "vdup.16         q1, %[DC]        \n"
+
+    "vld1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vld1.32         d1[1], [%[dst]], %[kBPS]    \n"
+
+    "sub         %[dst], %[dst], %[kBPS], lsl #2 \n"
+
+    // add DC and convert to s16.
+    "vaddw.u8        q2, q1, d0                  \n"
+    "vaddw.u8        q3, q1, d1                  \n"
+    // convert back to u8 with saturation
+    "vqmovun.s16     d0,  q2                     \n"
+    "vqmovun.s16     d1,  q3                     \n"
+
+    "vst1.32         d0[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[0], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d0[1], [%[dst]], %[kBPS]    \n"
+    "vst1.32         d1[1], [%[dst]]             \n"
+    : [in] "+r"(in), [dst] "+r"(dst)  /* modified registers */
+    : [kBPS] "r"(kBPS),   /* constants */
+      [DC] "r"(DC)
+    : "memory", "q0", "q1", "q2", "q3"  /* clobbered */
+  );
+}
+
 static void TransformWHT(const int16_t* in, int16_t* out) {
   const int kStep = 32;  // The store is only incrementing the pointer as if we
                          // had stored a single byte.
@@ -324,39 +351,39 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
     // load data into q0, q1
     "vld1.16         {q0, q1}, [%[in]]           \n"
 
-    "vaddl.s16       q2, d0, d3                  \n" // a0 = in[0] + in[12]
-    "vaddl.s16       q3, d1, d2                  \n" // a1 = in[4] + in[8]
-    "vsubl.s16       q4, d1, d2                  \n" // a2 = in[4] - in[8]
-    "vsubl.s16       q5, d0, d3                  \n" // a3 = in[0] - in[12]
+    "vaddl.s16       q2, d0, d3                  \n"  // a0 = in[0] + in[12]
+    "vaddl.s16       q3, d1, d2                  \n"  // a1 = in[4] + in[8]
+    "vsubl.s16       q10, d1, d2                 \n"  // a2 = in[4] - in[8]
+    "vsubl.s16       q11, d0, d3                 \n"  // a3 = in[0] - in[12]
 
-    "vadd.s32        q0, q2, q3                  \n" // tmp[0] = a0 + a1
-    "vsub.s32        q2, q2, q3                  \n" // tmp[8] = a0 - a1
-    "vadd.s32        q1, q5, q4                  \n" // tmp[4] = a3 + a2
-    "vsub.s32        q3, q5, q4                  \n" // tmp[12] = a3 - a2
+    "vadd.s32        q0, q2, q3                  \n"  // tmp[0] = a0 + a1
+    "vsub.s32        q2, q2, q3                  \n"  // tmp[8] = a0 - a1
+    "vadd.s32        q1, q11, q10                \n"  // tmp[4] = a3 + a2
+    "vsub.s32        q3, q11, q10                \n"  // tmp[12] = a3 - a2
 
     // Transpose
     // q0 = tmp[0, 4, 8, 12], q1 = tmp[2, 6, 10, 14]
     // q2 = tmp[1, 5, 9, 13], q3 = tmp[3, 7, 11, 15]
-    "vswp            d1, d4                      \n" // vtrn.64 q0, q2
-    "vswp            d3, d6                      \n" // vtrn.64 q1, q3
+    "vswp            d1, d4                      \n"  // vtrn.64 q0, q2
+    "vswp            d3, d6                      \n"  // vtrn.64 q1, q3
     "vtrn.32         q0, q1                      \n"
     "vtrn.32         q2, q3                      \n"
 
-    "vmov.s32        q4, #3                      \n" // dc = 3
-    "vadd.s32        q0, q0, q4                  \n" // dc = tmp[0] + 3
-    "vadd.s32        q6, q0, q3                  \n" // a0 = dc + tmp[3]
-    "vadd.s32        q7, q1, q2                  \n" // a1 = tmp[1] + tmp[2]
-    "vsub.s32        q8, q1, q2                  \n" // a2 = tmp[1] - tmp[2]
-    "vsub.s32        q9, q0, q3                  \n" // a3 = dc - tmp[3]
+    "vmov.s32        q10, #3                     \n"  // dc = 3
+    "vadd.s32        q0, q0, q10                 \n"  // dc = tmp[0] + 3
+    "vadd.s32        q12, q0, q3                 \n"  // a0 = dc + tmp[3]
+    "vadd.s32        q13, q1, q2                 \n"  // a1 = tmp[1] + tmp[2]
+    "vsub.s32        q8, q1, q2                  \n"  // a2 = tmp[1] - tmp[2]
+    "vsub.s32        q9, q0, q3                  \n"  // a3 = dc - tmp[3]
 
-    "vadd.s32        q0, q6, q7                  \n"
-    "vshrn.s32       d0, q0, #3                  \n" // (a0 + a1) >> 3
+    "vadd.s32        q0, q12, q13                \n"
+    "vshrn.s32       d0, q0, #3                  \n"  // (a0 + a1) >> 3
     "vadd.s32        q1, q9, q8                  \n"
-    "vshrn.s32       d1, q1, #3                  \n" // (a3 + a2) >> 3
-    "vsub.s32        q2, q6, q7                  \n"
-    "vshrn.s32       d2, q2, #3                  \n" // (a0 - a1) >> 3
+    "vshrn.s32       d1, q1, #3                  \n"  // (a3 + a2) >> 3
+    "vsub.s32        q2, q12, q13                \n"
+    "vshrn.s32       d2, q2, #3                  \n"  // (a0 - a1) >> 3
     "vsub.s32        q3, q9, q8                  \n"
-    "vshrn.s32       d3, q3, #3                  \n" // (a3 - a2) >> 3
+    "vshrn.s32       d3, q3, #3                  \n"  // (a3 - a2) >> 3
 
     // set the results to output
     "vst1.16         d0[0], [%[out]], %[kStep]   \n"
@@ -378,8 +405,8 @@ static void TransformWHT(const int16_t* in, int16_t* out) {
 
     : [out] "+r"(out)  // modified registers
     : [in] "r"(in), [kStep] "r"(kStep)  // constants
-    : "memory", "q0", "q1", "q2", "q3", "q4",
-      "q5", "q6", "q7", "q8", "q9"  // clobbered
+    : "memory", "q0", "q1", "q2", "q3",
+      "q8", "q9", "q10", "q11", "q12", "q13"  // clobbered
   );
 }
 
@@ -392,7 +419,9 @@ extern void VP8DspInitNEON(void);
 
 void VP8DspInitNEON(void) {
 #if defined(WEBP_USE_NEON)
-  VP8Transform = TransformTwoNEON;
+  VP8Transform = TransformTwo;
+  VP8TransformAC3 = TransformOne;  // no special code here
+  VP8TransformDC = TransformDC;
   VP8TransformWHT = TransformWHT;
 
   VP8SimpleVFilter16 = SimpleVFilter16NEON;
@@ -402,6 +431,3 @@ void VP8DspInitNEON(void) {
 #endif   // WEBP_USE_NEON
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/dec_sse2.c b/third_party/libwebp/dsp/dec_sse2.c
index 6be9467..150c559 100644
--- a/third_party/libwebp/dsp/dec_sse2.c
+++ b/third_party/libwebp/dsp/dec_sse2.c
@@ -14,12 +14,12 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_SSE2)
 
+// The 3-coeff sparse transform in SSE2 is not really faster than the plain-C
+// one it seems => disable it by default. Uncomment the following to enable:
+// #define USE_TRANSFORM_AC3
+
 #include <emmintrin.h>
 #include "../dec/vp8i.h"
 
@@ -201,16 +201,16 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
     __m128i dst0, dst1, dst2, dst3;
     if (do_two) {
       // Load eight bytes/pixels per line.
-      dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]);
-      dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]);
-      dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]);
-      dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]);
+      dst0 = _mm_loadl_epi64((__m128i*)(dst + 0 * BPS));
+      dst1 = _mm_loadl_epi64((__m128i*)(dst + 1 * BPS));
+      dst2 = _mm_loadl_epi64((__m128i*)(dst + 2 * BPS));
+      dst3 = _mm_loadl_epi64((__m128i*)(dst + 3 * BPS));
     } else {
       // Load four bytes/pixels per line.
-      dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]);
-      dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]);
-      dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]);
-      dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]);
+      dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+      dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+      dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+      dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
     }
     // Convert to 16b.
     dst0 = _mm_unpacklo_epi8(dst0, zero);
@@ -230,20 +230,66 @@ static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) {
     // Store the results.
     if (do_two) {
       // Store eight bytes/pixels per line.
-      _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0);
-      _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1);
-      _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2);
-      _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3);
+      _mm_storel_epi64((__m128i*)(dst + 0 * BPS), dst0);
+      _mm_storel_epi64((__m128i*)(dst + 1 * BPS), dst1);
+      _mm_storel_epi64((__m128i*)(dst + 2 * BPS), dst2);
+      _mm_storel_epi64((__m128i*)(dst + 3 * BPS), dst3);
     } else {
       // Store four bytes/pixels per line.
-      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0);
-      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1);
-      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2);
-      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3);
+      *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+      *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+      *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+      *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
     }
   }
 }
 
+#if defined(USE_TRANSFORM_AC3)
+#define MUL(a, b) (((a) * (b)) >> 16)
+static void TransformAC3SSE2(const int16_t* in, uint8_t* dst) {
+  static const int kC1 = 20091 + (1 << 16);
+  static const int kC2 = 35468;
+  const __m128i A = _mm_set1_epi16(in[0] + 4);
+  const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
+  const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
+  const int c1 = MUL(in[1], kC2);
+  const int d1 = MUL(in[1], kC1);
+  const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
+  const __m128i B = _mm_adds_epi16(A, CD);
+  const __m128i m0 = _mm_adds_epi16(B, d4);
+  const __m128i m1 = _mm_adds_epi16(B, c4);
+  const __m128i m2 = _mm_subs_epi16(B, c4);
+  const __m128i m3 = _mm_subs_epi16(B, d4);
+  const __m128i zero = _mm_setzero_si128();
+  // Load the source pixels.
+  __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
+  __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
+  __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
+  __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
+  // Convert to 16b.
+  dst0 = _mm_unpacklo_epi8(dst0, zero);
+  dst1 = _mm_unpacklo_epi8(dst1, zero);
+  dst2 = _mm_unpacklo_epi8(dst2, zero);
+  dst3 = _mm_unpacklo_epi8(dst3, zero);
+  // Add the inverse transform.
+  dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
+  dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
+  dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
+  dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
+  // Unsigned saturate to 8b.
+  dst0 = _mm_packus_epi16(dst0, dst0);
+  dst1 = _mm_packus_epi16(dst1, dst1);
+  dst2 = _mm_packus_epi16(dst2, dst2);
+  dst3 = _mm_packus_epi16(dst3, dst3);
+  // Store the results.
+  *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
+  *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
+  *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
+  *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
+}
+#undef MUL
+#endif   // USE_TRANSFORM_AC3
+
 //------------------------------------------------------------------------------
 // Loop Filter (Paragraph 15)
 
@@ -888,6 +934,9 @@ extern void VP8DspInitSSE2(void);
 void VP8DspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
   VP8Transform = TransformSSE2;
+#if defined(USE_TRANSFORM_AC3)
+  VP8TransformAC3 = TransformAC3SSE2;
+#endif
 
   VP8VFilter16 = VFilter16SSE2;
   VP8HFilter16 = HFilter16SSE2;
@@ -905,6 +954,3 @@ void VP8DspInitSSE2(void) {
 #endif   // WEBP_USE_SSE2
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/dsp.h b/third_party/libwebp/dsp/dsp.h
index 31bda89..3be783a 100644
--- a/third_party/libwebp/dsp/dsp.h
+++ b/third_party/libwebp/dsp/dsp.h
@@ -16,14 +16,15 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
 // CPU detection
 
-#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
+#if defined(_MSC_VER) && _MSC_VER > 1310 && \
+    (defined(_M_X64) || defined(_M_IX86))
 #define WEBP_MSC_SSE2  // Visual C++ SSE2 targets
 #endif
 
@@ -85,6 +86,11 @@ typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16],
                                 int n, const struct VP8Matrix* const mtx);
 extern VP8QuantizeBlock VP8EncQuantizeBlock;
 
+// specific to 2nd transform:
+typedef int (*VP8QuantizeBlockWHT)(int16_t in[16], int16_t out[16],
+                                   const struct VP8Matrix* const mtx);
+extern VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
+
 // Collect histogram for susceptibility calculation and accumulate in histo[].
 struct VP8Histogram;
 typedef void (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred,
@@ -102,6 +108,7 @@ typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst);
 // when doing two transforms, coeffs is actually int16_t[2][16].
 typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two);
 extern VP8DecIdct2 VP8Transform;
+extern VP8DecIdct VP8TransformAC3;
 extern VP8DecIdct VP8TransformUV;
 extern VP8DecIdct VP8TransformDC;
 extern VP8DecIdct VP8TransformDCUV;
@@ -146,6 +153,8 @@ void VP8DspInit(void);
 
 #define FANCY_UPSAMPLING   // undefined to remove fancy upsampling support
 
+// Convert a pair of y/u/v lines together to the output rgb/a colorspace.
+// bottom_y can be NULL if only one line of output is needed (at top/bottom).
 typedef void (*WebPUpsampleLinePairFunc)(
     const uint8_t* top_y, const uint8_t* bottom_y,
     const uint8_t* top_u, const uint8_t* top_v,
@@ -160,10 +169,8 @@ extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 // Initializes SSE2 version of the fancy upsamplers.
 void WebPInitUpsamplersSSE2(void);
 
-#if defined(WEBP_USE_NEON)
 // NEON version
 void WebPInitUpsamplersNEON(void);
-#endif
 
 #endif    // FANCY_UPSAMPLING
 
@@ -206,14 +213,11 @@ extern void (*WebPApplyAlphaMultiply4444)(
 void WebPInitPremultiply(void);
 
 void WebPInitPremultiplySSE2(void);   // should not be called directly.
-
-#if defined(WEBP_USE_NEON)
 void WebPInitPremultiplyNEON(void);
-#endif
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/dsp/enc.c b/third_party/libwebp/dsp/enc.c
index 068ff141..fcc6ec8 100644
--- a/third_party/libwebp/dsp/enc.c
+++ b/third_party/libwebp/dsp/enc.c
@@ -11,14 +11,12 @@
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
+#include <assert.h>
 #include <stdlib.h>  // for abs()
+
 #include "./dsp.h"
 #include "../enc/vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 static WEBP_INLINE uint8_t clip_8b(int v) {
   return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
@@ -190,7 +188,7 @@ static void ITransformWHT(const int16_t* in, int16_t* out) {
 
 static void FTransformWHT(const int16_t* in, int16_t* out) {
   // input is 12b signed
-  int16_t tmp[16];
+  int32_t tmp[16];
   int i;
   for (i = 0; i < 4; ++i, in += 64) {
     const int a0 = (in[0 * 16] + in[2 * 16]);  // 13b
@@ -652,6 +650,31 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16],
   return (last >= 0);
 }
 
+static int QuantizeBlockWHT(int16_t in[16], int16_t out[16],
+                            const VP8Matrix* const mtx) {
+  int n, last = -1;
+  for (n = 0; n < 16; ++n) {
+    const int j = kZigzag[n];
+    const int sign = (in[j] < 0);
+    const int coeff = sign ? -in[j] : in[j];
+    assert(mtx->sharpen_[j] == 0);
+    if (coeff > mtx->zthresh_[j]) {
+      const int Q = mtx->q_[j];
+      const int iQ = mtx->iq_[j];
+      const int B = mtx->bias_[j];
+      out[n] = QUANTDIV(coeff, iQ, B);
+      if (out[n] > MAX_LEVEL) out[n] = MAX_LEVEL;
+      if (sign) out[n] = -out[n];
+      in[j] = out[n] * Q;
+      if (out[n]) last = n;
+    } else {
+      out[n] = 0;
+      in[j] = 0;
+    }
+  }
+  return (last >= 0);
+}
+
 //------------------------------------------------------------------------------
 // Block copy
 
@@ -686,12 +709,11 @@ VP8Metric VP8SSE4x4;
 VP8WMetric VP8TDisto4x4;
 VP8WMetric VP8TDisto16x16;
 VP8QuantizeBlock VP8EncQuantizeBlock;
+VP8QuantizeBlockWHT VP8EncQuantizeBlockWHT;
 VP8BlockCopy VP8Copy4x4;
 
 extern void VP8EncDspInitSSE2(void);
-#if defined(WEBP_USE_NEON)
 extern void VP8EncDspInitNEON(void);
-#endif
 
 void VP8EncDspInit(void) {
   InitTables();
@@ -712,6 +734,7 @@ void VP8EncDspInit(void) {
   VP8TDisto4x4 = Disto4x4;
   VP8TDisto16x16 = Disto16x16;
   VP8EncQuantizeBlock = QuantizeBlock;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHT;
   VP8Copy4x4 = Copy4x4;
 
   // If defined, use CPUInfo() to overwrite some pointers with faster versions.
@@ -728,6 +751,3 @@ void VP8EncDspInit(void) {
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/enc_neon.c b/third_party/libwebp/dsp/enc_neon.c
index eb256e6..52cca18 100644
--- a/third_party/libwebp/dsp/enc_neon.c
+++ b/third_party/libwebp/dsp/enc_neon.c
@@ -13,10 +13,6 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_NEON)
 
 #include "../enc/vp8enci.h"
@@ -493,7 +489,7 @@ static int Disto4x4(const uint8_t* const a, const uint8_t* const b,
     // q12/14 tmp[12-15]
 
     // These are still in 01 45 23 67 order. We fix it easily in the addition
-    // case but the subtraction propegates them.
+    // case but the subtraction propagates them.
     "vswp            d3, d27                  \n"
     "vswp            d19, d31                 \n"
 
@@ -634,6 +630,3 @@ void VP8EncDspInitNEON(void) {
 #endif   // WEBP_USE_NEON
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/enc_sse2.c b/third_party/libwebp/dsp/enc_sse2.c
index 032e990..540a3cb 100644
--- a/third_party/libwebp/dsp/enc_sse2.c
+++ b/third_party/libwebp/dsp/enc_sse2.c
@@ -13,10 +13,6 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_SSE2)
 #include <stdlib.h>  // for abs()
 #include <emmintrin.h>
@@ -456,7 +452,7 @@ static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
 }
 
 static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
-  int16_t tmp[16];
+  int32_t tmp[16];
   int i;
   for (i = 0; i < 4; ++i, in += 64) {
     const int a0 = (in[0 * 16] + in[2 * 16]);
@@ -469,22 +465,22 @@ static void FTransformWHTSSE2(const int16_t* in, int16_t* out) {
     tmp[3 + i * 4] = a0 - a1;
   }
   {
-    const __m128i src0 = _mm_loadl_epi64((__m128i*)&tmp[0]);
-    const __m128i src1 = _mm_loadl_epi64((__m128i*)&tmp[4]);
-    const __m128i src2 = _mm_loadl_epi64((__m128i*)&tmp[8]);
-    const __m128i src3 = _mm_loadl_epi64((__m128i*)&tmp[12]);
-    const __m128i a0 = _mm_add_epi16(src0, src2);
-    const __m128i a1 = _mm_add_epi16(src1, src3);
-    const __m128i a2 = _mm_sub_epi16(src1, src3);
-    const __m128i a3 = _mm_sub_epi16(src0, src2);
-    const __m128i b0 = _mm_srai_epi16(_mm_adds_epi16(a0, a1), 1);
-    const __m128i b1 = _mm_srai_epi16(_mm_adds_epi16(a3, a2), 1);
-    const __m128i b2 = _mm_srai_epi16(_mm_subs_epi16(a3, a2), 1);
-    const __m128i b3 = _mm_srai_epi16(_mm_subs_epi16(a0, a1), 1);
-    _mm_storel_epi64((__m128i*)&out[ 0], b0);
-    _mm_storel_epi64((__m128i*)&out[ 4], b1);
-    _mm_storel_epi64((__m128i*)&out[ 8], b2);
-    _mm_storel_epi64((__m128i*)&out[12], b3);
+    const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]);
+    const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]);
+    const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]);
+    const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]);
+    const __m128i a0 = _mm_add_epi32(src0, src2);
+    const __m128i a1 = _mm_add_epi32(src1, src3);
+    const __m128i a2 = _mm_sub_epi32(src1, src3);
+    const __m128i a3 = _mm_sub_epi32(src0, src2);
+    const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1);
+    const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1);
+    const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1);
+    const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1);
+    const __m128i out0 = _mm_packs_epi32(b0, b1);
+    const __m128i out1 = _mm_packs_epi32(b2, b3);
+    _mm_storeu_si128((__m128i*)&out[0], out0);
+    _mm_storeu_si128((__m128i*)&out[8], out1);
   }
 }
 
@@ -644,7 +640,7 @@ static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
   __m128i tmp_0, tmp_1, tmp_2, tmp_3;
   const __m128i zero = _mm_setzero_si128();
 
-  // Load, combine and tranpose inputs.
+  // Load, combine and transpose inputs.
   {
     const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
     const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
@@ -830,8 +826,6 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
   const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
   const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
-  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
-  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);
 
   // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
   const __m128i sign0 = _mm_srai_epi16(in0, 15);
@@ -894,17 +888,8 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   in0 = _mm_mullo_epi16(out0, q0);
   in8 = _mm_mullo_epi16(out8, q8);
 
-  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
-  {
-    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
-    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
-    in0 = _mm_and_si128(in0, cmp0);
-    in8 = _mm_and_si128(in8, cmp8);
-    _mm_storeu_si128((__m128i*)&in[0], in0);
-    _mm_storeu_si128((__m128i*)&in[8], in8);
-    out0 = _mm_and_si128(out0, cmp0);
-    out8 = _mm_and_si128(out8, cmp8);
-  }
+  _mm_storeu_si128((__m128i*)&in[0], in0);
+  _mm_storeu_si128((__m128i*)&in[8], in8);
 
   // zigzag the output before storing it.
   //
@@ -941,6 +926,11 @@ static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
   }
 }
 
+static int QuantizeBlockWHTSSE2(int16_t in[16], int16_t out[16],
+                                const VP8Matrix* const mtx) {
+  return QuantizeBlockSSE2(in, out, 0, mtx);
+}
+
 #endif   // WEBP_USE_SSE2
 
 //------------------------------------------------------------------------------
@@ -952,6 +942,7 @@ void VP8EncDspInitSSE2(void) {
 #if defined(WEBP_USE_SSE2)
   VP8CollectHistogram = CollectHistogramSSE2;
   VP8EncQuantizeBlock = QuantizeBlockSSE2;
+  VP8EncQuantizeBlockWHT = QuantizeBlockWHTSSE2;
   VP8ITransform = ITransformSSE2;
   VP8FTransform = FTransformSSE2;
   VP8FTransformWHT = FTransformWHTSSE2;
@@ -964,6 +955,3 @@ void VP8EncDspInitSSE2(void) {
 #endif   // WEBP_USE_SSE2
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/lossless.c b/third_party/libwebp/dsp/lossless.c
index e445924..bab76d2 100644
--- a/third_party/libwebp/dsp/lossless.c
+++ b/third_party/libwebp/dsp/lossless.c
@@ -15,14 +15,7 @@
 
 #include "./dsp.h"
 
-// Define the following if target arch is sure to have SSE2
-// #define WEBP_TARGET_HAS_SSE2
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#if defined(WEBP_TARGET_HAS_SSE2)
+#if defined(WEBP_USE_SSE2)
 #include <emmintrin.h>
 #endif
 
@@ -235,6 +228,109 @@ const float kSLog2Table[LOG_LOOKUP_IDX_MAX] = {
   2010.27454072f, 2019.69737440f, 2029.12591044f, 2038.56012640f
 };
 
+const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX] = {
+  { 0, 0}, { 0, 0}, { 1, 0}, { 2, 0}, { 3, 0}, { 4, 1}, { 4, 1}, { 5, 1},
+  { 5, 1}, { 6, 2}, { 6, 2}, { 6, 2}, { 6, 2}, { 7, 2}, { 7, 2}, { 7, 2},
+  { 7, 2}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3}, { 8, 3},
+  { 8, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3}, { 9, 3},
+  { 9, 3}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4}, {10, 4},
+  {10, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4}, {11, 4},
+  {11, 4}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5}, {12, 5},
+  {12, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5}, {13, 5},
+  {13, 5}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6}, {14, 6},
+  {14, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6}, {15, 6},
+  {15, 6}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7}, {16, 7},
+  {16, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+  {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7}, {17, 7},
+};
+
+const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX] = {
+   0,  0,  0,  0,  0,  0,  1,  0,  1,  0,  1,  2,  3,  0,  1,  2,  3,
+   0,  1,  2,  3,  4,  5,  6,  7,  0,  1,  2,  3,  4,  5,  6,  7,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+  127,
+   0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+  64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+  80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+  96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+  112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126
+};
+
 float VP8LFastSLog2Slow(int v) {
   assert(v >= LOG_LOOKUP_IDX_MAX);
   if (v < APPROX_LOG_MAX) {
@@ -287,61 +383,6 @@ static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
   return Average2(Average2(a0, a1), Average2(a2, a3));
 }
 
-#if defined(WEBP_TARGET_HAS_SSE2)
-static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
-  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
-  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i V1 = _mm_add_epi16(C0, C1);
-  const __m128i V2 = _mm_sub_epi16(V1, C2);
-  const __m128i b = _mm_packus_epi16(V2, V2);
-  const uint32_t output = _mm_cvtsi128_si32(b);
-  return output;
-}
-
-static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
-                                                   uint32_t c2) {
-  const uint32_t ave = Average2(c0, c1);
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
-  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
-  const __m128i A1 = _mm_sub_epi16(A0, B0);
-  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
-  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
-  const __m128i A3 = _mm_srai_epi16(A2, 1);
-  const __m128i A4 = _mm_add_epi16(A0, A3);
-  const __m128i A5 = _mm_packus_epi16(A4, A4);
-  const uint32_t output = _mm_cvtsi128_si32(A5);
-  return output;
-}
-
-static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
-  int pa_minus_pb;
-  const __m128i zero = _mm_setzero_si128();
-  const __m128i A0 = _mm_cvtsi32_si128(a);
-  const __m128i B0 = _mm_cvtsi32_si128(b);
-  const __m128i C0 = _mm_cvtsi32_si128(c);
-  const __m128i AC0 = _mm_subs_epu8(A0, C0);
-  const __m128i CA0 = _mm_subs_epu8(C0, A0);
-  const __m128i BC0 = _mm_subs_epu8(B0, C0);
-  const __m128i CB0 = _mm_subs_epu8(C0, B0);
-  const __m128i AC = _mm_or_si128(AC0, CA0);
-  const __m128i BC = _mm_or_si128(BC0, CB0);
-  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
-  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
-  const __m128i diff = _mm_sub_epi16(pb, pa);
-  {
-    int16_t out[8];
-    _mm_storeu_si128((__m128i*)out, diff);
-    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
-  }
-  return (pa_minus_pb <= 0) ? a : b;
-}
-
-#else
-
 static WEBP_INLINE uint32_t Clip255(uint32_t a) {
   if (a < 256) {
     return a;
@@ -396,7 +437,6 @@ static WEBP_INLINE uint32_t Select(uint32_t a, uint32_t b, uint32_t c) {
       Sub3((a      ) & 0xff, (b      ) & 0xff, (c      ) & 0xff);
   return (pa_minus_pb <= 0) ? a : b;
 }
-#endif
 
 //------------------------------------------------------------------------------
 // Predictors
@@ -449,18 +489,19 @@ static uint32_t Predictor10(uint32_t left, const uint32_t* const top) {
   return pred;
 }
 static uint32_t Predictor11(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = Select(top[0], left, top[-1]);
+  const uint32_t pred = VP8LSelect(top[0], left, top[-1]);
   return pred;
 }
 static uint32_t Predictor12(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractFull(left, top[0], top[-1]);
+  const uint32_t pred = VP8LClampedAddSubtractFull(left, top[0], top[-1]);
   return pred;
 }
 static uint32_t Predictor13(uint32_t left, const uint32_t* const top) {
-  const uint32_t pred = ClampedAddSubtractHalf(left, top[0], top[-1]);
+  const uint32_t pred = VP8LClampedAddSubtractHalf(left, top[0], top[-1]);
   return pred;
 }
 
+// TODO(vikasa): Export the predictor array, to allow SSE2 variants.
 typedef uint32_t (*PredictorFunc)(uint32_t left, const uint32_t* const top);
 static const PredictorFunc kPredictors[16] = {
   Predictor0, Predictor1, Predictor2, Predictor3,
@@ -716,21 +757,8 @@ static void PredictorInverseTransform(const VP8LTransform* const transform,
   }
 }
 
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
+static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
   int i = 0;
-#if defined(WEBP_TARGET_HAS_SSE2)
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; i + 4 < num_pixs; i += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_sub_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)&argb_data[i], out);
-  }
-  // fallthrough and finish off with plain-C
-#endif
   for (; i < num_pixs; ++i) {
     const uint32_t argb = argb_data[i];
     const uint32_t green = (argb >> 8) & 0xff;
@@ -742,23 +770,7 @@ void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs) {
 
 // Add green to blue and red channels (i.e. perform the inverse transform of
 // 'subtract green').
-static void AddGreenToBlueAndRed(const VP8LTransform* const transform,
-                                 int y_start, int y_end, uint32_t* data) {
-  const int width = transform->xsize_;
-  const uint32_t* const data_end = data + (y_end - y_start) * width;
-#if defined(WEBP_TARGET_HAS_SSE2)
-  const __m128i mask = _mm_set1_epi32(0x0000ff00);
-  for (; data + 4 < data_end; data += 4) {
-    const __m128i in = _mm_loadu_si128((__m128i*)data);
-    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
-    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
-    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
-    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
-    const __m128i out = _mm_add_epi8(in, in_0g0g);
-    _mm_storeu_si128((__m128i*)data, out);
-  }
-  // fallthrough and finish off with plain-C
-#endif
+static void AddGreenToBlueAndRed(uint32_t* data, const uint32_t* data_end) {
   while (data < data_end) {
     const uint32_t argb = *data;
     const uint32_t green = ((argb >> 8) & 0xff);
@@ -1156,18 +1168,18 @@ COLOR_INDEX_INVERSE(VP8LColorIndexInverseTransformAlpha, uint8_t, GetAlphaIndex,
 void VP8LInverseTransform(const VP8LTransform* const transform,
                           int row_start, int row_end,
                           const uint32_t* const in, uint32_t* const out) {
+  const int width = transform->xsize_;
   assert(row_start < row_end);
   assert(row_end <= transform->ysize_);
   switch (transform->type_) {
     case SUBTRACT_GREEN:
-      AddGreenToBlueAndRed(transform, row_start, row_end, out);
+      VP8LAddGreenToBlueAndRed(out, out + (row_end - row_start) * width);
       break;
     case PREDICTOR_TRANSFORM:
       PredictorInverseTransform(transform, row_start, row_end, out);
       if (row_end != transform->ysize_) {
         // The last predicted row in this iteration will be the top-pred row
         // for the first row in next iteration.
-        const int width = transform->xsize_;
         memcpy(out - width, out + (row_end - row_start - 1) * width,
                width * sizeof(*out));
       }
@@ -1182,7 +1194,7 @@ void VP8LInverseTransform(const VP8LTransform* const transform,
         // Also, note that this is the only transform that applies on
         // the effective width of VP8LSubSampleSize(xsize_, bits_). All other
         // transforms work on effective width of xsize_.
-        const int out_stride = (row_end - row_start) * transform->xsize_;
+        const int out_stride = (row_end - row_start) * width;
         const int in_stride = (row_end - row_start) *
             VP8LSubSampleSize(transform->xsize_, transform->bits_);
         uint32_t* const src = out + out_stride - in_stride;
@@ -1382,6 +1394,139 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
+// TODO(vikasa): Move the SSE2 functions to lossless_dsp.c (new file), once
+// color-space conversion methods (ConvertFromBGRA) are also updated for SSE2.
+#if defined(WEBP_USE_SSE2)
+static WEBP_INLINE uint32_t ClampedAddSubtractFullSSE2(uint32_t c0, uint32_t c1,
+                                                       uint32_t c2) {
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
+  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
+  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i V1 = _mm_add_epi16(C0, C1);
+  const __m128i V2 = _mm_sub_epi16(V1, C2);
+  const __m128i b = _mm_packus_epi16(V2, V2);
+  const uint32_t output = _mm_cvtsi128_si32(b);
+  return output;
+}
+
+static WEBP_INLINE uint32_t ClampedAddSubtractHalfSSE2(uint32_t c0, uint32_t c1,
+                                                       uint32_t c2) {
+  const uint32_t ave = Average2(c0, c1);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ave), zero);
+  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
+  const __m128i A1 = _mm_sub_epi16(A0, B0);
+  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
+  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
+  const __m128i A3 = _mm_srai_epi16(A2, 1);
+  const __m128i A4 = _mm_add_epi16(A0, A3);
+  const __m128i A5 = _mm_packus_epi16(A4, A4);
+  const uint32_t output = _mm_cvtsi128_si32(A5);
+  return output;
+}
+
+static WEBP_INLINE uint32_t SelectSSE2(uint32_t a, uint32_t b, uint32_t c) {
+  int pa_minus_pb;
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i A0 = _mm_cvtsi32_si128(a);
+  const __m128i B0 = _mm_cvtsi32_si128(b);
+  const __m128i C0 = _mm_cvtsi32_si128(c);
+  const __m128i AC0 = _mm_subs_epu8(A0, C0);
+  const __m128i CA0 = _mm_subs_epu8(C0, A0);
+  const __m128i BC0 = _mm_subs_epu8(B0, C0);
+  const __m128i CB0 = _mm_subs_epu8(C0, B0);
+  const __m128i AC = _mm_or_si128(AC0, CA0);
+  const __m128i BC = _mm_or_si128(BC0, CB0);
+  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
+  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
+  const __m128i diff = _mm_sub_epi16(pb, pa);
+  {
+    int16_t out[8];
+    _mm_storeu_si128((__m128i*)out, diff);
+    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
+  }
+  return (pa_minus_pb <= 0) ? a : b;
+}
+
+static void SubtractGreenFromBlueAndRedSSE2(uint32_t* argb_data, int num_pixs) {
+  int i = 0;
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  for (; i + 4 < num_pixs; i += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_sub_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)&argb_data[i], out);
+  }
+  // fallthrough and finish off with plain-C
+  for (; i < num_pixs; ++i) {
+    const uint32_t argb = argb_data[i];
+    const uint32_t green = (argb >> 8) & 0xff;
+    const uint32_t new_r = (((argb >> 16) & 0xff) - green) & 0xff;
+    const uint32_t new_b = ((argb & 0xff) - green) & 0xff;
+    argb_data[i] = (argb & 0xff00ff00) | (new_r << 16) | new_b;
+  }
+}
+
+static void AddGreenToBlueAndRedSSE2(uint32_t* data, const uint32_t* data_end) {
+  const __m128i mask = _mm_set1_epi32(0x0000ff00);
+  for (; data + 4 < data_end; data += 4) {
+    const __m128i in = _mm_loadu_si128((__m128i*)data);
+    const __m128i in_00g0 = _mm_and_si128(in, mask);     // 00g0|00g0|...
+    const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8);  // 0g00|0g00|...
+    const __m128i in_000g = _mm_srli_epi32(in_00g0, 8);  // 000g|000g|...
+    const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g);
+    const __m128i out = _mm_add_epi8(in, in_0g0g);
+    _mm_storeu_si128((__m128i*)data, out);
+  }
+  // fallthrough and finish off with plain-C
+  while (data < data_end) {
+    const uint32_t argb = *data;
+    const uint32_t green = ((argb >> 8) & 0xff);
+    uint32_t red_blue = (argb & 0x00ff00ffu);
+    red_blue += (green << 16) | green;
+    red_blue &= 0x00ff00ffu;
+    *data++ = (argb & 0xff00ff00u) | red_blue;
+  }
+}
+
+extern void VP8LDspInitSSE2(void);
+
+void VP8LDspInitSSE2(void) {
+  VP8LClampedAddSubtractFull = ClampedAddSubtractFullSSE2;
+  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalfSSE2;
+  VP8LSelect = SelectSSE2;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRedSSE2;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRedSSE2;
+}
+#endif
+//------------------------------------------------------------------------------
+
+VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
+VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
+VP8LPredSelectFunc VP8LSelect;
+VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+void VP8LDspInit(void) {
+  VP8LClampedAddSubtractFull = ClampedAddSubtractFull;
+  VP8LClampedAddSubtractHalf = ClampedAddSubtractHalf;
+  VP8LSelect = Select;
+  VP8LSubtractGreenFromBlueAndRed = SubtractGreenFromBlueAndRed;
+  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed;
+
+  // If defined, use CPUInfo() to overwrite some pointers with faster versions.
+  if (VP8GetCPUInfo != NULL) {
+#if defined(WEBP_USE_SSE2)
+    if (VP8GetCPUInfo(kSSE2)) {
+      VP8LDspInitSSE2();
+    }
 #endif
+  }
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/third_party/libwebp/dsp/lossless.h b/third_party/libwebp/dsp/lossless.h
index 7490ec8..0f1d442 100644
--- a/third_party/libwebp/dsp/lossless.h
+++ b/third_party/libwebp/dsp/lossless.h
@@ -18,11 +18,31 @@
 #include "../webp/types.h"
 #include "../webp/decode.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
 //------------------------------------------------------------------------------
+//
+
+typedef uint32_t (*VP8LPredClampedAddSubFunc)(uint32_t c0, uint32_t c1,
+                                              uint32_t c2);
+typedef uint32_t (*VP8LPredSelectFunc)(uint32_t c0, uint32_t c1, uint32_t c2);
+typedef void (*VP8LSubtractGreenFromBlueAndRedFunc)(uint32_t* argb_data,
+                                                    int num_pixs);
+typedef void (*VP8LAddGreenToBlueAndRedFunc)(uint32_t* data_start,
+                                             const uint32_t* data_end);
+
+extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractFull;
+extern VP8LPredClampedAddSubFunc VP8LClampedAddSubtractHalf;
+extern VP8LPredSelectFunc VP8LSelect;
+extern VP8LSubtractGreenFromBlueAndRedFunc VP8LSubtractGreenFromBlueAndRed;
+extern VP8LAddGreenToBlueAndRedFunc VP8LAddGreenToBlueAndRed;
+
+// Must be called before calling any of the above methods.
+void VP8LDspInit(void);
+
+//------------------------------------------------------------------------------
 // Image transforms.
 
 struct VP8LTransform;  // Defined in dec/vp8li.h.
@@ -42,9 +62,6 @@ void VP8LColorIndexInverseTransformAlpha(
     const struct VP8LTransform* const transform, int y_start, int y_end,
     const uint8_t* src, uint8_t* dst);
 
-// Subtracts green from blue and red channels.
-void VP8LSubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixs);
-
 void VP8LResidualImage(int width, int height, int bits,
                        uint32_t* const argb, uint32_t* const argb_scratch,
                        uint32_t* const image);
@@ -72,8 +89,8 @@ static WEBP_INLINE uint32_t VP8LSubSampleSize(uint32_t size,
 #define LOG_LOOKUP_IDX_MAX 256
 extern const float kLog2Table[LOG_LOOKUP_IDX_MAX];
 extern const float kSLog2Table[LOG_LOOKUP_IDX_MAX];
-extern float VP8LFastLog2Slow(int v);
-extern float VP8LFastSLog2Slow(int v);
+float VP8LFastLog2Slow(int v);
+float VP8LFastSLog2Slow(int v);
 static WEBP_INLINE float VP8LFastLog2(int v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kLog2Table[v] : VP8LFastLog2Slow(v);
 }
@@ -82,6 +99,105 @@ static WEBP_INLINE float VP8LFastSLog2(int v) {
   return (v < LOG_LOOKUP_IDX_MAX) ? kSLog2Table[v] : VP8LFastSLog2Slow(v);
 }
 
+// -----------------------------------------------------------------------------
+// PrefixEncode()
+
+// use GNU builtins where available.
+#if defined(__GNUC__) && \
+    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  return 31 ^ __builtin_clz(n);
+}
+#elif defined(_MSC_VER) && _MSC_VER > 1310 && \
+      (defined(_M_X64) || defined(_M_IX86))
+#include <intrin.h>
+#pragma intrinsic(_BitScanReverse)
+
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  unsigned long first_set_bit;
+  _BitScanReverse(&first_set_bit, n);
+  return first_set_bit;
+}
+#else
+// Returns (int)floor(log2(n)). n must be > 0.
+static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
+  int log = 0;
+  uint32_t value = n;
+  int i;
+
+  for (i = 4; i >= 0; --i) {
+    const int shift = (1 << i);
+    const uint32_t x = value >> shift;
+    if (x != 0) {
+      value = x;
+      log += shift;
+    }
+  }
+  return log;
+}
+#endif
+
+static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
+  const int log_floor = BitsLog2Floor(n);
+  if (n == (n & ~(n - 1)))  // zero or a power of two.
+    return log_floor;
+  else
+    return log_floor + 1;
+}
+
+// Splitting of distance and length codes into prefixes and
+// extra bits. The prefixes are encoded with an entropy code
+// while the extra bits are stored just as normal bits.
+static WEBP_INLINE void VP8LPrefixEncodeBitsNoLUT(int distance, int* const code,
+                                                  int* const extra_bits) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+static WEBP_INLINE void VP8LPrefixEncodeNoLUT(int distance, int* const code,
+                                              int* const extra_bits,
+                                              int* const extra_bits_value) {
+  const int highest_bit = BitsLog2Floor(--distance);
+  const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
+  *extra_bits = highest_bit - 1;
+  *extra_bits_value = distance & ((1 << *extra_bits) - 1);
+  *code = 2 * highest_bit + second_highest_bit;
+}
+
+#define PREFIX_LOOKUP_IDX_MAX   512
+typedef struct {
+  int8_t code_;
+  int8_t extra_bits_;
+} VP8LPrefixCode;
+
+// These tables are derived using VP8LPrefixEncodeNoLUT.
+extern const VP8LPrefixCode kPrefixEncodeCode[PREFIX_LOOKUP_IDX_MAX];
+extern const uint8_t kPrefixEncodeExtraBitsValue[PREFIX_LOOKUP_IDX_MAX];
+static WEBP_INLINE void VP8LPrefixEncodeBits(int distance, int* const code,
+                                             int* const extra_bits) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+  } else {
+    VP8LPrefixEncodeBitsNoLUT(distance, code, extra_bits);
+  }
+}
+
+static WEBP_INLINE void VP8LPrefixEncode(int distance, int* const code,
+                                         int* const extra_bits,
+                                         int* const extra_bits_value) {
+  if (distance < PREFIX_LOOKUP_IDX_MAX) {
+    const VP8LPrefixCode prefix_code = kPrefixEncodeCode[distance];
+    *code = prefix_code.code_;
+    *extra_bits = prefix_code.extra_bits_;
+    *extra_bits_value = kPrefixEncodeExtraBitsValue[distance];
+  } else {
+    VP8LPrefixEncodeNoLUT(distance, code, extra_bits, extra_bits_value);
+  }
+}
 
 // In-place difference of each component with mod 256.
 static WEBP_INLINE uint32_t VP8LSubPixels(uint32_t a, uint32_t b) {
@@ -97,7 +213,7 @@ void VP8LBundleColorMap(const uint8_t* const row, int width,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/dsp/upsampling.c b/third_party/libwebp/dsp/upsampling.c
index 80ba4f8..978e3ce 100644
--- a/third_party/libwebp/dsp/upsampling.c
+++ b/third_party/libwebp/dsp/upsampling.c
@@ -14,9 +14,7 @@
 #include "./dsp.h"
 #include "./yuv.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include <assert.h>
 
 //------------------------------------------------------------------------------
 // Fancy upsampler
@@ -45,11 +43,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
   const int last_pixel_pair = (len - 1) >> 1;                                  \
   uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
   uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
-  if (top_y) {                                                                 \
+  assert(top_y != NULL);                                                       \
+  {                                                                            \
     const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
     FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
   }                                                                            \
-  if (bottom_y) {                                                              \
+  if (bottom_y != NULL) {                                                      \
     const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
     FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
   }                                                                            \
@@ -60,7 +59,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
     const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
     const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
-    if (top_y) {                                                               \
+    {                                                                          \
       const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
       const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
       FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
@@ -68,7 +67,7 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
       FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
            top_dst + (2 * x - 0) * XSTEP);                                     \
     }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
       const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
       FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
@@ -80,12 +79,12 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
     l_uv = uv;                                                                 \
   }                                                                            \
   if (!(len & 1)) {                                                            \
-    if (top_y) {                                                               \
+    {                                                                          \
       const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
       FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
            top_dst + (len - 1) * XSTEP);                                       \
     }                                                                          \
-    if (bottom_y) {                                                            \
+    if (bottom_y != NULL) {                                                    \
       const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
       FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
            bottom_dst + (len - 1) * XSTEP);                                    \
@@ -168,7 +167,8 @@ static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bot_y,              \
                       uint8_t* top_dst, uint8_t* bot_dst, int len) {           \
   const int half_len = len >> 1;                                               \
   int x;                                                                       \
-  if (top_dst != NULL) {                                                       \
+  assert(top_dst != NULL);                                                     \
+  {                                                                            \
     for (x = 0; x < half_len; ++x) {                                           \
       FUNC(top_y[2 * x + 0], top_u[x], top_v[x], top_dst + 8 * x + 0);         \
       FUNC(top_y[2 * x + 1], top_u[x], top_v[x], top_dst + 8 * x + 4);         \
@@ -364,6 +364,3 @@ void WebPInitPremultiply(void) {
 #endif  // FANCY_UPSAMPLING
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/dsp/upsampling_neon.c b/third_party/libwebp/dsp/upsampling_neon.c
index d118895..791222f 100644
--- a/third_party/libwebp/dsp/upsampling_neon.c
+++ b/third_party/libwebp/dsp/upsampling_neon.c
@@ -14,10 +14,6 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_NEON)
 
 #include <assert.h>
@@ -27,6 +23,9 @@ extern "C" {
 
 #ifdef FANCY_UPSAMPLING
 
+//-----------------------------------------------------------------------------
+// U/V upsampling
+
 // Loads 9 pixels each from rows r1 and r2 and generates 16 pixels.
 #define UPSAMPLE_16PIXELS(r1, r2, out) {                                \
   uint8x8_t a = vld1_u8(r1);                                            \
@@ -85,125 +84,90 @@ static void Upsample16Pixels(const uint8_t *r1, const uint8_t *r2,
   Upsample16Pixels(r1, r2, out);                                        \
 }
 
-#define CY  76283
-#define CVR 89858
-#define CUG 22014
-#define CVG 45773
-#define CUB 113618
-
-static const int16_t coef[4] = { CVR / 4, CUG, CVG / 2, CUB / 4 };
+//-----------------------------------------------------------------------------
+// YUV->RGB conversion
 
-#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
-  int i;                                                                \
-  for (i = 0; i < N; i += 8) {                                          \
-    int off = ((cur_x) + i) * XSTEP;                                    \
-    uint8x8_t y  = vld1_u8(src_y + (cur_x)  + i);                       \
-    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
-    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
-    int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));             \
-    int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));            \
-    int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));            \
-                                                                        \
-    int16x8_t ud = vshlq_n_s16(uu, 1);                                  \
-    int16x8_t vd = vshlq_n_s16(vv, 1);                                  \
-                                                                        \
-    int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1),  \
-                                     vget_low_s16(vd),  cf16, 0);       \
-    int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), \
-                                     vget_high_s16(vd), cf16, 0);       \
-    int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16),                  \
-                                vrshrn_n_s32(vrh, 16));                 \
-                                                                        \
-    int32x4_t vl = vmovl_s16(vget_low_s16(vv));                         \
-    int32x4_t vh = vmovl_s16(vget_high_s16(vv));                        \
-    int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu),  cf16, 1);     \
-    int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1);     \
-    int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv),  cf16, 2);  \
-    int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2);  \
-    int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16),                  \
-                                vrshrn_n_s32(gch, 16));                 \
-                                                                        \
-    int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1),  \
-                                     vget_low_s16(ud),  cf16, 3);       \
-    int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), \
-                                     vget_high_s16(ud), cf16, 3);       \
-    int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16),                  \
-                                vrshrn_n_s32(ubh, 16));                 \
-                                                                        \
-    int32x4_t rl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(vr));      \
-    int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr));     \
-    int32x4_t gl = vsubl_s16(vget_low_s16(yy),  vget_low_s16(gc));      \
-    int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc));     \
-    int32x4_t bl = vaddl_s16(vget_low_s16(yy),  vget_low_s16(ub));      \
-    int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub));     \
-                                                                        \
-    rl = vmulq_lane_s32(rl, cf32, 0);                                   \
-    rh = vmulq_lane_s32(rh, cf32, 0);                                   \
-    gl = vmulq_lane_s32(gl, cf32, 0);                                   \
-    gh = vmulq_lane_s32(gh, cf32, 0);                                   \
-    bl = vmulq_lane_s32(bl, cf32, 0);                                   \
-    bh = vmulq_lane_s32(bh, cf32, 0);                                   \
-                                                                        \
-    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16),                  \
-                                 vrshrn_n_s32(rh, 16)));                \
-    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16),                  \
-                                 vrshrn_n_s32(gh, 16)));                \
-    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16),                  \
-                                 vrshrn_n_s32(bh, 16)));                \
-    STR_ ## FMT(out + off, y, u, v);                                    \
-  }                                                                     \
-}
+static const int16_t kCoeffs[4] = { kYScale, kVToR, kUToG, kVToG };
 
 #define v255 vmov_n_u8(255)
 
-#define STR_Rgb(out, r, g, b) do {                                      \
+#define STORE_Rgb(out, r, g, b) do {                                    \
   const uint8x8x3_t r_g_b = {{ r, g, b }};                              \
   vst3_u8(out, r_g_b);                                                  \
 } while (0)
 
-#define STR_Bgr(out, r, g, b) do {                                      \
+#define STORE_Bgr(out, r, g, b) do {                                    \
   const uint8x8x3_t b_g_r = {{ b, g, r }};                              \
   vst3_u8(out, b_g_r);                                                  \
 } while (0)
 
-#define STR_Rgba(out, r, g, b) do {                                     \
+#define STORE_Rgba(out, r, g, b) do {                                   \
   const uint8x8x4_t r_g_b_v255 = {{ r, g, b, v255 }};                   \
   vst4_u8(out, r_g_b_v255);                                             \
 } while (0)
 
-#define STR_Bgra(out, r, g, b) do {                                     \
+#define STORE_Bgra(out, r, g, b) do {                                   \
   const uint8x8x4_t b_g_r_v255 = {{ b, g, r, v255 }};                   \
   vst4_u8(out, b_g_r_v255);                                             \
 } while (0)
 
-#define CONVERT1(FMT, XSTEP, N, src_y, src_uv, rgb, cur_x) {            \
+#define CONVERT8(FMT, XSTEP, N, src_y, src_uv, out, cur_x) {            \
+  int i;                                                                \
+  for (i = 0; i < N; i += 8) {                                          \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    uint8x8_t y  = vld1_u8((src_y) + (cur_x)  + i);                     \
+    uint8x8_t u  = vld1_u8((src_uv) + i);                               \
+    uint8x8_t v  = vld1_u8((src_uv) + i + 16);                          \
+    const int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16));       \
+    const int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128));      \
+    const int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128));      \
+    int32x4_t yl = vmull_lane_s16(vget_low_s16(yy),  cf16, 0);          \
+    int32x4_t yh = vmull_lane_s16(vget_high_s16(yy), cf16, 0);          \
+    const int32x4_t rl = vmlal_lane_s16(yl, vget_low_s16(vv),  cf16, 1);\
+    const int32x4_t rh = vmlal_lane_s16(yh, vget_high_s16(vv), cf16, 1);\
+    int32x4_t gl = vmlsl_lane_s16(yl, vget_low_s16(uu),  cf16, 2);      \
+    int32x4_t gh = vmlsl_lane_s16(yh, vget_high_s16(uu), cf16, 2);      \
+    const int32x4_t bl = vmovl_s16(vget_low_s16(uu));                   \
+    const int32x4_t bh = vmovl_s16(vget_high_s16(uu));                  \
+    gl = vmlsl_lane_s16(gl, vget_low_s16(vv),  cf16, 3);                \
+    gh = vmlsl_lane_s16(gh, vget_high_s16(vv), cf16, 3);                \
+    yl = vmlaq_lane_s32(yl, bl, cf32, 0);                               \
+    yh = vmlaq_lane_s32(yh, bh, cf32, 0);                               \
+    /* vrshrn_n_s32() already incorporates the rounding constant */     \
+    y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, YUV_FIX2),            \
+                                 vrshrn_n_s32(rh, YUV_FIX2)));          \
+    u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, YUV_FIX2),            \
+                                 vrshrn_n_s32(gh, YUV_FIX2)));          \
+    v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(yl, YUV_FIX2),            \
+                                 vrshrn_n_s32(yh, YUV_FIX2)));          \
+    STORE_ ## FMT(out + off, y, u, v);                                  \
+  }                                                                     \
+}
+
+#define CONVERT1(FUNC, XSTEP, N, src_y, src_uv, rgb, cur_x) {           \
   int i;                                                                \
   for (i = 0; i < N; i++) {                                             \
-    int off = ((cur_x) + i) * XSTEP;                                    \
-    int y = src_y[(cur_x) + i];                                         \
-    int u = (src_uv)[i];                                                \
-    int v = (src_uv)[i + 16];                                           \
-    VP8YuvTo ## FMT(y, u, v, rgb + off);                                \
+    const int off = ((cur_x) + i) * XSTEP;                              \
+    const int y = src_y[(cur_x) + i];                                   \
+    const int u = (src_uv)[i];                                          \
+    const int v = (src_uv)[i + 16];                                     \
+    FUNC(y, u, v, rgb + off);                                           \
   }                                                                     \
 }
 
 #define CONVERT2RGB_8(FMT, XSTEP, top_y, bottom_y, uv,                  \
                       top_dst, bottom_dst, cur_x, len) {                \
-  if (top_y) {                                                          \
-    CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                \
-  }                                                                     \
-  if (bottom_y) {                                                       \
+  CONVERT8(FMT, XSTEP, len, top_y, uv, top_dst, cur_x)                  \
+  if (bottom_y != NULL) {                                               \
     CONVERT8(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x)   \
   }                                                                     \
 }
 
-#define CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, uv,                  \
+#define CONVERT2RGB_1(FUNC, XSTEP, top_y, bottom_y, uv,                 \
                       top_dst, bottom_dst, cur_x, len) {                \
-  if (top_y) {                                                          \
-    CONVERT1(FMT, XSTEP, len, top_y, uv, top_dst, cur_x);               \
-  }                                                                     \
-  if (bottom_y) {                                                       \
-    CONVERT1(FMT, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x);  \
+  CONVERT1(FUNC, XSTEP, len, top_y, uv, top_dst, cur_x);                \
+  if (bottom_y != NULL) {                                               \
+    CONVERT1(FUNC, XSTEP, len, bottom_y, (uv) + 32, bottom_dst, cur_x); \
   }                                                                     \
 }
 
@@ -225,18 +189,19 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
   const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                  \
   const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                  \
                                                                         \
-  const int16x4_t cf16 = vld1_s16(coef);                                \
-  const int32x2_t cf32 = vmov_n_s32(CY);                                \
+  const int16x4_t cf16 = vld1_s16(kCoeffs);                             \
+  const int32x2_t cf32 = vmov_n_s32(kUToB);                             \
   const uint8x8_t u16  = vmov_n_u8(16);                                 \
   const uint8x8_t u128 = vmov_n_u8(128);                                \
                                                                         \
   /* Treat the first pixel in regular way */                            \
-  if (top_y) {                                                          \
+  assert(top_y != NULL);                                                \
+  {                                                                     \
     const int u0 = (top_u[0] + u_diag) >> 1;                            \
     const int v0 = (top_v[0] + v_diag) >> 1;                            \
     VP8YuvTo ## FMT(top_y[0], u0, v0, top_dst);                         \
   }                                                                     \
-  if (bottom_y) {                                                       \
+  if (bottom_y != NULL) {                                               \
     const int u0 = (cur_u[0] + u_diag) >> 1;                            \
     const int v0 = (cur_v[0] + v_diag) >> 1;                            \
     VP8YuvTo ## FMT(bottom_y[0], u0, v0, bottom_dst);                   \
@@ -255,7 +220,7 @@ static void FUNC_NAME(const uint8_t *top_y, const uint8_t *bottom_y,    \
                                                                         \
   UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv);                    \
   UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 16);               \
-  CONVERT2RGB_1(FMT, XSTEP, top_y, bottom_y, r_uv,                      \
+  CONVERT2RGB_1(VP8YuvTo ## FMT, XSTEP, top_y, bottom_y, r_uv,          \
                 top_dst, bottom_dst, last_pos, len - last_pos);         \
 }
 
@@ -271,6 +236,8 @@ NEON_UPSAMPLE_FUNC(UpsampleBgraLinePairNEON, Bgra, 4)
 
 //------------------------------------------------------------------------------
 
+#ifdef FANCY_UPSAMPLING
+
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersNEON(void) {
@@ -289,6 +256,10 @@ void WebPInitPremultiplyNEON(void) {
 #endif   // WEBP_USE_NEON
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitPremultiplyNEON(void) {}
+
+#endif  // FANCY_UPSAMPLING
+
diff --git a/third_party/libwebp/dsp/upsampling_sse2.c b/third_party/libwebp/dsp/upsampling_sse2.c
index f31d048..0db0798 100644
--- a/third_party/libwebp/dsp/upsampling_sse2.c
+++ b/third_party/libwebp/dsp/upsampling_sse2.c
@@ -13,10 +13,6 @@
 
 #include "./dsp.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if defined(WEBP_USE_SSE2)
 
 #include <assert.h>
@@ -51,7 +47,7 @@ extern "C" {
   (out) = _mm_sub_epi8(tmp0, tmp4);    /* (k + in + 1) / 2 - lsb_correction */ \
 } while (0)
 
-// pack and store two alterning pixel rows
+// pack and store two alternating pixel rows
 #define PACK_AND_STORE(a, b, da, db, out) do {                                 \
   const __m128i t_a = _mm_avg_epu8(a, da);  /* (9a + 3b + 3c +  d + 8) / 16 */ \
   const __m128i t_b = _mm_avg_epu8(b, db);  /* (3a + 9b +  c + 3d + 8) / 16 */ \
@@ -87,8 +83,8 @@ extern "C" {
   GET_M(ad, s, diag2);                  /* diag2 = (3a + b + c + 3d) / 8 */    \
                                                                                \
   /* pack the alternate pixels */                                              \
-  PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]);                          \
-  PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]);                          \
+  PACK_AND_STORE(a, b, diag1, diag2, out +      0);  /* store top */           \
+  PACK_AND_STORE(c, d, diag2, diag1, out + 2 * 32);  /* store bottom */        \
 }
 
 // Turn the macro into a function for reducing code-size when non-critical
@@ -108,69 +104,68 @@ static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[],
   Upsample32Pixels(r1, r2, out);                                               \
 }
 
-#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv,                          \
+#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y,                              \
                     top_dst, bottom_dst, cur_x, num_pixels) {                  \
   int n;                                                                       \
-  if (top_y) {                                                                 \
-    for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n],                          \
-           top_dst + ((cur_x) + n) * XSTEP);                                   \
-    }                                                                          \
+  for (n = 0; n < (num_pixels); ++n) {                                         \
+    FUNC(top_y[(cur_x) + n], r_u[n], r_v[n],                                   \
+         top_dst + ((cur_x) + n) * XSTEP);                                     \
   }                                                                            \
-  if (bottom_y) {                                                              \
+  if (bottom_y != NULL) {                                                      \
     for (n = 0; n < (num_pixels); ++n) {                                       \
-      FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n],             \
+      FUNC(bottom_y[(cur_x) + n], r_u[64 + n], r_v[64 + n],                    \
            bottom_dst + ((cur_x) + n) * XSTEP);                                \
     }                                                                          \
   }                                                                            \
 }
 
+#define CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y,                           \
+                       top_dst, bottom_dst, cur_x) do {                        \
+  FUNC##32(top_y + (cur_x), r_u, r_v, top_dst + (cur_x) * XSTEP);              \
+  if (bottom_y != NULL) {                                                      \
+    FUNC##32(bottom_y + (cur_x), r_u + 64, r_v + 64,                           \
+             bottom_dst + (cur_x) * XSTEP);                                    \
+  }                                                                            \
+} while (0)
+
 #define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP)                             \
 static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,           \
                       const uint8_t* top_u, const uint8_t* top_v,              \
                       const uint8_t* cur_u, const uint8_t* cur_v,              \
                       uint8_t* top_dst, uint8_t* bottom_dst, int len) {        \
-  int block;                                                                   \
-  /* 16 byte aligned array to cache reconstructed u and v */                   \
+  int uv_pos, pos;                                                             \
+  /* 16byte-aligned array to cache reconstructed u and v */                    \
   uint8_t uv_buf[4 * 32 + 15];                                                 \
-  uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);            \
-  const int uv_len = (len + 1) >> 1;                                           \
-  /* 17 pixels must be read-able for each block */                             \
-  const int num_blocks = (uv_len - 1) >> 4;                                    \
-  const int leftover = uv_len - num_blocks * 16;                               \
-  const int last_pos = 1 + 32 * num_blocks;                                    \
-                                                                               \
-  const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                         \
-  const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                         \
+  uint8_t* const r_u = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15);             \
+  uint8_t* const r_v = r_u + 32;                                               \
                                                                                \
-  assert(len > 0);                                                             \
-  /* Treat the first pixel in regular way */                                   \
-  if (top_y) {                                                                 \
-    const int u0 = (top_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (top_v[0] + v_diag) >> 1;                                   \
-    FUNC(top_y[0], u0, v0, top_dst);                                           \
+  assert(top_y != NULL);                                                       \
+  {   /* Treat the first pixel in regular way */                               \
+    const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1;                       \
+    const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1;                       \
+    const int u0_t = (top_u[0] + u_diag) >> 1;                                 \
+    const int v0_t = (top_v[0] + v_diag) >> 1;                                 \
+    FUNC(top_y[0], u0_t, v0_t, top_dst);                                       \
+    if (bottom_y != NULL) {                                                    \
+      const int u0_b = (cur_u[0] + u_diag) >> 1;                               \
+      const int v0_b = (cur_v[0] + v_diag) >> 1;                               \
+      FUNC(bottom_y[0], u0_b, v0_b, bottom_dst);                               \
+    }                                                                          \
   }                                                                            \
-  if (bottom_y) {                                                              \
-    const int u0 = (cur_u[0] + u_diag) >> 1;                                   \
-    const int v0 = (cur_v[0] + v_diag) >> 1;                                   \
-    FUNC(bottom_y[0], u0, v0, bottom_dst);                                     \
+  /* For UPSAMPLE_32PIXELS, 17 u/v values must be read-able for each block */  \
+  for (pos = 1, uv_pos = 0; pos + 32 + 1 <= len; pos += 32, uv_pos += 16) {    \
+    UPSAMPLE_32PIXELS(top_u + uv_pos, cur_u + uv_pos, r_u);                    \
+    UPSAMPLE_32PIXELS(top_v + uv_pos, cur_v + uv_pos, r_v);                    \
+    CONVERT2RGB_32(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst, pos);    \
   }                                                                            \
-                                                                               \
-  for (block = 0; block < num_blocks; ++block) {                               \
-    UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32);                            \
-    UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32);                            \
-    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,       \
-                32 * block + 1, 32)                                            \
-    top_u += 16;                                                               \
-    cur_u += 16;                                                               \
-    top_v += 16;                                                               \
-    cur_v += 16;                                                               \
+  if (len > 1) {                                                               \
+    const int left_over = ((len + 1) >> 1) - (pos >> 1);                       \
+    assert(left_over > 0);                                                     \
+    UPSAMPLE_LAST_BLOCK(top_u + uv_pos, cur_u + uv_pos, left_over, r_u);       \
+    UPSAMPLE_LAST_BLOCK(top_v + uv_pos, cur_v + uv_pos, left_over, r_v);       \
+    CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, top_dst, bottom_dst,             \
+                pos, len - pos);                                               \
   }                                                                            \
-                                                                               \
-  UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32);                  \
-  UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32);                  \
-  CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst,         \
-              last_pos, len - last_pos);                                       \
 }
 
 // SSE2 variants of the fancy upsampler.
@@ -184,6 +179,7 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 #undef UPSAMPLE_32PIXELS
 #undef UPSAMPLE_LAST_BLOCK
 #undef CONVERT2RGB
+#undef CONVERT2RGB_32
 #undef SSE2_UPSAMPLE_FUNC
 
 #endif  // FANCY_UPSAMPLING
@@ -192,10 +188,13 @@ SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4)
 
 //------------------------------------------------------------------------------
 
+#ifdef FANCY_UPSAMPLING
+
 extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */];
 
 void WebPInitUpsamplersSSE2(void) {
 #if defined(WEBP_USE_SSE2)
+  VP8YUVInitSSE2();
   WebPUpsamplers[MODE_RGB]  = UpsampleRgbLinePairSSE2;
   WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2;
   WebPUpsamplers[MODE_BGR]  = UpsampleBgrLinePairSSE2;
@@ -210,8 +209,10 @@ void WebPInitPremultiplySSE2(void) {
 #endif   // WEBP_USE_SSE2
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+#else
+
+// this empty function is to avoid an empty .o
+void WebPInitPremultiplySSE2(void) {}
 
+#endif  // FANCY_UPSAMPLING
 
diff --git a/third_party/libwebp/dsp/yuv.c b/third_party/libwebp/dsp/yuv.c
index 1a59f74..4f9cafc 100644
--- a/third_party/libwebp/dsp/yuv.c
+++ b/third_party/libwebp/dsp/yuv.c
@@ -13,16 +13,8 @@
 
 #include "./yuv.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-#ifdef WEBP_YUV_USE_TABLE
 
-int16_t VP8kVToR[256], VP8kUToB[256];
-int32_t VP8kVToG[256], VP8kUToG[256];
-uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+#if defined(WEBP_YUV_USE_TABLE)
 
 static int done = 0;
 
@@ -30,6 +22,11 @@ static WEBP_INLINE uint8_t clip(int v, int max_value) {
   return v < 0 ? 0 : v > max_value ? max_value : v;
 }
 
+int16_t VP8kVToR[256], VP8kUToB[256];
+int32_t VP8kVToG[256], VP8kUToG[256];
+uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+
 void VP8YUVInit(void) {
   int i;
   if (done) {
@@ -70,6 +67,141 @@ void VP8YUVInit(void) {}
 
 #endif  // WEBP_YUV_USE_TABLE
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+//-----------------------------------------------------------------------------
+// SSE2 extras
+
+#if defined(WEBP_USE_SSE2)
+
+#ifdef FANCY_UPSAMPLING
+
+#include <emmintrin.h>
+#include <string.h>   // for memcpy
+
+typedef union {   // handy struct for converting SSE2 registers
+  int32_t i32[4];
+  uint8_t u8[16];
+  __m128i m;
+} VP8kCstSSE2;
+
+static int done_sse2 = 0;
+static VP8kCstSSE2 VP8kUtoRGBA[256], VP8kVtoRGBA[256], VP8kYtoRGBA[256];
+
+void VP8YUVInitSSE2(void) {
+  if (!done_sse2) {
+    int i;
+    for (i = 0; i < 256; ++i) {
+      VP8kYtoRGBA[i].i32[0] =
+        VP8kYtoRGBA[i].i32[1] =
+        VP8kYtoRGBA[i].i32[2] = (i - 16) * kYScale + YUV_HALF2;
+      VP8kYtoRGBA[i].i32[3] = 0xff << YUV_FIX2;
+
+      VP8kUtoRGBA[i].i32[0] = 0;
+      VP8kUtoRGBA[i].i32[1] = -kUToG * (i - 128);
+      VP8kUtoRGBA[i].i32[2] =  kUToB * (i - 128);
+      VP8kUtoRGBA[i].i32[3] = 0;
+
+      VP8kVtoRGBA[i].i32[0] =  kVToR * (i - 128);
+      VP8kVtoRGBA[i].i32[1] = -kVToG * (i - 128);
+      VP8kVtoRGBA[i].i32[2] = 0;
+      VP8kVtoRGBA[i].i32[3] = 0;
+    }
+    done_sse2 = 1;
+  }
+}
+
+static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) {
+  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
+  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
+  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
+  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
+  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
+  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
+  return rgba2;
+}
+
+static WEBP_INLINE void VP8YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const rgb) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0);
+  const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)rgb, tmp2);
+}
+
+static WEBP_INLINE void VP8YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v,
+                                        uint8_t* const bgr) {
+  const __m128i tmp0 = VP8GetRGBA32b(y, u, v);
+  const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2));
+  const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1);
+  const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2);
+  // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp
+  _mm_storel_epi64((__m128i*)bgr, tmp3);
+}
+
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 4) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp0_3 = VP8GetRGBA32b(y[n + 2], u[n + 2], v[n + 2]);
+    const __m128i tmp0_4 = VP8GetRGBA32b(y[n + 3], u[n + 3], v[n + 3]);
+    const __m128i tmp1_1 = _mm_packs_epi32(tmp0_1, tmp0_2);
+    const __m128i tmp1_2 = _mm_packs_epi32(tmp0_3, tmp0_4);
+    const __m128i tmp2 = _mm_packus_epi16(tmp1_1, tmp1_2);
+    _mm_storeu_si128((__m128i*)dst, tmp2);
+    dst += 4 * 4;
+  }
+}
+
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst) {
+  int n;
+  for (n = 0; n < 32; n += 2) {
+    const __m128i tmp0_1 = VP8GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]);
+    const __m128i tmp0_2 = VP8GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]);
+    const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2));
+    const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2);
+    const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1);
+    _mm_storel_epi64((__m128i*)dst, tmp3);
+    dst += 4 * 2;
+  }
+}
+
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {   // we directly stomp the *dst memory
+    VP8YuvToRgbSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  // Last two pixels are special: we write in a tmp buffer before sending
+  // to dst.
+  VP8YuvToRgbSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToRgbSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst) {
+  int n;
+  uint8_t tmp0[2 * 3 + 5 + 15];
+  uint8_t* const tmp = (uint8_t*)((uintptr_t)(tmp0 + 15) & ~15);  // align
+  for (n = 0; n < 30; ++n) {
+    VP8YuvToBgrSSE2(y[n], u[n], v[n], dst + n * 3);
+  }
+  VP8YuvToBgrSSE2(y[n + 0], u[n + 0], v[n + 0], tmp + 0);
+  VP8YuvToBgrSSE2(y[n + 1], u[n + 1], v[n + 1], tmp + 3);
+  memcpy(dst + n * 3, tmp, 2 * 3);
+}
+
+#else
+
+void VP8YUVInitSSE2(void) {}
+
+#endif  // FANCY_UPSAMPLING
+
+#endif  // WEBP_USE_SSE2
+
diff --git a/third_party/libwebp/dsp/yuv.h b/third_party/libwebp/dsp/yuv.h
index 3844d8c..dd778f9 100644
--- a/third_party/libwebp/dsp/yuv.h
+++ b/third_party/libwebp/dsp/yuv.h
@@ -14,7 +14,7 @@
 // Y = 0.2569 * R + 0.5044 * G + 0.0979 * B + 16
 // U = -0.1483 * R - 0.2911 * G + 0.4394 * B + 128
 // V = 0.4394 * R - 0.3679 * G - 0.0715 * B + 128
-// We use 16bit fixed point operations for RGB->YUV conversion.
+// We use 16bit fixed point operations for RGB->YUV conversion (YUV_FIX).
 //
 // For the Y'CbCr to RGB conversion, the BT.601 specification reads:
 //   R = 1.164 * (Y-16) + 1.596 * (V-128)
@@ -23,21 +23,24 @@
 // where Y is in the [16,235] range, and U/V in the [16,240] range.
 // In the table-lookup version (WEBP_YUV_USE_TABLE), the common factor
 // "1.164 * (Y-16)" can be handled as an offset in the VP8kClip[] table.
-// So in this case the formulae should be read as:
+// So in this case the formulae should read:
 //   R = 1.164 * [Y + 1.371 * (V-128)                  ] - 18.624
 //   G = 1.164 * [Y - 0.698 * (V-128) - 0.336 * (U-128)] - 18.624
 //   B = 1.164 * [Y                   + 1.733 * (U-128)] - 18.624
-// once factorized. Here too, 16bit fixed precision is used.
+// once factorized.
+// For YUV->RGB conversion, only 14bit fixed precision is used (YUV_FIX2).
+// That's the maximum possible for a convenient ARM implementation.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #ifndef WEBP_DSP_YUV_H_
 #define WEBP_DSP_YUV_H_
 
+#include "./dsp.h"
 #include "../dec/decode_vp8.h"
 
 // Define the following to use the LUT-based code:
-#define WEBP_YUV_USE_TABLE
+// #define WEBP_YUV_USE_TABLE
 
 #if defined(WEBP_EXPERIMENTAL_FEATURES)
 // Do NOT activate this feature for real compression. This is only experimental!
@@ -52,53 +55,75 @@
 //------------------------------------------------------------------------------
 // YUV -> RGB conversion
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-enum { YUV_FIX = 16,                // fixed-point precision
-       YUV_HALF = 1 << (YUV_FIX - 1),
-       YUV_MASK = (256 << YUV_FIX) - 1,
-       YUV_RANGE_MIN = -227,        // min value of r/g/b output
-       YUV_RANGE_MAX = 256 + 226    // max value of r/g/b output
+enum {
+  YUV_FIX = 16,                    // fixed-point precision for RGB->YUV
+  YUV_HALF = 1 << (YUV_FIX - 1),
+  YUV_MASK = (256 << YUV_FIX) - 1,
+  YUV_RANGE_MIN = -227,            // min value of r/g/b output
+  YUV_RANGE_MAX = 256 + 226,       // max value of r/g/b output
+
+  YUV_FIX2 = 14,                   // fixed-point precision for YUV->RGB
+  YUV_HALF2 = 1 << (YUV_FIX2 - 1),
+  YUV_MASK2 = (256 << YUV_FIX2) - 1
 };
 
-#ifdef WEBP_YUV_USE_TABLE
+// These constants are 14b fixed-point version of ITU-R BT.601 constants.
+#define kYScale 19077    // 1.164 = 255 / 219
+#define kVToR   26149    // 1.596 = 255 / 112 * 0.701
+#define kUToG   6419     // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
+#define kVToG   13320    // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
+#define kUToB   33050    // 2.018 = 255 / 112 * 0.886
+#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF2)
+#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF2)
+#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF2)
 
-extern int16_t VP8kVToR[256], VP8kUToB[256];
-extern int32_t VP8kVToG[256], VP8kUToG[256];
-extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
-extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
+//------------------------------------------------------------------------------
+
+#if !defined(WEBP_YUV_USE_TABLE)
+
+// slower on x86 by ~7-8%, but bit-exact with the SSE2 version
+
+static WEBP_INLINE int VP8Clip8(int v) {
+  return ((v & ~YUV_MASK2) == 0) ? (v >> YUV_FIX2) : (v < 0) ? 0 : 255;
+}
+
+static WEBP_INLINE int VP8YUVToR(int y, int v) {
+  return VP8Clip8(kYScale * y + kVToR * v + kRCst);
+}
+
+static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
+  return VP8Clip8(kYScale * y - kUToG * u - kVToG * v + kGCst);
+}
+
+static WEBP_INLINE int VP8YUVToB(int y, int u) {
+  return VP8Clip8(kYScale * y + kUToB * u + kBCst);
+}
 
-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                     uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
-  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  rgb[0] = VP8YUVToR(y, v);
+  rgb[1] = VP8YUVToG(y, u, v);
+  rgb[2] = VP8YUVToB(y, u);
 }
 
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                     uint8_t* const bgr) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
-  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
-  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  bgr[0] = VP8YUVToB(y, u);
+  bgr[1] = VP8YUVToG(y, u, v);
+  bgr[2] = VP8YUVToR(y, v);
 }
 
-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                        uint8_t* const rgb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  const uint8_t rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
-                      (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
-  const uint8_t gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
-                      (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
+  const int r = VP8YUVToR(y, v);      // 5 usable bits
+  const int g = VP8YUVToG(y, u, v);   // 6 usable bits
+  const int b = VP8YUVToB(y, u);      // 5 usable bits
+  const int rg = (r & 0xf8) | (g >> 5);
+  const int gb = ((g << 3) & 0xe0) | (b >> 3);
 #ifdef WEBP_SWAP_16BIT_CSP
   rgb[0] = gb;
   rgb[1] = rg;
@@ -108,14 +133,13 @@ static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
 #endif
 }
 
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                          uint8_t* const argb) {
-  const int r_off = VP8kVToR[v];
-  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
-  const int b_off = VP8kUToB[u];
-  const uint8_t rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
-                      VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
-  const uint8_t ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
+  const int r = VP8YUVToR(y, v);        // 4 usable bits
+  const int g = VP8YUVToG(y, u, v);     // 4 usable bits
+  const int b = VP8YUVToB(y, u);        // 4 usable bits
+  const int rg = (r & 0xf0) | (g >> 4);
+  const int ba = (b & 0xf0) | 0x0f;     // overwrite the lower 4 bits
 #ifdef WEBP_SWAP_16BIT_CSP
   argb[0] = ba;
   argb[1] = rg;
@@ -125,61 +149,45 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
 #endif
 }
 
-#else   // Table-free version (slower on x86)
-
-// These constants are 16b fixed-point version of ITU-R BT.601 constants
-#define kYScale 76309      // 1.164 = 255 / 219
-#define kVToR   104597     // 1.596 = 255 / 112 * 0.701
-#define kUToG   25674      // 0.391 = 255 / 112 * 0.886 * 0.114 / 0.587
-#define kVToG   53278      // 0.813 = 255 / 112 * 0.701 * 0.299 / 0.587
-#define kUToB   132201     // 2.018 = 255 / 112 * 0.886
-#define kRCst (-kYScale * 16 - kVToR * 128 + YUV_HALF)
-#define kGCst (-kYScale * 16 + kUToG * 128 + kVToG * 128 + YUV_HALF)
-#define kBCst (-kYScale * 16 - kUToB * 128 + YUV_HALF)
-
-static WEBP_INLINE uint8_t VP8Clip8(int v) {
-  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> YUV_FIX)
-                                : (v < 0) ? 0u : 255u;
-}
-
-static WEBP_INLINE uint8_t VP8ClipN(int v, int N) {  // clip to N bits
-  return ((v & ~YUV_MASK) == 0) ? (uint8_t)(v >> (YUV_FIX + (8 - N)))
-                                : (v < 0) ? 0u : (255u >> (8 - N));
-}
-
-static WEBP_INLINE int VP8YUVToR(int y, int v) {
-  return kYScale * y + kVToR * v + kRCst;
-}
+#else
 
-static WEBP_INLINE int VP8YUVToG(int y, int u, int v) {
-  return kYScale * y - kUToG * u - kVToG * v + kGCst;
-}
+// Table-based version, not totally equivalent to the SSE2 version.
+// Rounding diff is only +/-1 though.
 
-static WEBP_INLINE int VP8YUVToB(int y, int u) {
-  return kYScale * y  + kUToB * u + kBCst;
-}
+extern int16_t VP8kVToR[256], VP8kUToB[256];
+extern int32_t VP8kVToG[256], VP8kUToG[256];
+extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN];
+extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN];
 
-static WEBP_INLINE void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb(int y, int u, int v,
                                     uint8_t* const rgb) {
-  rgb[0] = VP8Clip8(VP8YUVToR(y, v));
-  rgb[1] = VP8Clip8(VP8YUVToG(y, u, v));
-  rgb[2] = VP8Clip8(VP8YUVToB(y, u));
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN];
+  rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN];
 }
 
-static WEBP_INLINE void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToBgr(int y, int u, int v,
                                     uint8_t* const bgr) {
-  bgr[0] = VP8Clip8(VP8YUVToB(y, u));
-  bgr[1] = VP8Clip8(VP8YUVToG(y, u, v));
-  bgr[2] = VP8Clip8(VP8YUVToR(y, v));
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN];
+  bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN];
+  bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN];
 }
 
-static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgb565(int y, int u, int v,
                                        uint8_t* const rgb) {
-  const int r = VP8Clip8(VP8YUVToR(y, u));
-  const int g = VP8ClipN(VP8YUVToG(y, u, v), 6);
-  const int b = VP8ClipN(VP8YUVToB(y, v), 5);
-  const uint8_t rg = (r & 0xf8) | (g >> 3);
-  const uint8_t gb = (g << 5) | b;
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) |
+                  (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5));
+  const int gb = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) |
+                   (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3));
 #ifdef WEBP_SWAP_16BIT_CSP
   rgb[0] = gb;
   rgb[1] = rg;
@@ -189,13 +197,14 @@ static WEBP_INLINE void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v,
 #endif
 }
 
-static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
+static WEBP_INLINE void VP8YuvToRgba4444(int y, int u, int v,
                                          uint8_t* const argb) {
-  const int r = VP8Clip8(VP8YUVToR(y, u));
-  const int g = VP8ClipN(VP8YUVToG(y, u, v), 4);
-  const int b = VP8Clip8(VP8YUVToB(y, v));
-  const uint8_t rg = (r & 0xf0) | g;
-  const uint8_t ba = b | 0x0f;   // overwrite the lower 4 bits
+  const int r_off = VP8kVToR[v];
+  const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX;
+  const int b_off = VP8kUToB[u];
+  const int rg = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) |
+                   VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]);
+  const int ba = (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4) | 0x0f;
 #ifdef WEBP_SWAP_16BIT_CSP
   argb[0] = ba;
   argb[1] = rg;
@@ -207,6 +216,9 @@ static WEBP_INLINE void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v,
 
 #endif  // WEBP_YUV_USE_TABLE
 
+//-----------------------------------------------------------------------------
+// Alpha handling variants
+
 static WEBP_INLINE void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v,
                                      uint8_t* const argb) {
   argb[0] = 0xff;
@@ -228,56 +240,77 @@ static WEBP_INLINE void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v,
 // Must be called before everything, to initialize the tables.
 void VP8YUVInit(void);
 
+//-----------------------------------------------------------------------------
+// SSE2 extra functions (mostly for upsampling_sse2.c)
+
+#if defined(WEBP_USE_SSE2)
+
+#if defined(FANCY_UPSAMPLING)
+// Process 32 pixels and store the result (24b or 32b per pixel) in *dst.
+void VP8YuvToRgba32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToRgb32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                    uint8_t* dst);
+void VP8YuvToBgr32(const uint8_t* y, const uint8_t* u, const uint8_t* v,
+                   uint8_t* dst);
+#endif  // FANCY_UPSAMPLING
+
+// Must be called to initialize tables before using the functions.
+void VP8YUVInitSSE2(void);
+
+#endif    // WEBP_USE_SSE2
+
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
-static WEBP_INLINE int VP8ClipUV(int v) {
-  v = (v + (257 << (YUV_FIX + 2 - 1))) >> (YUV_FIX + 2);
-  return ((v & ~0xff) == 0) ? v : (v < 0) ? 0 : 255;
+// Stub functions that can be called with various rounding values:
+static WEBP_INLINE int VP8ClipUV(int uv, int rounding) {
+  uv = (uv + rounding + (128 << (YUV_FIX + 2))) >> (YUV_FIX + 2);
+  return ((uv & ~0xff) == 0) ? uv : (uv < 0) ? 0 : 255;
 }
 
 #ifndef USE_YUVj
 
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
-  const int kRound = (1 << (YUV_FIX - 1)) + (16 << YUV_FIX);
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
   const int luma = 16839 * r + 33059 * g + 6420 * b;
-  return (luma + kRound) >> YUV_FIX;  // no need to clip
+  return (luma + rounding + (16 << YUV_FIX)) >> YUV_FIX;  // no need to clip
 }
 
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToU(int r, int g, int b, int rounding) {
   const int u = -9719 * r - 19081 * g + 28800 * b;
-  return VP8ClipUV(u);
+  return VP8ClipUV(u, rounding);
 }
 
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+static WEBP_INLINE int VP8RGBToV(int r, int g, int b, int rounding) {
   const int v = +28800 * r - 24116 * g - 4684 * b;
-  return VP8ClipUV(v);
+  return VP8ClipUV(v, rounding);
 }
 
 #else
 
 // This JPEG-YUV colorspace, only for comparison!
-// These are also 16-bit precision coefficients from Rec.601, but with full
+// These are also 16bit precision coefficients from Rec.601, but with full
 // [0..255] output range.
-static WEBP_INLINE int VP8RGBToY(int r, int g, int b) {
-  const int kRound = (1 << (YUV_FIX - 1));
+static WEBP_INLINE int VP8RGBToY(int r, int g, int b, int rounding) {
   const int luma = 19595 * r + 38470 * g + 7471 * b;
-  return (luma + kRound) >> YUV_FIX;  // no need to clip
+  return (luma + rounding) >> YUV_FIX;  // no need to clip
 }
 
-static WEBP_INLINE int VP8RGBToU(int r, int g, int b) {
+static WEBP_INLINE int VP8_RGB_TO_U(int r, int g, int b, int rounding) {
   const int u = -11058 * r - 21710 * g + 32768 * b;
-  return VP8ClipUV(u);
+  return VP8ClipUV(u, rounding);
 }
 
-static WEBP_INLINE int VP8RGBToV(int r, int g, int b) {
+static WEBP_INLINE int VP8_RGB_TO_V(int r, int g, int b, int rounding) {
   const int v = 32768 * r - 27439 * g - 5329 * b;
-  return VP8ClipUV(v);
+  return VP8ClipUV(v, rounding);
 }
 
 #endif    // USE_YUVj
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/enc/alpha.c b/third_party/libwebp/enc/alpha.c
index e636c96..21d4b5cb 100644
--- a/third_party/libwebp/enc/alpha.c
+++ b/third_party/libwebp/enc/alpha.c
@@ -19,10 +19,6 @@
 #include "../utils/quant_levels.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 // -----------------------------------------------------------------------------
 // Encodes the given alpha data via specified compression method 'method'.
 // The pre-processing (quantization) is performed if 'quality' is less than 100.
@@ -71,7 +67,7 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
     const uint8_t* src = data;
     for (j = 0; j < picture.height; ++j) {
       for (i = 0; i < picture.width; ++i) {
-        dst[i] = (src[i] << 8) | 0xff000000u;
+        dst[i] = src[i] << 8;  // we leave A/R/B channels zero'd.
       }
       src += width;
       dst += picture.argb_stride;
@@ -81,8 +77,10 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
   WebPConfigInit(&config);
   config.lossless = 1;
   config.method = effort_level;  // impact is very small
-  // Set a moderate default quality setting for alpha.
-  config.quality = 10.f * effort_level;
+  // Set a low default quality for encoding alpha. Ensure that Alpha quality at
+  // lower methods (3 and below) is less than the threshold for triggering
+  // costly 'BackwardReferencesTraceBackwards'.
+  config.quality = 8.f * effort_level;
   assert(config.quality >= 0 && config.quality <= 100.f);
 
   ok = VP8LBitWriterInit(&tmp_bw, (width * height) >> 3);
@@ -99,12 +97,19 @@ static int EncodeLossless(const uint8_t* const data, int width, int height,
 
 // -----------------------------------------------------------------------------
 
+// Small struct to hold the result of a filter mode compression attempt.
+typedef struct {
+  size_t score;
+  VP8BitWriter bw;
+  WebPAuxStats stats;
+} FilterTrial;
+
+// This function always returns an initialized 'bw' object, even upon error.
 static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
                                int method, int filter, int reduce_levels,
                                int effort_level,  // in [0..6] range
                                uint8_t* const tmp_alpha,
-                               VP8BitWriter* const bw,
-                               WebPAuxStats* const stats) {
+                               FilterTrial* result) {
   int ok = 0;
   const uint8_t* alpha_src;
   WebPFilterFunc filter_func;
@@ -125,8 +130,8 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   header = method | (filter << 2);
   if (reduce_levels) header |= ALPHA_PREPROCESSED_LEVELS << 4;
 
-  VP8BitWriterInit(bw, expected_size);
-  VP8BitWriterAppend(bw, &header, ALPHA_HEADER_LEN);
+  VP8BitWriterInit(&result->bw, expected_size);
+  VP8BitWriterAppend(&result->bw, &header, ALPHA_HEADER_LEN);
 
   filter_func = WebPFilters[filter];
   if (filter_func != NULL) {
@@ -137,12 +142,14 @@ static int EncodeAlphaInternal(const uint8_t* const data, int width, int height,
   }
 
   if (method == ALPHA_NO_COMPRESSION) {
-    ok = VP8BitWriterAppend(bw, alpha_src, width * height);
-    ok = ok && !bw->error_;
+    ok = VP8BitWriterAppend(&result->bw, alpha_src, width * height);
+    ok = ok && !result->bw.error_;
   } else {
-    ok = EncodeLossless(alpha_src, width, height, effort_level, bw, stats);
-    VP8BitWriterFinish(bw);
+    ok = EncodeLossless(alpha_src, width, height, effort_level,
+                        &result->bw, &result->stats);
+    VP8BitWriterFinish(&result->bw);
   }
+  result->score = VP8BitWriterSize(&result->bw);
   return ok;
 }
 
@@ -177,6 +184,85 @@ static int GetNumColors(const uint8_t* data, int width, int height,
   return colors;
 }
 
+#define FILTER_TRY_NONE (1 << WEBP_FILTER_NONE)
+#define FILTER_TRY_ALL ((1 << WEBP_FILTER_LAST) - 1)
+
+// Given the input 'filter' option, return an OR'd bit-set of filters to try.
+static uint32_t GetFilterMap(const uint8_t* alpha, int width, int height,
+                             int filter, int effort_level) {
+  uint32_t bit_map = 0U;
+  if (filter == WEBP_FILTER_FAST) {
+    // Quick estimate of the best candidate.
+    int try_filter_none = (effort_level > 3);
+    const int kMinColorsForFilterNone = 16;
+    const int kMaxColorsForFilterNone = 192;
+    const int num_colors = GetNumColors(alpha, width, height, width);
+    // For low number of colors, NONE yields better compression.
+    filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
+             EstimateBestFilter(alpha, width, height, width);
+    bit_map |= 1 << filter;
+    // For large number of colors, try FILTER_NONE in addition to the best
+    // filter as well.
+    if (try_filter_none || num_colors > kMaxColorsForFilterNone) {
+      bit_map |= FILTER_TRY_NONE;
+    }
+  } else if (filter == WEBP_FILTER_NONE) {
+    bit_map = FILTER_TRY_NONE;
+  } else {  // WEBP_FILTER_BEST -> try all
+    bit_map = FILTER_TRY_ALL;
+  }
+  return bit_map;
+}
+
+static void InitFilterTrial(FilterTrial* const score) {
+  score->score = (size_t)~0U;
+  VP8BitWriterInit(&score->bw, 0);
+}
+
+static int ApplyFiltersAndEncode(const uint8_t* alpha, int width, int height,
+                                 size_t data_size, int method, int filter,
+                                 int reduce_levels, int effort_level,
+                                 uint8_t** const output,
+                                 size_t* const output_size,
+                                 WebPAuxStats* const stats) {
+  int ok = 1;
+  FilterTrial best;
+  uint32_t try_map =
+      GetFilterMap(alpha, width, height, filter, effort_level);
+  InitFilterTrial(&best);
+  if (try_map != FILTER_TRY_NONE) {
+    uint8_t* filtered_alpha =  (uint8_t*)malloc(data_size);
+    if (filtered_alpha == NULL) return 0;
+
+    for (filter = WEBP_FILTER_NONE; ok && try_map; ++filter, try_map >>= 1) {
+      if (try_map & 1) {
+        FilterTrial trial;
+        ok = EncodeAlphaInternal(alpha, width, height, method, filter,
+                                 reduce_levels, effort_level, filtered_alpha,
+                                 &trial);
+        if (ok && trial.score < best.score) {
+          VP8BitWriterWipeOut(&best.bw);
+          best = trial;
+        } else {
+          VP8BitWriterWipeOut(&trial.bw);
+        }
+      }
+    }
+    free(filtered_alpha);
+  } else {
+    ok = EncodeAlphaInternal(alpha, width, height, method, WEBP_FILTER_NONE,
+                             reduce_levels, effort_level, NULL, &best);
+  }
+  if (ok) {
+    if (stats != NULL) *stats = best.stats;
+    *output_size = VP8BitWriterSize(&best.bw);
+    *output = VP8BitWriterBuf(&best.bw);
+  } else {
+    VP8BitWriterWipeOut(&best.bw);
+  }
+  return ok;
+}
+
 static int EncodeAlpha(VP8Encoder* const enc,
                        int quality, int method, int filter,
                        int effort_level,
@@ -207,6 +293,11 @@ static int EncodeAlpha(VP8Encoder* const enc,
     return 0;
   }
 
+  if (method == ALPHA_NO_COMPRESSION) {
+    // Don't filter, as filtering will make no impact on compressed size.
+    filter = WEBP_FILTER_NONE;
+  }
+
   quant_alpha = (uint8_t*)malloc(data_size);
   if (quant_alpha == NULL) {
     return 0;
@@ -225,105 +316,19 @@ static int EncodeAlpha(VP8Encoder* const enc,
   }
 
   if (ok) {
-    VP8BitWriter bw;
-    int test_filter;
-    uint8_t* filtered_alpha = NULL;
-    int try_filter_none = (effort_level > 3);
-
-    if (filter == WEBP_FILTER_FAST) {  // Quick estimate of the best candidate.
-      const int kMinColorsForFilterNone = 16;
-      const int kMaxColorsForFilterNone = 192;
-      const int num_colors = GetNumColors(quant_alpha, width, height, width);
-      // For low number of colors, NONE yeilds better compression.
-      filter = (num_colors <= kMinColorsForFilterNone) ? WEBP_FILTER_NONE :
-               EstimateBestFilter(quant_alpha, width, height, width);
-      // For large number of colors, try FILTER_NONE in addition to the best
-      // filter as well.
-      if (num_colors > kMaxColorsForFilterNone) {
-        try_filter_none = 1;
-      }
-    }
-
-    // Test for WEBP_FILTER_NONE for higher effort levels.
-    if (try_filter_none || filter == WEBP_FILTER_NONE) {
-      ok = EncodeAlphaInternal(quant_alpha, width, height,
-                               method, WEBP_FILTER_NONE, reduce_levels,
-                               effort_level, NULL, &bw, pic->stats);
-
-      if (!ok) {
-        VP8BitWriterWipeOut(&bw);
-        goto End;
-      }
-    }
-    // Stop?
-    if (filter == WEBP_FILTER_NONE) {
-      goto Ok;
-    }
-
-    filtered_alpha = (uint8_t*)malloc(data_size);
-    ok = (filtered_alpha != NULL);
-    if (!ok) {
-      goto End;
+    ok = ApplyFiltersAndEncode(quant_alpha, width, height, data_size, method,
+                               filter, reduce_levels, effort_level, output,
+                               output_size, pic->stats);
+    if (pic->stats != NULL) {  // need stats?
+      pic->stats->coded_size += (int)(*output_size);
+      enc->sse_[3] = sse;
     }
-
-    // Try the other mode(s).
-    {
-      WebPAuxStats best_stats;
-      size_t best_score = try_filter_none ?
-                          VP8BitWriterSize(&bw) : (size_t)~0U;
-      int wipe_tmp_bw = try_filter_none;
-
-      memset(&best_stats, 0, sizeof(best_stats));  // prevent spurious warning
-      if (pic->stats != NULL) best_stats = *pic->stats;
-      for (test_filter =
-           try_filter_none ? WEBP_FILTER_HORIZONTAL : WEBP_FILTER_NONE;
-           ok && (test_filter <= WEBP_FILTER_GRADIENT);
-           ++test_filter) {
-        VP8BitWriter tmp_bw;
-        if (filter != WEBP_FILTER_BEST && test_filter != filter) {
-          continue;
-        }
-        ok = EncodeAlphaInternal(quant_alpha, width, height,
-                                 method, test_filter, reduce_levels,
-                                 effort_level, filtered_alpha, &tmp_bw,
-                                 pic->stats);
-        if (ok) {
-          const size_t score = VP8BitWriterSize(&tmp_bw);
-          if (score < best_score) {
-            // swap bitwriter objects.
-            VP8BitWriter tmp = tmp_bw;
-            tmp_bw = bw;
-            bw = tmp;
-            best_score = score;
-            if (pic->stats != NULL) best_stats = *pic->stats;
-          }
-        } else {
-          VP8BitWriterWipeOut(&bw);
-        }
-        if (wipe_tmp_bw) {
-          VP8BitWriterWipeOut(&tmp_bw);
-        }
-        wipe_tmp_bw = 1;  // For next filter trial for WEBP_FILTER_BEST.
-      }
-      if (pic->stats != NULL) *pic->stats = best_stats;
-    }
- Ok:
-    if (ok) {
-      *output_size = VP8BitWriterSize(&bw);
-      *output = VP8BitWriterBuf(&bw);
-      if (pic->stats != NULL) {         // need stats?
-        pic->stats->coded_size += (int)(*output_size);
-        enc->sse_[3] = sse;
-      }
-    }
-    free(filtered_alpha);
   }
- End:
+
   free(quant_alpha);
   return ok;
 }
 
-
 //------------------------------------------------------------------------------
 // Main calls
 
@@ -403,6 +408,3 @@ int VP8EncDeleteAlpha(VP8Encoder* const enc) {
   return ok;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/analysis.c b/third_party/libwebp/enc/analysis.c
index 4ff3edd..7d4cfdc 100644
--- a/third_party/libwebp/enc/analysis.c
+++ b/third_party/libwebp/enc/analysis.c
@@ -19,10 +19,6 @@
 #include "./cost.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define MAX_ITERS_K_MEANS  6
 
 //------------------------------------------------------------------------------
@@ -55,6 +51,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) {
       for (n = 0; n < NUM_MB_SEGMENTS; ++n) {
         if (cnt[n] >= majority_cnt_3_x_3_grid) {
           majority_seg = n;
+          break;
         }
       }
       tmp[x + y * w] = majority_seg;
@@ -153,6 +150,8 @@ static void AssignSegments(VP8Encoder* const enc,
   // 'int' type is ok for histo, and won't overflow
   int accum[NUM_MB_SEGMENTS], dist_accum[NUM_MB_SEGMENTS];
 
+  assert(nb >= 1);
+
   // bracket the input
   for (n = 0; n <= MAX_ALPHA && alphas[n] == 0; ++n) {}
   min_a = n;
@@ -161,8 +160,9 @@ static void AssignSegments(VP8Encoder* const enc,
   range_a = max_a - min_a;
 
   // Spread initial centers evenly
-  for (n = 1, k = 0; n < 2 * nb; n += 2) {
-    centers[k++] = min_a + (n * range_a) / (2 * nb);
+  for (k = 0, n = 1; k < nb; ++k, n += 2) {
+    assert(n < 2 * nb);
+    centers[k] = min_a + (n * range_a) / (2 * nb);
   }
 
   for (k = 0; k < MAX_ITERS_K_MEANS; ++k) {     // few iters are enough
@@ -177,7 +177,7 @@ static void AssignSegments(VP8Encoder* const enc,
     n = 0;    // track the nearest center for current 'a'
     for (a = min_a; a <= max_a; ++a) {
       if (alphas[a]) {
-        while (n < nb - 1 && abs(a - centers[n + 1]) < abs(a - centers[n])) {
+        while (n + 1 < nb && abs(a - centers[n + 1]) < abs(a - centers[n])) {
           n++;
         }
         map[a] = n;
@@ -384,38 +384,114 @@ static void ResetAllMBInfo(VP8Encoder* const enc) {
   // Default susceptibilities.
   enc->dqm_[0].alpha_ = 0;
   enc->dqm_[0].beta_ = 0;
-  // Note: we can't compute this alpha_ / uv_alpha_.
+  // Note: we can't compute this alpha_ / uv_alpha_ -> set to default value.
+  enc->alpha_ = 0;
+  enc->uv_alpha_ = 0;
   WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
 }
 
+// struct used to collect job result
+typedef struct {
+  WebPWorker worker;
+  int alphas[MAX_ALPHA + 1];
+  int alpha, uv_alpha;
+  VP8EncIterator it;
+  int delta_progress;
+} SegmentJob;
+
+// main work call
+static int DoSegmentsJob(SegmentJob* const job, VP8EncIterator* const it) {
+  int ok = 1;
+  if (!VP8IteratorIsDone(it)) {
+    uint8_t tmp[32 + ALIGN_CST];
+    uint8_t* const scratch = (uint8_t*)DO_ALIGN(tmp);
+    do {
+      // Let's pretend we have perfect lossless reconstruction.
+      VP8IteratorImport(it, scratch);
+      MBAnalyze(it, job->alphas, &job->alpha, &job->uv_alpha);
+      ok = VP8IteratorProgress(it, job->delta_progress);
+    } while (ok && VP8IteratorNext(it));
+  }
+  return ok;
+}
+
+static void MergeJobs(const SegmentJob* const src, SegmentJob* const dst) {
+  int i;
+  for (i = 0; i <= MAX_ALPHA; ++i) dst->alphas[i] += src->alphas[i];
+  dst->alpha += src->alpha;
+  dst->uv_alpha += src->uv_alpha;
+}
+
+// initialize the job struct with some TODOs
+static void InitSegmentJob(VP8Encoder* const enc, SegmentJob* const job,
+                           int start_row, int end_row) {
+  WebPWorkerInit(&job->worker);
+  job->worker.data1 = job;
+  job->worker.data2 = &job->it;
+  job->worker.hook = (WebPWorkerHook)DoSegmentsJob;
+  VP8IteratorInit(enc, &job->it);
+  VP8IteratorSetRow(&job->it, start_row);
+  VP8IteratorSetCountDown(&job->it, (end_row - start_row) * enc->mb_w_);
+  memset(job->alphas, 0, sizeof(job->alphas));
+  job->alpha = 0;
+  job->uv_alpha = 0;
+  // only one of both jobs can record the progress, since we don't
+  // expect the user's hook to be multi-thread safe
+  job->delta_progress = (start_row == 0) ? 20 : 0;
+}
+
+// main entry point
 int VP8EncAnalyze(VP8Encoder* const enc) {
   int ok = 1;
   const int do_segments =
       enc->config_->emulate_jpeg_size ||   // We need the complexity evaluation.
       (enc->segment_hdr_.num_segments_ > 1) ||
       (enc->method_ == 0);  // for method 0, we need preds_[] to be filled.
-  enc->alpha_ = 0;
-  enc->uv_alpha_ = 0;
   if (do_segments) {
-    int alphas[MAX_ALPHA + 1] = { 0 };
-    VP8EncIterator it;
-
-    VP8IteratorInit(enc, &it);
-    do {
-      VP8IteratorImport(&it);
-      MBAnalyze(&it, alphas, &enc->alpha_, &enc->uv_alpha_);
-      ok = VP8IteratorProgress(&it, 20);
-      // Let's pretend we have perfect lossless reconstruction.
-    } while (ok && VP8IteratorNext(&it, it.yuv_in_));
-    enc->alpha_ /= enc->mb_w_ * enc->mb_h_;
-    enc->uv_alpha_ /= enc->mb_w_ * enc->mb_h_;
-    if (ok) AssignSegments(enc, alphas);
+    const int last_row = enc->mb_h_;
+    // We give a little more than a half work to the main thread.
+    const int split_row = (9 * last_row + 15) >> 4;
+    const int total_mb = last_row * enc->mb_w_;
+#ifdef WEBP_USE_THREAD
+    const int kMinSplitRow = 2;  // minimal rows needed for mt to be worth it
+    const int do_mt = (enc->thread_level_ > 0) && (split_row >= kMinSplitRow);
+#else
+    const int do_mt = 0;
+#endif
+    SegmentJob main_job;
+    if (do_mt) {
+      SegmentJob side_job;
+      // Note the use of '&' instead of '&&' because we must call the functions
+      // no matter what.
+      InitSegmentJob(enc, &main_job, 0, split_row);
+      InitSegmentJob(enc, &side_job, split_row, last_row);
+      // we don't need to call Reset() on main_job.worker, since we're calling
+      // WebPWorkerExecute() on it
+      ok &= WebPWorkerReset(&side_job.worker);
+      // launch the two jobs in parallel
+      if (ok) {
+        WebPWorkerLaunch(&side_job.worker);
+        WebPWorkerExecute(&main_job.worker);
+        ok &= WebPWorkerSync(&side_job.worker);
+        ok &= WebPWorkerSync(&main_job.worker);
+      }
+      WebPWorkerEnd(&side_job.worker);
+      if (ok) MergeJobs(&side_job, &main_job);  // merge results together
+    } else {
+      // Even for single-thread case, we use the generic Worker tools.
+      InitSegmentJob(enc, &main_job, 0, last_row);
+      WebPWorkerExecute(&main_job.worker);
+      ok &= WebPWorkerSync(&main_job.worker);
+    }
+    WebPWorkerEnd(&main_job.worker);
+    if (ok) {
+      enc->alpha_ = main_job.alpha / total_mb;
+      enc->uv_alpha_ = main_job.uv_alpha / total_mb;
+      AssignSegments(enc, main_job.alphas);
+    }
   } else {   // Use only one default segment.
     ResetAllMBInfo(enc);
   }
   return ok;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/backward_references.c b/third_party/libwebp/enc/backward_references.c
index db4f430..77b4be7 100644
--- a/third_party/libwebp/enc/backward_references.c
+++ b/third_party/libwebp/enc/backward_references.c
@@ -156,14 +156,14 @@ static void GetParamsForHashChainFindCopy(int quality, int xsize,
   *window_size = (max_window_size > WINDOW_SIZE) ? WINDOW_SIZE
                : max_window_size;
   *iter_pos = 8 + (quality >> 3);
-  // For lower entropy images, the rigourous search loop in HashChainFindCopy
+  // For lower entropy images, the rigorous search loop in HashChainFindCopy
   // can be relaxed.
   *iter_limit = (cache_bits > 0) ? iter_neg : iter_neg / 2;
 }
 
 static int HashChainFindCopy(const HashChain* const p,
                              int base_position, int xsize_signed,
-                             const uint32_t* const argb, int maxlen,
+                             const uint32_t* const argb, int max_len,
                              int window_size, int iter_pos, int iter_limit,
                              int* const distance_ptr,
                              int* const length_ptr) {
@@ -176,25 +176,34 @@ static int HashChainFindCopy(const HashChain* const p,
       (base_position > window_size) ? base_position - window_size : 0;
   int pos;
   assert(xsize > 0);
+  if (max_len > MAX_LENGTH) {
+    max_len = MAX_LENGTH;
+  }
   for (pos = p->hash_to_first_index_[GetPixPairHash64(argb_start)];
        pos >= min_pos;
        pos = p->chain_[pos]) {
     uint64_t val;
     uint32_t curr_length;
     uint32_t distance;
+    const uint64_t* const ptr1 =
+        (const uint64_t*)(argb + pos + best_length - 1);
+    const uint64_t* const ptr2 =
+        (const uint64_t*)(argb_start + best_length - 1);
+
     if (iter_pos < 0) {
       if (iter_pos < iter_limit || best_val >= 0xff0000) {
         break;
       }
     }
     --iter_pos;
-    if (argb[pos + best_length - 1] != argb_start[best_length - 1]) {
-      continue;
-    }
-    curr_length = FindMatchLength(argb + pos, argb_start, maxlen);
-    if (curr_length < best_length) {
-      continue;
-    }
+
+    // Before 'expensive' linear match, check if the two arrays match at the
+    // current best length index and also for the succeeding elements.
+    if (*ptr1 != *ptr2) continue;
+
+    curr_length = FindMatchLength(argb + pos, argb_start, max_len);
+    if (curr_length < best_length) continue;
+
     distance = (uint32_t)(base_position - pos);
     val = curr_length << 16;
     // Favoring 2d locality here gives savings for certain images.
@@ -213,7 +222,7 @@ static int HashChainFindCopy(const HashChain* const p,
       best_val = val;
       best_length = curr_length;
       best_distance = distance;
-      if (curr_length >= MAX_LENGTH) {
+      if (curr_length >= (uint32_t)max_len) {
         break;
       }
       if ((best_distance == 1 || distance == xsize) &&
@@ -291,11 +300,8 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
     int offset = 0;
     int len = 0;
     if (i < pix_count - 1) {  // FindCopy(i,..) reads pixels at [i] and [i + 1].
-      int maxlen = pix_count - i;
-      if (maxlen > MAX_LENGTH) {
-        maxlen = MAX_LENGTH;
-      }
-      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+      int max_len = pix_count - i;
+      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
                         window_size, iter_pos, iter_limit,
                         &offset, &len);
     }
@@ -307,11 +313,8 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
       int k;
       HashChainInsert(hash_chain, &argb[i], i);
       if (i < pix_count - 2) {  // FindCopy(i+1,..) reads [i + 1] and [i + 2].
-        int maxlen = pix_count - (i + 1);
-        if (maxlen > MAX_LENGTH) {
-          maxlen = MAX_LENGTH;
-        }
-        HashChainFindCopy(hash_chain, i + 1, xsize, argb, maxlen,
+        int max_len = pix_count - (i + 1);
+        HashChainFindCopy(hash_chain, i + 1, xsize, argb, max_len,
                           window_size, iter_pos, iter_limit,
                           &offset2, &len2);
         if (len2 > len + 1) {
@@ -321,10 +324,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
             const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
           } else {
+            if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
             refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
           }
           ++refs->size;
-          if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
           i++;  // Backward reference to be done for next pixel.
           len = len2;
           offset = offset2;
@@ -354,10 +357,10 @@ static int BackwardReferencesHashChain(int xsize, int ysize,
         const int ix = VP8LColorCacheGetIndex(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateCacheIdx(ix);
       } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
         refs->refs[refs->size] = PixOrCopyCreateLiteral(pixel);
       }
       ++refs->size;
-      if (use_color_cache) VP8LColorCacheInsert(&hashers, pixel);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -459,16 +462,16 @@ static WEBP_INLINE double GetCacheCost(const CostModel* const m, uint32_t idx) {
 
 static WEBP_INLINE double GetLengthCost(const CostModel* const m,
                                         uint32_t length) {
-  int code, extra_bits_count, extra_bits_value;
-  PrefixEncode(length, &code, &extra_bits_count, &extra_bits_value);
-  return m->literal_[VALUES_IN_BYTE + code] + extra_bits_count;
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(length, &code, &extra_bits);
+  return m->literal_[VALUES_IN_BYTE + code] + extra_bits;
 }
 
 static WEBP_INLINE double GetDistanceCost(const CostModel* const m,
                                           uint32_t distance) {
-  int code, extra_bits_count, extra_bits_value;
-  PrefixEncode(distance, &code, &extra_bits_count, &extra_bits_value);
-  return m->distance_[code] + extra_bits_count;
+  int code, extra_bits;
+  VP8LPrefixEncodeBits(distance, &code, &extra_bits);
+  return m->distance_[code] + extra_bits;
 }
 
 static int BackwardReferencesHashChainDistanceOnly(
@@ -522,11 +525,8 @@ static int BackwardReferencesHashChainDistanceOnly(
       int offset = 0;
       int len = 0;
       if (i < pix_count - 1) {  // FindCopy reads pixels at [i] and [i + 1].
-        int maxlen = shortmax ? 2 : MAX_LENGTH;
-        if (maxlen > pix_count - i) {
-          maxlen = pix_count - i;
-        }
-        HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+        int max_len = shortmax ? 2 : pix_count - i;
+        HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
                           window_size, iter_pos, iter_limit,
                           &offset, &len);
       }
@@ -577,13 +577,13 @@ static int BackwardReferencesHashChainDistanceOnly(
         const int ix = VP8LColorCacheGetIndex(&hashers, argb[i]);
         cost_val += GetCacheCost(cost_model, ix) * mul0;
       } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         cost_val += GetLiteralCost(cost_model, argb[i]) * mul1;
       }
       if (cost[i] > cost_val) {
         cost[i] = (float)cost_val;
         dist_array[i] = 1;  // only one is inserted.
       }
-      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
     }
  next_symbol: ;
   }
@@ -650,12 +650,12 @@ static int BackwardReferencesHashChainFollowChosenPath(
   for (ix = 0; ix < chosen_path_size; ++ix, ++size) {
     int offset = 0;
     int len = 0;
-    int maxlen = chosen_path[ix];
-    if (maxlen != 1) {
-      HashChainFindCopy(hash_chain, i, xsize, argb, maxlen,
+    int max_len = chosen_path[ix];
+    if (max_len != 1) {
+      HashChainFindCopy(hash_chain, i, xsize, argb, max_len,
                         window_size, iter_pos, iter_limit,
                         &offset, &len);
-      assert(len == maxlen);
+      assert(len == max_len);
       refs->refs[size] = PixOrCopyCreateCopy(offset, len);
       if (use_color_cache) {
         for (k = 0; k < len; ++k) {
@@ -675,9 +675,9 @@ static int BackwardReferencesHashChainFollowChosenPath(
         const int idx = VP8LColorCacheGetIndex(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateCacheIdx(idx);
       } else {
+        if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
         refs->refs[size] = PixOrCopyCreateLiteral(argb[i]);
       }
-      if (use_color_cache) VP8LColorCacheInsert(&hashers, argb[i]);
       if (i + 1 < pix_count) {
         HashChainInsert(hash_chain, &argb[i], i);
       }
@@ -780,8 +780,8 @@ int VP8LGetBackwardReferences(int width, int height,
 
   // Choose appropriate backward reference.
   if (lz77_is_useful) {
-    // TraceBackwards is costly. Don't execute it at lower quality (q <= 10).
-    const int try_lz77_trace_backwards = (quality > 10);
+    // TraceBackwards is costly. Don't execute it at lower quality.
+    const int try_lz77_trace_backwards = (quality >= 25);
     *best = refs_lz77;   // default guess: lz77 is better
     VP8LClearBackwardRefs(&refs_rle);
     if (try_lz77_trace_backwards) {
diff --git a/third_party/libwebp/enc/backward_references.h b/third_party/libwebp/enc/backward_references.h
index b0d1813..e1c75f0 100644
--- a/third_party/libwebp/enc/backward_references.h
+++ b/third_party/libwebp/enc/backward_references.h
@@ -18,7 +18,7 @@
 #include "../webp/types.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -31,73 +31,6 @@ extern "C" {
     (NUM_LITERAL_CODES + NUM_LENGTH_CODES + (1 << MAX_COLOR_CACHE_BITS))
 
 // -----------------------------------------------------------------------------
-// PrefixEncode()
-
-// use GNU builtins where available.
-#if defined(__GNUC__) && \
-    ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  assert(n != 0);
-  return 31 ^ __builtin_clz(n);
-}
-#elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86))
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  unsigned long first_set_bit;
-  assert(n != 0);
-  _BitScanReverse(&first_set_bit, n);
-  return first_set_bit;
-}
-#else
-// Returns (int)floor(log2(n)). n must be > 0.
-static WEBP_INLINE int BitsLog2Floor(uint32_t n) {
-  int log = 0;
-  uint32_t value = n;
-  int i;
-
-  assert(n != 0);
-  for (i = 4; i >= 0; --i) {
-    const int shift = (1 << i);
-    const uint32_t x = value >> shift;
-    if (x != 0) {
-      value = x;
-      log += shift;
-    }
-  }
-  return log;
-}
-#endif
-
-static WEBP_INLINE int VP8LBitsLog2Ceiling(uint32_t n) {
-  const int log_floor = BitsLog2Floor(n);
-  if (n == (n & ~(n - 1)))  // zero or a power of two.
-    return log_floor;
-  else
-    return log_floor + 1;
-}
-
-// Splitting of distance and length codes into prefixes and
-// extra bits. The prefixes are encoded with an entropy code
-// while the extra bits are stored just as normal bits.
-static WEBP_INLINE void PrefixEncode(int distance, int* const code,
-                                     int* const extra_bits_count,
-                                     int* const extra_bits_value) {
-  if (distance > 2) {  // Collect the two most significant bits.
-    const int highest_bit = BitsLog2Floor(--distance);
-    const int second_highest_bit = (distance >> (highest_bit - 1)) & 1;
-    *extra_bits_count = highest_bit - 1;
-    *extra_bits_value = distance & ((1 << *extra_bits_count) - 1);
-    *code = 2 * highest_bit + second_highest_bit;
-  } else {
-    *extra_bits_count = 0;
-    *extra_bits_value = 0;
-    *code = (distance == 2) ? 1 : 0;
-  }
-}
-
-// -----------------------------------------------------------------------------
 // PixOrCopy
 
 enum Mode {
@@ -212,7 +145,7 @@ int VP8LCalculateEstimateForCacheSize(const uint32_t* const argb,
                                       int xsize, int ysize,
                                       int* const best_cache_bits);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/third_party/libwebp/enc/config.c b/third_party/libwebp/enc/config.c
index acf96b0..af7f0b0 100644
--- a/third_party/libwebp/enc/config.c
+++ b/third_party/libwebp/enc/config.c
@@ -13,10 +13,6 @@
 
 #include "../webp/encode.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // WebPConfig
 //------------------------------------------------------------------------------
@@ -33,7 +29,7 @@ int WebPConfigInitInternal(WebPConfig* config,
   config->target_PSNR = 0.;
   config->method = 4;
   config->sns_strength = 50;
-  config->filter_strength = 60;   // rather high filtering, helps w/ gradients.
+  config->filter_strength = 60;   // mid-filtering
   config->filter_sharpness = 0;
   config->filter_type = 1;        // default: strong (so U/V is filtered too)
   config->partitions = 0;
@@ -58,11 +54,13 @@ int WebPConfigInitInternal(WebPConfig* config,
       config->sns_strength = 80;
       config->filter_sharpness = 4;
       config->filter_strength = 35;
+      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_PHOTO:
       config->sns_strength = 80;
       config->filter_sharpness = 3;
       config->filter_strength = 30;
+      config->preprocessing |= 2;
       break;
     case WEBP_PRESET_DRAWING:
       config->sns_strength = 25;
@@ -72,10 +70,12 @@ int WebPConfigInitInternal(WebPConfig* config,
     case WEBP_PRESET_ICON:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
       break;
     case WEBP_PRESET_TEXT:
       config->sns_strength = 0;
       config->filter_strength = 0;   // disable filtering to retain sharpness
+      config->preprocessing &= ~2;   // no dithering
       config->segments = 2;
       break;
     case WEBP_PRESET_DEFAULT:
@@ -111,7 +111,7 @@ int WebPValidateConfig(const WebPConfig* config) {
     return 0;
   if (config->show_compressed < 0 || config->show_compressed > 1)
     return 0;
-  if (config->preprocessing < 0 || config->preprocessing > 1)
+  if (config->preprocessing < 0 || config->preprocessing > 3)
     return 0;
   if (config->partitions < 0 || config->partitions > 3)
     return 0;
@@ -138,6 +138,3 @@ int WebPValidateConfig(const WebPConfig* config) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/cost.c b/third_party/libwebp/enc/cost.c
index d4916d7..09699f8 100644
--- a/third_party/libwebp/enc/cost.c
+++ b/third_party/libwebp/enc/cost.c
@@ -13,10 +13,6 @@
 
 #include "./cost.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Boolean-cost cost table
 
@@ -387,110 +383,107 @@ const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 };
 // note: these values include the fixed VP8BitCost(1, 145) mode selection cost.
 const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 };
 const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = {
-  { {  251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 },
-    {  403,  680, 1507, 1519, 2060, 2005, 1992, 1914, 1924, 1733 },
-    {  353, 1121,  973, 1895, 2060, 1787, 1671, 1516, 2012, 1868 },
-    {  770,  852, 1581,  632, 1393, 1780, 1823, 1936, 1074, 1218 },
-    {  510, 1270, 1467, 1319,  847, 1279, 1792, 2094, 1080, 1353 },
-    {  488, 1322,  918, 1573, 1300,  883, 1814, 1752, 1756, 1502 },
-    {  425,  992, 1820, 1514, 1843, 2440,  937, 1771, 1924, 1129 },
-    {  363, 1248, 1257, 1970, 2194, 2385, 1569,  953, 1951, 1601 },
-    {  723, 1257, 1631,  964,  963, 1508, 1697, 1824,  671, 1418 },
-    {  635, 1038, 1573,  930, 1673, 1413, 1410, 1687, 1410,  749 } },
-  { {  451,  613, 1345, 1702, 1870, 1716, 1728, 1766, 2190, 2310 },
-    {  678,  453, 1171, 1443, 1925, 1831, 2045, 1781, 1887, 1602 },
-    {  711,  666,  674, 1718, 1910, 1493, 1775, 1193, 2325, 2325 },
-    {  883,  854, 1583,  542, 1800, 1878, 1664, 2149, 1207, 1087 },
-    {  669,  994, 1248, 1122,  949, 1179, 1376, 1729, 1070, 1244 },
-    {  715, 1026,  715, 1350, 1430,  930, 1717, 1296, 1479, 1479 },
-    {  544,  841, 1656, 1450, 2094, 3883, 1010, 1759, 2076,  809 },
-    {  610,  855,  957, 1553, 2067, 1561, 1704,  824, 2066, 1226 },
-    {  833,  960, 1416,  819, 1277, 1619, 1501, 1617,  757, 1182 },
-    {  711,  964, 1252,  879, 1441, 1828, 1508, 1636, 1594,  734 } },
-  { {  605,  764,  734, 1713, 1747, 1192, 1819, 1353, 1877, 2392 },
-    {  866,  641,  586, 1622, 2072, 1431, 1888, 1346, 2189, 1764 },
-    {  901,  851,  456, 2165, 2281, 1405, 1739, 1193, 2183, 2443 },
-    {  770, 1045,  952, 1078, 1342, 1191, 1436, 1063, 1303,  995 },
-    {  901, 1086,  727, 1170,  884, 1105, 1267, 1401, 1739, 1337 },
-    {  951, 1162,  595, 1488, 1388,  703, 1790, 1366, 2057, 1724 },
-    {  534,  986, 1273, 1987, 3273, 1485, 1024, 1399, 1583,  866 },
-    {  699, 1182,  695, 1978, 1726, 1986, 1326,  714, 1750, 1672 },
-    {  951, 1217, 1209,  920, 1062, 1441, 1548,  999,  952,  932 },
-    {  733, 1284,  784, 1256, 1557, 1098, 1257, 1357, 1414,  908 } },
-  { {  316, 1075, 1653, 1220, 2145, 2051, 1730, 2131, 1884, 1790 },
-    {  745,  516, 1404,  894, 1599, 2375, 2013, 2105, 1475, 1381 },
-    {  516,  729, 1088, 1319, 1637, 3426, 1636, 1275, 1531, 1453 },
-    {  894,  943, 2138,  468, 1704, 2259, 2069, 1763, 1266, 1158 },
-    {  605, 1025, 1235,  871, 1170, 1767, 1493, 1500, 1104, 1258 },
-    {  739,  826, 1207, 1151, 1412,  846, 1305, 2726, 1014, 1569 },
-    {  558,  825, 1820, 1398, 3344, 1556, 1218, 1550, 1228,  878 },
-    {  429,  951, 1089, 1816, 3861, 3861, 1556,  969, 1568, 1828 },
-    {  883,  961, 1752,  769, 1468, 1810, 2081, 2346,  613, 1298 },
-    {  803,  895, 1372,  641, 1303, 1708, 1686, 1700, 1306, 1033 } },
-  { {  439, 1267, 1270, 1579,  963, 1193, 1723, 1729, 1198, 1993 },
-    {  705,  725, 1029, 1153, 1176, 1103, 1821, 1567, 1259, 1574 },
-    {  723,  859,  802, 1253,  972, 1202, 1407, 1665, 1520, 1674 },
-    {  894,  960, 1254,  887, 1052, 1607, 1344, 1349,  865, 1150 },
-    {  833, 1312, 1337, 1205,  572, 1288, 1414, 1529, 1088, 1430 },
-    {  842, 1279, 1068, 1861,  862,  688, 1861, 1630, 1039, 1381 },
-    {  766,  938, 1279, 1546, 3338, 1550, 1031, 1542, 1288,  640 },
-    {  715, 1090,  835, 1609, 1100, 1100, 1603, 1019, 1102, 1617 },
-    {  894, 1813, 1500, 1188,  789, 1194, 1491, 1919,  617, 1333 },
-    {  610, 1076, 1644, 1281, 1283,  975, 1179, 1688, 1434,  889 } },
-  { {  544,  971, 1146, 1849, 1221,  740, 1857, 1621, 1683, 2430 },
-    {  723,  705,  961, 1371, 1426,  821, 2081, 2079, 1839, 1380 },
-    {  783,  857,  703, 2145, 1419,  814, 1791, 1310, 1609, 2206 },
-    {  997, 1000, 1153,  792, 1229, 1162, 1810, 1418,  942,  979 },
-    {  901, 1226,  883, 1289,  793,  715, 1904, 1649, 1319, 3108 },
-    {  979, 1478,  782, 2216, 1454,  455, 3092, 1591, 1997, 1664 },
-    {  663, 1110, 1504, 1114, 1522, 3311,  676, 1522, 1530, 1024 },
-    {  605, 1138, 1153, 1314, 1569, 1315, 1157,  804, 1574, 1320 },
-    {  770, 1216, 1218, 1227,  869, 1384, 1232, 1375,  834, 1239 },
-    {  775, 1007,  843, 1216, 1225, 1074, 2527, 1479, 1149,  975 } },
-  { {  477,  817, 1309, 1439, 1708, 1454, 1159, 1241, 1945, 1672 },
-    {  577,  796, 1112, 1271, 1618, 1458, 1087, 1345, 1831, 1265 },
-    {  663,  776,  753, 1940, 1690, 1690, 1227, 1097, 3149, 1361 },
-    {  766, 1299, 1744, 1161, 1565, 1106, 1045, 1230, 1232,  707 },
-    {  915, 1026, 1404, 1182, 1184,  851, 1428, 2425, 1043,  789 },
-    {  883, 1456,  790, 1082, 1086,  985, 1083, 1484, 1238, 1160 },
-    {  507, 1345, 2261, 1995, 1847, 3636,  653, 1761, 2287,  933 },
-    {  553, 1193, 1470, 2057, 2059, 2059,  833,  779, 2058, 1263 },
-    {  766, 1275, 1515, 1039,  957, 1554, 1286, 1540, 1289,  705 },
-    {  499, 1378, 1496, 1385, 1850, 1850, 1044, 2465, 1515,  720 } },
-  { {  553,  930,  978, 2077, 1968, 1481, 1457,  761, 1957, 2362 },
-    {  694,  864,  905, 1720, 1670, 1621, 1429,  718, 2125, 1477 },
-    {  699,  968,  658, 3190, 2024, 1479, 1865,  750, 2060, 2320 },
-    {  733, 1308, 1296, 1062, 1576, 1322, 1062, 1112, 1172,  816 },
-    {  920,  927, 1052,  939,  947, 1156, 1152, 1073, 3056, 1268 },
-    {  723, 1534,  711, 1547, 1294,  892, 1553,  928, 1815, 1561 },
-    {  663, 1366, 1583, 2111, 1712, 3501,  522, 1155, 2130, 1133 },
-    {  614, 1731, 1188, 2343, 1944, 3733, 1287,  487, 3546, 1758 },
-    {  770, 1585, 1312,  826,  884, 2673, 1185, 1006, 1195, 1195 },
-    {  758, 1333, 1273, 1023, 1621, 1162, 1351,  833, 1479,  862 } },
-  { {  376, 1193, 1446, 1149, 1545, 1577, 1870, 1789, 1175, 1823 },
-    {  803,  633, 1136, 1058, 1350, 1323, 1598, 2247, 1072, 1252 },
-    {  614, 1048,  943,  981, 1152, 1869, 1461, 1020, 1618, 1618 },
-    { 1107, 1085, 1282,  592, 1779, 1933, 1648, 2403,  691, 1246 },
-    {  851, 1309, 1223, 1243,  895, 1593, 1792, 2317,  627, 1076 },
-    {  770, 1216, 1030, 1125,  921,  981, 1629, 1131, 1049, 1646 },
-    {  626, 1469, 1456, 1081, 1489, 3278,  981, 1232, 1498,  733 },
-    {  617, 1201,  812, 1220, 1476, 1476, 1478,  970, 1228, 1488 },
-    { 1179, 1393, 1540,  999, 1243, 1503, 1916, 1925,  414, 1614 },
-    {  943, 1088, 1490,  682, 1112, 1372, 1756, 1505,  966,  966 } },
-  { {  322, 1142, 1589, 1396, 2144, 1859, 1359, 1925, 2084, 1518 },
-    {  617,  625, 1241, 1234, 2121, 1615, 1524, 1858, 1720, 1004 },
-    {  553,  851,  786, 1299, 1452, 1560, 1372, 1561, 1967, 1713 },
-    {  770,  977, 1396,  568, 1893, 1639, 1540, 2108, 1430, 1013 },
-    {  684, 1120, 1375,  982,  930, 2719, 1638, 1643,  933,  993 },
-    {  553, 1103,  996, 1356, 1361, 1005, 1507, 1761, 1184, 1268 },
-    {  419, 1247, 1537, 1554, 1817, 3606, 1026, 1666, 1829,  923 },
-    {  439, 1139, 1101, 1257, 3710, 1922, 1205, 1040, 1931, 1529 },
-    {  979,  935, 1269,  847, 1202, 1286, 1530, 1535,  827, 1036 },
-    {  516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543,  712 } },
+  { {   40, 1151, 1723, 1874, 2103, 2019, 1628, 1777, 2226, 2137 },
+    {  192,  469, 1296, 1308, 1849, 1794, 1781, 1703, 1713, 1522 },
+    {  142,  910,  762, 1684, 1849, 1576, 1460, 1305, 1801, 1657 },
+    {  559,  641, 1370,  421, 1182, 1569, 1612, 1725,  863, 1007 },
+    {  299, 1059, 1256, 1108,  636, 1068, 1581, 1883,  869, 1142 },
+    {  277, 1111,  707, 1362, 1089,  672, 1603, 1541, 1545, 1291 },
+    {  214,  781, 1609, 1303, 1632, 2229,  726, 1560, 1713,  918 },
+    {  152, 1037, 1046, 1759, 1983, 2174, 1358,  742, 1740, 1390 },
+    {  512, 1046, 1420,  753,  752, 1297, 1486, 1613,  460, 1207 },
+    {  424,  827, 1362,  719, 1462, 1202, 1199, 1476, 1199,  538 } },
+  { {  240,  402, 1134, 1491, 1659, 1505, 1517, 1555, 1979, 2099 },
+    {  467,  242,  960, 1232, 1714, 1620, 1834, 1570, 1676, 1391 },
+    {  500,  455,  463, 1507, 1699, 1282, 1564,  982, 2114, 2114 },
+    {  672,  643, 1372,  331, 1589, 1667, 1453, 1938,  996,  876 },
+    {  458,  783, 1037,  911,  738,  968, 1165, 1518,  859, 1033 },
+    {  504,  815,  504, 1139, 1219,  719, 1506, 1085, 1268, 1268 },
+    {  333,  630, 1445, 1239, 1883, 3672,  799, 1548, 1865,  598 },
+    {  399,  644,  746, 1342, 1856, 1350, 1493,  613, 1855, 1015 },
+    {  622,  749, 1205,  608, 1066, 1408, 1290, 1406,  546,  971 },
+    {  500,  753, 1041,  668, 1230, 1617, 1297, 1425, 1383,  523 } },
+  { {  394,  553,  523, 1502, 1536,  981, 1608, 1142, 1666, 2181 },
+    {  655,  430,  375, 1411, 1861, 1220, 1677, 1135, 1978, 1553 },
+    {  690,  640,  245, 1954, 2070, 1194, 1528,  982, 1972, 2232 },
+    {  559,  834,  741,  867, 1131,  980, 1225,  852, 1092,  784 },
+    {  690,  875,  516,  959,  673,  894, 1056, 1190, 1528, 1126 },
+    {  740,  951,  384, 1277, 1177,  492, 1579, 1155, 1846, 1513 },
+    {  323,  775, 1062, 1776, 3062, 1274,  813, 1188, 1372,  655 },
+    {  488,  971,  484, 1767, 1515, 1775, 1115,  503, 1539, 1461 },
+    {  740, 1006,  998,  709,  851, 1230, 1337,  788,  741,  721 },
+    {  522, 1073,  573, 1045, 1346,  887, 1046, 1146, 1203,  697 } },
+  { {  105,  864, 1442, 1009, 1934, 1840, 1519, 1920, 1673, 1579 },
+    {  534,  305, 1193,  683, 1388, 2164, 1802, 1894, 1264, 1170 },
+    {  305,  518,  877, 1108, 1426, 3215, 1425, 1064, 1320, 1242 },
+    {  683,  732, 1927,  257, 1493, 2048, 1858, 1552, 1055,  947 },
+    {  394,  814, 1024,  660,  959, 1556, 1282, 1289,  893, 1047 },
+    {  528,  615,  996,  940, 1201,  635, 1094, 2515,  803, 1358 },
+    {  347,  614, 1609, 1187, 3133, 1345, 1007, 1339, 1017,  667 },
+    {  218,  740,  878, 1605, 3650, 3650, 1345,  758, 1357, 1617 },
+    {  672,  750, 1541,  558, 1257, 1599, 1870, 2135,  402, 1087 },
+    {  592,  684, 1161,  430, 1092, 1497, 1475, 1489, 1095,  822 } },
+  { {  228, 1056, 1059, 1368,  752,  982, 1512, 1518,  987, 1782 },
+    {  494,  514,  818,  942,  965,  892, 1610, 1356, 1048, 1363 },
+    {  512,  648,  591, 1042,  761,  991, 1196, 1454, 1309, 1463 },
+    {  683,  749, 1043,  676,  841, 1396, 1133, 1138,  654,  939 },
+    {  622, 1101, 1126,  994,  361, 1077, 1203, 1318,  877, 1219 },
+    {  631, 1068,  857, 1650,  651,  477, 1650, 1419,  828, 1170 },
+    {  555,  727, 1068, 1335, 3127, 1339,  820, 1331, 1077,  429 },
+    {  504,  879,  624, 1398,  889,  889, 1392,  808,  891, 1406 },
+    {  683, 1602, 1289,  977,  578,  983, 1280, 1708,  406, 1122 },
+    {  399,  865, 1433, 1070, 1072,  764,  968, 1477, 1223,  678 } },
+  { {  333,  760,  935, 1638, 1010,  529, 1646, 1410, 1472, 2219 },
+    {  512,  494,  750, 1160, 1215,  610, 1870, 1868, 1628, 1169 },
+    {  572,  646,  492, 1934, 1208,  603, 1580, 1099, 1398, 1995 },
+    {  786,  789,  942,  581, 1018,  951, 1599, 1207,  731,  768 },
+    {  690, 1015,  672, 1078,  582,  504, 1693, 1438, 1108, 2897 },
+    {  768, 1267,  571, 2005, 1243,  244, 2881, 1380, 1786, 1453 },
+    {  452,  899, 1293,  903, 1311, 3100,  465, 1311, 1319,  813 },
+    {  394,  927,  942, 1103, 1358, 1104,  946,  593, 1363, 1109 },
+    {  559, 1005, 1007, 1016,  658, 1173, 1021, 1164,  623, 1028 },
+    {  564,  796,  632, 1005, 1014,  863, 2316, 1268,  938,  764 } },
+  { {  266,  606, 1098, 1228, 1497, 1243,  948, 1030, 1734, 1461 },
+    {  366,  585,  901, 1060, 1407, 1247,  876, 1134, 1620, 1054 },
+    {  452,  565,  542, 1729, 1479, 1479, 1016,  886, 2938, 1150 },
+    {  555, 1088, 1533,  950, 1354,  895,  834, 1019, 1021,  496 },
+    {  704,  815, 1193,  971,  973,  640, 1217, 2214,  832,  578 },
+    {  672, 1245,  579,  871,  875,  774,  872, 1273, 1027,  949 },
+    {  296, 1134, 2050, 1784, 1636, 3425,  442, 1550, 2076,  722 },
+    {  342,  982, 1259, 1846, 1848, 1848,  622,  568, 1847, 1052 },
+    {  555, 1064, 1304,  828,  746, 1343, 1075, 1329, 1078,  494 },
+    {  288, 1167, 1285, 1174, 1639, 1639,  833, 2254, 1304,  509 } },
+  { {  342,  719,  767, 1866, 1757, 1270, 1246,  550, 1746, 2151 },
+    {  483,  653,  694, 1509, 1459, 1410, 1218,  507, 1914, 1266 },
+    {  488,  757,  447, 2979, 1813, 1268, 1654,  539, 1849, 2109 },
+    {  522, 1097, 1085,  851, 1365, 1111,  851,  901,  961,  605 },
+    {  709,  716,  841,  728,  736,  945,  941,  862, 2845, 1057 },
+    {  512, 1323,  500, 1336, 1083,  681, 1342,  717, 1604, 1350 },
+    {  452, 1155, 1372, 1900, 1501, 3290,  311,  944, 1919,  922 },
+    {  403, 1520,  977, 2132, 1733, 3522, 1076,  276, 3335, 1547 },
+    {  559, 1374, 1101,  615,  673, 2462,  974,  795,  984,  984 },
+    {  547, 1122, 1062,  812, 1410,  951, 1140,  622, 1268,  651 } },
+  { {  165,  982, 1235,  938, 1334, 1366, 1659, 1578,  964, 1612 },
+    {  592,  422,  925,  847, 1139, 1112, 1387, 2036,  861, 1041 },
+    {  403,  837,  732,  770,  941, 1658, 1250,  809, 1407, 1407 },
+    {  896,  874, 1071,  381, 1568, 1722, 1437, 2192,  480, 1035 },
+    {  640, 1098, 1012, 1032,  684, 1382, 1581, 2106,  416,  865 },
+    {  559, 1005,  819,  914,  710,  770, 1418,  920,  838, 1435 },
+    {  415, 1258, 1245,  870, 1278, 3067,  770, 1021, 1287,  522 },
+    {  406,  990,  601, 1009, 1265, 1265, 1267,  759, 1017, 1277 },
+    {  968, 1182, 1329,  788, 1032, 1292, 1705, 1714,  203, 1403 },
+    {  732,  877, 1279,  471,  901, 1161, 1545, 1294,  755,  755 } },
+  { {  111,  931, 1378, 1185, 1933, 1648, 1148, 1714, 1873, 1307 },
+    {  406,  414, 1030, 1023, 1910, 1404, 1313, 1647, 1509,  793 },
+    {  342,  640,  575, 1088, 1241, 1349, 1161, 1350, 1756, 1502 },
+    {  559,  766, 1185,  357, 1682, 1428, 1329, 1897, 1219,  802 },
+    {  473,  909, 1164,  771,  719, 2508, 1427, 1432,  722,  782 },
+    {  342,  892,  785, 1145, 1150,  794, 1296, 1550,  973, 1057 },
+    {  208, 1036, 1326, 1343, 1606, 3395,  815, 1455, 1618,  712 },
+    {  228,  928,  890, 1046, 3499, 1711,  994,  829, 1720, 1318 },
+    {  768,  724, 1058,  636,  991, 1075, 1319, 1324,  616,  825 },
+    {  305, 1167, 1358,  899, 1587, 1587,  987, 1988, 1332,  501 } }
 };
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/cost.h b/third_party/libwebp/enc/cost.h
index 7d7c2c7..3cbad1a 100644
--- a/third_party/libwebp/enc/cost.h
+++ b/third_party/libwebp/enc/cost.h
@@ -16,7 +16,7 @@
 
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -44,7 +44,7 @@ extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES];
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/enc/filter.c b/third_party/libwebp/enc/filter.c
index aae2723d..dd27804 100644
--- a/third_party/libwebp/enc/filter.c
+++ b/third_party/libwebp/enc/filter.c
@@ -11,12 +11,57 @@
 //
 // Author: somnath@google.com (Somnath Banerjee)
 
+#include <assert.h>
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// This table gives, for a given sharpness, the filtering strength to be
+// used (at least) in order to filter a given edge step delta.
+// This is constructed by brute force inspection: for all delta, we iterate
+// over all possible filtering strength / thresh until needs_filter() returns
+// true.
+#define MAX_DELTA_SIZE 64
+static const uint8_t kLevelsFromDelta[8][MAX_DELTA_SIZE] = {
+  { 0,   1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 },
+  { 0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 15, 17, 18,
+    20, 21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42,
+    44, 45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 14, 16, 17, 19,
+    20, 22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43,
+    44, 46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19,
+    21, 22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43,
+    45, 46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  3,  5,  6,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20,
+    21, 23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44,
+    45, 47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 17, 19, 20,
+    22, 23, 25, 26, 28, 29, 31, 32, 34, 35, 37, 38, 40, 41, 43, 44,
+    46, 47, 49, 50, 52, 53, 55, 56, 58, 59, 61, 62, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 13, 15, 16, 18, 19, 21,
+    22, 24, 25, 27, 28, 30, 31, 33, 34, 36, 37, 39, 40, 42, 43, 45,
+    46, 48, 49, 51, 52, 54, 55, 57, 58, 60, 61, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 },
+  {  0,  1,  2,  4,  5,  7,  8,  9, 11, 12, 14, 15, 17, 18, 20, 21,
+    23, 24, 26, 27, 29, 30, 32, 33, 35, 36, 38, 39, 41, 42, 44, 45,
+    47, 48, 50, 51, 53, 54, 56, 57, 59, 60, 62, 63, 63, 63, 63, 63,
+    63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63, 63 }
+};
+
+int VP8FilterStrengthFromDelta(int sharpness, int delta) {
+  const int pos = (delta < MAX_DELTA_SIZE) ? delta : MAX_DELTA_SIZE - 1;
+  assert(sharpness >= 0 && sharpness <= 7);
+  return kLevelsFromDelta[sharpness][pos];
+}
 
+// -----------------------------------------------------------------------------
 // NOTE: clip1, tables and InitTables are repeated entries of dsp.c
 static uint8_t abs0[255 + 255 + 1];     // abs(i)
 static uint8_t abs1[255 + 255 + 1];     // abs(i)>>1
@@ -340,28 +385,29 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) {
 // loop filter strength
 
 void VP8InitFilter(VP8EncIterator* const it) {
-  int s, i;
-  if (!it->lf_stats_) return;
-
-  InitTables();
-  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-    for (i = 0; i < MAX_LF_LEVELS; i++) {
-      (*it->lf_stats_)[s][i] = 0;
+  if (it->lf_stats_ != NULL) {
+    int s, i;
+    InitTables();
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      for (i = 0; i < MAX_LF_LEVELS; i++) {
+        (*it->lf_stats_)[s][i] = 0;
+      }
     }
   }
 }
 
 void VP8StoreFilterStats(VP8EncIterator* const it) {
   int d;
+  VP8Encoder* const enc = it->enc_;
   const int s = it->mb_->segment_;
-  const int level0 = it->enc_->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
+  const int level0 = enc->dqm_[s].fstrength_;  // TODO: ref_lf_delta[]
 
   // explore +/-quant range of values around level0
-  const int delta_min = -it->enc_->dqm_[s].quant_;
-  const int delta_max = it->enc_->dqm_[s].quant_;
+  const int delta_min = -enc->dqm_[s].quant_;
+  const int delta_max = enc->dqm_[s].quant_;
   const int step_size = (delta_max - delta_min >= 4) ? 4 : 1;
 
-  if (!it->lf_stats_) return;
+  if (it->lf_stats_ == NULL) return;
 
   // NOTE: Currently we are applying filter only across the sublock edges
   // There are two reasons for that.
@@ -385,27 +431,41 @@ void VP8StoreFilterStats(VP8EncIterator* const it) {
 }
 
 void VP8AdjustFilterStrength(VP8EncIterator* const it) {
-  int s;
   VP8Encoder* const enc = it->enc_;
-
-  if (!it->lf_stats_) {
-    return;
-  }
-  for (s = 0; s < NUM_MB_SEGMENTS; s++) {
-    int i, best_level = 0;
-    // Improvement over filter level 0 should be at least 1e-5 (relatively)
-    double best_v = 1.00001 * (*it->lf_stats_)[s][0];
-    for (i = 1; i < MAX_LF_LEVELS; i++) {
-      const double v = (*it->lf_stats_)[s][i];
-      if (v > best_v) {
-        best_v = v;
-        best_level = i;
+  if (it->lf_stats_ != NULL) {
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      int i, best_level = 0;
+      // Improvement over filter level 0 should be at least 1e-5 (relatively)
+      double best_v = 1.00001 * (*it->lf_stats_)[s][0];
+      for (i = 1; i < MAX_LF_LEVELS; i++) {
+        const double v = (*it->lf_stats_)[s][i];
+        if (v > best_v) {
+          best_v = v;
+          best_level = i;
+        }
+      }
+      enc->dqm_[s].fstrength_ = best_level;
+    }
+  } else if (enc->config_->filter_strength > 0) {
+    int max_level = 0;
+    int s;
+    for (s = 0; s < NUM_MB_SEGMENTS; s++) {
+      VP8SegmentInfo* const dqm = &enc->dqm_[s];
+      // this '>> 3' accounts for some inverse WHT scaling
+      const int delta = (dqm->max_edge_ * dqm->y2_.q_[1]) >> 3;
+      const int level =
+          VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, delta);
+      if (level > dqm->fstrength_) {
+        dqm->fstrength_ = level;
+      }
+      if (max_level < dqm->fstrength_) {
+        max_level = dqm->fstrength_;
       }
     }
-    enc->dqm_[s].fstrength_ = best_level;
+    enc->filter_hdr_.level_ = max_level;
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
+// -----------------------------------------------------------------------------
+
diff --git a/third_party/libwebp/enc/frame.c b/third_party/libwebp/enc/frame.c
index c56abed..2582244 100644
--- a/third_party/libwebp/enc/frame.c
+++ b/third_party/libwebp/enc/frame.c
@@ -18,10 +18,7 @@
 
 #include "./vp8enci.h"
 #include "./cost.h"
-
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+#include "../webp/format_constants.h"  // RIFF constants
 
 #define SEGMENT_VISU 0
 #define DEBUG_SEARCH 0    // useful to track search convergence
@@ -40,6 +37,63 @@ typedef struct {
 } VP8Residual;
 
 //------------------------------------------------------------------------------
+// multi-pass convergence
+
+#define HEADER_SIZE_ESTIMATE (RIFF_HEADER_SIZE + CHUNK_HEADER_SIZE +  \
+                              VP8_FRAME_HEADER_SIZE)
+#define DQ_LIMIT 0.4  // convergence is considered reached if dq < DQ_LIMIT
+// we allow 2k of extra head-room in PARTITION0 limit.
+#define PARTITION0_SIZE_LIMIT ((VP8_MAX_PARTITION0_SIZE - 2048ULL) << 11)
+
+typedef struct {  // struct for organizing convergence in either size or PSNR
+  int is_first;
+  float dq;
+  float q, last_q;
+  double value, last_value;   // PSNR or size
+  double target;
+  int do_size_search;
+} PassStats;
+
+static int InitPassStats(const VP8Encoder* const enc, PassStats* const s) {
+  const uint64_t target_size = (uint64_t)enc->config_->target_size;
+  const int do_size_search = (target_size != 0);
+  const float target_PSNR = enc->config_->target_PSNR;
+
+  s->is_first = 1;
+  s->dq = 10.f;
+  s->q = s->last_q = enc->config_->quality;
+  s->target = do_size_search ? (double)target_size
+            : (target_PSNR > 0.) ? target_PSNR
+            : 40.;   // default, just in case
+  s->value = s->last_value = 0.;
+  s->do_size_search = do_size_search;
+  return do_size_search;
+}
+
+static float Clamp(float v, float min, float max) {
+  return (v < min) ? min : (v > max) ? max : v;
+}
+
+static float ComputeNextQ(PassStats* const s) {
+  float dq;
+  if (s->is_first) {
+    dq = (s->value > s->target) ? -s->dq : s->dq;
+    s->is_first = 0;
+  } else if (s->value != s->last_value) {
+    const double slope = (s->target - s->value) / (s->last_value - s->value);
+    dq = (float)(slope * (s->last_q - s->q));
+  } else {
+    dq = 0.;  // we're done?!
+  }
+  // Limit variable to avoid large swings.
+  s->dq = Clamp(dq, -30.f, 30.f);
+  s->last_q = s->q;
+  s->last_value = s->value;
+  s->q = Clamp(s->q + s->dq, 0.f, 100.f);
+  return s->q;
+}
+
+//------------------------------------------------------------------------------
 // Tables for level coding
 
 const uint8_t VP8EncBands[16 + 1] = {
@@ -292,31 +346,20 @@ static int GetResidualCost(int ctx0, const VP8Residual* const res) {
   if (res->last < 0) {
     return VP8BitCost(0, p0);
   }
-  cost = 0;
-  while (n < res->last) {
-    int v = res->coeffs[n];
+  cost = VP8BitCost(1, p0);
+  for (; n < res->last; ++n) {
+    const int v = abs(res->coeffs[n]);
     const int b = VP8EncBands[n + 1];
-    ++n;
-    if (v == 0) {
-      // short-case for VP8LevelCost(t, 0) (note: VP8LevelFixedCosts[0] == 0):
-      cost += t[0];
-      t = res->cost[b][0];
-      continue;
-    }
-    v = abs(v);
-    cost += VP8BitCost(1, p0);
+    const int ctx = (v >= 2) ? 2 : v;
     cost += VP8LevelCost(t, v);
-    {
-      const int ctx = (v == 1) ? 1 : 2;
-      p0 = res->prob[b][ctx][0];
-      t = res->cost[b][ctx];
-    }
+    t = res->cost[b][ctx];
+    // the masking trick is faster than "if (v) cost += ..." with clang
+    cost += (v ? ~0U : 0) & VP8BitCost(1, res->prob[b][ctx][0]);
   }
   // Last coefficient is always non-zero
   {
     const int v = abs(res->coeffs[n]);
     assert(v != 0);
-    cost += VP8BitCost(1, p0);
     cost += VP8LevelCost(t, v);
     if (n < 15) {
       const int b = VP8EncBands[n + 1];
@@ -685,81 +728,83 @@ static void StoreSideInfo(const VP8EncIterator* const it) {
 #endif
 }
 
+static double GetPSNR(uint64_t mse, uint64_t size) {
+  return (mse > 0 && size > 0) ? 10. * log10(255. * 255. * size / mse) : 99;
+}
+
 //------------------------------------------------------------------------------
 //  StatLoop(): only collect statistics (number of skips, token usage, ...).
 //  This is used for deciding optimal probabilities. It also modifies the
-//  quantizer value if some target (size, PNSR) was specified.
-
-#define kHeaderSizeEstimate (15 + 20 + 10)      // TODO: fix better
+//  quantizer value if some target (size, PSNR) was specified.
 
 static void SetLoopParams(VP8Encoder* const enc, float q) {
   // Make sure the quality parameter is inside valid bounds
-  if (q < 0.) {
-    q = 0;
-  } else if (q > 100.) {
-    q = 100;
-  }
+  q = Clamp(q, 0.f, 100.f);
 
   VP8SetSegmentParams(enc, q);      // setup segment quantizations and filters
   SetSegmentProbas(enc);            // compute segment probabilities
 
   ResetStats(enc);
-  ResetTokenStats(enc);
-
   ResetSSE(enc);
 }
 
-static int OneStatPass(VP8Encoder* const enc, float q, VP8RDLevel rd_opt,
-                       int nb_mbs, float* const PSNR, int percent_delta) {
+static uint64_t OneStatPass(VP8Encoder* const enc, VP8RDLevel rd_opt,
+                            int nb_mbs, int percent_delta,
+                            PassStats* const s) {
   VP8EncIterator it;
   uint64_t size = 0;
+  uint64_t size_p0 = 0;
   uint64_t distortion = 0;
   const uint64_t pixel_count = nb_mbs * 384;
 
-  SetLoopParams(enc, q);
-
   VP8IteratorInit(enc, &it);
+  SetLoopParams(enc, s->q);
   do {
     VP8ModeScore info;
-    VP8IteratorImport(&it);
+    VP8IteratorImport(&it, NULL);
     if (VP8Decimate(&it, &info, rd_opt)) {
       // Just record the number of skips and act like skip_proba is not used.
       enc->proba_.nb_skip_++;
     }
     RecordResiduals(&it, &info);
-    size += info.R;
+    size += info.R + info.H;
+    size_p0 += info.H;
     distortion += info.D;
     if (percent_delta && !VP8IteratorProgress(&it, percent_delta))
       return 0;
-  } while (VP8IteratorNext(&it, it.yuv_out_) && --nb_mbs > 0);
-  size += FinalizeSkipProba(enc);
-  size += FinalizeTokenProbas(&enc->proba_);
-  size += enc->segment_hdr_.size_;
-  size = ((size + 1024) >> 11) + kHeaderSizeEstimate;
-
-  if (PSNR) {
-    *PSNR = (float)(10.* log10(255. * 255. * pixel_count / distortion));
+    VP8IteratorSaveBoundary(&it);
+  } while (VP8IteratorNext(&it) && --nb_mbs > 0);
+
+  size_p0 += enc->segment_hdr_.size_;
+  if (s->do_size_search) {
+    size += FinalizeSkipProba(enc);
+    size += FinalizeTokenProbas(&enc->proba_);
+    size = ((size + size_p0 + 1024) >> 11) + HEADER_SIZE_ESTIMATE;
+    s->value = (double)size;
+  } else {
+    s->value = GetPSNR(distortion, pixel_count);
   }
-  return (int)size;
+  return size_p0;
 }
 
-// successive refinement increments.
-static const int dqs[] = { 20, 15, 10, 8, 6, 4, 2, 1, 0 };
-
 static int StatLoop(VP8Encoder* const enc) {
   const int method = enc->method_;
   const int do_search = enc->do_search_;
   const int fast_probe = ((method == 0 || method == 3) && !do_search);
-  float q = enc->config_->quality;
-  const int max_passes = enc->config_->pass;
+  int num_pass_left = enc->config_->pass;
   const int task_percent = 20;
-  const int percent_per_pass = (task_percent + max_passes / 2) / max_passes;
+  const int percent_per_pass =
+      (task_percent + num_pass_left / 2) / num_pass_left;
   const int final_percent = enc->percent_ + task_percent;
-  int pass;
-  int nb_mbs;
+  const VP8RDLevel rd_opt =
+      (method >= 3 || do_search) ? RD_OPT_BASIC : RD_OPT_NONE;
+  int nb_mbs = enc->mb_w_ * enc->mb_h_;
+  PassStats stats;
+
+  InitPassStats(enc, &stats);
+  ResetTokenStats(enc);
 
   // Fast mode: quick analysis pass over few mbs. Better than nothing.
-  nb_mbs = enc->mb_w_ * enc->mb_h_;
   if (fast_probe) {
     if (method == 3) {  // we need more stats for method 3 to be reliable.
       nb_mbs = (nb_mbs > 200) ? nb_mbs >> 1 : 100;
@@ -768,37 +813,35 @@ static int StatLoop(VP8Encoder* const enc) {
     }
   }
 
-  // No target size: just do several pass without changing 'q'
-  if (!do_search) {
-    for (pass = 0; pass < max_passes; ++pass) {
-      const VP8RDLevel rd_opt = (method >= 3) ? RD_OPT_BASIC : RD_OPT_NONE;
-      if (!OneStatPass(enc, q, rd_opt, nb_mbs, NULL, percent_per_pass)) {
-        return 0;
-      }
-    }
-  } else {
-    // binary search for a size close to target
-    for (pass = 0; pass < max_passes && (dqs[pass] > 0); ++pass) {
-      float PSNR;
-      int criterion;
-      const int size = OneStatPass(enc, q, RD_OPT_BASIC, nb_mbs, &PSNR,
-                                   percent_per_pass);
-#if DEBUG_SEARCH
-      printf("#%d size=%d PSNR=%.2f q=%.2f\n", pass, size, PSNR, q);
+  while (num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    const uint64_t size_p0 =
+        OneStatPass(enc, rd_opt, nb_mbs, percent_per_pass, &stats);
+    if (size_p0 == 0) return 0;
+#if (DEBUG_SEARCH > 0)
+    printf("#%d value:%.1lf -> %.1lf   q:%.2f -> %.2f\n",
+           num_pass_left, stats.last_value, stats.value, stats.last_q, stats.q);
 #endif
-      if (size == 0) return 0;
-      if (enc->config_->target_PSNR > 0) {
-        criterion = (PSNR < enc->config_->target_PSNR);
-      } else {
-        criterion = (size < enc->config_->target_size);
-      }
-      // dichotomize
-      if (criterion) {
-        q += dqs[pass];
-      } else {
-        q -= dqs[pass];
-      }
+    if (enc->max_i4_header_bits_ > 0 && size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
     }
+    if (is_last_pass) {
+      break;
+    }
+    // If no target size: just do several pass without changing 'q'
+    if (do_search) {
+      ComputeNextQ(&stats);
+      if (fabs(stats.dq) <= DQ_LIMIT) break;
+    }
+  }
+  if (!do_search || !stats.do_size_search) {
+    // Need to finalize probas now, since it wasn't done during the search.
+    FinalizeSkipProba(enc);
+    FinalizeTokenProbas(&enc->proba_);
   }
   VP8CalculateLevelCosts(&enc->proba_);  // finalize costs
   return WebPReportProgress(enc->pic_, final_percent, &enc->percent_);
@@ -835,7 +878,7 @@ static int PostLoopFinalize(VP8EncIterator* const it, int ok) {
   }
 
   if (ok) {      // All good. Finish up.
-    if (enc->pic_->stats) {           // finalize byte counters...
+    if (enc->pic_->stats != NULL) {  // finalize byte counters...
       int i, s;
       for (i = 0; i <= 2; ++i) {
         for (s = 0; s < NUM_MB_SEGMENTS; ++s) {
@@ -877,7 +920,7 @@ int VP8EncLoop(VP8Encoder* const enc) {
     const int dont_use_skip = !enc->proba_.use_skip_proba_;
     const VP8RDLevel rd_opt = enc->rd_opt_level_;
 
-    VP8IteratorImport(&it);
+    VP8IteratorImport(&it, NULL);
     // Warning! order is important: first call VP8Decimate() and
     // *then* decide how to code the skip decision if there's one.
     if (!VP8Decimate(&it, &info, rd_opt) || dont_use_skip) {
@@ -894,7 +937,8 @@ int VP8EncLoop(VP8Encoder* const enc) {
     VP8StoreFilterStats(&it);
     VP8IteratorExport(&it);
     ok = VP8IteratorProgress(&it, 20);
-  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
+    VP8IteratorSaveBoundary(&it);
+  } while (ok && VP8IteratorNext(&it));
 
   return PostLoopFinalize(&it, ok);
 }
@@ -904,62 +948,110 @@ int VP8EncLoop(VP8Encoder* const enc) {
 
 #if !defined(DISABLE_TOKEN_BUFFER)
 
-#define MIN_COUNT 96   // minimum number of macroblocks before updating stats
+#define MIN_COUNT 96  // minimum number of macroblocks before updating stats
 
 int VP8EncTokenLoop(VP8Encoder* const enc) {
-  int ok;
-  // Roughly refresh the proba height times per pass
+  // Roughly refresh the proba eight times per pass
   int max_count = (enc->mb_w_ * enc->mb_h_) >> 3;
-  int cnt;
+  int num_pass_left = enc->config_->pass;
+  const int do_search = enc->do_search_;
   VP8EncIterator it;
   VP8Proba* const proba = &enc->proba_;
   const VP8RDLevel rd_opt = enc->rd_opt_level_;
+  const uint64_t pixel_count = enc->mb_w_ * enc->mb_h_ * 384;
+  PassStats stats;
+  int ok;
+
+  InitPassStats(enc, &stats);
+  ok = PreLoopInitialize(enc);
+  if (!ok) return 0;
 
   if (max_count < MIN_COUNT) max_count = MIN_COUNT;
-  cnt = max_count;
 
   assert(enc->num_parts_ == 1);
   assert(enc->use_tokens_);
   assert(proba->use_skip_proba_ == 0);
   assert(rd_opt >= RD_OPT_BASIC);   // otherwise, token-buffer won't be useful
-  assert(!enc->do_search_);         // TODO(skal): handle pass and dichotomy
-
-  SetLoopParams(enc, enc->config_->quality);
-
-  ok = PreLoopInitialize(enc);
-  if (!ok) return 0;
-
-  VP8IteratorInit(enc, &it);
-  VP8InitFilter(&it);
-  do {
-    VP8ModeScore info;
-    VP8IteratorImport(&it);
-    if (--cnt < 0) {
-      FinalizeTokenProbas(proba);
-      VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
-      cnt = max_count;
-    }
-    VP8Decimate(&it, &info, rd_opt);
-    RecordTokens(&it, &info, &enc->tokens_);
-#ifdef WEBP_EXPERIMENTAL_FEATURES
-    if (enc->use_layer_) {
-      VP8EncCodeLayerBlock(&it);
+  assert(num_pass_left > 0);
+
+  while (ok && num_pass_left-- > 0) {
+    const int is_last_pass = (fabs(stats.dq) <= DQ_LIMIT) ||
+                             (num_pass_left == 0) ||
+                             (enc->max_i4_header_bits_ == 0);
+    uint64_t size_p0 = 0;
+    uint64_t distortion = 0;
+    int cnt = max_count;
+    VP8IteratorInit(enc, &it);
+    SetLoopParams(enc, stats.q);
+    if (is_last_pass) {
+      ResetTokenStats(enc);
+      VP8InitFilter(&it);  // don't collect stats until last pass (too costly)
     }
+    VP8TBufferClear(&enc->tokens_);
+    do {
+      VP8ModeScore info;
+      VP8IteratorImport(&it, NULL);
+      if (--cnt < 0) {
+        FinalizeTokenProbas(proba);
+        VP8CalculateLevelCosts(proba);  // refresh cost tables for rd-opt
+        cnt = max_count;
+      }
+      VP8Decimate(&it, &info, rd_opt);
+      RecordTokens(&it, &info, &enc->tokens_);
+      size_p0 += info.H;
+      distortion += info.D;
+#ifdef WEBP_EXPERIMENTAL_FEATURES
+      if (enc->use_layer_) {
+        VP8EncCodeLayerBlock(&it);
+      }
 #endif
-    StoreSideInfo(&it);
-    VP8StoreFilterStats(&it);
-    VP8IteratorExport(&it);
-    ok = VP8IteratorProgress(&it, 20);
-  } while (ok && VP8IteratorNext(&it, it.yuv_out_));
-
-  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
+      if (is_last_pass) {
+        StoreSideInfo(&it);
+        VP8StoreFilterStats(&it);
+        VP8IteratorExport(&it);
+        ok = VP8IteratorProgress(&it, 20);
+      }
+      VP8IteratorSaveBoundary(&it);
+    } while (ok && VP8IteratorNext(&it));
+    if (!ok) break;
+
+    size_p0 += enc->segment_hdr_.size_;
+    if (stats.do_size_search) {
+      uint64_t size = FinalizeTokenProbas(&enc->proba_);
+      size += VP8EstimateTokenSize(&enc->tokens_,
+                                   (const uint8_t*)proba->coeffs_);
+      size = (size + size_p0 + 1024) >> 11;  // -> size in bytes
+      size += HEADER_SIZE_ESTIMATE;
+      stats.value = (double)size;
+    } else {  // compute and store PSNR
+      stats.value = GetPSNR(distortion, pixel_count);
+    }
 
+#if (DEBUG_SEARCH > 0)
+    printf("#%2d metric:%.1lf -> %.1lf   last_q=%.2lf q=%.2lf dq=%.2lf\n",
+           num_pass_left, stats.last_value, stats.value,
+           stats.last_q, stats.q, stats.dq);
+#endif
+    if (size_p0 > PARTITION0_SIZE_LIMIT) {
+      ++num_pass_left;
+      enc->max_i4_header_bits_ >>= 1;  // strengthen header bit limitation...
+      continue;                        // ...and start over
+    }
+    if (is_last_pass) {
+      break;   // done
+    }
+    if (do_search) {
+      ComputeNextQ(&stats);  // Adjust q
+    }
+  }
   if (ok) {
-    FinalizeTokenProbas(proba);
+    if (!stats.do_size_search) {
+      FinalizeTokenProbas(&enc->proba_);
+    }
     ok = VP8EmitTokens(&enc->tokens_, enc->parts_ + 0,
                        (const uint8_t*)proba->coeffs_, 1);
   }
-
+  ok = ok && WebPReportProgress(enc->pic_, enc->percent_ + 20, &enc->percent_);
   return PostLoopFinalize(&it, ok);
 }
 
@@ -974,6 +1066,3 @@ int VP8EncTokenLoop(VP8Encoder* const enc) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/histogram.c b/third_party/libwebp/enc/histogram.c
index 787ea5d..abd253b 100644
--- a/third_party/libwebp/enc/histogram.c
+++ b/third_party/libwebp/enc/histogram.c
@@ -90,12 +90,10 @@ void VP8LHistogramAddSinglePixOrCopy(VP8LHistogram* const histo,
     int literal_ix = 256 + NUM_LENGTH_CODES + PixOrCopyCacheIdx(v);
     ++histo->literal_[literal_ix];
   } else {
-    int code, extra_bits_count, extra_bits_value;
-    PrefixEncode(PixOrCopyLength(v),
-                 &code, &extra_bits_count, &extra_bits_value);
+    int code, extra_bits;
+    VP8LPrefixEncodeBits(PixOrCopyLength(v), &code, &extra_bits);
     ++histo->literal_[256 + code];
-    PrefixEncode(PixOrCopyDistance(v),
-                 &code, &extra_bits_count, &extra_bits_value);
+    VP8LPrefixEncodeBits(PixOrCopyDistance(v), &code, &extra_bits);
     ++histo->distance_[code];
   }
 }
diff --git a/third_party/libwebp/enc/histogram.h b/third_party/libwebp/enc/histogram.h
index 583b5a4..4d346a8 100644
--- a/third_party/libwebp/enc/histogram.h
+++ b/third_party/libwebp/enc/histogram.h
@@ -24,7 +24,7 @@
 #include "../webp/format_constants.h"
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -94,7 +94,7 @@ int VP8LGetHistoImageSymbols(int xsize, int ysize,
                              VP8LHistogramSet* const image_in,
                              uint16_t* const histogram_symbols);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/third_party/libwebp/enc/iterator.c b/third_party/libwebp/enc/iterator.c
index 0746659..e42ad00 100644
--- a/third_party/libwebp/enc/iterator.c
+++ b/third_party/libwebp/enc/iterator.c
@@ -15,21 +15,16 @@
 
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // VP8Iterator
 //------------------------------------------------------------------------------
 
 static void InitLeft(VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  enc->y_left_[-1] = enc->u_left_[-1] = enc->v_left_[-1] =
+  it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] =
       (it->y_ > 0) ? 129 : 127;
-  memset(enc->y_left_, 129, 16);
-  memset(enc->u_left_, 129, 8);
-  memset(enc->v_left_, 129, 8);
+  memset(it->y_left_, 129, 16);
+  memset(it->u_left_, 129, 8);
+  memset(it->v_left_, 129, 8);
   it->left_nz_[8] = 0;
 }
 
@@ -40,43 +35,60 @@ static void InitTop(VP8EncIterator* const it) {
   memset(enc->nz_, 0, enc->mb_w_ * sizeof(*enc->nz_));
 }
 
-void VP8IteratorReset(VP8EncIterator* const it) {
+void VP8IteratorSetRow(VP8EncIterator* const it, int y) {
   VP8Encoder* const enc = it->enc_;
   it->x_ = 0;
-  it->y_ = 0;
-  it->y_offset_ = 0;
-  it->uv_offset_ = 0;
-  it->mb_ = enc->mb_info_;
-  it->preds_ = enc->preds_;
+  it->y_ = y;
+  it->bw_ = &enc->parts_[y & (enc->num_parts_ - 1)];
+  it->preds_ = enc->preds_ + y * 4 * enc->preds_w_;
   it->nz_ = enc->nz_;
-  it->bw_ = &enc->parts_[0];
-  it->done_ = enc->mb_w_* enc->mb_h_;
+  it->mb_ = enc->mb_info_ + y * enc->mb_w_;
+  it->y_top_ = enc->y_top_;
+  it->uv_top_ = enc->uv_top_;
+  InitLeft(it);
+}
+
+void VP8IteratorReset(VP8EncIterator* const it) {
+  VP8Encoder* const enc = it->enc_;
+  VP8IteratorSetRow(it, 0);
+  VP8IteratorSetCountDown(it, enc->mb_w_ * enc->mb_h_);  // default
   InitTop(it);
   InitLeft(it);
   memset(it->bit_count_, 0, sizeof(it->bit_count_));
   it->do_trellis_ = 0;
 }
 
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down) {
+  it->count_down_ = it->count_down0_ = count_down;
+}
+
+int VP8IteratorIsDone(const VP8EncIterator* const it) {
+  return (it->count_down_ <= 0);
+}
+
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) {
   it->enc_ = enc;
   it->y_stride_  = enc->pic_->y_stride;
   it->uv_stride_ = enc->pic_->uv_stride;
-  // TODO(later): for multithreading, these should be owned by 'it'.
-  it->yuv_in_   = enc->yuv_in_;
-  it->yuv_out_  = enc->yuv_out_;
-  it->yuv_out2_ = enc->yuv_out2_;
-  it->yuv_p_    = enc->yuv_p_;
+  it->yuv_in_   = (uint8_t*)DO_ALIGN(it->yuv_mem_);
+  it->yuv_out_  = it->yuv_in_ + YUV_SIZE;
+  it->yuv_out2_ = it->yuv_out_ + YUV_SIZE;
+  it->yuv_p_    = it->yuv_out2_ + YUV_SIZE;
   it->lf_stats_ = enc->lf_stats_;
   it->percent0_ = enc->percent_;
+  it->y_left_ = (uint8_t*)DO_ALIGN(it->yuv_left_mem_ + 1);
+  it->u_left_ = it->y_left_ + 16 + 16;
+  it->v_left_ = it->u_left_ + 16;
   VP8IteratorReset(it);
 }
 
 int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
   VP8Encoder* const enc = it->enc_;
-  if (delta && enc->pic_->progress_hook) {
-    const int percent = (enc->mb_h_ <= 1)
+  if (delta && enc->pic_->progress_hook != NULL) {
+    const int done = it->count_down0_ - it->count_down_;
+    const int percent = (it->count_down0_ <= 0)
                       ? it->percent0_
-                      : it->percent0_ + delta * it->y_ / (enc->mb_h_ - 1);
+                      : it->percent0_ + delta * done / it->count_down0_;
     return WebPReportProgress(enc->pic_, percent, &enc->percent_);
   }
   return 1;
@@ -86,6 +98,8 @@ int VP8IteratorProgress(const VP8EncIterator* const it, int delta) {
 // Import the source samples into the cache. Takes care of replicating
 // boundary pixels if necessary.
 
+static WEBP_INLINE int MinSize(int a, int b) { return (a < b) ? a : b; }
+
 static void ImportBlock(const uint8_t* src, int src_stride,
                         uint8_t* dst, int w, int h, int size) {
   int i;
@@ -103,30 +117,55 @@ static void ImportBlock(const uint8_t* src, int src_stride,
   }
 }
 
-void VP8IteratorImport(const VP8EncIterator* const it) {
+static void ImportLine(const uint8_t* src, int src_stride,
+                       uint8_t* dst, int len, int total_len) {
+  int i;
+  for (i = 0; i < len; ++i, src += src_stride) dst[i] = *src;
+  for (; i < total_len; ++i) dst[i] = dst[len - 1];
+}
+
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32) {
   const VP8Encoder* const enc = it->enc_;
   const int x = it->x_, y = it->y_;
   const WebPPicture* const pic = enc->pic_;
-  const uint8_t* const ysrc = pic->y + (y * pic->y_stride + x) * 16;
+  const uint8_t* const ysrc = pic->y + (y * pic->y_stride  + x) * 16;
   const uint8_t* const usrc = pic->u + (y * pic->uv_stride + x) * 8;
   const uint8_t* const vsrc = pic->v + (y * pic->uv_stride + x) * 8;
-  uint8_t* const ydst = it->yuv_in_ + Y_OFF;
-  uint8_t* const udst = it->yuv_in_ + U_OFF;
-  uint8_t* const vdst = it->yuv_in_ + V_OFF;
-  int w = (pic->width - x * 16);
-  int h = (pic->height - y * 16);
-
-  if (w > 16) w = 16;
-  if (h > 16) h = 16;
-
-  // Luma plane
-  ImportBlock(ysrc, pic->y_stride, ydst, w, h, 16);
-
-  {   // U/V planes
-    const int uv_w = (w + 1) >> 1;
-    const int uv_h = (h + 1) >> 1;
-    ImportBlock(usrc, pic->uv_stride, udst, uv_w, uv_h, 8);
-    ImportBlock(vsrc, pic->uv_stride, vdst, uv_w, uv_h, 8);
+  const int w = MinSize(pic->width - x * 16, 16);
+  const int h = MinSize(pic->height - y * 16, 16);
+  const int uv_w = (w + 1) >> 1;
+  const int uv_h = (h + 1) >> 1;
+
+  ImportBlock(ysrc, pic->y_stride,  it->yuv_in_ + Y_OFF, w, h, 16);
+  ImportBlock(usrc, pic->uv_stride, it->yuv_in_ + U_OFF, uv_w, uv_h, 8);
+  ImportBlock(vsrc, pic->uv_stride, it->yuv_in_ + V_OFF, uv_w, uv_h, 8);
+
+  if (tmp_32 == NULL) return;
+
+  // Import source (uncompressed) samples into boundary.
+  if (x == 0) {
+    InitLeft(it);
+  } else {
+    if (y == 0) {
+      it->y_left_[-1] = it->u_left_[-1] = it->v_left_[-1] = 127;
+    } else {
+      it->y_left_[-1] = ysrc[- 1 - pic->y_stride];
+      it->u_left_[-1] = usrc[- 1 - pic->uv_stride];
+      it->v_left_[-1] = vsrc[- 1 - pic->uv_stride];
+    }
+    ImportLine(ysrc - 1, pic->y_stride,  it->y_left_, h,   16);
+    ImportLine(usrc - 1, pic->uv_stride, it->u_left_, uv_h, 8);
+    ImportLine(vsrc - 1, pic->uv_stride, it->v_left_, uv_h, 8);
+  }
+
+  it->y_top_  = tmp_32 + 0;
+  it->uv_top_ = tmp_32 + 16;
+  if (y == 0) {
+    memset(tmp_32, 127, 32 * sizeof(*tmp_32));
+  } else {
+    ImportLine(ysrc - pic->y_stride,  1, tmp_32,          w,   16);
+    ImportLine(usrc - pic->uv_stride, 1, tmp_32 + 16,     uv_w, 8);
+    ImportLine(vsrc - pic->uv_stride, 1, tmp_32 + 16 + 8, uv_w, 8);
   }
 }
 
@@ -242,48 +281,44 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) {
 #undef BIT
 
 //------------------------------------------------------------------------------
-// Advance to the next position, doing the bookeeping.
+// Advance to the next position, doing the bookkeeping.
 
-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save) {
+void VP8IteratorSaveBoundary(VP8EncIterator* const it) {
   VP8Encoder* const enc = it->enc_;
-  if (block_to_save) {
-    const int x = it->x_, y = it->y_;
-    const uint8_t* const ysrc = block_to_save + Y_OFF;
-    const uint8_t* const usrc = block_to_save + U_OFF;
-    if (x < enc->mb_w_ - 1) {   // left
-      int i;
-      for (i = 0; i < 16; ++i) {
-        enc->y_left_[i] = ysrc[15 + i * BPS];
-      }
-      for (i = 0; i < 8; ++i) {
-        enc->u_left_[i] = usrc[7 + i * BPS];
-        enc->v_left_[i] = usrc[15 + i * BPS];
-      }
-      // top-left (before 'top'!)
-      enc->y_left_[-1] = enc->y_top_[x * 16 + 15];
-      enc->u_left_[-1] = enc->uv_top_[x * 16 + 0 + 7];
-      enc->v_left_[-1] = enc->uv_top_[x * 16 + 8 + 7];
+  const int x = it->x_, y = it->y_;
+  const uint8_t* const ysrc = it->yuv_out_ + Y_OFF;
+  const uint8_t* const uvsrc = it->yuv_out_ + U_OFF;
+  if (x < enc->mb_w_ - 1) {   // left
+    int i;
+    for (i = 0; i < 16; ++i) {
+      it->y_left_[i] = ysrc[15 + i * BPS];
     }
-    if (y < enc->mb_h_ - 1) {  // top
-      memcpy(enc->y_top_ + x * 16, ysrc + 15 * BPS, 16);
-      memcpy(enc->uv_top_ + x * 16, usrc + 7 * BPS, 8 + 8);
+    for (i = 0; i < 8; ++i) {
+      it->u_left_[i] = uvsrc[7 + i * BPS];
+      it->v_left_[i] = uvsrc[15 + i * BPS];
     }
+    // top-left (before 'top'!)
+    it->y_left_[-1] = it->y_top_[15];
+    it->u_left_[-1] = it->uv_top_[0 + 7];
+    it->v_left_[-1] = it->uv_top_[8 + 7];
   }
+  if (y < enc->mb_h_ - 1) {  // top
+    memcpy(it->y_top_, ysrc + 15 * BPS, 16);
+    memcpy(it->uv_top_, uvsrc + 7 * BPS, 8 + 8);
+  }
+}
 
-  it->mb_++;
+int VP8IteratorNext(VP8EncIterator* const it) {
   it->preds_ += 4;
-  it->nz_++;
-  it->x_++;
-  if (it->x_ == enc->mb_w_) {
-    it->x_ = 0;
-    it->y_++;
-    it->bw_ = &enc->parts_[it->y_ & (enc->num_parts_ - 1)];
-    it->preds_ = enc->preds_ + it->y_ * 4 * enc->preds_w_;
-    it->nz_ = enc->nz_;
-    InitLeft(it);
+  it->mb_ += 1;
+  it->nz_ += 1;
+  it->y_top_ += 16;
+  it->uv_top_ += 16;
+  it->x_ += 1;
+  if (it->x_ == it->enc_->mb_w_) {
+    VP8IteratorSetRow(it, ++it->y_);
   }
-  return (0 < --it->done_);
+  return (0 < --it->count_down_);
 }
 
 //------------------------------------------------------------------------------
@@ -370,15 +405,15 @@ void VP8IteratorStartI4(VP8EncIterator* const it) {
 
   // Import the boundary samples
   for (i = 0; i < 17; ++i) {    // left
-    it->i4_boundary_[i] = enc->y_left_[15 - i];
+    it->i4_boundary_[i] = it->y_left_[15 - i];
   }
   for (i = 0; i < 16; ++i) {    // top
-    it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+    it->i4_boundary_[17 + i] = it->y_top_[i];
   }
   // top-right samples have a special case on the far right of the picture
   if (it->x_ < enc->mb_w_ - 1) {
     for (i = 16; i < 16 + 4; ++i) {
-      it->i4_boundary_[17 + i] = enc->y_top_[it->x_ * 16 + i];
+      it->i4_boundary_[17 + i] = it->y_top_[i];
     }
   } else {    // else, replicate the last valid pixel four times
     for (i = 16; i < 16 + 4; ++i) {
@@ -419,6 +454,3 @@ int VP8IteratorRotateI4(VP8EncIterator* const it,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/layer.c b/third_party/libwebp/enc/layer.c
index fa89660..2402362 100644
--- a/third_party/libwebp/enc/layer.c
+++ b/third_party/libwebp/enc/layer.c
@@ -15,10 +15,6 @@
 
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 
 void VP8EncInitLayer(VP8Encoder* const enc) {
@@ -46,6 +42,3 @@ void VP8EncDeleteLayer(VP8Encoder* enc) {
   free(enc->layer_data_);
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/picture.c b/third_party/libwebp/enc/picture.c
index 5aaa385..011690d 100644
--- a/third_party/libwebp/enc/picture.c
+++ b/third_party/libwebp/enc/picture.c
@@ -16,14 +16,15 @@
 #include <math.h>
 
 #include "./vp8enci.h"
+#include "../utils/alpha_processing.h"
+#include "../utils/random.h"
 #include "../utils/rescaler.h"
 #include "../utils/utils.h"
 #include "../dsp/dsp.h"
 #include "../dsp/yuv.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// Uncomment to disable gamma-compression during RGB->U/V averaging
+#define USE_GAMMA_COMPRESSION
 
 #define HALVE(x) (((x) + 1) >> 1)
 #define IS_YUV_CSP(csp, YUV_CSP) (((csp) & WEBP_CSP_UV_MASK) == (YUV_CSP))
@@ -34,6 +35,10 @@ static const union {
 } test_endian = { 0xff000000u };
 #define ALPHA_IS_LAST (test_endian.bytes[3] == 0xff)
 
+static WEBP_INLINE uint32_t MakeARGB32(int r, int g, int b) {
+  return (0xff000000u | (r << 16) | (g << 8) | b);
+}
+
 //------------------------------------------------------------------------------
 // WebPPicture
 //------------------------------------------------------------------------------
@@ -118,6 +123,7 @@ int WebPPictureAlloc(WebPPicture* picture) {
         picture->v0 = mem;
         mem += uv0_size;
       }
+      (void)mem;  // makes the static analyzer happy
     } else {
       void* memory;
       const uint64_t argb_size = (uint64_t)width * height;
@@ -395,6 +401,28 @@ static void RescalePlane(const uint8_t* src,
   }
 }
 
+static void AlphaMultiplyARGB(WebPPicture* const pic, int inverse) {
+  uint32_t* ptr = pic->argb;
+  int y;
+  for (y = 0; y < pic->height; ++y) {
+    WebPMultARGBRow(ptr, pic->width, inverse);
+    ptr += pic->argb_stride;
+  }
+}
+
+static void AlphaMultiplyY(WebPPicture* const pic, int inverse) {
+  const uint8_t* ptr_a = pic->a;
+  if (ptr_a != NULL) {
+    uint8_t* ptr_y = pic->y;
+    int y;
+    for (y = 0; y < pic->height; ++y) {
+      WebPMultRow(ptr_y, ptr_a, pic->width, inverse);
+      ptr_y += pic->y_stride;
+      ptr_a += pic->a_stride;
+    }
+  }
+}
+
 int WebPPictureRescale(WebPPicture* pic, int width, int height) {
   WebPPicture tmp;
   int prev_width, prev_height;
@@ -425,9 +453,19 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
+    // If present, we need to rescale alpha first (for AlphaMultiplyY).
+    if (pic->a != NULL) {
+      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
+                   tmp.a, width, height, tmp.a_stride, work, 1);
+    }
 
+    // We take transparency into account on the luma plane only. That's not
+    // totally exact blending, but still is a good approximation.
+    AlphaMultiplyY(pic, 0);
     RescalePlane(pic->y, prev_width, prev_height, pic->y_stride,
                  tmp.y, width, height, tmp.y_stride, work, 1);
+    AlphaMultiplyY(&tmp, 1);
+
     RescalePlane(pic->u,
                  HALVE(prev_width), HALVE(prev_height), pic->uv_stride,
                  tmp.u,
@@ -437,10 +475,6 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
                  tmp.v,
                  HALVE(width), HALVE(height), tmp.uv_stride, work, 1);
 
-    if (tmp.a != NULL) {
-      RescalePlane(pic->a, prev_width, prev_height, pic->a_stride,
-                   tmp.a, width, height, tmp.a_stride, work, 1);
-    }
 #ifdef WEBP_EXPERIMENTAL_FEATURES
     if (tmp.u0 != NULL) {
       const int s = IS_YUV_CSP(tmp.colorspace, WEBP_YUV422) ? 2 : 1;
@@ -458,12 +492,16 @@ int WebPPictureRescale(WebPPicture* pic, int width, int height) {
       WebPPictureFree(&tmp);
       return 0;
     }
-
+    // In order to correctly interpolate colors, we need to apply the alpha
+    // weighting first (black-matting), scale the RGB values, and remove
+    // the premultiplication afterward (while preserving the alpha channel).
+    AlphaMultiplyARGB(pic, 0);
     RescalePlane((const uint8_t*)pic->argb, prev_width, prev_height,
                  pic->argb_stride * 4,
                  (uint8_t*)tmp.argb, width, height,
                  tmp.argb_stride * 4,
                  work, 4);
+    AlphaMultiplyARGB(&tmp, 1);
   }
   WebPPictureFree(pic);
   free(work);
@@ -552,20 +590,101 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
 //------------------------------------------------------------------------------
 // RGB -> YUV conversion
 
-// TODO: we can do better than simply 2x2 averaging on U/V samples.
-#define SUM4(ptr) ((ptr)[0] + (ptr)[step] + \
-                   (ptr)[rgb_stride] + (ptr)[rgb_stride + step])
-#define SUM2H(ptr) (2 * (ptr)[0] + 2 * (ptr)[step])
-#define SUM2V(ptr) (2 * (ptr)[0] + 2 * (ptr)[rgb_stride])
-#define SUM1(ptr)  (4 * (ptr)[0])
+static int RGBToY(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToY(r, g, b, VP8RandomBits(rg, YUV_FIX));
+}
+
+static int RGBToU(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToU(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+static int RGBToV(int r, int g, int b, VP8Random* const rg) {
+  return VP8RGBToV(r, g, b, VP8RandomBits(rg, YUV_FIX + 2));
+}
+
+//------------------------------------------------------------------------------
+
+#if defined(USE_GAMMA_COMPRESSION)
+
+// gamma-compensates loss of resolution during chroma subsampling
+#define kGamma 0.80
+#define kGammaFix 12     // fixed-point precision for linear values
+#define kGammaScale ((1 << kGammaFix) - 1)
+#define kGammaTabFix 7   // fixed-point fractional bits precision
+#define kGammaTabScale (1 << kGammaTabFix)
+#define kGammaTabRounder (kGammaTabScale >> 1)
+#define kGammaTabSize (1 << (kGammaFix - kGammaTabFix))
+
+static int kLinearToGammaTab[kGammaTabSize + 1];
+static uint16_t kGammaToLinearTab[256];
+static int kGammaTablesOk = 0;
+
+static void InitGammaTables(void) {
+  if (!kGammaTablesOk) {
+    int v;
+    const double scale = 1. / kGammaScale;
+    for (v = 0; v <= 255; ++v) {
+      kGammaToLinearTab[v] =
+          (uint16_t)(pow(v / 255., kGamma) * kGammaScale + .5);
+    }
+    for (v = 0; v <= kGammaTabSize; ++v) {
+      const double x = scale * (v << kGammaTabFix);
+      kLinearToGammaTab[v] = (int)(pow(x, 1. / kGamma) * 255. + .5);
+    }
+    kGammaTablesOk = 1;
+  }
+}
+
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) {
+  return kGammaToLinearTab[v];
+}
+
+// Convert a linear value 'v' to YUV_FIX+2 fixed-point precision
+// U/V value, suitable for RGBToU/V calls.
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  const int v = base_value << shift;              // final uplifted value
+  const int tab_pos = v >> (kGammaTabFix + 2);    // integer part
+  const int x = v & ((kGammaTabScale << 2) - 1);  // fractional part
+  const int v0 = kLinearToGammaTab[tab_pos];
+  const int v1 = kLinearToGammaTab[tab_pos + 1];
+  const int y = v1 * x + v0 * ((kGammaTabScale << 2) - x);   // interpolate
+  return (y + kGammaTabRounder) >> kGammaTabFix;             // descale
+}
+
+#else
+
+static void InitGammaTables(void) {}
+static WEBP_INLINE uint32_t GammaToLinear(uint8_t v) { return v; }
+static WEBP_INLINE int LinearToGamma(uint32_t base_value, int shift) {
+  (void)shift;
+  return v;
+}
+
+#endif    // USE_GAMMA_COMPRESSION
+
+//------------------------------------------------------------------------------
+
+#define SUM4(ptr) LinearToGamma(                         \
+    GammaToLinear((ptr)[0]) +                            \
+    GammaToLinear((ptr)[step]) +                         \
+    GammaToLinear((ptr)[rgb_stride]) +                   \
+    GammaToLinear((ptr)[rgb_stride + step]), 0)          \
+
+#define SUM2H(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[step]), 1)
+#define SUM2V(ptr) \
+    LinearToGamma(GammaToLinear((ptr)[0]) + GammaToLinear((ptr)[rgb_stride]), 1)
+#define SUM1(ptr)  \
+    LinearToGamma(GammaToLinear((ptr)[0]), 2)
+
 #define RGB_TO_UV(x, y, SUM) {                           \
   const int src = (2 * (step * (x) + (y) * rgb_stride)); \
   const int dst = (x) + (y) * picture->uv_stride;        \
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u[dst] = VP8RGBToU(r, g, b);                  \
-  picture->v[dst] = VP8RGBToV(r, g, b);                  \
+  picture->u[dst] = RGBToU(r, g, b, &rg);                \
+  picture->v[dst] = RGBToV(r, g, b, &rg);                \
 }
 
 #define RGB_TO_UV0(x_in, x_out, y, SUM) {                \
@@ -574,8 +693,8 @@ int WebPPictureHasTransparency(const WebPPicture* picture) {
   const int r = SUM(r_ptr + src);                        \
   const int g = SUM(g_ptr + src);                        \
   const int b = SUM(b_ptr + src);                        \
-  picture->u0[dst] = VP8RGBToU(r, g, b);                 \
-  picture->v0[dst] = VP8RGBToV(r, g, b);                 \
+  picture->u0[dst] = RGBToU(r, g, b, &rg);               \
+  picture->v0[dst] = RGBToV(r, g, b, &rg);               \
 }
 
 static void MakeGray(WebPPicture* const picture) {
@@ -594,12 +713,14 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
                               const uint8_t* const a_ptr,
                               int step,         // bytes per pixel
                               int rgb_stride,   // bytes per scanline
+                              float dithering,
                               WebPPicture* const picture) {
   const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK;
   int x, y;
   const int width = picture->width;
   const int height = picture->height;
   const int has_alpha = CheckNonOpaque(a_ptr, width, height, step, rgb_stride);
+  VP8Random rg;
 
   picture->colorspace = uv_csp;
   picture->use_argb = 0;
@@ -608,12 +729,15 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
   }
   if (!WebPPictureAlloc(picture)) return 0;
 
+  VP8InitRandom(&rg, dithering);
+  InitGammaTables();
+
   // Import luma plane
   for (y = 0; y < height; ++y) {
     for (x = 0; x < width; ++x) {
       const int offset = step * x + y * rgb_stride;
       picture->y[x + y * picture->y_stride] =
-          VP8RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
+          RGBToY(r_ptr[offset], g_ptr[offset], b_ptr[offset], &rg);
     }
   }
 
@@ -661,6 +785,7 @@ static int ImportYUVAFromRGBA(const uint8_t* const r_ptr,
 
   if (has_alpha) {
     assert(step >= 4);
+    assert(picture->a != NULL);
     for (y = 0; y < height; ++y) {
       for (x = 0; x < width; ++x) {
         picture->a[x + y * picture->a_stride] =
@@ -683,7 +808,7 @@ static int Import(WebPPicture* const picture,
 
   if (!picture->use_argb) {
     return ImportYUVAFromRGBA(r_ptr, g_ptr, b_ptr, a_ptr, step, rgb_stride,
-                              picture);
+                              0.f /* no dithering */, picture);
   }
   if (import_alpha) {
     picture->colorspace |= WEBP_CSP_ALPHA_BIT;
@@ -698,10 +823,7 @@ static int Import(WebPPicture* const picture,
       for (x = 0; x < width; ++x) {
         const int offset = step * x + y * rgb_stride;
         const uint32_t argb =
-            0xff000000u |
-            (r_ptr[offset] << 16) |
-            (g_ptr[offset] <<  8) |
-            (b_ptr[offset]);
+            MakeARGB32(r_ptr[offset], g_ptr[offset], b_ptr[offset]);
         picture->argb[x + y * picture->argb_stride] = argb;
       }
     }
@@ -762,8 +884,7 @@ int WebPPictureImportBGRX(WebPPicture* picture,
 
 int WebPPictureYUVAToARGB(WebPPicture* picture) {
   if (picture == NULL) return 0;
-  if (picture->memory_ == NULL || picture->y == NULL ||
-      picture->u == NULL || picture->v == NULL) {
+  if (picture->y == NULL || picture->u == NULL || picture->v == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
   }
   if ((picture->colorspace & WEBP_CSP_ALPHA_BIT) && picture->a == NULL) {
@@ -786,7 +907,7 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
     WebPUpsampleLinePairFunc upsample = WebPGetLinePairConverter(ALPHA_IS_LAST);
 
     // First row, with replicated top samples.
-    upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, width);
+    upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, dst, NULL, width);
     cur_y += picture->y_stride;
     dst += argb_stride;
     // Center rows.
@@ -819,7 +940,8 @@ int WebPPictureYUVAToARGB(WebPPicture* picture) {
   return 1;
 }
 
-int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+int WebPPictureARGBToYUVADithered(WebPPicture* picture, WebPEncCSP colorspace,
+                                  float dithering) {
   if (picture == NULL) return 0;
   if (picture->argb == NULL) {
     return WebPEncodingSetError(picture, VP8_ENC_ERROR_NULL_PARAMETER);
@@ -835,7 +957,8 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
     PictureResetARGB(&tmp);  // reset ARGB buffer so that it's not free()'d.
     tmp.use_argb = 0;
     tmp.colorspace = colorspace & WEBP_CSP_UV_MASK;
-    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, &tmp)) {
+    if (!ImportYUVAFromRGBA(r, g, b, a, 4, 4 * picture->argb_stride, dithering,
+                            &tmp)) {
       return WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY);
     }
     // Copy back the YUV specs into 'picture'.
@@ -847,6 +970,10 @@ int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
   return 1;
 }
 
+int WebPPictureARGBToYUVA(WebPPicture* picture, WebPEncCSP colorspace) {
+  return WebPPictureARGBToYUVADithered(picture, colorspace, 0.f);
+}
+
 //------------------------------------------------------------------------------
 // Helper: clean up fully transparent area to help compressibility.
 
@@ -913,6 +1040,91 @@ void WebPCleanupTransparentArea(WebPPicture* pic) {
 #undef SIZE2
 
 //------------------------------------------------------------------------------
+// Blend color and remove transparency info
+
+#define BLEND(V0, V1, ALPHA) \
+    ((((V0) * (255 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 16)
+#define BLEND_10BIT(V0, V1, ALPHA) \
+    ((((V0) * (1020 - (ALPHA)) + (V1) * (ALPHA)) * 0x101) >> 18)
+
+void WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb) {
+  const int red = (background_rgb >> 16) & 0xff;
+  const int green = (background_rgb >> 8) & 0xff;
+  const int blue = (background_rgb >> 0) & 0xff;
+  VP8Random rg;
+  int x, y;
+  if (pic == NULL) return;
+  VP8InitRandom(&rg, 0.f);
+  if (!pic->use_argb) {
+    const int uv_width = (pic->width >> 1);  // omit last pixel during u/v loop
+    const int Y0 = RGBToY(red, green, blue, &rg);
+    // VP8RGBToU/V expects the u/v values summed over four pixels
+    const int U0 = RGBToU(4 * red, 4 * green, 4 * blue, &rg);
+    const int V0 = RGBToV(4 * red, 4 * green, 4 * blue, &rg);
+    const int has_alpha = pic->colorspace & WEBP_CSP_ALPHA_BIT;
+    if (!has_alpha || pic->a == NULL) return;    // nothing to do
+    for (y = 0; y < pic->height; ++y) {
+      // Luma blending
+      uint8_t* const y_ptr = pic->y + y * pic->y_stride;
+      uint8_t* const a_ptr = pic->a + y * pic->a_stride;
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = a_ptr[x];
+        if (alpha < 0xff) {
+          y_ptr[x] = BLEND(Y0, y_ptr[x], a_ptr[x]);
+        }
+      }
+      // Chroma blending every even line
+      if ((y & 1) == 0) {
+        uint8_t* const u = pic->u + (y >> 1) * pic->uv_stride;
+        uint8_t* const v = pic->v + (y >> 1) * pic->uv_stride;
+        uint8_t* const a_ptr2 =
+            (y + 1 == pic->height) ? a_ptr : a_ptr + pic->a_stride;
+        for (x = 0; x < uv_width; ++x) {
+          // Average four alpha values into a single blending weight.
+          // TODO(skal): might lead to visible contouring. Can we do better?
+          const int alpha =
+              a_ptr[2 * x + 0] + a_ptr[2 * x + 1] +
+              a_ptr2[2 * x + 0] + a_ptr2[2 * x + 1];
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+        if (pic->width & 1) {   // rightmost pixel
+          const int alpha = 2 * (a_ptr[2 * x + 0] + a_ptr2[2 * x + 0]);
+          u[x] = BLEND_10BIT(U0, u[x], alpha);
+          v[x] = BLEND_10BIT(V0, v[x], alpha);
+        }
+      }
+      memset(a_ptr, 0xff, pic->width);
+    }
+  } else {
+    uint32_t* argb = pic->argb;
+    const uint32_t background = MakeARGB32(red, green, blue);
+    for (y = 0; y < pic->height; ++y) {
+      for (x = 0; x < pic->width; ++x) {
+        const int alpha = (argb[x] >> 24) & 0xff;
+        if (alpha != 0xff) {
+          if (alpha > 0) {
+            int r = (argb[x] >> 16) & 0xff;
+            int g = (argb[x] >>  8) & 0xff;
+            int b = (argb[x] >>  0) & 0xff;
+            r = BLEND(red, r, alpha);
+            g = BLEND(green, g, alpha);
+            b = BLEND(blue, b, alpha);
+            argb[x] = MakeARGB32(r, g, b);
+          } else {
+            argb[x] = background;
+          }
+        }
+      }
+      argb += pic->argb_stride;
+    }
+  }
+}
+
+#undef BLEND
+#undef BLEND_10BIT
+
+//------------------------------------------------------------------------------
 // local-min distortion
 //
 // For every pixel in the *reference* picture, we search for the local best
@@ -1088,10 +1300,10 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, float q,          \
   return Encode(in, w, h, bps, IMPORTER, q, 0, out);                    \
 }
 
-ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB);
-ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR);
-ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA);
-ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA);
+ENCODE_FUNC(WebPEncodeRGB, WebPPictureImportRGB)
+ENCODE_FUNC(WebPEncodeBGR, WebPPictureImportBGR)
+ENCODE_FUNC(WebPEncodeRGBA, WebPPictureImportRGBA)
+ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA)
 
 #undef ENCODE_FUNC
 
@@ -1101,15 +1313,12 @@ size_t NAME(const uint8_t* in, int w, int h, int bps, uint8_t** out) {       \
   return Encode(in, w, h, bps, IMPORTER, LOSSLESS_DEFAULT_QUALITY, 1, out);  \
 }
 
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA);
-LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA);
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGB, WebPPictureImportRGB)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGR, WebPPictureImportBGR)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessRGBA, WebPPictureImportRGBA)
+LOSSLESS_ENCODE_FUNC(WebPEncodeLosslessBGRA, WebPPictureImportBGRA)
 
 #undef LOSSLESS_ENCODE_FUNC
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/quant.c b/third_party/libwebp/enc/quant.c
index 462d4e9..e1d202b 100644
--- a/third_party/libwebp/enc/quant.c
+++ b/third_party/libwebp/enc/quant.c
@@ -13,6 +13,7 @@
 
 #include <assert.h>
 #include <math.h>
+#include <stdlib.h>  // for abs()
 
 #include "./vp8enci.h"
 #include "./cost.h"
@@ -24,18 +25,78 @@
 
 #define MID_ALPHA 64      // neutral value for susceptibility
 #define MIN_ALPHA 30      // lowest usable value for susceptibility
-#define MAX_ALPHA 100     // higher meaninful value for susceptibility
+#define MAX_ALPHA 100     // higher meaningful value for susceptibility
 
 #define SNS_TO_DQ 0.9     // Scaling constant between the sns value and the QP
                           // power-law modulation. Must be strictly less than 1.
 
 #define I4_PENALTY 4000   // Rate-penalty for quick i4/i16 decision
 
+// number of non-zero coeffs below which we consider the block very flat
+// (and apply a penalty to complex predictions)
+#define FLATNESS_LIMIT_I16 10      // I16 mode
+#define FLATNESS_LIMIT_I4  3       // I4 mode
+#define FLATNESS_LIMIT_UV  2       // UV mode
+#define FLATNESS_PENALTY   140     // roughly ~1bit per block
+
 #define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// #define DEBUG_BLOCK
+
+//------------------------------------------------------------------------------
+
+#if defined(DEBUG_BLOCK)
+
+#include <stdio.h>
+#include <stdlib.h>
+
+static void PrintBlockInfo(const VP8EncIterator* const it,
+                           const VP8ModeScore* const rd) {
+  int i, j;
+  const int is_i16 = (it->mb_->type_ == 1);
+  printf("SOURCE / OUTPUT / ABS DELTA\n");
+  for (j = 0; j < 24; ++j) {
+    if (j == 16) printf("\n");   // newline before the U/V block
+    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_in_[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) printf("%3d ", it->yuv_out_[i + j * BPS]);
+    printf("     ");
+    for (i = 0; i < 16; ++i) {
+      printf("%1d ", abs(it->yuv_out_[i + j * BPS] - it->yuv_in_[i + j * BPS]));
+    }
+    printf("\n");
+  }
+  printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
+    (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
+    (int)rd->score);
+  if (is_i16) {
+    printf("Mode: %d\n", rd->mode_i16);
+    printf("y_dc_levels:");
+    for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
+    printf("\n");
+  } else {
+    printf("Modes[16]: ");
+    for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
+    printf("\n");
+  }
+  printf("y_ac_levels:\n");
+  for (j = 0; j < 16; ++j) {
+    for (i = is_i16 ? 1 : 0; i < 16; ++i) {
+      printf("%4d ", rd->y_ac_levels[j][i]);
+    }
+    printf("\n");
+  }
+  printf("\n");
+  printf("uv_levels (mode=%d):\n", rd->mode_uv);
+  for (j = 0; j < 8; ++j) {
+    for (i = 0; i < 16; ++i) {
+      printf("%4d ", rd->uv_levels[j][i]);
+    }
+    printf("\n");
+  }
+}
+
+#endif   // DEBUG_BLOCK
 
 //------------------------------------------------------------------------------
 
@@ -104,31 +165,13 @@ static const uint16_t kAcTable2[128] = {
   385, 393, 401, 409, 416, 424, 432, 440
 };
 
-static const uint16_t kCoeffThresh[16] = {
-  0,  10, 20, 30,
-  10, 20, 30, 30,
-  20, 30, 30, 30,
-  30, 30, 30, 30
-};
-
-// TODO(skal): tune more. Coeff thresholding?
-static const uint8_t kBiasMatrices[3][16] = {  // [3] = [luma-ac,luma-dc,chroma]
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 },
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 },
-  { 96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96,
-    96, 96, 96, 96 }
+static const uint8_t kBiasMatrices[3][2] = {  // [luma-ac,luma-dc,chroma][dc,ac]
+  { 96, 110 }, { 96, 108 }, { 110, 115 }
 };
 
-// Sharpening by (slightly) raising the hi-frequency coeffs (only for trellis).
+// Sharpening by (slightly) raising the hi-frequency coeffs.
 // Hack-ish but helpful for mid-bitrate range. Use with care.
+#define SHARPEN_BITS 11  // number of descaling bits for sharpening bias
 static const uint8_t kFreqSharpening[16] = {
   0,  30, 60, 90,
   30, 60, 90, 90,
@@ -141,20 +184,30 @@ static const uint8_t kFreqSharpening[16] = {
 
 // Returns the average quantizer
 static int ExpandMatrix(VP8Matrix* const m, int type) {
-  int i;
-  int sum = 0;
+  int i, sum;
+  for (i = 0; i < 2; ++i) {
+    const int is_ac_coeff = (i > 0);
+    const int bias = kBiasMatrices[type][is_ac_coeff];
+    m->iq_[i] = (1 << QFIX) / m->q_[i];
+    m->bias_[i] = BIAS(bias);
+    // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
+    //   * zero if coeff <= zthresh
+    //   * non-zero if coeff > zthresh
+    m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
+  }
   for (i = 2; i < 16; ++i) {
     m->q_[i] = m->q_[1];
+    m->iq_[i] = m->iq_[1];
+    m->bias_[i] = m->bias_[1];
+    m->zthresh_[i] = m->zthresh_[1];
   }
-  for (i = 0; i < 16; ++i) {
-    const int j = kZigzag[i];
-    const int bias = kBiasMatrices[type][j];
-    m->iq_[j] = (1 << QFIX) / m->q_[j];
-    m->bias_[j] = BIAS(bias);
-    // TODO(skal): tune kCoeffThresh[]
-    m->zthresh_[j] = ((256 /*+ kCoeffThresh[j]*/ - bias) * m->q_[j] + 127) >> 8;
-    m->sharpen_[j] = (kFreqSharpening[j] * m->q_[j]) >> 11;
-    sum += m->q_[j];
+  for (sum = 0, i = 0; i < 16; ++i) {
+    if (type == 0) {  // we only use sharpening for AC luma coeffs
+      m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
+    } else {
+      m->sharpen_[i] = 0;
+    }
+    sum += m->q_[i];
   }
   return (sum + 8) >> 4;
 }
@@ -182,17 +235,17 @@ static void SetupMatrices(VP8Encoder* enc) {
     q16 = ExpandMatrix(&m->y2_, 1);
     quv = ExpandMatrix(&m->uv_, 2);
 
-    // TODO: Switch to kLambda*[] tables?
-    {
-      m->lambda_i4_  = (3 * q4 * q4) >> 7;
-      m->lambda_i16_ = (3 * q16 * q16);
-      m->lambda_uv_  = (3 * quv * quv) >> 6;
-      m->lambda_mode_    = (1 * q4 * q4) >> 7;
-      m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
-      m->lambda_trellis_i16_ = (q16 * q16) >> 2;
-      m->lambda_trellis_uv_  = (quv *quv) << 1;
-      m->tlambda_            = (tlambda_scale * q4) >> 5;
-    }
+    m->lambda_i4_          = (3 * q4 * q4) >> 7;
+    m->lambda_i16_         = (3 * q16 * q16);
+    m->lambda_uv_          = (3 * quv * quv) >> 6;
+    m->lambda_mode_        = (1 * q4 * q4) >> 7;
+    m->lambda_trellis_i4_  = (7 * q4 * q4) >> 3;
+    m->lambda_trellis_i16_ = (q16 * q16) >> 2;
+    m->lambda_trellis_uv_  = (quv *quv) << 1;
+    m->tlambda_            = (tlambda_scale * q4) >> 5;
+
+    m->min_disto_ = 10 * m->y1_.q_[0];   // quantization-aware min disto
+    m->max_edge_  = 0;
   }
 }
 
@@ -201,16 +254,21 @@ static void SetupMatrices(VP8Encoder* enc) {
 
 // Very small filter-strength values have close to no visual effect. So we can
 // save a little decoding-CPU by turning filtering off for these.
-#define FSTRENGTH_CUTOFF 3
+#define FSTRENGTH_CUTOFF 2
 
 static void SetupFilterStrength(VP8Encoder* const enc) {
   int i;
-  const int level0 = enc->config_->filter_strength;
+  // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
+  const int level0 = 5 * enc->config_->filter_strength;
   for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
-    // Segments with lower quantizer will be less filtered. TODO: tune (wrt SNS)
-    const int level = level0 * 256 * enc->dqm_[i].quant_ / 128;
-    const int f = level / (256 + enc->dqm_[i].beta_);
-    enc->dqm_[i].fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
+    VP8SegmentInfo* const m = &enc->dqm_[i];
+    // We focus on the quantization of AC coeffs.
+    const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
+    const int base_strength =
+        VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
+    // Segments with lower complexity ('beta') will be less filtered.
+    const int f = base_strength * level0 / (256 + m->beta_);
+    m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
   }
   // We record the initial strength (mainly for the case of 1-segment only).
   enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
@@ -234,7 +292,7 @@ static double QualityToCompression(double c) {
   // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
   // in the mid-quant range. So we scale the compressibility inversely to
   // this power-law: quant ~= compression ^ 1/3. This law holds well for
-  // low quant. Finer modelling for high-quant would make use of kAcTable[]
+  // low quant. Finer modeling for high-quant would make use of kAcTable[]
   // more explicitly.
   const double v = pow(linear_c, 1 / 3.);
   return v;
@@ -367,16 +425,14 @@ const int VP8I4ModeOffsets[NUM_BMODES] = {
 };
 
 void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  const uint8_t* const left = it->x_ ? enc->y_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->y_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
   VP8EncPredLuma16(it->yuv_p_, left, top);
 }
 
 void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
-  const VP8Encoder* const enc = it->enc_;
-  const uint8_t* const left = it->x_ ? enc->u_left_ : NULL;
-  const uint8_t* const top = it->y_ ? enc->uv_top_ + it->x_ * 16 : NULL;
+  const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
+  const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
   VP8EncPredChroma8(it->yuv_p_, left, top);
 }
 
@@ -432,6 +488,7 @@ static void InitScore(VP8ModeScore* const rd) {
   rd->D  = 0;
   rd->SD = 0;
   rd->R  = 0;
+  rd->H  = 0;
   rd->nz = 0;
   rd->score = MAX_COST;
 }
@@ -440,6 +497,7 @@ static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  = src->D;
   dst->SD = src->SD;
   dst->R  = src->R;
+  dst->H  = src->H;
   dst->nz = src->nz;      // note that nz is not accumulated, but just copied.
   dst->score = src->score;
 }
@@ -448,6 +506,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
   dst->D  += src->D;
   dst->SD += src->SD;
   dst->R  += src->R;
+  dst->H  += src->H;
   dst->nz |= src->nz;     // here, new nz bits are accumulated.
   dst->score += src->score;
 }
@@ -476,7 +535,7 @@ typedef struct {
 
 static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
   // TODO: incorporate the "* 256" in the tables?
-  rd->score = rd->R * lambda + 256 * (rd->D + rd->SD);
+  rd->score = (rd->R + rd->H) * lambda + 256 * (rd->D + rd->SD);
 }
 
 static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
@@ -539,11 +598,10 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
     // note: it's important to take sign of the _original_ coeff,
     // so we don't have to consider level < 0 afterward.
     const int sign = (in[j] < 0);
-    int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
-    int level0;
-    if (coeff0 > 2047) coeff0 = 2047;
+    const int coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
+    int level0 = QUANTDIV(coeff0, iQ, B);
+    if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
 
-    level0 = QUANTDIV(coeff0, iQ, B);
     // test all alternate level values around level0.
     for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
       Node* const cur = &NODE(n, m);
@@ -555,7 +613,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it,
       cur->sign = sign;
       cur->level = level;
       cur->ctx = (level == 0) ? 0 : (level == 1) ? 1 : 2;
-      if (level >= 2048 || level < 0) {   // node is dead?
+      if (level > MAX_LEVEL || level < 0) {   // node is dead?
         cur->cost = MAX_COST;
         continue;
       }
@@ -648,10 +706,10 @@ static int ReconstructIntra16(VP8EncIterator* const it,
                               VP8ModeScore* const rd,
                               uint8_t* const yuv_out,
                               int mode) {
-  const VP8Encoder* const enc = it->enc_;
+  VP8Encoder* const enc = it->enc_;
   const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   int nz = 0;
   int n;
   int16_t tmp[16][16], dc_tmp[16];
@@ -660,7 +718,7 @@ static int ReconstructIntra16(VP8EncIterator* const it,
     VP8FTransform(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
   }
   VP8FTransformWHT(tmp[0], dc_tmp);
-  nz |= VP8EncQuantizeBlock(dc_tmp, rd->y_dc_levels, 0, &dqm->y2_) << 24;
+  nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
 
   if (DO_TRELLIS_I16 && it->do_trellis_) {
     int x, y;
@@ -755,7 +813,18 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
 
 //------------------------------------------------------------------------------
 // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
-// Pick the mode is lower RD-cost = Rate + lamba * Distortion.
+// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
+
+static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
+  // We look at the first three AC coefficients to determine what is the average
+  // delta between each sub-4x4 block.
+  const int v0 = abs(DCs[1]);
+  const int v1 = abs(DCs[4]);
+  const int v2 = abs(DCs[5]);
+  int max_v = (v0 > v1) ? v1 : v0;
+  max_v = (v2 > max_v) ? v2 : max_v;
+  if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
+}
 
 static void SwapPtr(uint8_t** a, uint8_t** b) {
   uint8_t* const tmp = *a;
@@ -767,9 +836,23 @@ static void SwapOut(VP8EncIterator* const it) {
   SwapPtr(&it->yuv_out_, &it->yuv_out2_);
 }
 
+static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
+  score_t score = 0;
+  while (num_blocks-- > 0) {      // TODO(skal): refine positional scoring?
+    int i;
+    for (i = 1; i < 16; ++i) {    // omit DC, we're only interested in AC
+      score += (levels[i] != 0);
+      if (score > thresh) return 0;
+    }
+    levels += 16;
+  }
+  return 1;
+}
+
 static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
-  const VP8Encoder* const enc = it->enc_;
-  const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
+  const int kNumBlocks = 16;
+  VP8Encoder* const enc = it->enc_;
+  VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_i16_;
   const int tlambda = dqm->tlambda_;
   const uint8_t* const src = it->yuv_in_ + Y_OFF;
@@ -788,8 +871,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
     rd16.D = VP8SSE16x16(src, tmp_dst);
     rd16.SD = tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY))
             : 0;
+    rd16.H = VP8FixedCostsI16[mode];
     rd16.R = VP8GetCostLuma16(it, &rd16);
-    rd16.R += VP8FixedCostsI16[mode];
+    if (mode > 0 &&
+        IsFlat(rd16.y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16)) {
+      // penalty to avoid flat area to be mispredicted by complex mode
+      rd16.R += FLATNESS_PENALTY * kNumBlocks;
+    }
 
     // Since we always examine Intra16 first, we can overwrite *rd directly.
     SetRDScore(lambda, &rd16);
@@ -804,6 +892,13 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
   SetRDScore(dqm->lambda_mode_, rd);   // finalize score for mode decision.
   VP8SetIntra16Mode(it, rd->mode_i16);
+
+  // we have a blocky macroblock (only DCs are non-zero) with fairly high
+  // distortion, record max delta so we can later adjust the minimal filtering
+  // strength needed to smooth these blocks out.
+  if ((rd->nz & 0xffff) == 0 && rd->D > dqm->min_disto_) {
+    StoreMaxDelta(dqm, rd->y_dc_levels);
+  }
 }
 
 //------------------------------------------------------------------------------
@@ -833,9 +928,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
   }
 
   InitScore(&rd_best);
-  rd_best.score = 211;  // '211' is the value of VP8BitCost(0, 145)
+  rd_best.H = 211;  // '211' is the value of VP8BitCost(0, 145)
+  SetRDScore(dqm->lambda_mode_, &rd_best);
   VP8IteratorStartI4(it);
   do {
+    const int kNumBlocks = 1;
     VP8ModeScore rd_i4;
     int mode;
     int best_mode = -1;
@@ -859,8 +956,11 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
       rd_tmp.SD =
           tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
                   : 0;
+      rd_tmp.H = mode_costs[mode];
       rd_tmp.R = VP8GetCostLuma4(it, tmp_levels);
-      rd_tmp.R += mode_costs[mode];
+      if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
+        rd_tmp.R += FLATNESS_PENALTY * kNumBlocks;
+      }
 
       SetRDScore(lambda, &rd_tmp);
       if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
@@ -872,14 +972,17 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
     }
     SetRDScore(dqm->lambda_mode_, &rd_i4);
     AddScore(&rd_best, &rd_i4);
-    total_header_bits += mode_costs[best_mode];
-    if (rd_best.score >= rd->score ||
-        total_header_bits > enc->max_i4_header_bits_) {
+    if (rd_best.score >= rd->score) {
+      return 0;
+    }
+    total_header_bits += (int)rd_i4.H;   // <- equal to mode_costs[best_mode];
+    if (total_header_bits > enc->max_i4_header_bits_) {
       return 0;
     }
     // Copy selected samples if not in the right place already.
-    if (best_block != best_blocks + VP8Scan[it->i4_])
+    if (best_block != best_blocks + VP8Scan[it->i4_]) {
       VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
+    }
     rd->modes_i4[it->i4_] = best_mode;
     it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
   } while (VP8IteratorRotateI4(it, best_blocks));
@@ -895,6 +998,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
 //------------------------------------------------------------------------------
 
 static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
+  const int kNumBlocks = 8;
   const VP8Encoder* const enc = it->enc_;
   const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
   const int lambda = dqm->lambda_uv_;
@@ -915,8 +1019,11 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
     // Compute RD-score
     rd_uv.D  = VP8SSE16x8(src, tmp_dst);
     rd_uv.SD = 0;    // TODO: should we call TDisto? it tends to flatten areas.
+    rd_uv.H  = VP8FixedCostsUV[mode];
     rd_uv.R  = VP8GetCostUV(it, &rd_uv);
-    rd_uv.R += VP8FixedCostsUV[mode];
+    if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
+      rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
+    }
 
     SetRDScore(lambda, &rd_uv);
     if (mode == 0 || rd_uv.score < rd_best.score) {
@@ -1047,6 +1154,3 @@ int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
   return is_skipped;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/syntax.c b/third_party/libwebp/enc/syntax.c
index b0f7676..08cfe79 100644
--- a/third_party/libwebp/enc/syntax.c
+++ b/third_party/libwebp/enc/syntax.c
@@ -18,10 +18,6 @@
 #include "../webp/mux_types.h"         // ALPHA_FLAG
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Helper functions
 
@@ -425,6 +421,3 @@ int VP8EncWrite(VP8Encoder* const enc) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/token.c b/third_party/libwebp/enc/token.c
index 6a63371..e696642 100644
--- a/third_party/libwebp/enc/token.c
+++ b/third_party/libwebp/enc/token.c
@@ -20,12 +20,9 @@
 #include <stdlib.h>
 #include <string.h>
 
+#include "./cost.h"
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #if !defined(DISABLE_TOKEN_BUFFER)
 
 // we use pages to reduce the number of memcpy()
@@ -238,6 +235,29 @@ int VP8EmitTokens(VP8TBuffer* const b, VP8BitWriter* const bw,
   return 1;
 }
 
+// Size estimation
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas) {
+  size_t size = 0;
+  const VP8Tokens* p = b->pages_;
+  if (b->error_) return 0;
+  while (p != NULL) {
+    const VP8Tokens* const next = p->next_;
+    const int N = (next == NULL) ? b->left_ : 0;
+    int n = MAX_NUM_TOKEN;
+    while (n-- > N) {
+      const uint16_t token = p->tokens_[n];
+      const int bit = token & (1 << 15);
+      if (token & FIXED_PROBA_BIT) {
+        size += VP8BitCost(bit, token & 0xffu);
+      } else {
+        size += VP8BitCost(bit, probas[token & 0x3fffu]);
+      }
+    }
+    p = next;
+  }
+  return size;
+}
+
 //------------------------------------------------------------------------------
 
 #else     // DISABLE_TOKEN_BUFFER
@@ -251,6 +271,3 @@ void VP8TBufferClear(VP8TBuffer* const b) {
 
 #endif    // !DISABLE_TOKEN_BUFFER
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/tree.c b/third_party/libwebp/enc/tree.c
index ecd8fb9..e5d05e5 100644
--- a/third_party/libwebp/enc/tree.c
+++ b/third_party/libwebp/enc/tree.c
@@ -7,23 +7,18 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-// Token probabilities
+// Coding of token probabilities, intra modes and segments.
 //
 // Author: Skal (pascal.massimino@gmail.com)
 
 #include "./vp8enci.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Default probabilities
 
 // Paragraph 13.5
 const uint8_t
   VP8CoeffsProba0[NUM_TYPES][NUM_BANDS][NUM_CTX][NUM_PROBAS] = {
-  // genereated using vp8_default_coef_probs() in entropy.c:129
   { { { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 },
       { 128, 128, 128, 128, 128, 128, 128, 128, 128, 128, 128 }
@@ -320,7 +315,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
   VP8EncIterator it;
   VP8IteratorInit(enc, &it);
   do {
-    const VP8MBInfo* mb = it.mb_;
+    const VP8MBInfo* const mb = it.mb_;
     const uint8_t* preds = it.preds_;
     if (enc->segment_hdr_.update_map_) {
       PutSegment(bw, mb->segment_, enc->proba_.segments_);
@@ -345,7 +340,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) {
       }
     }
     PutUVMode(bw, mb->uv_mode_);
-  } while (VP8IteratorNext(&it, 0));
+  } while (VP8IteratorNext(&it));
 }
 
 //------------------------------------------------------------------------------
@@ -507,6 +502,3 @@ void VP8WriteProbas(VP8BitWriter* const bw, const VP8Proba* const probas) {
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/vp8enci.h b/third_party/libwebp/enc/vp8enci.h
index 61d56be..71adf6c 100644
--- a/third_party/libwebp/enc/vp8enci.h
+++ b/third_party/libwebp/enc/vp8enci.h
@@ -20,7 +20,7 @@
 #include "../utils/bit_writer.h"
 #include "../utils/thread.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -29,8 +29,8 @@ extern "C" {
 
 // version numbers
 #define ENC_MAJ_VERSION 0
-#define ENC_MIN_VERSION 3
-#define ENC_REV_VERSION 1
+#define ENC_MIN_VERSION 4
+#define ENC_REV_VERSION 0
 
 // intra prediction modes
 enum { B_DC_PRED = 0,   // 4x4 modes
@@ -74,7 +74,7 @@ typedef enum {   // Rate-distortion optimization levels
 // The predicted blocks can be accessed using offsets to yuv_p_ and
 // the arrays VP8*ModeOffsets[];
 //         +----+      YUV Samples area. See VP8Scan[] for accessing the blocks.
-//  Y_OFF  |YYYY| <- original samples  (enc->yuv_in_)
+//  Y_OFF  |YYYY| <- original samples  ('yuv_in_')
 //         |YYYY|
 //         |YYYY|
 //         |YYYY|
@@ -248,16 +248,19 @@ typedef struct {
   int beta_;       // filter-susceptibility, range [0,255].
   int quant_;      // final segment quantizer.
   int fstrength_;  // final in-loop filtering strength
+  int max_edge_;   // max edge delta (for filtering strength)
+  int min_disto_;  // minimum distortion required to trigger filtering record
   // reactivities
   int lambda_i16_, lambda_i4_, lambda_uv_;
   int lambda_mode_, lambda_trellis_, tlambda_;
   int lambda_trellis_i16_, lambda_trellis_i4_, lambda_trellis_uv_;
 } VP8SegmentInfo;
 
-// Handy transcient struct to accumulate score and info during RD-optimization
+// Handy transient struct to accumulate score and info during RD-optimization
 // and mode evaluation.
 typedef struct {
-  score_t D, SD, R, score;    // Distortion, spectral distortion, rate, score.
+  score_t D, SD;              // Distortion, spectral distortion
+  score_t H, R, score;        // header bits, rate, score.
   int16_t y_dc_levels[16];    // Quantized levels for luma-DC, luma-AC, chroma.
   int16_t y_ac_levels[16][16];
   int16_t uv_levels[4 + 4][16];
@@ -271,12 +274,11 @@ typedef struct {
 // right neighbouring data (samples, predictions, contexts, ...)
 typedef struct {
   int x_, y_;                      // current macroblock
-  int y_offset_, uv_offset_;       // offset to the luma / chroma planes
   int y_stride_, uv_stride_;       // respective strides
-  uint8_t*      yuv_in_;           // borrowed from enc_ (for now)
-  uint8_t*      yuv_out_;          // ''
-  uint8_t*      yuv_out2_;         // ''
-  uint8_t*      yuv_p_;            // ''
+  uint8_t*      yuv_in_;           // input samples
+  uint8_t*      yuv_out_;          // output samples
+  uint8_t*      yuv_out2_;         // secondary buffer swapped with yuv_out_.
+  uint8_t*      yuv_p_;            // scratch buffer for prediction
   VP8Encoder*   enc_;              // back-pointer
   VP8MBInfo*    mb_;               // current macroblock
   VP8BitWriter* bw_;               // current bit-writer
@@ -292,24 +294,43 @@ typedef struct {
   uint64_t      uv_bits_;          // macroblock bit-cost for chroma
   LFStats*      lf_stats_;         // filter stats (borrowed from enc_)
   int           do_trellis_;       // if true, perform extra level optimisation
-  int           done_;             // true when scan is finished
+  int           count_down_;       // number of mb still to be processed
+  int           count_down0_;      // starting counter value (for progress)
   int           percent0_;         // saved initial progress percent
+
+  uint8_t* y_left_;    // left luma samples (addressable from index -1 to 15).
+  uint8_t* u_left_;    // left u samples (addressable from index -1 to 7)
+  uint8_t* v_left_;    // left v samples (addressable from index -1 to 7)
+
+  uint8_t* y_top_;     // top luma samples at position 'x_'
+  uint8_t* uv_top_;    // top u/v samples at position 'x_', packed as 16 bytes
+
+  // memory for storing y/u/v_left_ and yuv_in_/out_*
+  uint8_t yuv_left_mem_[17 + 16 + 16 + 8 + ALIGN_CST];     // memory for *_left_
+  uint8_t yuv_mem_[3 * YUV_SIZE + PRED_SIZE + ALIGN_CST];  // memory for yuv_*
 } VP8EncIterator;
 
   // in iterator.c
-// must be called first.
+// must be called first
 void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it);
-// restart a scan.
+// restart a scan
 void VP8IteratorReset(VP8EncIterator* const it);
-// import samples from source
-void VP8IteratorImport(const VP8EncIterator* const it);
+// reset iterator position to row 'y'
+void VP8IteratorSetRow(VP8EncIterator* const it, int y);
+// set count down (=number of iterations to go)
+void VP8IteratorSetCountDown(VP8EncIterator* const it, int count_down);
+// return true if iteration is finished
+int VP8IteratorIsDone(const VP8EncIterator* const it);
+// Import uncompressed samples from source.
+// If tmp_32 is not NULL, import boundary samples too.
+// tmp_32 is a 32-bytes scratch buffer that must be aligned in memory.
+void VP8IteratorImport(VP8EncIterator* const it, uint8_t* tmp_32);
 // export decimated samples
 void VP8IteratorExport(const VP8EncIterator* const it);
-// go to next macroblock. Returns !done_. If *block_to_save is non-null, will
-// save the boundary values to top_/left_ arrays. block_to_save can be
-// it->yuv_out_ or it->yuv_in_.
-int VP8IteratorNext(VP8EncIterator* const it,
-                    const uint8_t* const block_to_save);
+// go to next macroblock. Returns false if not finished.
+int VP8IteratorNext(VP8EncIterator* const it);
+// save the yuv_out_ boundary values to top_/left_ arrays for next iterations.
+void VP8IteratorSaveBoundary(VP8EncIterator* const it);
 // Report progression based on macroblock rows. Return 0 for user-abort request.
 int VP8IteratorProgress(const VP8EncIterator* const it,
                         int final_delta_percent);
@@ -360,6 +381,9 @@ int VP8RecordCoeffTokens(int ctx, int coeff_type, int first, int last,
                          const int16_t* const coeffs,
                          VP8TBuffer* const tokens);
 
+// Estimate the final coded size given a set of 'probas'.
+size_t VP8EstimateTokenSize(VP8TBuffer* const b, const uint8_t* const probas);
+
 // unused for now
 void VP8TokenToStats(const VP8TBuffer* const b, proba_t* const stats);
 
@@ -435,17 +459,9 @@ struct VP8Encoder {
   VP8MBInfo* mb_info_;   // contextual macroblock infos (mb_w_ + 1)
   uint8_t*   preds_;     // predictions modes: (4*mb_w+1) * (4*mb_h+1)
   uint32_t*  nz_;        // non-zero bit context: mb_w+1
-  uint8_t*   yuv_in_;    // input samples
-  uint8_t*   yuv_out_;   // output samples
-  uint8_t*   yuv_out2_;  // secondary scratch out-buffer. swapped with yuv_out_.
-  uint8_t*   yuv_p_;     // scratch buffer for prediction
   uint8_t   *y_top_;     // top luma samples.
   uint8_t   *uv_top_;    // top u/v samples.
-                         // U and V are packed into 16 pixels (8 U + 8 V)
-  uint8_t   *y_left_;    // left luma samples (adressable from index -1 to 15).
-  uint8_t   *u_left_;    // left u samples (adressable from index -1 to 7)
-  uint8_t   *v_left_;    // left v samples (adressable from index -1 to 7)
-
+                         // U and V are packed into 16 bytes (8 U + 8 V)
   LFStats   *lf_stats_;  // autofilter stats (if NULL, autofilter is off)
 };
 
@@ -541,9 +557,13 @@ void VP8InitFilter(VP8EncIterator* const it);
 void VP8StoreFilterStats(VP8EncIterator* const it);
 void VP8AdjustFilterStrength(VP8EncIterator* const it);
 
+// returns the approximate filtering strength needed to smooth a edge
+// step of 'delta', given a sharpness parameter 'sharpness'.
+int VP8FilterStrengthFromDelta(int sharpness, int delta);
+
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/enc/vp8l.c b/third_party/libwebp/enc/vp8l.c
index 945870c..1572631 100644
--- a/third_party/libwebp/enc/vp8l.c
+++ b/third_party/libwebp/enc/vp8l.c
@@ -25,10 +25,6 @@
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define PALETTE_KEY_RIGHT_SHIFT   22  // Key for 1K buffer.
 #define MAX_HUFF_IMAGE_SIZE       (16 * 1024 * 1024)
 #define MAX_COLORS_FOR_GRAPH      64
@@ -168,9 +164,6 @@ static int VP8LEncAnalyze(VP8LEncoder* const enc, WebPImageHint image_hint) {
       }
       if (pred_entropy < 0.95 * non_pred_entropy) {
         enc->use_predict_ = 1;
-        // TODO(vikasa): Observed some correlation of cross_color transform with
-        // predict. Need to investigate this further and add separate heuristic
-        // for setting use_cross_color flag.
         enc->use_cross_color_ = 1;
       }
     }
@@ -451,12 +444,12 @@ static void StoreImageToBitMask(
       int bits, n_bits;
       int code, distance;
 
-      PrefixEncode(v->len, &code, &n_bits, &bits);
+      VP8LPrefixEncode(v->len, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes, 256 + code);
       VP8LWriteBits(bw, n_bits, bits);
 
       distance = PixOrCopyDistance(v);
-      PrefixEncode(distance, &code, &n_bits, &bits);
+      VP8LPrefixEncode(distance, &code, &n_bits, &bits);
       WriteHuffmanCode(bw, codes + 4, code);
       VP8LWriteBits(bw, n_bits, bits);
     }
@@ -702,7 +695,7 @@ static int ApplyCrossColorFilter(const VP8LEncoder* const enc,
   const int ccolor_transform_bits = enc->transform_bits_;
   const int transform_width = VP8LSubSampleSize(width, ccolor_transform_bits);
   const int transform_height = VP8LSubSampleSize(height, ccolor_transform_bits);
-  const int step = (quality == 0) ? 32 : 8;
+  const int step = (quality < 25) ? 32 : (quality > 50) ? 8 : 16;
 
   VP8LColorSpaceTransform(width, height, ccolor_transform_bits, step,
                           enc->argb_, enc->transform_data_);
@@ -827,7 +820,7 @@ static void ApplyPalette(uint32_t* src, uint32_t* dst,
   }
 
   if (use_LUT) {
-    int inv_palette[MAX_PALETTE_SIZE] = { 0 };
+    uint8_t inv_palette[MAX_PALETTE_SIZE] = { 0 };
     for (i = 0; i < palette_size; ++i) {
       const int color = (palette[i] >> 8) & 0xff;
       inv_palette[color] = i;
@@ -895,7 +888,7 @@ static WebPEncodingError EncodePalette(VP8LBitWriter* const bw,
   if (err != VP8_ENC_OK) goto Error;
   dst = enc->argb_;
 
-  row = WebPSafeMalloc((uint64_t)width, sizeof(*row));
+  row = (uint8_t*)WebPSafeMalloc((uint64_t)width, sizeof(*row));
   if (row == NULL) return VP8_ENC_ERROR_OUT_OF_MEMORY;
 
   ApplyPalette(src, dst, pic->argb_stride, enc->current_width_,
@@ -959,6 +952,9 @@ static VP8LEncoder* VP8LEncoderNew(const WebPConfig* const config,
   }
   enc->config_ = config;
   enc->pic_ = picture;
+
+  VP8LDspInit();
+
   return enc;
 }
 
@@ -1170,6 +1166,3 @@ int VP8LEncodeImage(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/enc/vp8li.h b/third_party/libwebp/enc/vp8li.h
index 01f01f5..96d6fae 100644
--- a/third_party/libwebp/enc/vp8li.h
+++ b/third_party/libwebp/enc/vp8li.h
@@ -19,7 +19,7 @@
 #include "../webp/encode.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -63,7 +63,7 @@ WebPEncodingError VP8LEncodeStream(const WebPConfig* const config,
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/enc/webpenc.c b/third_party/libwebp/enc/webpenc.c
index d420d06..207cce6 100644
--- a/third_party/libwebp/enc/webpenc.c
+++ b/third_party/libwebp/enc/webpenc.c
@@ -22,10 +22,6 @@
 
 // #define PRINT_MEMORY_INFO
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef PRINT_MEMORY_INFO
 #include <stdio.h>
 #endif
@@ -136,7 +132,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
   enc->do_search_ = (config->target_size > 0 || config->target_PSNR > 0);
   if (!config->low_memory) {
 #if !defined(DISABLE_TOKEN_BUFFER)
-    enc->use_tokens_ = (method >= 3) && !enc->do_search_;
+    enc->use_tokens_ = (enc->rd_opt_level_ >= RD_OPT_BASIC);  // need rd stats
 #endif
     if (enc->use_tokens_) {
       enc->num_parts_ = 1;   // doesn't work with multi-partition
@@ -157,7 +153,7 @@ static void MapConfigToTools(VP8Encoder* const enc) {
 //             non-zero: 196
 //             lf-stats: 2048
 //                total: 68635
-// Transcient object sizes:
+// Transient object sizes:
 //       VP8EncIterator: 352
 //         VP8ModeScore: 912
 //       VP8SegmentInfo: 532
@@ -175,20 +171,16 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   const int preds_h = 4 * mb_h + 1;
   const size_t preds_size = preds_w * preds_h * sizeof(uint8_t);
   const int top_stride = mb_w * 16;
-  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t);
-  const size_t cache_size = (3 * YUV_SIZE + PRED_SIZE) * sizeof(uint8_t);
+  const size_t nz_size = (mb_w + 1) * sizeof(uint32_t) + ALIGN_CST;
   const size_t info_size = mb_w * mb_h * sizeof(VP8MBInfo);
-  const size_t samples_size = (2 * top_stride +         // top-luma/u/v
-                               16 + 16 + 16 + 8 + 1 +   // left y/u/v
-                               2 * ALIGN_CST)           // align all
-                               * sizeof(uint8_t);
+  const size_t samples_size = 2 * top_stride * sizeof(uint8_t)  // top-luma/u/v
+                            + ALIGN_CST;                        // align all
   const size_t lf_stats_size =
       config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0;
   VP8Encoder* enc;
   uint8_t* mem;
   const uint64_t size = (uint64_t)sizeof(VP8Encoder)   // main struct
                       + ALIGN_CST                      // cache alignment
-                      + cache_size                     // working caches
                       + info_size                      // modes info
                       + preds_size                     // prediction modes
                       + samples_size                   // top/left samples
@@ -199,16 +191,15 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   printf("===================================\n");
   printf("Memory used:\n"
          "             encoder: %ld\n"
-         "         block cache: %ld\n"
          "                info: %ld\n"
          "               preds: %ld\n"
          "         top samples: %ld\n"
          "            non-zero: %ld\n"
          "            lf-stats: %ld\n"
          "               total: %ld\n",
-         sizeof(VP8Encoder) + ALIGN_CST, cache_size, info_size,
+         sizeof(VP8Encoder) + ALIGN_CST, info_size,
          preds_size, samples_size, nz_size, lf_stats_size, size);
-  printf("Transcient object sizes:\n"
+  printf("Transient object sizes:\n"
          "      VP8EncIterator: %ld\n"
          "        VP8ModeScore: %ld\n"
          "      VP8SegmentInfo: %ld\n"
@@ -233,19 +224,11 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->mb_w_ = mb_w;
   enc->mb_h_ = mb_h;
   enc->preds_w_ = preds_w;
-  enc->yuv_in_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_out2_ = (uint8_t*)mem;
-  mem += YUV_SIZE;
-  enc->yuv_p_ = (uint8_t*)mem;
-  mem += PRED_SIZE;
   enc->mb_info_ = (VP8MBInfo*)mem;
   mem += info_size;
   enc->preds_ = ((uint8_t*)mem) + 1 + enc->preds_w_;
   mem += preds_w * preds_h * sizeof(uint8_t);
-  enc->nz_ = 1 + (uint32_t*)mem;
+  enc->nz_ = 1 + (uint32_t*)DO_ALIGN(mem);
   mem += nz_size;
   enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL;
   mem += lf_stats_size;
@@ -255,13 +238,7 @@ static VP8Encoder* InitVP8Encoder(const WebPConfig* const config,
   enc->y_top_ = (uint8_t*)mem;
   enc->uv_top_ = enc->y_top_ + top_stride;
   mem += 2 * top_stride;
-  mem = (uint8_t*)DO_ALIGN(mem + 1);
-  enc->y_left_ = (uint8_t*)mem;
-  mem += 16 + 16;
-  enc->u_left_ = (uint8_t*)mem;
-  mem += 16;
-  enc->v_left_ = (uint8_t*)mem;
-  mem += 8;
+  assert(mem <= (uint8_t*)enc + size);
 
   enc->config_ = config;
   enc->profile_ = use_filter ? ((config->filter_type == 1) ? 0 : 1) : 2;
@@ -300,7 +277,7 @@ static int DeleteVP8Encoder(VP8Encoder* enc) {
 //------------------------------------------------------------------------------
 
 static double GetPSNR(uint64_t err, uint64_t size) {
-  return err ? 10. * log10(255. * 255. * size / err) : 99.;
+  return (err > 0 && size > 0) ? 10. * log10(255. * 255. * size / err) : 99.;
 }
 
 static void FinalizePSNR(const VP8Encoder* const enc) {
@@ -377,7 +354,17 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
     VP8Encoder* enc = NULL;
     if (pic->y == NULL || pic->u == NULL || pic->v == NULL) {
       // Make sure we have YUVA samples.
-      if (!WebPPictureARGBToYUVA(pic, WEBP_YUV420)) return 0;
+      float dithering = 0.f;
+      if (config->preprocessing & 2) {
+        const float x = config->quality / 100.f;
+        const float x2 = x * x;
+        // slowly decreasing from max dithering at low quality (q->0)
+        // to 0.5 dithering amplitude at high quality (q->100)
+        dithering = 1.0f + (0.5f - 1.0f) * x2 * x2;
+      }
+      if (!WebPPictureARGBToYUVADithered(pic, WEBP_YUV420, dithering)) {
+        return 0;
+      }
     }
 
     enc = InitVP8Encoder(config, pic);
@@ -415,6 +402,3 @@ int WebPEncode(const WebPConfig* config, WebPPicture* pic) {
   return ok;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/libwebp.gyp b/third_party/libwebp/libwebp.gyp
index 997515f..49526df 100644
--- a/third_party/libwebp/libwebp.gyp
+++ b/third_party/libwebp/libwebp.gyp
@@ -118,6 +118,7 @@
       'type': 'static_library',
       'include_dirs': ['.'],
       'sources': [
+        'utils/alpha_processing.c',
         'utils/bit_reader.c',
         'utils/bit_writer.c',
         'utils/color_cache.c',
@@ -126,6 +127,7 @@
         'utils/huffman_encode.c',
         'utils/quant_levels.c',
         'utils/quant_levels_dec.c',
+        'utils/random.c',
         'utils/rescaler.c',
         'utils/thread.c',
         'utils/utils.c',
diff --git a/third_party/libwebp/utils/alpha_processing.c b/third_party/libwebp/utils/alpha_processing.c
new file mode 100644
index 0000000..7362ff9
--- /dev/null
+++ b/third_party/libwebp/utils/alpha_processing.c
@@ -0,0 +1,196 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <assert.h>
+#include "./alpha_processing.h"
+
+// Tables can be faster on some platform but incur some extra binary size (~2k).
+// #define USE_TABLES_FOR_ALPHA_MULT
+
+// -----------------------------------------------------------------------------
+
+#define MFIX 24    // 24bit fixed-point arithmetic
+#define HALF ((1u << MFIX) >> 1)
+#define KINV_255 ((1u << MFIX) / 255u)
+
+static uint32_t Mult(uint8_t x, uint32_t mult) {
+  const uint32_t v = (x * mult + HALF) >> MFIX;
+  assert(v <= 255);  // <- 24bit precision is enough to ensure that.
+  return v;
+}
+
+#ifdef USE_TABLES_FOR_ALPHA_MULT
+
+static const uint32_t kMultTables[2][256] = {
+  {    // (255u << MFIX) / alpha
+    0x00000000, 0xff000000, 0x7f800000, 0x55000000, 0x3fc00000, 0x33000000,
+    0x2a800000, 0x246db6db, 0x1fe00000, 0x1c555555, 0x19800000, 0x172e8ba2,
+    0x15400000, 0x139d89d8, 0x1236db6d, 0x11000000, 0x0ff00000, 0x0f000000,
+    0x0e2aaaaa, 0x0d6bca1a, 0x0cc00000, 0x0c249249, 0x0b9745d1, 0x0b1642c8,
+    0x0aa00000, 0x0a333333, 0x09cec4ec, 0x0971c71c, 0x091b6db6, 0x08cb08d3,
+    0x08800000, 0x0839ce73, 0x07f80000, 0x07ba2e8b, 0x07800000, 0x07492492,
+    0x07155555, 0x06e45306, 0x06b5e50d, 0x0689d89d, 0x06600000, 0x063831f3,
+    0x06124924, 0x05ee23b8, 0x05cba2e8, 0x05aaaaaa, 0x058b2164, 0x056cefa8,
+    0x05500000, 0x05343eb1, 0x05199999, 0x05000000, 0x04e76276, 0x04cfb2b7,
+    0x04b8e38e, 0x04a2e8ba, 0x048db6db, 0x0479435e, 0x04658469, 0x045270d0,
+    0x04400000, 0x042e29f7, 0x041ce739, 0x040c30c3, 0x03fc0000, 0x03ec4ec4,
+    0x03dd1745, 0x03ce540f, 0x03c00000, 0x03b21642, 0x03a49249, 0x03976fc6,
+    0x038aaaaa, 0x037e3f1f, 0x03722983, 0x03666666, 0x035af286, 0x034fcace,
+    0x0344ec4e, 0x033a5440, 0x03300000, 0x0325ed09, 0x031c18f9, 0x0312818a,
+    0x03092492, 0x03000000, 0x02f711dc, 0x02ee5846, 0x02e5d174, 0x02dd7baf,
+    0x02d55555, 0x02cd5cd5, 0x02c590b2, 0x02bdef7b, 0x02b677d4, 0x02af286b,
+    0x02a80000, 0x02a0fd5c, 0x029a1f58, 0x029364d9, 0x028ccccc, 0x0286562d,
+    0x02800000, 0x0279c952, 0x0273b13b, 0x026db6db, 0x0267d95b, 0x026217ec,
+    0x025c71c7, 0x0256e62a, 0x0251745d, 0x024c1bac, 0x0246db6d, 0x0241b2f9,
+    0x023ca1af, 0x0237a6f4, 0x0232c234, 0x022df2df, 0x02293868, 0x02249249,
+    0x02200000, 0x021b810e, 0x021714fb, 0x0212bb51, 0x020e739c, 0x020a3d70,
+    0x02061861, 0x02020408, 0x01fe0000, 0x01fa0be8, 0x01f62762, 0x01f25213,
+    0x01ee8ba2, 0x01ead3ba, 0x01e72a07, 0x01e38e38, 0x01e00000, 0x01dc7f10,
+    0x01d90b21, 0x01d5a3e9, 0x01d24924, 0x01cefa8d, 0x01cbb7e3, 0x01c880e5,
+    0x01c55555, 0x01c234f7, 0x01bf1f8f, 0x01bc14e5, 0x01b914c1, 0x01b61eed,
+    0x01b33333, 0x01b05160, 0x01ad7943, 0x01aaaaaa, 0x01a7e567, 0x01a5294a,
+    0x01a27627, 0x019fcbd2, 0x019d2a20, 0x019a90e7, 0x01980000, 0x01957741,
+    0x0192f684, 0x01907da4, 0x018e0c7c, 0x018ba2e8, 0x018940c5, 0x0186e5f0,
+    0x01849249, 0x018245ae, 0x01800000, 0x017dc11f, 0x017b88ee, 0x0179574e,
+    0x01772c23, 0x01750750, 0x0172e8ba, 0x0170d045, 0x016ebdd7, 0x016cb157,
+    0x016aaaaa, 0x0168a9b9, 0x0166ae6a, 0x0164b8a7, 0x0162c859, 0x0160dd67,
+    0x015ef7bd, 0x015d1745, 0x015b3bea, 0x01596596, 0x01579435, 0x0155c7b4,
+    0x01540000, 0x01523d03, 0x01507eae, 0x014ec4ec, 0x014d0fac, 0x014b5edc,
+    0x0149b26c, 0x01480a4a, 0x01466666, 0x0144c6af, 0x01432b16, 0x0141938b,
+    0x01400000, 0x013e7063, 0x013ce4a9, 0x013b5cc0, 0x0139d89d, 0x01385830,
+    0x0136db6d, 0x01356246, 0x0133ecad, 0x01327a97, 0x01310bf6, 0x012fa0be,
+    0x012e38e3, 0x012cd459, 0x012b7315, 0x012a150a, 0x0128ba2e, 0x01276276,
+    0x01260dd6, 0x0124bc44, 0x01236db6, 0x01222222, 0x0120d97c, 0x011f93bc,
+    0x011e50d7, 0x011d10c4, 0x011bd37a, 0x011a98ef, 0x0119611a, 0x01182bf2,
+    0x0116f96f, 0x0115c988, 0x01149c34, 0x0113716a, 0x01124924, 0x01112358,
+    0x01100000, 0x010edf12, 0x010dc087, 0x010ca458, 0x010b8a7d, 0x010a72f0,
+    0x01095da8, 0x01084a9f, 0x010739ce, 0x01062b2e, 0x01051eb8, 0x01041465,
+    0x01030c30, 0x01020612, 0x01010204, 0x01000000 },
+  {   // alpha * KINV_255
+    0x00000000, 0x00010101, 0x00020202, 0x00030303, 0x00040404, 0x00050505,
+    0x00060606, 0x00070707, 0x00080808, 0x00090909, 0x000a0a0a, 0x000b0b0b,
+    0x000c0c0c, 0x000d0d0d, 0x000e0e0e, 0x000f0f0f, 0x00101010, 0x00111111,
+    0x00121212, 0x00131313, 0x00141414, 0x00151515, 0x00161616, 0x00171717,
+    0x00181818, 0x00191919, 0x001a1a1a, 0x001b1b1b, 0x001c1c1c, 0x001d1d1d,
+    0x001e1e1e, 0x001f1f1f, 0x00202020, 0x00212121, 0x00222222, 0x00232323,
+    0x00242424, 0x00252525, 0x00262626, 0x00272727, 0x00282828, 0x00292929,
+    0x002a2a2a, 0x002b2b2b, 0x002c2c2c, 0x002d2d2d, 0x002e2e2e, 0x002f2f2f,
+    0x00303030, 0x00313131, 0x00323232, 0x00333333, 0x00343434, 0x00353535,
+    0x00363636, 0x00373737, 0x00383838, 0x00393939, 0x003a3a3a, 0x003b3b3b,
+    0x003c3c3c, 0x003d3d3d, 0x003e3e3e, 0x003f3f3f, 0x00404040, 0x00414141,
+    0x00424242, 0x00434343, 0x00444444, 0x00454545, 0x00464646, 0x00474747,
+    0x00484848, 0x00494949, 0x004a4a4a, 0x004b4b4b, 0x004c4c4c, 0x004d4d4d,
+    0x004e4e4e, 0x004f4f4f, 0x00505050, 0x00515151, 0x00525252, 0x00535353,
+    0x00545454, 0x00555555, 0x00565656, 0x00575757, 0x00585858, 0x00595959,
+    0x005a5a5a, 0x005b5b5b, 0x005c5c5c, 0x005d5d5d, 0x005e5e5e, 0x005f5f5f,
+    0x00606060, 0x00616161, 0x00626262, 0x00636363, 0x00646464, 0x00656565,
+    0x00666666, 0x00676767, 0x00686868, 0x00696969, 0x006a6a6a, 0x006b6b6b,
+    0x006c6c6c, 0x006d6d6d, 0x006e6e6e, 0x006f6f6f, 0x00707070, 0x00717171,
+    0x00727272, 0x00737373, 0x00747474, 0x00757575, 0x00767676, 0x00777777,
+    0x00787878, 0x00797979, 0x007a7a7a, 0x007b7b7b, 0x007c7c7c, 0x007d7d7d,
+    0x007e7e7e, 0x007f7f7f, 0x00808080, 0x00818181, 0x00828282, 0x00838383,
+    0x00848484, 0x00858585, 0x00868686, 0x00878787, 0x00888888, 0x00898989,
+    0x008a8a8a, 0x008b8b8b, 0x008c8c8c, 0x008d8d8d, 0x008e8e8e, 0x008f8f8f,
+    0x00909090, 0x00919191, 0x00929292, 0x00939393, 0x00949494, 0x00959595,
+    0x00969696, 0x00979797, 0x00989898, 0x00999999, 0x009a9a9a, 0x009b9b9b,
+    0x009c9c9c, 0x009d9d9d, 0x009e9e9e, 0x009f9f9f, 0x00a0a0a0, 0x00a1a1a1,
+    0x00a2a2a2, 0x00a3a3a3, 0x00a4a4a4, 0x00a5a5a5, 0x00a6a6a6, 0x00a7a7a7,
+    0x00a8a8a8, 0x00a9a9a9, 0x00aaaaaa, 0x00ababab, 0x00acacac, 0x00adadad,
+    0x00aeaeae, 0x00afafaf, 0x00b0b0b0, 0x00b1b1b1, 0x00b2b2b2, 0x00b3b3b3,
+    0x00b4b4b4, 0x00b5b5b5, 0x00b6b6b6, 0x00b7b7b7, 0x00b8b8b8, 0x00b9b9b9,
+    0x00bababa, 0x00bbbbbb, 0x00bcbcbc, 0x00bdbdbd, 0x00bebebe, 0x00bfbfbf,
+    0x00c0c0c0, 0x00c1c1c1, 0x00c2c2c2, 0x00c3c3c3, 0x00c4c4c4, 0x00c5c5c5,
+    0x00c6c6c6, 0x00c7c7c7, 0x00c8c8c8, 0x00c9c9c9, 0x00cacaca, 0x00cbcbcb,
+    0x00cccccc, 0x00cdcdcd, 0x00cecece, 0x00cfcfcf, 0x00d0d0d0, 0x00d1d1d1,
+    0x00d2d2d2, 0x00d3d3d3, 0x00d4d4d4, 0x00d5d5d5, 0x00d6d6d6, 0x00d7d7d7,
+    0x00d8d8d8, 0x00d9d9d9, 0x00dadada, 0x00dbdbdb, 0x00dcdcdc, 0x00dddddd,
+    0x00dedede, 0x00dfdfdf, 0x00e0e0e0, 0x00e1e1e1, 0x00e2e2e2, 0x00e3e3e3,
+    0x00e4e4e4, 0x00e5e5e5, 0x00e6e6e6, 0x00e7e7e7, 0x00e8e8e8, 0x00e9e9e9,
+    0x00eaeaea, 0x00ebebeb, 0x00ececec, 0x00ededed, 0x00eeeeee, 0x00efefef,
+    0x00f0f0f0, 0x00f1f1f1, 0x00f2f2f2, 0x00f3f3f3, 0x00f4f4f4, 0x00f5f5f5,
+    0x00f6f6f6, 0x00f7f7f7, 0x00f8f8f8, 0x00f9f9f9, 0x00fafafa, 0x00fbfbfb,
+    0x00fcfcfc, 0x00fdfdfd, 0x00fefefe, 0x00ffffff }
+};
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return kMultTables[!inverse][a];
+}
+
+#else
+
+static WEBP_INLINE uint32_t GetScale(uint32_t a, int inverse) {
+  return inverse ? (255u << MFIX) / a : a * KINV_255;
+}
+
+#endif    // USE_TABLES_FOR_ALPHA_MULT
+
+void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t argb = ptr[x];
+    if (argb < 0xff000000u) {      // alpha < 255
+      if (argb <= 0x00ffffffu) {   // alpha == 0
+        ptr[x] = 0;
+      } else {
+        const uint32_t alpha = (argb >> 24) & 0xff;
+        const uint32_t scale = GetScale(alpha, inverse);
+        uint32_t out = argb & 0xff000000u;
+        out |= Mult(argb >>  0, scale) <<  0;
+        out |= Mult(argb >>  8, scale) <<  8;
+        out |= Mult(argb >> 16, scale) << 16;
+        ptr[x] = out;
+      }
+    }
+  }
+}
+
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultARGBRow((uint32_t*)ptr, width, inverse);
+    ptr += stride;
+  }
+}
+
+void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                 int width, int inverse) {
+  int x;
+  for (x = 0; x < width; ++x) {
+    const uint32_t a = alpha[x];
+    if (a != 255) {
+      if (a == 0) {
+        ptr[x] = 0;
+      } else {
+        const uint32_t scale = GetScale(a, inverse);
+        ptr[x] = Mult(ptr[x], scale);
+      }
+    }
+  }
+}
+
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse) {
+  int n;
+  for (n = 0; n < num_rows; ++n) {
+    WebPMultRow(ptr, alpha, width, inverse);
+    ptr += stride;
+    alpha += alpha_stride;
+  }
+}
+
+#undef KINV_255
+#undef HALF
+#undef MFIX
+
diff --git a/third_party/libwebp/utils/alpha_processing.h b/third_party/libwebp/utils/alpha_processing.h
new file mode 100644
index 0000000..80e1ae4
--- /dev/null
+++ b/third_party/libwebp/utils/alpha_processing.h
@@ -0,0 +1,46 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Utilities for processing transparent channel.
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_ALPHA_PROCESSING_H_
+#define WEBP_UTILS_ALPHA_PROCESSING_H_
+
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Pre-Multiply operation transforms x into x * A / 255  (where x=Y,R,G or B).
+// Un-Multiply operation transforms x into x * 255 / A.
+
+// Pre-Multiply or Un-Multiply (if 'inverse' is true) argb values in a row.
+void WebPMultARGBRow(uint32_t* const ptr, int width, int inverse);
+
+// Same a WebPMultARGBRow(), but for several rows.
+void WebPMultARGBRows(uint8_t* ptr, int stride, int width, int num_rows,
+                      int inverse);
+
+// Same for a row of single values, with side alpha values.
+void WebPMultRow(uint8_t* const ptr, const uint8_t* const alpha,
+                 int width, int inverse);
+
+// Same a WebPMultRow(), but for several 'num_rows' rows.
+void WebPMultRows(uint8_t* ptr, int stride,
+                  const uint8_t* alpha, int alpha_stride,
+                  int width, int num_rows, int inverse);
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif    // WEBP_UTILS_ALPHA_PROCESSING_H_
diff --git a/third_party/libwebp/utils/bit_reader.c b/third_party/libwebp/utils/bit_reader.c
index 677fa01..bfa4d7d 100644
--- a/third_party/libwebp/utils/bit_reader.c
+++ b/third_party/libwebp/utils/bit_reader.c
@@ -13,10 +13,6 @@
 
 #include "./bit_reader.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifndef USE_RIGHT_JUSTIFY
 #define MK(X) (((range_t)(X) << (BITS)) | (MASK))
 #else
@@ -209,6 +205,3 @@ uint32_t VP8LReadBits(VP8LBitReader* const br, int n_bits) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/bit_reader.h b/third_party/libwebp/utils/bit_reader.h
index 588222b..98df98a 100644
--- a/third_party/libwebp/utils/bit_reader.h
+++ b/third_party/libwebp/utils/bit_reader.h
@@ -19,10 +19,9 @@
 #ifdef _MSC_VER
 #include <stdlib.h>  // _byteswap_ulong
 #endif
-#include <string.h>  // For memcpy
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -55,7 +54,7 @@ extern "C" {
 // And just after calling VP8LoadNewBytes():
 //  [........vvvvvvvvBBBBBBBBBBBBBBBB]LSB || [........vvvvvvvvBBBBBBBBBBBBBBBB]
 //
-// -> we're back to height active 'value_' bits (marked 'v') and BITS cached
+// -> we're back to eight active 'value_' bits (marked 'v') and BITS cached
 // bits (marked 'B')
 //
 // The right-justify strategy tends to use less shifts and is often faster.
@@ -155,7 +154,7 @@ static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
   if (br->buf_ + sizeof(lbit_t) <= br->buf_end_) {
     // convert memory type to register type (with some zero'ing!)
     bit_t bits;
-    lbit_t in_bits = *(lbit_t*)br->buf_;
+    const lbit_t in_bits = *(const lbit_t*)br->buf_;
     br->buf_ += (BITS) >> 3;
 #if !defined(__BIG_ENDIAN__)
 #if (BITS > 32)
@@ -179,8 +178,11 @@ static WEBP_INLINE void VP8LoadNewBytes(VP8BitReader* const br) {
     bits >>= 64 - BITS;
 #elif (BITS >= 24)
 #if defined(__i386__) || defined(__x86_64__)
-    __asm__ volatile("bswap %k0" : "=r"(in_bits) : "0"(in_bits));
-    bits = (bit_t)in_bits;   // 24b/32b -> 32b/64b zero-extension
+    {
+      lbit_t swapped_in_bits;
+      __asm__ volatile("bswap %k0" : "=r"(swapped_in_bits) : "0"(in_bits));
+      bits = (bit_t)swapped_in_bits;   // 24b/32b -> 32b/64b zero-extension
+    }
 #elif defined(_MSC_VER)
     bits = (bit_t)_byteswap_ulong(in_bits);
 #else
@@ -254,6 +256,7 @@ static WEBP_INLINE void VP8Shift(VP8BitReader* const br) {
   br->bits_ -= shift;
 #endif
 }
+
 static WEBP_INLINE int VP8GetBit(VP8BitReader* const br, int prob) {
 #ifndef USE_RIGHT_JUSTIFY
   // It's important to avoid generating a 64bit x 64bit multiply here.
@@ -282,7 +285,6 @@ static WEBP_INLINE int VP8GetSigned(VP8BitReader* const br, int v) {
   return bit ? -v : v;
 }
 
-
 // -----------------------------------------------------------------------------
 // Bitreader for lossless format
 
@@ -316,15 +318,16 @@ static WEBP_INLINE uint32_t VP8LPrefetchBits(VP8LBitReader* const br) {
   return (uint32_t)(br->val_ >> br->bit_pos_);
 }
 
-// Discard 'num_bits' bits from the cache.
-static WEBP_INLINE void VP8LDiscardBits(VP8LBitReader* const br, int num_bits) {
-  br->bit_pos_ += num_bits;
+// For jumping over a number of bits in the bit stream when accessed with
+// VP8LPrefetchBits and VP8LFillBitWindow.
+static WEBP_INLINE void VP8LSetBitPos(VP8LBitReader* const br, int val) {
+  br->bit_pos_ = val;
 }
 
-// Advances the Read buffer by 4 bytes to make room for reading next 32 bits.
+// Advances the read buffer by 4 bytes to make room for reading next 32 bits.
 void VP8LFillBitWindow(VP8LBitReader* const br);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/bit_writer.c b/third_party/libwebp/utils/bit_writer.c
index 3827a13..29810a1 100644
--- a/third_party/libwebp/utils/bit_writer.c
+++ b/third_party/libwebp/utils/bit_writer.c
@@ -17,10 +17,6 @@
 #include <stdlib.h>
 #include "./bit_writer.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // VP8BitWriter
 
@@ -43,7 +39,10 @@ static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) {
     bw->error_ = 1;
     return 0;
   }
-  memcpy(new_buf, bw->buf_, bw->pos_);
+  if (bw->pos_ > 0) {
+    assert(bw->buf_ != NULL);
+    memcpy(new_buf, bw->buf_, bw->pos_);
+  }
   free(bw->buf_);
   bw->buf_ = new_buf;
   bw->max_pos_ = new_size;
@@ -253,7 +252,7 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
     uint8_t* p = &bw->buf_[bw->bit_pos_ >> 3];
     const int bits_reserved_in_first_byte = bw->bit_pos_ & 7;
     const int bits_left_to_write = n_bits - 8 + bits_reserved_in_first_byte;
-    // implicit & 0xff is assumed for uint8_t arithmetics
+    // implicit & 0xff is assumed for uint8_t arithmetic
     *p++ |= bits << bits_reserved_in_first_byte;
     bits >>= 8 - bits_reserved_in_first_byte;
     if (bits_left_to_write >= 1) {
@@ -281,6 +280,3 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/bit_writer.h b/third_party/libwebp/utils/bit_writer.h
index cbb095c..89a9ead 100644
--- a/third_party/libwebp/utils/bit_writer.h
+++ b/third_party/libwebp/utils/bit_writer.h
@@ -16,7 +16,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -118,7 +118,7 @@ void VP8LWriteBits(VP8LBitWriter* const bw, int n_bits, uint32_t bits);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/color_cache.c b/third_party/libwebp/utils/color_cache.c
index 749db61..66a4464 100644
--- a/third_party/libwebp/utils/color_cache.c
+++ b/third_party/libwebp/utils/color_cache.c
@@ -16,10 +16,6 @@
 #include "./color_cache.h"
 #include "../utils/utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // VP8LColorCache.
 
@@ -41,6 +37,3 @@ void VP8LColorCacheClear(VP8LColorCache* const cc) {
   }
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}
-#endif
diff --git a/third_party/libwebp/utils/color_cache.h b/third_party/libwebp/utils/color_cache.h
index e5a0bd6f..0f824ed 100644
--- a/third_party/libwebp/utils/color_cache.h
+++ b/third_party/libwebp/utils/color_cache.h
@@ -17,7 +17,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -63,7 +63,7 @@ void VP8LColorCacheClear(VP8LColorCache* const color_cache);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/third_party/libwebp/utils/filters.c b/third_party/libwebp/utils/filters.c
index eb5bb34..2d15bd0 100644
--- a/third_party/libwebp/utils/filters.c
+++ b/third_party/libwebp/utils/filters.c
@@ -16,19 +16,17 @@
 #include <stdlib.h>
 #include <string.h>
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Helpful macro.
 
-# define SANITY_CHECK(in, out)                              \
-  assert(in != NULL);                                       \
-  assert(out != NULL);                                      \
-  assert(width > 0);                                        \
-  assert(height > 0);                                       \
-  assert(stride >= width);
+# define SANITY_CHECK(in, out)                                                 \
+  assert(in != NULL);                                                          \
+  assert(out != NULL);                                                         \
+  assert(width > 0);                                                           \
+  assert(height > 0);                                                          \
+  assert(stride >= width);                                                     \
+  assert(row >= 0 && num_rows > 0 && row + num_rows <= height);                \
+  (void)height;  // Silence unused warning.
 
 static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
                                     uint8_t* dst, int length, int inverse) {
@@ -45,20 +43,32 @@ static WEBP_INLINE void PredictLine(const uint8_t* src, const uint8_t* pred,
 
 static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
                                            int width, int height, int stride,
+                                           int row, int num_rows,
                                            int inverse, uint8_t* out) {
-  int h;
-  const uint8_t* preds = (inverse ? out : in);
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Leftmost pixel is the same as input for topmost scanline.
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
+    preds += stride;
+    in += stride;
+    out += stride;
+  }
 
   // Filter line-by-line.
-  for (h = 0; h < height; ++h) {
-    // Leftmost pixel is predicted from above (except for topmost scanline).
-    if (h == 0) {
-      out[0] = in[0];
-    } else {
-      PredictLine(in, preds - stride, out, 1, inverse);
-    }
+  while (row < last_row) {
+    // Leftmost pixel is predicted from above.
+    PredictLine(in, preds - stride, out, 1, inverse);
     PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    ++row;
     preds += stride;
     in += stride;
     out += stride;
@@ -67,12 +77,12 @@ static WEBP_INLINE void DoHorizontalFilter(const uint8_t* in,
 
 static void HorizontalFilter(const uint8_t* data, int width, int height,
                              int stride, uint8_t* filtered_data) {
-  DoHorizontalFilter(data, width, height, stride, 0, filtered_data);
+  DoHorizontalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 
-static void HorizontalUnfilter(int width, int height, int stride,
-                               uint8_t* data) {
-  DoHorizontalFilter(data, width, height, stride, 1, data);
+static void HorizontalUnfilter(int width, int height, int stride, int row,
+                               int num_rows, uint8_t* data) {
+  DoHorizontalFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 
 //------------------------------------------------------------------------------
@@ -80,32 +90,47 @@ static void HorizontalUnfilter(int width, int height, int stride,
 
 static WEBP_INLINE void DoVerticalFilter(const uint8_t* in,
                                          int width, int height, int stride,
+                                         int row, int num_rows,
                                          int inverse, uint8_t* out) {
-  int h;
-  const uint8_t* preds = (inverse ? out : in);
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
-
-  // Very first top-left pixel is copied.
-  out[0] = in[0];
-  // Rest of top scan-line is left-predicted.
-  PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-
-  // Filter line-by-line.
-  for (h = 1; h < height; ++h) {
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
+
+  if (row == 0) {
+    // Very first top-left pixel is copied.
+    out[0] = in[0];
+    // Rest of top scan-line is left-predicted.
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
     in += stride;
     out += stride;
+  } else {
+    // We are starting from in-between. Make sure 'preds' points to prev row.
+    preds -= stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
     PredictLine(in, preds, out, width, inverse);
+    ++row;
     preds += stride;
+    in += stride;
+    out += stride;
   }
 }
 
 static void VerticalFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoVerticalFilter(data, width, height, stride, 0, filtered_data);
+  DoVerticalFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 
-static void VerticalUnfilter(int width, int height, int stride, uint8_t* data) {
-  DoVerticalFilter(data, width, height, stride, 1, data);
+static void VerticalUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoVerticalFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 
 //------------------------------------------------------------------------------
@@ -116,23 +141,31 @@ static WEBP_INLINE int GradientPredictor(uint8_t a, uint8_t b, uint8_t c) {
   return ((g & ~0xff) == 0) ? g : (g < 0) ? 0 : 255;  // clip to 8bit
 }
 
-static WEBP_INLINE
-void DoGradientFilter(const uint8_t* in, int width, int height,
-                      int stride, int inverse, uint8_t* out) {
-  const uint8_t* preds = (inverse ? out : in);
-  int h;
+static WEBP_INLINE void DoGradientFilter(const uint8_t* in,
+                                         int width, int height, int stride,
+                                         int row, int num_rows,
+                                         int inverse, uint8_t* out) {
+  const uint8_t* preds;
+  const size_t start_offset = row * stride;
+  const int last_row = row + num_rows;
   SANITY_CHECK(in, out);
+  in += start_offset;
+  out += start_offset;
+  preds = inverse ? out : in;
 
   // left prediction for top scan-line
-  out[0] = in[0];
-  PredictLine(in + 1, preds, out + 1, width - 1, inverse);
-
-  // Filter line-by-line.
-  for (h = 1; h < height; ++h) {
-    int w;
+  if (row == 0) {
+    out[0] = in[0];
+    PredictLine(in + 1, preds, out + 1, width - 1, inverse);
+    row = 1;
     preds += stride;
     in += stride;
     out += stride;
+  }
+
+  // Filter line-by-line.
+  while (row < last_row) {
+    int w;
     // leftmost pixel: predict from above.
     PredictLine(in, preds - stride, out, 1, inverse);
     for (w = 1; w < width; ++w) {
@@ -141,16 +174,21 @@ void DoGradientFilter(const uint8_t* in, int width, int height,
                                          preds[w - stride - 1]);
       out[w] = in[w] + (inverse ? pred : -pred);
     }
+    ++row;
+    preds += stride;
+    in += stride;
+    out += stride;
   }
 }
 
 static void GradientFilter(const uint8_t* data, int width, int height,
                            int stride, uint8_t* filtered_data) {
-  DoGradientFilter(data, width, height, stride, 0, filtered_data);
+  DoGradientFilter(data, width, height, stride, 0, height, 0, filtered_data);
 }
 
-static void GradientUnfilter(int width, int height, int stride, uint8_t* data) {
-  DoGradientFilter(data, width, height, stride, 1, data);
+static void GradientUnfilter(int width, int height, int stride, int row,
+                             int num_rows, uint8_t* data) {
+  DoGradientFilter(data, width, height, stride, row, num_rows, 1, data);
 }
 
 #undef SANITY_CHECK
@@ -186,7 +224,8 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
     }
   }
   {
-    WEBP_FILTER_TYPE filter, best_filter = WEBP_FILTER_NONE;
+    int filter;
+    WEBP_FILTER_TYPE best_filter = WEBP_FILTER_NONE;
     int best_score = 0x7fffffff;
     for (filter = WEBP_FILTER_NONE; filter < WEBP_FILTER_LAST; ++filter) {
       int score = 0;
@@ -197,7 +236,7 @@ WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
       }
       if (score < best_score) {
         best_score = score;
-        best_filter = filter;
+        best_filter = (WEBP_FILTER_TYPE)filter;
       }
     }
     return best_filter;
@@ -225,6 +264,3 @@ const WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST] = {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/filters.h b/third_party/libwebp/utils/filters.h
index 1f5fa16..dde39cb 100644
--- a/third_party/libwebp/utils/filters.h
+++ b/third_party/libwebp/utils/filters.h
@@ -16,7 +16,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -34,7 +34,7 @@ typedef enum {
 typedef void (*WebPFilterFunc)(const uint8_t* in, int width, int height,
                                int stride, uint8_t* out);
 typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
-                                 uint8_t* data);
+                                 int row, int num_rows, uint8_t* data);
 
 // Filter the given data using the given predictor.
 // 'in' corresponds to a 2-dimensional pixel array of size (stride * height)
@@ -44,13 +44,15 @@ typedef void (*WebPUnfilterFunc)(int width, int height, int stride,
 extern const WebPFilterFunc WebPFilters[WEBP_FILTER_LAST];
 
 // In-place reconstruct the original data from the given filtered data.
+// The reconstruction will be done for 'num_rows' rows starting from 'row'
+// (assuming rows upto 'row - 1' are already reconstructed).
 extern const WebPUnfilterFunc WebPUnfilters[WEBP_FILTER_LAST];
 
 // Fast estimate of a potentially good filter.
-extern WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
-                                           int width, int height, int stride);
+WEBP_FILTER_TYPE EstimateBestFilter(const uint8_t* data,
+                                    int width, int height, int stride);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/huffman.c b/third_party/libwebp/utils/huffman.c
index 0ba9d05..8c5739f 100644
--- a/third_party/libwebp/utils/huffman.c
+++ b/third_party/libwebp/utils/huffman.c
@@ -13,13 +13,14 @@
 
 #include <assert.h>
 #include <stdlib.h>
+#include <string.h>
 #include "./huffman.h"
 #include "../utils/utils.h"
 #include "../webp/format_constants.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
+// Uncomment the following to use look-up table for ReverseBits()
+// (might be faster on some platform)
+// #define USE_LUT_REVERSE_BITS
 
 #define NON_EXISTENT_SYMBOL (-1)
 
@@ -52,11 +53,14 @@ static int TreeInit(HuffmanTree* const tree, int num_leaves) {
   // Note that a Huffman tree is a full binary tree; and in a full binary tree
   // with L leaves, the total number of nodes N = 2 * L - 1.
   tree->max_nodes_ = 2 * num_leaves - 1;
+  assert(tree->max_nodes_ < (1 << 16));   // limit for the lut_jump_ table
   tree->root_ = (HuffmanTreeNode*)WebPSafeMalloc((uint64_t)tree->max_nodes_,
                                                  sizeof(*tree->root_));
   if (tree->root_ == NULL) return 0;
   TreeNodeInit(tree->root_);  // Initialize root.
   tree->num_nodes_ = 1;
+  memset(tree->lut_bits_, 255, sizeof(tree->lut_bits_));
+  memset(tree->lut_jump_, 0, sizeof(tree->lut_jump_));
   return 1;
 }
 
@@ -117,10 +121,54 @@ int HuffmanCodeLengthsToCodes(const int* const code_lengths,
   return 1;
 }
 
+#ifndef USE_LUT_REVERSE_BITS
+
+static int ReverseBitsShort(int bits, int num_bits) {
+  int retval = 0;
+  int i;
+  assert(num_bits <= 8);   // Not a hard requirement, just for coherency.
+  for (i = 0; i < num_bits; ++i) {
+    retval <<= 1;
+    retval |= bits & 1;
+    bits >>= 1;
+  }
+  return retval;
+}
+
+#else
+
+static const uint8_t kReversedBits[16] = {  // Pre-reversed 4-bit values.
+  0x0, 0x8, 0x4, 0xc, 0x2, 0xa, 0x6, 0xe,
+  0x1, 0x9, 0x5, 0xd, 0x3, 0xb, 0x7, 0xf
+};
+
+static int ReverseBitsShort(int bits, int num_bits) {
+  const uint8_t v = (kReversedBits[bits & 0xf] << 4) | kReversedBits[bits >> 4];
+  assert(num_bits <= 8);
+  return v >> (8 - num_bits);
+}
+
+#endif
+
 static int TreeAddSymbol(HuffmanTree* const tree,
                          int symbol, int code, int code_length) {
+  int step = HUFF_LUT_BITS;
+  int base_code;
   HuffmanTreeNode* node = tree->root_;
   const HuffmanTreeNode* const max_node = tree->root_ + tree->max_nodes_;
+  assert(symbol == (int16_t)symbol);
+  if (code_length <= HUFF_LUT_BITS) {
+    int i;
+    base_code = ReverseBitsShort(code, code_length);
+    for (i = 0; i < (1 << (HUFF_LUT_BITS - code_length)); ++i) {
+      const int idx = base_code | (i << code_length);
+      tree->lut_symbol_[idx] = (int16_t)symbol;
+      tree->lut_bits_[idx] = code_length;
+    }
+  } else {
+    base_code = ReverseBitsShort((code >> (code_length - HUFF_LUT_BITS)),
+                                 HUFF_LUT_BITS);
+  }
   while (code_length-- > 0) {
     if (node >= max_node) {
       return 0;
@@ -128,14 +176,17 @@ static int TreeAddSymbol(HuffmanTree* const tree,
     if (NodeIsEmpty(node)) {
       if (IsFull(tree)) return 0;    // error: too many symbols.
       AssignChildren(tree, node);
-    } else if (HuffmanTreeNodeIsLeaf(node)) {
+    } else if (!HuffmanTreeNodeIsNotLeaf(node)) {
       return 0;  // leaf is already occupied.
     }
     node += node->children_ + ((code >> code_length) & 1);
+    if (--step == 0) {
+      tree->lut_jump_[base_code] = (int16_t)(node - tree->root_);
+    }
   }
   if (NodeIsEmpty(node)) {
     node->children_ = 0;      // turn newly created node into a leaf.
-  } else if (!HuffmanTreeNodeIsLeaf(node)) {
+  } else if (HuffmanTreeNodeIsNotLeaf(node)) {
     return 0;   // trying to assign a symbol to already used code.
   }
   node->symbol_ = symbol;  // Add symbol in this node.
@@ -235,6 +286,3 @@ int HuffmanTreeBuildExplicit(HuffmanTree* const tree,
   return ok;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/huffman.h b/third_party/libwebp/utils/huffman.h
index 83a517e..e8afd27 100644
--- a/third_party/libwebp/utils/huffman.h
+++ b/third_party/libwebp/utils/huffman.h
@@ -17,7 +17,7 @@
 #include <assert.h>
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -28,17 +28,24 @@ typedef struct {
 } HuffmanTreeNode;
 
 // Huffman Tree.
+#define HUFF_LUT_BITS 7
+#define HUFF_LUT (1U << HUFF_LUT_BITS)
 typedef struct HuffmanTree HuffmanTree;
 struct HuffmanTree {
+  // Fast lookup for short bit lengths.
+  uint8_t lut_bits_[HUFF_LUT];
+  int16_t lut_symbol_[HUFF_LUT];
+  int16_t lut_jump_[HUFF_LUT];
+  // Complete tree for lookups.
   HuffmanTreeNode* root_;   // all the nodes, starting at root.
   int max_nodes_;           // max number of nodes
   int num_nodes_;           // number of currently occupied nodes
 };
 
-// Returns true if the given node is a leaf of the Huffman tree.
-static WEBP_INLINE int HuffmanTreeNodeIsLeaf(
+// Returns true if the given node is not a leaf of the Huffman tree.
+static WEBP_INLINE int HuffmanTreeNodeIsNotLeaf(
     const HuffmanTreeNode* const node) {
-  return (node->children_ == 0);
+  return node->children_;
 }
 
 // Go down one level. Most critical function. 'right_child' must be 0 or 1.
@@ -73,7 +80,7 @@ int HuffmanCodeLengthsToCodes(const int* const code_lengths,
                               int code_lengths_size, int* const huff_codes);
 
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/huffman_encode.c b/third_party/libwebp/utils/huffman_encode.c
index 9608666..9c59867 100644
--- a/third_party/libwebp/utils/huffman_encode.c
+++ b/third_party/libwebp/utils/huffman_encode.c
@@ -27,7 +27,7 @@ static int ValuesShouldBeCollapsedToStrideAverage(int a, int b) {
 }
 
 // Change the population counts in a way that the consequent
-// Hufmann tree compression, especially its RLE-part, give smaller output.
+// Huffman tree compression, especially its RLE-part, give smaller output.
 static int OptimizeHuffmanForRle(int length, int* const counts) {
   uint8_t* good_for_rle;
   // 1) Let's make the Huffman code more compatible with rle encoding.
diff --git a/third_party/libwebp/utils/huffman_encode.h b/third_party/libwebp/utils/huffman_encode.h
index 0b81f47..ee51c68 100644
--- a/third_party/libwebp/utils/huffman_encode.h
+++ b/third_party/libwebp/utils/huffman_encode.h
@@ -16,7 +16,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -42,7 +42,7 @@ int VP8LCreateCompressedHuffmanTree(const HuffmanTreeCode* const tree,
 int VP8LCreateHuffmanTree(int* const histogram, int tree_depth_limit,
                           HuffmanTreeCode* const tree);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }
 #endif
 
diff --git a/third_party/libwebp/utils/quant_levels.c b/third_party/libwebp/utils/quant_levels.c
index 42c7245..d7c8aab 100644
--- a/third_party/libwebp/utils/quant_levels.c
+++ b/third_party/libwebp/utils/quant_levels.c
@@ -16,10 +16,6 @@
 
 #include "./quant_levels.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define NUM_SYMBOLS     256
 
 #define MAX_ITER  6             // Maximum number of convergence steps.
@@ -142,6 +138,3 @@ int QuantizeLevels(uint8_t* const data, int width, int height,
   return 1;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/quant_levels.h b/third_party/libwebp/utils/quant_levels.h
index 2d90828..1cb5a32 100644
--- a/third_party/libwebp/utils/quant_levels.h
+++ b/third_party/libwebp/utils/quant_levels.h
@@ -18,7 +18,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -29,7 +29,7 @@ extern "C" {
 int QuantizeLevels(uint8_t* const data, int width, int height, int num_levels,
                    uint64_t* const sse);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/quant_levels_dec.c b/third_party/libwebp/utils/quant_levels_dec.c
index d93594b..8489705 100644
--- a/third_party/libwebp/utils/quant_levels_dec.c
+++ b/third_party/libwebp/utils/quant_levels_dec.c
@@ -13,18 +13,12 @@
 
 #include "./quant_levels_dec.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
-int DequantizeLevels(uint8_t* const data, int width, int height) {
-  if (data == NULL || width <= 0 || height <= 0) return 0;
-  (void)data;
-  (void)width;
-  (void)height;
+int DequantizeLevels(uint8_t* const data, int width, int height,
+                     int row, int num_rows) {
+  if (data == NULL || width <= 0 || height <= 0 || row < 0 || num_rows < 0 ||
+      row + num_rows > height) {
+    return 0;
+  }
   return 1;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/quant_levels_dec.h b/third_party/libwebp/utils/quant_levels_dec.h
index 5891067..0288383 100644
--- a/third_party/libwebp/utils/quant_levels_dec.h
+++ b/third_party/libwebp/utils/quant_levels_dec.h
@@ -16,16 +16,18 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-// Apply post-processing to input 'data' of size 'width'x'height' assuming
-// that the source was quantized to a reduced number of levels.
+// Apply post-processing to input 'data' of size 'width'x'height' assuming that
+// the source was quantized to a reduced number of levels. The post-processing
+// will be applied to 'num_rows' rows of 'data' starting from 'row'.
 // Returns false in case of error (data is NULL, invalid parameters, ...).
-int DequantizeLevels(uint8_t* const data, int width, int height);
+int DequantizeLevels(uint8_t* const data, int width, int height,
+                     int row, int num_rows);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/random.c b/third_party/libwebp/utils/random.c
new file mode 100644
index 0000000..24e96ad
--- /dev/null
+++ b/third_party/libwebp/utils/random.c
@@ -0,0 +1,43 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Pseudo-random utilities
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#include <string.h>
+#include "./random.h"
+
+//------------------------------------------------------------------------------
+
+// 31b-range values
+static const uint32_t kRandomTable[VP8_RANDOM_TABLE_SIZE] = {
+  0x0de15230, 0x03b31886, 0x775faccb, 0x1c88626a, 0x68385c55, 0x14b3b828,
+  0x4a85fef8, 0x49ddb84b, 0x64fcf397, 0x5c550289, 0x4a290000, 0x0d7ec1da,
+  0x5940b7ab, 0x5492577d, 0x4e19ca72, 0x38d38c69, 0x0c01ee65, 0x32a1755f,
+  0x5437f652, 0x5abb2c32, 0x0faa57b1, 0x73f533e7, 0x685feeda, 0x7563cce2,
+  0x6e990e83, 0x4730a7ed, 0x4fc0d9c6, 0x496b153c, 0x4f1403fa, 0x541afb0c,
+  0x73990b32, 0x26d7cb1c, 0x6fcc3706, 0x2cbb77d8, 0x75762f2a, 0x6425ccdd,
+  0x24b35461, 0x0a7d8715, 0x220414a8, 0x141ebf67, 0x56b41583, 0x73e502e3,
+  0x44cab16f, 0x28264d42, 0x73baaefb, 0x0a50ebed, 0x1d6ab6fb, 0x0d3ad40b,
+  0x35db3b68, 0x2b081e83, 0x77ce6b95, 0x5181e5f0, 0x78853bbc, 0x009f9494,
+  0x27e5ed3c
+};
+
+void VP8InitRandom(VP8Random* const rg, float dithering) {
+  memcpy(rg->tab_, kRandomTable, sizeof(rg->tab_));
+  rg->index1_ = 0;
+  rg->index2_ = 31;
+  rg->amp_ = (dithering < 0.0) ? 0
+           : (dithering > 1.0) ? (1 << VP8_RANDOM_DITHER_FIX)
+           : (uint32_t)((1 << VP8_RANDOM_DITHER_FIX) * dithering);
+}
+
+//------------------------------------------------------------------------------
+
diff --git a/third_party/libwebp/utils/random.h b/third_party/libwebp/utils/random.h
new file mode 100644
index 0000000..08a83e9
--- /dev/null
+++ b/third_party/libwebp/utils/random.h
@@ -0,0 +1,62 @@
+// Copyright 2013 Google Inc. All Rights Reserved.
+//
+// Use of this source code is governed by a BSD-style license
+// that can be found in the COPYING file in the root of the source
+// tree. An additional intellectual property rights grant can be found
+// in the file PATENTS. All contributing project authors may
+// be found in the AUTHORS file in the root of the source tree.
+// -----------------------------------------------------------------------------
+//
+// Pseudo-random utilities
+//
+// Author: Skal (pascal.massimino@gmail.com)
+
+#ifndef WEBP_UTILS_RANDOM_H_
+#define WEBP_UTILS_RANDOM_H_
+
+#include <assert.h>
+#include "../webp/types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define VP8_RANDOM_DITHER_FIX 8   // fixed-point precision for dithering
+#define VP8_RANDOM_TABLE_SIZE 55
+
+typedef struct {
+  int index1_, index2_;
+  uint32_t tab_[VP8_RANDOM_TABLE_SIZE];
+  int amp_;
+} VP8Random;
+
+// Initializes random generator with an amplitude 'dithering' in range [0..1].
+void VP8InitRandom(VP8Random* const rg, float dithering);
+
+// Returns a centered pseudo-random number with 'num_bits' amplitude.
+// (uses D.Knuth's Difference-based random generator).
+// 'amp' is in VP8_RANDOM_DITHER_FIX fixed-point precision.
+static WEBP_INLINE int VP8RandomBits2(VP8Random* const rg, int num_bits,
+                                      int amp) {
+  int diff;
+  assert(num_bits + VP8_RANDOM_DITHER_FIX <= 31);
+  diff = rg->tab_[rg->index1_] - rg->tab_[rg->index2_];
+  if (diff < 0) diff += (1u << 31);
+  rg->tab_[rg->index1_] = diff;
+  if (++rg->index1_ == VP8_RANDOM_TABLE_SIZE) rg->index1_ = 0;
+  if (++rg->index2_ == VP8_RANDOM_TABLE_SIZE) rg->index2_ = 0;
+  diff = (diff << 1) >> (32 - num_bits);         // sign-extend, 0-center
+  diff = (diff * amp) >> VP8_RANDOM_DITHER_FIX;  // restrict range
+  diff += 1 << (num_bits - 1);                   // shift back to 0.5-center
+  return diff;
+}
+
+static WEBP_INLINE int VP8RandomBits(VP8Random* const rg, int num_bits) {
+  return VP8RandomBits2(rg, num_bits, rg->amp_);
+}
+
+#ifdef __cplusplus
+}    // extern "C"
+#endif
+
+#endif  /* WEBP_UTILS_RANDOM_H_ */
diff --git a/third_party/libwebp/utils/rescaler.c b/third_party/libwebp/utils/rescaler.c
index e5ddc296..7061246 100644
--- a/third_party/libwebp/utils/rescaler.c
+++ b/third_party/libwebp/utils/rescaler.c
@@ -17,10 +17,6 @@
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #define RFIX 30
 #define MULT_FIX(x, y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX)
 
@@ -123,6 +119,11 @@ uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk) {
 //------------------------------------------------------------------------------
 // all-in-one calls
 
+int WebPRescaleNeededLines(const WebPRescaler* const wrk, int max_num_lines) {
+  const int num_lines = (wrk->y_accum + wrk->y_sub - 1) / wrk->y_sub;
+  return (num_lines > max_num_lines) ? max_num_lines : num_lines;
+}
+
 int WebPRescalerImport(WebPRescaler* const wrk, int num_lines,
                        const uint8_t* src, int src_stride) {
   int total_imported = 0;
@@ -149,6 +150,3 @@ int WebPRescalerExport(WebPRescaler* const rescaler) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/rescaler.h b/third_party/libwebp/utils/rescaler.h
index aedce46..68e49ce 100644
--- a/third_party/libwebp/utils/rescaler.h
+++ b/third_party/libwebp/utils/rescaler.h
@@ -14,7 +14,7 @@
 #ifndef WEBP_UTILS_RESCALER_H_
 #define WEBP_UTILS_RESCALER_H_
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -38,7 +38,8 @@ typedef struct {
 } WebPRescaler;
 
 // Initialize a rescaler given scratch area 'work' and dimensions of src & dst.
-void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
+void WebPRescalerInit(WebPRescaler* const rescaler,
+                      int src_width, int src_height,
                       uint8_t* const dst,
                       int dst_width, int dst_height, int dst_stride,
                       int num_channels,
@@ -46,6 +47,11 @@ void WebPRescalerInit(WebPRescaler* const wrk, int src_width, int src_height,
                       int y_add, int y_sub,
                       int32_t* const work);
 
+// Returns the number of input lines needed next to produce one output line,
+// considering that the maximum available input lines are 'max_num_lines'.
+int WebPRescaleNeededLines(const WebPRescaler* const rescaler,
+                           int max_num_lines);
+
 // Import a row of data and save its contribution in the rescaler.
 // 'channel' denotes the channel number to be imported.
 void WebPRescalerImportRow(WebPRescaler* const rescaler,
@@ -64,14 +70,14 @@ int WebPRescalerHasPendingOutput(const WebPRescaler* const rescaler) {
 
 // Export one row from rescaler. Returns the pointer where output was written,
 // or NULL if no row was pending.
-uint8_t* WebPRescalerExportRow(WebPRescaler* const wrk);
+uint8_t* WebPRescalerExportRow(WebPRescaler* const rescaler);
 
 // Export as many rows as possible. Return the numbers of rows written.
-int WebPRescalerExport(WebPRescaler* const wrk);
+int WebPRescalerExport(WebPRescaler* const rescaler);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/thread.c b/third_party/libwebp/utils/thread.c
index b1615d0..a9e3fae 100644
--- a/third_party/libwebp/utils/thread.c
+++ b/third_party/libwebp/utils/thread.c
@@ -15,10 +15,6 @@
 #include <string.h>   // for memset()
 #include "./thread.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 #ifdef WEBP_USE_THREAD
 
 #if defined(_WIN32)
@@ -126,14 +122,14 @@ static int pthread_cond_wait(pthread_cond_t* const condition,
   return !ok;
 }
 
-#else  // _WIN32
+#else  // !_WIN32
 # define THREADFN void*
 # define THREAD_RETURN(val) val
-#endif
+#endif  // _WIN32
 
 //------------------------------------------------------------------------------
 
-static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
+static THREADFN ThreadLoop(void* ptr) {
   WebPWorker* const worker = (WebPWorker*)ptr;
   int done = 0;
   while (!done) {
@@ -142,9 +138,7 @@ static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
       pthread_cond_wait(&worker->condition_, &worker->mutex_);
     }
     if (worker->status_ == WORK) {
-      if (worker->hook) {
-        worker->had_error |= !worker->hook(worker->data1, worker->data2);
-      }
+      WebPWorkerExecute(worker);
       worker->status_ = OK;
     } else if (worker->status_ == NOT_OK) {   // finish the worker
       done = 1;
@@ -157,8 +151,8 @@ static THREADFN WebPWorkerThreadLoop(void *ptr) {    // thread loop
 }
 
 // main thread state control
-static void WebPWorkerChangeState(WebPWorker* const worker,
-                                  WebPWorkerStatus new_status) {
+static void ChangeState(WebPWorker* const worker,
+                        WebPWorkerStatus new_status) {
   // no-op when attempting to change state on a thread that didn't come up
   if (worker->status_ < OK) return;
 
@@ -175,7 +169,7 @@ static void WebPWorkerChangeState(WebPWorker* const worker,
   pthread_mutex_unlock(&worker->mutex_);
 }
 
-#endif
+#endif  // WEBP_USE_THREAD
 
 //------------------------------------------------------------------------------
 
@@ -186,7 +180,7 @@ void WebPWorkerInit(WebPWorker* const worker) {
 
 int WebPWorkerSync(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  WebPWorkerChangeState(worker, OK);
+  ChangeState(worker, OK);
 #endif
   assert(worker->status_ <= OK);
   return !worker->had_error;
@@ -202,7 +196,7 @@ int WebPWorkerReset(WebPWorker* const worker) {
       return 0;
     }
     pthread_mutex_lock(&worker->mutex_);
-    ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker);
+    ok = !pthread_create(&worker->thread_, NULL, ThreadLoop, worker);
     if (ok) worker->status_ = OK;
     pthread_mutex_unlock(&worker->mutex_);
 #else
@@ -215,19 +209,24 @@ int WebPWorkerReset(WebPWorker* const worker) {
   return ok;
 }
 
+void WebPWorkerExecute(WebPWorker* const worker) {
+  if (worker->hook != NULL) {
+    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  }
+}
+
 void WebPWorkerLaunch(WebPWorker* const worker) {
 #ifdef WEBP_USE_THREAD
-  WebPWorkerChangeState(worker, WORK);
+  ChangeState(worker, WORK);
 #else
-  if (worker->hook)
-    worker->had_error |= !worker->hook(worker->data1, worker->data2);
+  WebPWorkerExecute(worker);
 #endif
 }
 
 void WebPWorkerEnd(WebPWorker* const worker) {
   if (worker->status_ >= OK) {
 #ifdef WEBP_USE_THREAD
-    WebPWorkerChangeState(worker, NOT_OK);
+    ChangeState(worker, NOT_OK);
     pthread_join(worker->thread_, NULL);
     pthread_mutex_destroy(&worker->mutex_);
     pthread_cond_destroy(&worker->condition_);
@@ -240,6 +239,3 @@ void WebPWorkerEnd(WebPWorker* const worker) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/thread.h b/third_party/libwebp/utils/thread.h
index 13a61a4..aef33bd 100644
--- a/third_party/libwebp/utils/thread.h
+++ b/third_party/libwebp/utils/thread.h
@@ -18,11 +18,11 @@
 #include "config.h"
 #endif
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#if WEBP_USE_THREAD
+#ifdef WEBP_USE_THREAD
 
 #if defined(_WIN32)
 
@@ -55,7 +55,7 @@ typedef int (*WebPWorkerHook)(void*, void*);
 
 // Synchronize object used to launch job in the worker thread
 typedef struct {
-#if WEBP_USE_THREAD
+#ifdef WEBP_USE_THREAD
   pthread_mutex_t mutex_;
   pthread_cond_t  condition_;
   pthread_t       thread_;
@@ -79,13 +79,18 @@ int WebPWorkerSync(WebPWorker* const worker);
 // hook/data1/data2 can be changed at any time before calling this function,
 // but not be changed afterward until the next call to WebPWorkerSync().
 void WebPWorkerLaunch(WebPWorker* const worker);
+// This function is similar to WebPWorkerLaunch() except that it calls the
+// hook directly instead of using a thread. Convenient to bypass the thread
+// mechanism while still using the WebPWorker structs. WebPWorkerSync() must
+// still be called afterward (for error reporting).
+void WebPWorkerExecute(WebPWorker* const worker);
 // Kill the thread and terminate the object. To use the object again, one
 // must call WebPWorkerReset() again.
 void WebPWorkerEnd(WebPWorker* const worker);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/utils/utils.c b/third_party/libwebp/utils/utils.c
index 7eb0610..5592538 100644
--- a/third_party/libwebp/utils/utils.c
+++ b/third_party/libwebp/utils/utils.c
@@ -14,10 +14,6 @@
 #include <stdlib.h>
 #include "./utils.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
-extern "C" {
-#endif
-
 //------------------------------------------------------------------------------
 // Checked memory allocation
 
@@ -44,6 +40,3 @@ void* WebPSafeCalloc(uint64_t nmemb, size_t size) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
-}    // extern "C"
-#endif
diff --git a/third_party/libwebp/utils/utils.h b/third_party/libwebp/utils/utils.h
index e10aeeb..8bdf0f0 100644
--- a/third_party/libwebp/utils/utils.h
+++ b/third_party/libwebp/utils/utils.h
@@ -19,7 +19,7 @@
 
 #include "../webp/types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -76,7 +76,7 @@ static WEBP_INLINE void PutLE32(uint8_t* const data, uint32_t val) {
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/webp/decode.h b/third_party/libwebp/webp/decode.h
index 141f861..0c3b62e 100644
--- a/third_party/libwebp/webp/decode.h
+++ b/third_party/libwebp/webp/decode.h
@@ -16,11 +16,11 @@
 
 #include "./types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#define WEBP_DECODER_ABI_VERSION 0x0201    // MAJOR(8b) + MINOR(8b)
+#define WEBP_DECODER_ABI_VERSION 0x0203    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -404,9 +404,9 @@ struct WebPBitstreamFeatures {
   int height;         // Height in pixels, as read from the bitstream.
   int has_alpha;      // True if the bitstream contains an alpha channel.
   int has_animation;  // True if the bitstream is an animation.
+  int format;         // 0 = undefined (/mixed), 1 = lossy, 2 = lossless
 
   // Unused for now:
-  int bitstream_version;        // should be 0 for now. TODO(later)
   int no_incremental_decoding;  // if true, using incremental decoding is not
                                 // recommended.
   int rotate;                   // TODO(later)
@@ -441,11 +441,12 @@ struct WebPDecoderOptions {
   int use_scaling;                    // if true, scaling is applied _afterward_
   int scaled_width, scaled_height;    // final resolution
   int use_threads;                    // if true, use multi-threaded decoding
+  int dithering_strength;             // dithering strength (0=Off, 100=full)
 
   // Unused for now:
   int force_rotation;                 // forced rotation (to be applied _last_)
   int no_enhancement;                 // if true, discard enhancement layer
-  uint32_t pad[6];                    // padding for later use
+  uint32_t pad[5];                    // padding for later use
 };
 
 // Main object storing the configuration for advanced decoding.
@@ -483,7 +484,7 @@ WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, size_t data_size,
 WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, size_t data_size,
                                       WebPDecoderConfig* config);
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/webp/demux.h b/third_party/libwebp/webp/demux.h
index 894ff4e..2da3239 100644
--- a/third_party/libwebp/webp/demux.h
+++ b/third_party/libwebp/webp/demux.h
@@ -12,44 +12,45 @@
 
 // Code Example: Demuxing WebP data to extract all the frames, ICC profile
 // and EXIF/XMP metadata.
-//
-//   WebPDemuxer* demux = WebPDemux(&webp_data);
-//
-//   uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
-//   uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
-//   // ... (Get information about the features present in the WebP file).
-//   uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
-//
-//   // ... (Iterate over all frames).
-//   WebPIterator iter;
-//   if (WebPDemuxGetFrame(demux, 1, &iter)) {
-//     do {
-//       // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
-//       // ... and get other frame properties like width, height, offsets etc.
-//       // ... see 'struct WebPIterator' below for more info).
-//     } while (WebPDemuxNextFrame(&iter));
-//     WebPDemuxReleaseIterator(&iter);
-//   }
-//
-//   // ... (Extract metadata).
-//   WebPChunkIterator chunk_iter;
-//   if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
-//   // ... (Consume the ICC profile in 'chunk_iter.chunk').
-//   WebPDemuxReleaseChunkIterator(&chunk_iter);
-//   if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
-//   // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
-//   WebPDemuxReleaseChunkIterator(&chunk_iter);
-//   if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
-//   // ... (Consume the XMP metadata in 'chunk_iter.chunk').
-//   WebPDemuxReleaseChunkIterator(&chunk_iter);
-//   WebPDemuxDelete(demux);
+/*
+  WebPDemuxer* demux = WebPDemux(&webp_data);
+
+  uint32_t width = WebPDemuxGetI(demux, WEBP_FF_CANVAS_WIDTH);
+  uint32_t height = WebPDemuxGetI(demux, WEBP_FF_CANVAS_HEIGHT);
+  // ... (Get information about the features present in the WebP file).
+  uint32_t flags = WebPDemuxGetI(demux, WEBP_FF_FORMAT_FLAGS);
+
+  // ... (Iterate over all frames).
+  WebPIterator iter;
+  if (WebPDemuxGetFrame(demux, 1, &iter)) {
+    do {
+      // ... (Consume 'iter'; e.g. Decode 'iter.fragment' with WebPDecode(),
+      // ... and get other frame properties like width, height, offsets etc.
+      // ... see 'struct WebPIterator' below for more info).
+    } while (WebPDemuxNextFrame(&iter));
+    WebPDemuxReleaseIterator(&iter);
+  }
+
+  // ... (Extract metadata).
+  WebPChunkIterator chunk_iter;
+  if (flags & ICCP_FLAG) WebPDemuxGetChunk(demux, "ICCP", 1, &chunk_iter);
+  // ... (Consume the ICC profile in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & EXIF_FLAG) WebPDemuxGetChunk(demux, "EXIF", 1, &chunk_iter);
+  // ... (Consume the EXIF metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  if (flags & XMP_FLAG) WebPDemuxGetChunk(demux, "XMP ", 1, &chunk_iter);
+  // ... (Consume the XMP metadata in 'chunk_iter.chunk').
+  WebPDemuxReleaseChunkIterator(&chunk_iter);
+  WebPDemuxDelete(demux);
+*/
 
 #ifndef WEBP_WEBP_DEMUX_H_
 #define WEBP_WEBP_DEMUX_H_
 
 #include "./mux_types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -73,9 +74,11 @@ WEBP_EXTERN(int) WebPGetDemuxVersion(void);
 // Life of a Demux object
 
 typedef enum WebPDemuxState {
-  WEBP_DEMUX_PARSING_HEADER,  // Not enough data to parse full header.
-  WEBP_DEMUX_PARSED_HEADER,   // Header parsing complete, data may be available.
-  WEBP_DEMUX_DONE             // Entire file has been parsed.
+  WEBP_DEMUX_PARSE_ERROR    = -1,  // An error occurred while parsing.
+  WEBP_DEMUX_PARSING_HEADER =  0,  // Not enough data to parse full header.
+  WEBP_DEMUX_PARSED_HEADER  =  1,  // Header parsing complete,
+                                   // data may be available.
+  WEBP_DEMUX_DONE           =  2   // Entire file has been parsed.
 } WebPDemuxState;
 
 // Internal, version-checked, entry point
@@ -90,7 +93,12 @@ static WEBP_INLINE WebPDemuxer* WebPDemux(const WebPData* data) {
 
 // Parses the possibly incomplete WebP file given by 'data'.
 // If 'state' is non-NULL it will be set to indicate the status of the demuxer.
-// Returns a WebPDemuxer object on successful parse, NULL otherwise.
+// Returns NULL in case of error or if there isn't enough data to start parsing;
+// and a WebPDemuxer object on successful parse.
+// Note that WebPDemuxer keeps internal pointers to 'data' memory segment.
+// If this data is volatile, the demuxer object should be deleted (by calling
+// WebPDemuxDelete()) and WebPDemuxPartial() called again on the new data.
+// This is usually an inexpensive operation.
 static WEBP_INLINE WebPDemuxer* WebPDemuxPartial(
     const WebPData* data, WebPDemuxState* state) {
   return WebPDemuxInternal(data, 1, state, WEBP_DEMUX_ABI_VERSION);
@@ -145,7 +153,7 @@ struct WebPIterator {
 
 // Retrieves frame 'frame_number' from 'dmux'.
 // 'iter->fragment' points to the first fragment on return from this function.
-// Individual fragments may be extracted using WebPDemuxSetFragment().
+// Individual fragments may be extracted using WebPDemuxSelectFragment().
 // Setting 'frame_number' equal to 0 will return the last frame of the image.
 // Returns false if 'dmux' is NULL or frame 'frame_number' is not present.
 // Call WebPDemuxReleaseIterator() when use of the iterator is complete.
@@ -209,7 +217,7 @@ WEBP_EXTERN(void) WebPDemuxReleaseChunkIterator(WebPChunkIterator* iter);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/webp/encode.h b/third_party/libwebp/webp/encode.h
index 726992f..7a428b4 100644
--- a/third_party/libwebp/webp/encode.h
+++ b/third_party/libwebp/webp/encode.h
@@ -16,11 +16,11 @@
 
 #include "./types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#define WEBP_ENCODER_ABI_VERSION 0x0201    // MAJOR(8b) + MINOR(8b)
+#define WEBP_ENCODER_ABI_VERSION 0x0202    // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -117,7 +117,8 @@ struct WebPConfig {
 
   int show_compressed;    // if true, export the compressed picture back.
                           // In-loop filtering is not applied.
-  int preprocessing;      // preprocessing filter (0=none, 1=segment-smooth)
+  int preprocessing;      // preprocessing filter:
+                          // 0=none, 1=segment-smooth, 2=pseudo-random dithering
   int partitions;         // log2(number of token partitions) in [0..3]. Default
                           // is set to 0 for easier progressive decoding.
   int partition_limit;    // quality degradation allowed to fit the 512k limit
@@ -443,6 +444,13 @@ WEBP_EXTERN(int) WebPPictureImportBGRX(
 WEBP_EXTERN(int) WebPPictureARGBToYUVA(WebPPicture* picture,
                                        WebPEncCSP colorspace);
 
+// Same as WebPPictureARGBToYUVA(), but the conversion is done using
+// pseudo-random dithering with a strength 'dithering' between
+// 0.0 (no dithering) and 1.0 (maximum dithering). This is useful
+// for photographic picture.
+WEBP_EXTERN(int) WebPPictureARGBToYUVADithered(
+    WebPPicture* picture, WebPEncCSP colorspace, float dithering);
+
 // Converts picture->yuv to picture->argb and sets picture->use_argb to true.
 // The input format must be YUV_420 or YUV_420A.
 // Note that the use of this method is discouraged if one has access to the
@@ -461,6 +469,11 @@ WEBP_EXTERN(void) WebPCleanupTransparentArea(WebPPicture* picture);
 // alpha plane can be ignored altogether e.g.).
 WEBP_EXTERN(int) WebPPictureHasTransparency(const WebPPicture* picture);
 
+// Remove the transparency information (if present) by blending the color with
+// the background color 'background_rgb' (specified as 24bit RGB triplet).
+// After this call, all alpha values are reset to 0xff.
+WEBP_EXTERN(void) WebPBlendAlpha(WebPPicture* pic, uint32_t background_rgb);
+
 //------------------------------------------------------------------------------
 // Main call
 
@@ -478,7 +491,7 @@ WEBP_EXTERN(int) WebPEncode(const WebPConfig* config, WebPPicture* picture);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/webp/mux.h b/third_party/libwebp/webp/mux.h
index cd56650..eb57f51 100644
--- a/third_party/libwebp/webp/mux.h
+++ b/third_party/libwebp/webp/mux.h
@@ -7,7 +7,7 @@
 // be found in the AUTHORS file in the root of the source tree.
 // -----------------------------------------------------------------------------
 //
-//  RIFF container manipulation for WEBP images.
+//  RIFF container manipulation for WebP images.
 //
 // Authors: Urvang (urvang@google.com)
 //          Vikas (vikasa@google.com)
@@ -15,45 +15,47 @@
 // This API allows manipulation of WebP container images containing features
 // like color profile, metadata, animation and fragmented images.
 //
-// Code Example#1: Creating a MUX with image data, color profile and XMP
-// metadata.
-//
-//   int copy_data = 0;
-//   WebPMux* mux = WebPMuxNew();
-//   // ... (Prepare image data).
-//   WebPMuxSetImage(mux, &image, copy_data);
-//   // ... (Prepare ICCP color profile data).
-//   WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
-//   // ... (Prepare XMP metadata).
-//   WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
-//   // Get data from mux in WebP RIFF format.
-//   WebPMuxAssemble(mux, &output_data);
-//   WebPMuxDelete(mux);
-//   // ... (Consume output_data; e.g. write output_data.bytes to file).
-//   WebPDataClear(&output_data);
-//
+// Code Example#1: Create a WebPMux object with image data, color profile and
+// XMP metadata.
+/*
+  int copy_data = 0;
+  WebPMux* mux = WebPMuxNew();
+  // ... (Prepare image data).
+  WebPMuxSetImage(mux, &image, copy_data);
+  // ... (Prepare ICCP color profile data).
+  WebPMuxSetChunk(mux, "ICCP", &icc_profile, copy_data);
+  // ... (Prepare XMP metadata).
+  WebPMuxSetChunk(mux, "XMP ", &xmp, copy_data);
+  // Get data from mux in WebP RIFF format.
+  WebPMuxAssemble(mux, &output_data);
+  WebPMuxDelete(mux);
+  // ... (Consume output_data; e.g. write output_data.bytes to file).
+  WebPDataClear(&output_data);
+*/
+
 // Code Example#2: Get image and color profile data from a WebP file.
-//
-//   int copy_data = 0;
-//   // ... (Read data from file).
-//   WebPMux* mux = WebPMuxCreate(&data, copy_data);
-//   WebPMuxGetFrame(mux, 1, &image);
-//   // ... (Consume image; e.g. call WebPDecode() to decode the data).
-//   WebPMuxGetChunk(mux, "ICCP", &icc_profile);
-//   // ... (Consume icc_data).
-//   WebPMuxDelete(mux);
-//   free(data);
+/*
+  int copy_data = 0;
+  // ... (Read data from file).
+  WebPMux* mux = WebPMuxCreate(&data, copy_data);
+  WebPMuxGetFrame(mux, 1, &image);
+  // ... (Consume image; e.g. call WebPDecode() to decode the data).
+  WebPMuxGetChunk(mux, "ICCP", &icc_profile);
+  // ... (Consume icc_data).
+  WebPMuxDelete(mux);
+  free(data);
+*/
 
 #ifndef WEBP_WEBP_MUX_H_
 #define WEBP_WEBP_MUX_H_
 
 #include "./mux_types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
-#define WEBP_MUX_ABI_VERSION 0x0100        // MAJOR(8b) + MINOR(8b)
+#define WEBP_MUX_ABI_VERSION 0x0101        // MAJOR(8b) + MINOR(8b)
 
 // Note: forward declaring enumerations is not allowed in (strict) C and C++,
 // the types are left here for reference.
@@ -91,7 +93,7 @@ typedef enum WebPChunkId {
 //------------------------------------------------------------------------------
 
 // Returns the version number of the mux library, packed in hexadecimal using
-// 8bits or each of major/minor/revision. E.g: v2.5.7 is 0x020507.
+// 8bits for each of major/minor/revision. E.g: v2.5.7 is 0x020507.
 WEBP_EXTERN(int) WebPGetMuxVersion(void);
 
 //------------------------------------------------------------------------------
@@ -122,7 +124,7 @@ WEBP_EXTERN(WebPMux*) WebPMuxCreateInternal(const WebPData*, int, int);
 // Parameters:
 //   bitstream - (in) the bitstream data in WebP RIFF format
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied.
 // Returns:
 //   A pointer to the mux object created from given data - on success.
 //   NULL - In case of invalid data or memory error.
@@ -136,8 +138,8 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 
 // Note: Only non-image related chunks should be managed through chunk APIs.
 // (Image related chunks are: "ANMF", "FRGM", "VP8 ", "VP8L" and "ALPH").
-// To add, get and delete images, use APIs WebPMuxSetImage(),
-// WebPMuxPushFrame(), WebPMuxGetFrame() and WebPMuxDeleteFrame().
+// To add, get and delete images, use WebPMuxSetImage(), WebPMuxPushFrame(),
+// WebPMuxGetFrame() and WebPMuxDeleteFrame().
 
 // Adds a chunk with id 'fourcc' and data 'chunk_data' in the mux object.
 // Any existing chunk(s) with the same id will be removed.
@@ -147,7 +149,7 @@ static WEBP_INLINE WebPMux* WebPMuxCreate(const WebPData* bitstream,
 //                 e.g., "ICCP", "XMP ", "EXIF" etc.
 //   chunk_data - (in) the chunk data to be added
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux, fourcc or chunk_data is NULL
 //                               or if fourcc corresponds to an image chunk.
@@ -165,7 +167,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetChunk(
 //                 e.g., "ICCP", "XMP ", "EXIF" etc.
 //   chunk_data - (out) returned chunk data
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if either mux, fourcc or chunk_data is NULL
+//   WEBP_MUX_INVALID_ARGUMENT - if mux, fourcc or chunk_data is NULL
 //                               or if fourcc corresponds to an image chunk.
 //   WEBP_MUX_NOT_FOUND - If mux does not contain a chunk with the given id.
 //   WEBP_MUX_OK - on success.
@@ -190,7 +192,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxDeleteChunk(
 
 // Encapsulates data about a single frame/fragment.
 struct WebPMuxFrameInfo {
-  WebPData    bitstream;  // image data: can either be a raw VP8/VP8L bitstream
+  WebPData    bitstream;  // image data: can be a raw VP8/VP8L bitstream
                           // or a single-image WebP file.
   int         x_offset;   // x-offset of the frame.
   int         y_offset;   // y-offset of the frame.
@@ -207,10 +209,10 @@ struct WebPMuxFrameInfo {
 // Note: Any existing images (including frames/fragments) will be removed.
 // Parameters:
 //   mux - (in/out) object in which the image is to be set
-//   bitstream - (in) can either be a raw VP8/VP8L bitstream or a single-image
+//   bitstream - (in) can be a raw VP8/VP8L bitstream or a single-image
 //               WebP file (non-animated and non-fragmented)
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux is NULL or bitstream is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
@@ -229,7 +231,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetImage(
 //   mux - (in/out) object to which the frame is to be added
 //   frame - (in) frame data.
 //   copy_data - (in) value 1 indicates given data WILL be copied to the mux
-//               and value 0 indicates data will NOT be copied.
+//               object and value 0 indicates data will NOT be copied.
 // Returns:
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or frame is NULL
 //                               or if content of 'frame' is invalid.
@@ -251,6 +253,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxPushFrame(
 //   WEBP_MUX_INVALID_ARGUMENT - if mux or frame is NULL.
 //   WEBP_MUX_NOT_FOUND - if there are less than nth frames in the mux object.
 //   WEBP_MUX_BAD_DATA - if nth frame chunk in mux is invalid.
+//   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxGetFrame(
     const WebPMux* mux, uint32_t nth, WebPMuxFrameInfo* frame);
@@ -286,7 +289,7 @@ struct WebPMuxAnimParams {
 //   mux - (in/out) object in which ANIM chunk is to be set/added
 //   params - (in) animation parameters.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if either mux or params is NULL
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
@@ -297,7 +300,7 @@ WEBP_EXTERN(WebPMuxError) WebPMuxSetAnimationParams(
 //   mux - (in) object from which the animation parameters to be fetched
 //   params - (out) animation parameters extracted from the ANIM chunk
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if either of mux or params is NULL
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or params is NULL.
 //   WEBP_MUX_NOT_FOUND - if ANIM chunk is not present in mux object.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
@@ -306,27 +309,44 @@ WEBP_EXTERN(WebPMuxError) WebPMuxGetAnimationParams(
 //------------------------------------------------------------------------------
 // Misc Utilities.
 
+// Gets the canvas size from the mux object.
+// Note: This method assumes that the VP8X chunk, if present, is up-to-date.
+// That is, the mux object hasn't been modified since the last call to
+// WebPMuxAssemble() or WebPMuxCreate().
+// Parameters:
+//   mux - (in) object from which the canvas size is to be fetched
+//   width - (out) canvas width
+//   height - (out) canvas height
+// Returns:
+//   WEBP_MUX_INVALID_ARGUMENT - if mux, width or height is NULL.
+//   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
+//   WEBP_MUX_OK - on success.
+WEBP_EXTERN(WebPMuxError) WebPMuxGetCanvasSize(const WebPMux* mux,
+                                               int* width, int* height);
+
 // Gets the feature flags from the mux object.
+// Note: This method assumes that the VP8X chunk, if present, is up-to-date.
+// That is, the mux object hasn't been modified since the last call to
+// WebPMuxAssemble() or WebPMuxCreate().
 // Parameters:
 //   mux - (in) object from which the features are to be fetched
 //   flags - (out) the flags specifying which features are present in the
 //           mux object. This will be an OR of various flag values.
 //           Enum 'WebPFeatureFlags' can be used to test individual flag values.
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL
-//   WEBP_MUX_NOT_FOUND - if VP8X chunk is not present in mux object.
-//   WEBP_MUX_BAD_DATA - if VP8X chunk in mux is invalid.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or flags is NULL.
+//   WEBP_MUX_BAD_DATA - if VP8X/VP8/VP8L chunk or canvas size is invalid.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxGetFeatures(const WebPMux* mux,
                                              uint32_t* flags);
 
-// Gets number of chunks having tag value tag in the mux object.
+// Gets number of chunks with the given 'id' in the mux object.
 // Parameters:
 //   mux - (in) object from which the info is to be fetched
 //   id - (in) chunk id specifying the type of chunk
 //   num_elements - (out) number of chunks with the given chunk id
 // Returns:
-//   WEBP_MUX_INVALID_ARGUMENT - if either mux, or num_elements is NULL
+//   WEBP_MUX_INVALID_ARGUMENT - if mux, or num_elements is NULL.
 //   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
                                            WebPChunkId id, int* num_elements);
@@ -342,16 +362,15 @@ WEBP_EXTERN(WebPMuxError) WebPMuxNumChunks(const WebPMux* mux,
 //   assembled_data - (out) assembled WebP data
 // Returns:
 //   WEBP_MUX_BAD_DATA - if mux object is invalid.
-//   WEBP_MUX_INVALID_ARGUMENT - if either mux, output_data or output_size is
-//                               NULL.
+//   WEBP_MUX_INVALID_ARGUMENT - if mux or assembled_data is NULL.
 //   WEBP_MUX_MEMORY_ERROR - on memory allocation error.
-//   WEBP_MUX_OK - on success
+//   WEBP_MUX_OK - on success.
 WEBP_EXTERN(WebPMuxError) WebPMuxAssemble(WebPMux* mux,
                                           WebPData* assembled_data);
 
 //------------------------------------------------------------------------------
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif
 
diff --git a/third_party/libwebp/webp/mux_types.h b/third_party/libwebp/webp/mux_types.h
index 94d8806..c94043a 100644
--- a/third_party/libwebp/webp/mux_types.h
+++ b/third_party/libwebp/webp/mux_types.h
@@ -18,7 +18,7 @@
 #include <string.h>  // memset()
 #include "./types.h"
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 extern "C" {
 #endif
 
@@ -90,7 +90,7 @@ static WEBP_INLINE int WebPDataCopy(const WebPData* src, WebPData* dst) {
   return 1;
 }
 
-#if defined(__cplusplus) || defined(c_plusplus)
+#ifdef __cplusplus
 }    // extern "C"
 #endif