9 files changed, 263 insertions, 244 deletions
diff --git a/third_party/libwebp/README.chromium b/third_party/libwebp/README.chromium
index 91cf9d2..7ef5f1d 100644
--- a/third_party/libwebp/README.chromium
+++ b/third_party/libwebp/README.chromium
@@ -1,7 +1,12 @@
 Name: libwebpdecode
 URL: http://code.google.com/speed/webp
 
-This contains a copy of libwebp-decode-0.1
+Here is a copy of libwebp-decode, from the repository:
+  git://review.webmproject.org/libwebp.git
+This code was pulled from MASTER on Dec 20th 2010, with the most recent
+Change-Id: I943a335b92b5ee6c2980c2ba9d4092f0b79f9a6b, "handle corner case of
+zero-dimensions".
+
 
 The project files do not include from the distribution:
   examples/
diff --git a/third_party/libwebp/bits.c b/third_party/libwebp/bits.c
index bdc89e0..a51775d 100644
--- a/third_party/libwebp/bits.c
+++ b/third_party/libwebp/bits.c
@@ -18,16 +18,16 @@ extern "C" {
 //-----------------------------------------------------------------------------
 // VP8BitReader
 
-int VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size) {
-  if (!br || !buf || size < 2) {
-    return 0;
-  }
-  br->buf_ = buf + 2;
+void VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size) {
+  assert(br);
+  assert(buf);
+  br->range_ = 255 - 1;
+  br->eof_ = 0;
+  br->buf_ = buf;
   br->buf_end_ = buf + size;
+  // Need two initial bytes.
+  br->value_ = (VP8GetByte(br) << 8) | VP8GetByte(br);
   br->left_ = -8;
-  br->value_ = (buf[0] << 8) | buf[1];
-  br->range_ = 255 - 1;
-  return 1;
 }
 
 const uint8_t kVP8Log2Range[128] = {
@@ -67,7 +67,7 @@ uint32_t VP8GetValue(VP8BitReader* const br, int bits) {
 }
 
 int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) {
-  const int value = (bits > 0) ? VP8GetValue(br, bits) : 0;
+  const int value = VP8GetValue(br, bits);
   return VP8Get(br) ? -value : value;
 }
 
diff --git a/third_party/libwebp/bits.h b/third_party/libwebp/bits.h
index e8a24e1..f19530f 100644
--- a/third_party/libwebp/bits.h
+++ b/third_party/libwebp/bits.h
@@ -33,8 +33,8 @@ typedef struct {
   int left_;                  // how many unused bits (negated)
 } VP8BitReader;
 
-// Initialize the bit reader and the boolean decoder. Return true if ok.
-int VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size);
+// Initialize the bit reader and the boolean decoder.
+void VP8Init(VP8BitReader* const br, const uint8_t* buf, uint32_t size);
 
 // return the next value made of 'num_bits' bits
 uint32_t VP8GetValue(VP8BitReader* const br, int num_bits);
diff --git a/third_party/libwebp/dsp.c b/third_party/libwebp/dsp.c
index 420b3b0..a64a9be 100644
--- a/third_party/libwebp/dsp.c
+++ b/third_party/libwebp/dsp.c
@@ -51,8 +51,7 @@ void VP8DspInitTables() {
 }
 
 static inline uint8_t clip_8b(int v) {
-  assert(v >= -255 && v <= 255 + 255);
-  return clip1[255 + v];
+  return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255;
 }
 
 //-----------------------------------------------------------------------------
@@ -70,18 +69,24 @@ static void Transform(const int16_t* in, uint8_t* dst) {
   int i;
   tmp = C;
   for (i = 0; i < 4; ++i) {    // vertical pass
-    const int a = in[0] + in[8];
-    const int b = in[0] - in[8];
-    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);
-    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);
-    tmp[0] = a + d;
-    tmp[1] = b + c;
-    tmp[2] = b - c;
-    tmp[3] = a - d;
+    const int a = in[0] + in[8];    // [-4096, 4094]
+    const int b = in[0] - in[8];    // [-4095, 4095]
+    const int c = MUL(in[4], kC2) - MUL(in[12], kC1);   // [-3783, 3783]
+    const int d = MUL(in[4], kC1) + MUL(in[12], kC2);   // [-3785, 3781]
+    tmp[0] = a + d;   // [-7881, 7875]
+    tmp[1] = b + c;   // [-7878, 7878]
+    tmp[2] = b - c;   // [-7878, 7878]
+    tmp[3] = a - d;   // [-7877, 7879]
     tmp += 4;
     in++;
   }
-
+  // Each pass is expanding the dynamic range by ~3.85 (upper bound).
+  // The exact value is (2. + (kC1 + kC2) / 65536).
+  // After the second pass, maximum interval is [-3794, 3794], assuming
+  // an input in [-2048, 2047] interval. We then need to add a dst value
+  // in the [0, 255] range.
+  // In the worst case scenario, the input to clip_8b() can be as large as
+  // [-60713, 60968].
   tmp = C;
   for (i = 0; i < 4; ++i) {    // horizontal pass
     const int dc = tmp[0] + 4;
@@ -170,32 +175,32 @@ void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT;
 
 static inline void TrueMotion(uint8_t *dst, int size) {
   const uint8_t* top = dst - BPS;
-  const int tl = top[-1];
-  int x, y;
-
+  const uint8_t* const clip0 = clip1 + 255 - top[-1];
+  int y;
   for (y = 0; y < size; ++y) {
-    const uint8_t* const clip = clip1 + 255 + dst[-1] - tl;
+    const uint8_t* const clip = clip0 + dst[-1];
+    int x;
     for (x = 0; x < size; ++x) {
       dst[x] = clip[top[x]];
     }
     dst += BPS;
   }
 }
-static void TM4(uint8_t *dst) { TrueMotion(dst, 4); }
+static void TM4(uint8_t *dst)   { TrueMotion(dst, 4); }
 static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); }
-static void TM16(uint8_t *dst) { TrueMotion(dst, 16); }
+static void TM16(uint8_t *dst)  { TrueMotion(dst, 16); }
 
 //-----------------------------------------------------------------------------
 // 16x16
 
-static void V16(uint8_t *dst) {     // vertical
+static void VE16(uint8_t *dst) {     // vertical
   int j;
   for (j = 0; j < 16; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 16);
   }
 }
 
-static void H16(uint8_t *dst) {     // horizontal
+static void HE16(uint8_t *dst) {     // horizontal
   int j;
   for (j = 16; j > 0; --j) {
     memset(dst, dst[-1], 16);
@@ -244,30 +249,24 @@ static void DC16NoTopLeft(uint8_t *dst) {  // DC with no top and left samples
 //-----------------------------------------------------------------------------
 // 4x4
 
-static inline void Put4(uint32_t v, uint8_t* dst) {
-  int i;
-  for (i = 4; i > 0; --i) {
-    *(uint32_t*)dst = v;
-    dst += BPS;
-  }
-}
-
 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2)
 #define AVG2(a, b) (((a) + (b) + 1) >> 1)
 
-static void V4(uint8_t *dst) {    // vertical
+static void VE4(uint8_t *dst) {    // vertical
   const uint8_t* top = dst - BPS;
   const uint8_t vals[4] = {
     AVG3(top[-1], top[0], top[1]),
-    AVG3(top[0], top[1], top[2]),
-    AVG3(top[1], top[2], top[3]),
-    AVG3(top[2], top[3], top[4])
+    AVG3(top[ 0], top[1], top[2]),
+    AVG3(top[ 1], top[2], top[3]),
+    AVG3(top[ 2], top[3], top[4])
   };
-  const uint32_t v = *(uint32_t*)vals;
-  Put4(v, dst);
+  int i;
+  for (i = 0; i < 4; ++i) {
+    memcpy(dst + i * BPS, vals, sizeof(vals));
+  }
 }
 
-static void H4(uint8_t *dst) {    // horizontal
+static void HE4(uint8_t *dst) {    // horizontal
   const int A = dst[-1 - BPS];
   const int B = dst[-1];
   const int C = dst[-1 + BPS];
@@ -282,10 +281,9 @@ static void H4(uint8_t *dst) {    // horizontal
 static void DC4(uint8_t *dst) {   // DC
   uint32_t dc = 4;
   int i;
-  for (i = 0; i < 4; ++i) {
-    dc += dst[i - BPS] + dst[-1 + i * BPS];
-  }
-  Put4((dc >> 3) * 0x01010101U, dst);
+  for (i = 0; i < 4; ++i) dc += dst[i - BPS] + dst[-1 + i * BPS];
+  dc >>= 3;
+  for (i = 0; i < 4; ++i) memset(dst + i * BPS, dc, 4);
 }
 
 static void RD4(uint8_t *dst) {   // Down-right
@@ -413,14 +411,14 @@ static void HD4(uint8_t *dst) {  // Horizontal-Down
 //-----------------------------------------------------------------------------
 // Chroma
 
-static void V8uv(uint8_t *dst) {    // vertical
+static void VE8uv(uint8_t *dst) {    // vertical
   int j;
   for (j = 0; j < 8; ++j) {
     memcpy(dst + j * BPS, dst - BPS, 8);
   }
 }
 
-static void H8uv(uint8_t *dst) {    // horizontal
+static void HE8uv(uint8_t *dst) {    // horizontal
   int j;
   for (j = 0; j < 8; ++j) {
     memset(dst, dst[-1], 8);
@@ -471,16 +469,16 @@ static void DC8uvNoTopLeft(uint8_t *dst) {    // DC with nothing
 // default C implementations
 
 VP8PredFunc VP8PredLuma4[11] = {
-  DC4, TM4, V4, H4, LD4, RD4, VR4, VL4, HD4, HU4
+  DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4
 };
 
 VP8PredFunc VP8PredLuma16[7] = {
-  DC16, TM16, V16, H16,
+  DC16, TM16, VE16, HE16,
   DC16NoTop, DC16NoLeft, DC16NoTopLeft
 };
 
 VP8PredFunc VP8PredChroma8[7] = {
-  DC8uv, TM8uv, V8uv, H8uv,
+  DC8uv, TM8uv, VE8uv, HE8uv,
   DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft
 };
 
diff --git a/third_party/libwebp/frame.c b/third_party/libwebp/frame.c
index 33cffd7..d934412 100644
--- a/third_party/libwebp/frame.c
+++ b/third_party/libwebp/frame.c
@@ -163,7 +163,7 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) {
   }
 }
 
-void VP8StoreBlock(VP8Decoder* const dec, VP8Io* const io) {
+void VP8StoreBlock(VP8Decoder* const dec) {
   if (dec->filter_type_ > 0) {
     VP8MB* const info = dec->mb_info_ + dec->mb_x_;
     int level = dec->filter_levels_[dec->segment_];
diff --git a/third_party/libwebp/tree.c b/third_party/libwebp/tree.c
index 1d5c422..03cb745 100644
--- a/third_party/libwebp/tree.c
+++ b/third_party/libwebp/tree.c
@@ -11,6 +11,7 @@
 
 #include <stdio.h>
 #include "vp8i.h"
+
 #define USE_GENERIC_TREE
 
 #if defined(__cplusplus) || defined(c_plusplus)
@@ -32,6 +33,12 @@ static const int8_t kYModesIntra4[18] = {
 #endif
 
 #ifndef ONLY_KEYFRAME_CODE
+
+// inter prediction modes
+enum {
+  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
+  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
+
 static const int8_t kYModesInter[8] = {
   -DC_PRED, 1,
     2, 3,
@@ -216,14 +223,13 @@ static const uint8_t
 
 // Paragraph 11.5
 static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
-  // genereated using vp8_kf_default_bmode_probs()
   { { 231, 120, 48, 89, 115, 113, 120, 152, 112 },
     { 152, 179, 64, 126, 170, 118, 46, 70, 95 },
     { 175, 69, 143, 80, 85, 82, 72, 155, 103 },
     { 56, 58, 10, 171, 218, 189, 17, 13, 152 },
-    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
     { 114, 26, 17, 163, 44, 195, 21, 10, 173 },
     { 121, 24, 80, 195, 26, 62, 44, 64, 85 },
+    { 144, 71, 10, 38, 171, 213, 144, 34, 26 },
     { 170, 46, 55, 19, 136, 160, 33, 206, 71 },
     { 63, 20, 8, 114, 114, 208, 12, 9, 226 },
     { 81, 40, 11, 96, 182, 84, 29, 16, 36 } },
@@ -231,9 +237,9 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 72, 187, 100, 130, 157, 111, 32, 75, 80 },
     { 66, 102, 167, 99, 74, 62, 40, 234, 128 },
     { 41, 53, 9, 178, 241, 141, 26, 8, 107 },
-    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
     { 74, 43, 26, 146, 73, 166, 49, 23, 157 },
     { 65, 38, 105, 160, 51, 52, 31, 115, 128 },
+    { 104, 79, 12, 27, 217, 255, 87, 17, 7 },
     { 87, 68, 71, 44, 114, 51, 15, 186, 23 },
     { 47, 41, 14, 110, 182, 183, 21, 17, 194 },
     { 66, 45, 25, 102, 197, 189, 23, 18, 22 } },
@@ -241,9 +247,9 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 43, 97, 183, 117, 85, 38, 35, 179, 61 },
     { 39, 53, 200, 87, 26, 21, 43, 232, 171 },
     { 56, 34, 51, 104, 114, 102, 29, 93, 77 },
-    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
     { 39, 28, 85, 171, 58, 165, 90, 98, 64 },
     { 34, 22, 116, 206, 23, 34, 43, 166, 73 },
+    { 107, 54, 32, 26, 51, 1, 81, 43, 31 },
     { 68, 25, 106, 22, 64, 171, 36, 225, 114 },
     { 34, 19, 21, 102, 132, 188, 16, 76, 124 },
     { 62, 18, 78, 95, 85, 57, 50, 48, 51 } },
@@ -251,29 +257,19 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 60, 148, 31, 172, 219, 228, 21, 18, 111 },
     { 112, 113, 77, 85, 179, 255, 38, 120, 114 },
     { 40, 42, 1, 196, 245, 209, 10, 25, 109 },
-    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
     { 88, 43, 29, 140, 166, 213, 37, 43, 154 },
     { 61, 63, 30, 155, 67, 45, 68, 1, 209 },
+    { 100, 80, 8, 43, 154, 1, 51, 26, 71 },
     { 142, 78, 78, 16, 255, 128, 34, 197, 171 },
     { 41, 40, 5, 102, 211, 183, 4, 1, 221 },
     { 51, 50, 17, 168, 209, 192, 23, 25, 82 } },
-  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
-    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
-    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
-    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
-    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
-    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
-    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
-    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
-    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
-    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
   { { 138, 31, 36, 171, 27, 166, 38, 44, 229 },
     { 67, 87, 58, 169, 82, 115, 26, 59, 179 },
     { 63, 59, 90, 180, 59, 166, 93, 73, 154 },
     { 40, 40, 21, 116, 143, 209, 34, 39, 175 },
-    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
     { 47, 15, 16, 183, 34, 223, 49, 45, 183 },
     { 46, 17, 33, 183, 6, 98, 15, 32, 183 },
+    { 57, 46, 22, 24, 128, 1, 54, 17, 37 },
     { 65, 32, 73, 115, 28, 128, 23, 128, 205 },
     { 40, 3, 9, 115, 51, 192, 18, 6, 223 },
     { 87, 37, 9, 115, 59, 77, 64, 21, 47 } },
@@ -281,19 +277,29 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 64, 90, 70, 205, 40, 41, 23, 26, 57 },
     { 54, 57, 112, 184, 5, 41, 38, 166, 213 },
     { 30, 34, 26, 133, 152, 116, 10, 32, 134 },
-    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
     { 39, 19, 53, 221, 26, 114, 32, 73, 255 },
     { 31, 9, 65, 234, 2, 15, 1, 118, 73 },
+    { 75, 32, 12, 51, 192, 255, 160, 43, 51 },
     { 88, 31, 35, 67, 102, 85, 55, 186, 85 },
     { 56, 21, 23, 111, 59, 205, 45, 37, 192 },
     { 55, 38, 70, 124, 73, 102, 1, 34, 98 } },
+  { { 125, 98, 42, 88, 104, 85, 117, 175, 82 },
+    { 95, 84, 53, 89, 128, 100, 113, 101, 45 },
+    { 75, 79, 123, 47, 51, 128, 81, 171, 1 },
+    { 57, 17, 5, 71, 102, 57, 53, 41, 49 },
+    { 38, 33, 13, 121, 57, 73, 26, 1, 85 },
+    { 41, 10, 67, 138, 77, 110, 90, 47, 114 },
+    { 115, 21, 2, 10, 102, 255, 166, 23, 6 },
+    { 101, 29, 16, 10, 85, 128, 101, 196, 26 },
+    { 57, 18, 10, 102, 102, 213, 34, 20, 43 },
+    { 117, 20, 15, 36, 163, 128, 68, 1, 26 } },
   { { 102, 61, 71, 37, 34, 53, 31, 243, 192 },
     { 69, 60, 71, 38, 73, 119, 28, 222, 37 },
     { 68, 45, 128, 34, 1, 47, 11, 245, 171 },
     { 62, 17, 19, 70, 146, 85, 55, 62, 70 },
-    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
     { 37, 43, 37, 154, 100, 163, 85, 160, 1 },
     { 63, 9, 92, 136, 28, 64, 32, 201, 85 },
+    { 75, 15, 9, 9, 64, 255, 184, 119, 16 },
     { 86, 6, 28, 5, 64, 255, 25, 248, 1 },
     { 56, 8, 17, 132, 137, 255, 55, 116, 128 },
     { 58, 15, 20, 82, 135, 57, 26, 121, 40 } },
@@ -301,9 +307,9 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 51, 103, 44, 131, 131, 123, 31, 6, 158 },
     { 86, 40, 64, 135, 148, 224, 45, 183, 128 },
     { 22, 26, 17, 131, 240, 154, 14, 1, 209 },
-    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
     { 45, 16, 21, 91, 64, 222, 7, 1, 197 },
     { 56, 21, 39, 155, 60, 138, 23, 102, 213 },
+    { 83, 12, 13, 54, 192, 255, 68, 47, 28 },
     { 85, 26, 85, 85, 128, 128, 32, 146, 171 },
     { 18, 11, 7, 63, 144, 171, 4, 4, 246 },
     { 35, 27, 10, 146, 174, 171, 12, 26, 128 } },
@@ -311,9 +317,9 @@ static const uint8_t kBModesProba[NUM_BMODES][NUM_BMODES][NUM_BMODES - 1] = {
     { 85, 126, 47, 87, 176, 51, 41, 20, 32 },
     { 101, 75, 128, 139, 118, 146, 116, 128, 85 },
     { 56, 41, 15, 176, 236, 85, 37, 9, 62 },
-    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
     { 71, 30, 17, 119, 118, 255, 17, 18, 138 },
     { 101, 38, 60, 138, 55, 70, 43, 26, 142 },
+    { 146, 36, 19, 30, 171, 255, 97, 27, 20 },
     { 138, 45, 61, 62, 219, 1, 81, 188, 64 },
     { 32, 41, 20, 117, 151, 142, 20, 21, 163 },
     { 112, 19, 12, 61, 195, 128, 48, 4, 24 } }
@@ -551,25 +557,27 @@ void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec) {
   }
 #ifndef ONLY_KEYFRAME_CODE
   if (!dec->frm_hdr_.key_frame_) {
+    int i;
     dec->intra_p_ = VP8GetValue(br, 8);
     dec->last_p_ = VP8GetValue(br, 8);
     dec->golden_p_ = VP8GetValue(br, 8);
     if (VP8Get(br)) {   // update y-mode
-      for (int i = 0; i < 4; ++i) {
+      for (i = 0; i < 4; ++i) {
         proba->ymode_[i] = VP8GetValue(br, 8);
       }
     }
     if (VP8Get(br)) {   // update uv-mode
-      for (int i = 0; i < 3; ++i) {
+      for (i = 0; i < 3; ++i) {
         proba->uvmode_[i] = VP8GetValue(br, 8);
       }
     }
     // update MV
-    for (int d = 0; d < 2; ++d) {
-      for (int k = 0; k < NUM_MV_PROBAS; ++k) {
-        if (VP8GetBit(br, MVUpdateProba[d][k])) {
+    for (i = 0; i < 2; ++i) {
+      int k;
+      for (k = 0; k < NUM_MV_PROBAS; ++k) {
+        if (VP8GetBit(br, MVUpdateProba[i][k])) {
           const int v = VP8GetValue(br, 7);
-          proba->mv_[d][k] = v ? v << 1 : 1;
+          proba->mv_[i][k] = v ? v << 1 : 1;
         }
       }
     }
diff --git a/third_party/libwebp/vp8.c b/third_party/libwebp/vp8.c
index 38018f2..caae7d9 100644
--- a/third_party/libwebp/vp8.c
+++ b/third_party/libwebp/vp8.c
@@ -188,13 +188,13 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
     return 0;
   }
   SetOk(dec);
-  if (io == NULL || io->data == NULL || io->data_size <= 4) {
+  if (io == NULL) {
     return VP8SetError(dec, 2, "null VP8Io passed to VP8GetHeaders()");
   }
 
   buf = (uint8_t *)io->data;
   buf_size = io->data_size;
-  if (buf_size < 4) {
+  if (buf == NULL || buf_size <= 4) {
     return VP8SetError(dec, 2, "Not enough data to parse frame header");
   }
 
@@ -260,6 +260,9 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
 
   br = &dec->br_;
   VP8Init(br, buf, buf_size);
+  if (frm_hdr->partition_length_ > buf_size) {
+    return VP8SetError(dec, 2, "bad partition length");
+  }
   buf += frm_hdr->partition_length_;
   buf_size -= frm_hdr->partition_length_;
   if (frm_hdr->key_frame_) {
@@ -302,15 +305,16 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) {
   }
 
   // Paragraph 9.8
+#ifndef ONLY_KEYFRAME_CODE
   dec->update_proba_ = VP8Get(br);
   if (!dec->update_proba_) {    // save for later restore
     dec->proba_saved_ = dec->proba_;
   }
-
-#ifndef ONLY_KEYFRAME_CODE
   dec->buffer_flags_ &= 1 << 8;
   dec->buffer_flags_ |=
       (frm_hdr->key_frame_ || VP8Get(br)) << 8;    // refresh last frame
+#else
+  VP8Get(br);   // just ignore the value of update_proba_
 #endif
 
   VP8ParseProba(br, dec);
@@ -328,24 +332,24 @@ static const uint8_t kBands[16 + 1] = {
   0  // extra entry as sentinel
 };
 
-static const uint8_t kCat3[] = {173, 148, 140, 0};
-static const uint8_t kCat4[] = {176, 155, 140, 135, 0};
-static const uint8_t kCat5[] = {180, 157, 141, 134, 130, 0};
+static const uint8_t kCat3[] = { 173, 148, 140, 0 };
+static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 };
+static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 };
 static const uint8_t kCat6[] =
-  {254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0};
+  { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 };
 static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 };
 static const uint8_t kZigzag[16] = {
   0, 1, 4, 8,  5, 2, 3, 6,  9, 12, 13, 10,  7, 11, 14, 15
 };
 
-typedef const uint8_t PROBA_ARRAY[NUM_CTX][NUM_PROBAS];
+typedef const uint8_t (*ProbaArray)[NUM_CTX][NUM_PROBAS];  // for const-casting
 
-static int GetCoeffs(VP8BitReader* const br,
-                     const uint8_t (*prob)[NUM_CTX][NUM_PROBAS],
+// Returns 1 if there's non-zero coeffs, 0 otherwise
+static int GetCoeffs(VP8BitReader* const br, ProbaArray prob,
                      int ctx, const uint16_t dq[2], int n, int16_t* out) {
   const uint8_t* p = prob[kBands[n]][ctx];
   if (!VP8GetBit(br, p[0])) {   // first EOB is more a 'CBP' bit.
-    return -1;
+    return 0;
   }
   while (1) {
     ++n;
@@ -371,12 +375,12 @@ static int GetCoeffs(VP8BitReader* const br,
               v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145);
             }
           } else {
-            uint8_t* tab;
+            const uint8_t* tab;
             const int bit1 = VP8GetBit(br, p[8]);
             const int bit0 = VP8GetBit(br, p[9 + bit1]);
             const int cat = 2 * bit1 + bit0;
             v = 0;
-            for (tab = (uint8_t*)kCat3456[cat]; *tab; ++tab) {
+            for (tab = kCat3456[cat]; *tab; ++tab) {
               v += v + VP8GetBit(br, *tab);
             }
             v += 3 + (8 << cat);
@@ -386,13 +390,15 @@ static int GetCoeffs(VP8BitReader* const br,
       }
       j = kZigzag[n - 1];
       out[j] = VP8GetSigned(br, v) * dq[j > 0];
-      if (n == 16) break;
-      if (!VP8GetBit(br, p[0])) {   // EOB
-        return n;
+      if (n == 16 || !VP8GetBit(br, p[0])) {   // EOB
+        return 1;
       }
     }
+    if (n == 16) {
+      return 1;
+    }
   }
-  return 15;
+  return 0;
 }
 
 // Table to unpack four bits into four bytes
@@ -403,14 +409,18 @@ static const uint8_t kUnpackTab[16][4] = {
   {0, 0, 1, 1},  {1, 0, 1, 1},  {0, 1, 1, 1},  {1, 1, 1, 1} };
 
 // Macro to pack four LSB of four bytes into four bits.
-#define PACK(X, S) \
-  ((((*(uint32_t*)(X)) * 0x01020408U) & 0xff000000) >> (S))
+#if defined(__PPC__) || defined(_M_PPC) || defined(_ARCH_PPC) || \
+    defined(__BIG_ENDIAN__)
+#define PACK_CST 0x08040201U
+#else
+#define PACK_CST 0x01020408U
+#endif
+#define PACK(X, S) ((((*(uint32_t*)(X)) * PACK_CST) & 0xff000000) >> (S))
 
-typedef const uint8_t (*Proba_t)[NUM_CTX][NUM_PROBAS];  // for const-casting
 static int ParseResiduals(VP8Decoder* const dec,
                           VP8MB* const mb, VP8BitReader* const token_br) {
   int out_t_nz, out_l_nz, first;
-  Proba_t ac_prob;
+  ProbaArray ac_prob;
   const VP8QuantMatrix* q = &dec->dqm_[dec->segment_];
   int16_t* dst = dec->coeffs_;
   VP8MB* const left_mb = dec->mb_info_ - 1;
@@ -424,15 +434,15 @@ static int ParseResiduals(VP8Decoder* const dec,
   if (!dec->is_i4x4_) {    // parse DC
     int16_t dc[16] = { 0 };
     const int ctx = mb->dc_nz_ + left_mb->dc_nz_;
-    const int last = GetCoeffs(token_br, (Proba_t)dec->proba_.coeffs_[1],
-                               ctx, q->y2_mat_, 0, dc);
-    mb->dc_nz_ = left_mb->dc_nz_ = (last >= 0);
+    mb->dc_nz_ = left_mb->dc_nz_ =
+        GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[1],
+                  ctx, q->y2_mat_, 0, dc);
     first = 1;
-    ac_prob = (Proba_t)dec->proba_.coeffs_[0];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[0];
     VP8TransformWHT(dc, dst);
   } else {
     first = 0;
-    ac_prob = (Proba_t)dec->proba_.coeffs_[3];
+    ac_prob = (ProbaArray)dec->proba_.coeffs_[3];
   }
 
   memcpy(tnz, kUnpackTab[mb->nz_ & 0xf], sizeof(tnz));
@@ -442,11 +452,10 @@ static int ParseResiduals(VP8Decoder* const dec,
 
     for (x = 0; x < 4; ++x) {
       const int ctx = l + tnz[x];
-      const int last = GetCoeffs(token_br, ac_prob, ctx,
-                                 q->y1_mat_, first, dst);
+      l = GetCoeffs(token_br, ac_prob, ctx,
+                    q->y1_mat_, first, dst);
       nz_dc[x] = (dst[0] != 0);
-      nz_ac[x] = (last > 0);
-      tnz[x] = l = (last >= 0);
+      nz_ac[x] = tnz[x] = l;
       dst += 16;
     }
     lnz[y] = l;
@@ -463,12 +472,10 @@ static int ParseResiduals(VP8Decoder* const dec,
       int l = lnz[ch + y];
       for (x = 0; x < 2; ++x) {
         const int ctx = l + tnz[ch + x];
-        const int last =
-            GetCoeffs(token_br, (Proba_t)dec->proba_.coeffs_[2],
+        l = GetCoeffs(token_br, (ProbaArray)dec->proba_.coeffs_[2],
                       ctx, q->uv_mat_, 0, dst);
         nz_dc[y * 2 + x] = (dst[0] != 0);
-        nz_ac[y * 2 + x] = (last > 0);
-        tnz[ch + x] = l = (last >= 0);
+        nz_ac[y * 2 + x] = tnz[ch + x] = l;
         dst += 16;
       }
       lnz[ch + y] = l;
@@ -537,7 +544,7 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
       VP8ReconstructBlock(dec);
 
       // Store data and save block's filtering params
-      VP8StoreBlock(dec, io);
+      VP8StoreBlock(dec);
     }
     if (!ok) {
       break;
@@ -550,9 +557,12 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) {
   }
 
   // Finish
+#ifndef ONLY_KEYFRAME_CODE
   if (!dec->update_proba_) {
     dec->proba_ = dec->proba_saved_;
   }
+#endif
+
   return ok;
 }
 
diff --git a/third_party/libwebp/vp8i.h b/third_party/libwebp/vp8i.h
index 79f0b2c..76985b8 100644
--- a/third_party/libwebp/vp8i.h
+++ b/third_party/libwebp/vp8i.h
@@ -29,9 +29,9 @@ enum { B_DC_PRED = 0,   // 4x4 modes
        B_TM_PRED,
        B_VE_PRED,
        B_HE_PRED,
-       B_LD_PRED,
        B_RD_PRED,
        B_VR_PRED,
+       B_LD_PRED,
        B_VL_PRED,
        B_HD_PRED,
        B_HU_PRED,
@@ -47,13 +47,6 @@ enum { B_DC_PRED = 0,   // 4x4 modes
        B_DC_PRED_NOLEFT = 5,
        B_DC_PRED_NOTOPLEFT = 6 };
 
-#ifndef ONLY_KEYFRAME_CODE
-// inter prediction modes
-enum {
-  LEFT4 = 0, ABOVE4 = 1, ZERO4 = 2, NEW4 = 3,
-  NEARESTMV, NEARMV, ZEROMV, NEWMV, SPLITMV };
-#endif
-
 enum { MB_FEATURE_TREE_PROBS = 3,
        NUM_MB_SEGMENTS = 4,
        NUM_REF_LF_DELTAS = 4,
@@ -177,10 +170,10 @@ struct VP8Decoder {
   VP8BitReader br_;
 
   // headers
-  VP8FrameHeader    frm_hdr_;
-  VP8PictureHeader  pic_hdr_;
-  VP8FilterHeader   filter_hdr_;
-  VP8SegmentHeader  segment_hdr_;
+  VP8FrameHeader   frm_hdr_;
+  VP8PictureHeader pic_hdr_;
+  VP8FilterHeader  filter_hdr_;
+  VP8SegmentHeader segment_hdr_;
 
   // dimension, in macroblock units.
   int mb_w_, mb_h_;
@@ -201,10 +194,14 @@ struct VP8Decoder {
   VP8QuantMatrix dqm_[NUM_MB_SEGMENTS];
 
   // probabilities
-  VP8Proba proba_, proba_saved_;
-  int update_proba_;
+  VP8Proba proba_;
   int use_skip_proba_;
-  uint8_t skip_p_, intra_p_, last_p_, golden_p_;
+  uint8_t skip_p_;
+#ifndef ONLY_KEYFRAME_CODE
+  uint8_t intra_p_, last_p_, golden_p_;
+  VP8Proba proba_saved_;
+  int update_proba_;
+#endif
 
   // Boundary data cache and persistent buffers.
   uint8_t* intra_t_;     // top intra modes values: 4 * mb_w_
@@ -264,7 +261,7 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io);
 // Predict a block and add residual
 void VP8ReconstructBlock(VP8Decoder* const dec);
 // Store a block, along with filtering params
-void VP8StoreBlock(VP8Decoder* const dec, VP8Io* const io);
+void VP8StoreBlock(VP8Decoder* const dec);
 // Finalize and transmit a complete row
 void VP8FinishRow(VP8Decoder* const dec, VP8Io* io);
 
diff --git a/third_party/libwebp/webp.c b/third_party/libwebp/webp.c
index 15a189d..f9a0eb9 100644
--- a/third_party/libwebp/webp.c
+++ b/third_party/libwebp/webp.c
@@ -76,87 +76,98 @@ typedef enum { MODE_RGB = 0, MODE_RGBA = 1,
 // we interpolate u/v as:
 //  ([9*a + 3*b + 3*c +   d    3*a + 9*b + 3*c +   d] + [8 8]) / 16
 //  ([3*a +   b + 9*c + 3*d      a + 3*b + 3*c + 9*d]   [8 8]) / 16
-#define MIX_ODD(a, b, c, d)        \
-  ((9 * (a) + 3 * ((b) + (c)) + (d) + 0x00080008u) >> 4)
-#define MIX_EVEN(a, b, c, d)       \
-  ((9 * (c) + 3 * ((d) + (a)) + (b) + 0x00080008u) >> 4)
 
 // We process u and v together stashed into 32bit (16bit each).
-// Note that we could store the pair (3*t_uv + uv, t_uv + 3*uv)
-// instead of (t_uv, uv), into a 64bit variable. Doing so, we could
-// simplify the MIXing a bit and save two multiplies. TODO(skal).
 #define LOAD_UV(u,v) ((u) | ((v) << 16))
 
-// Macro festival, so we can define all of rgb/bgr/rgba/bgra cases
-// for odd and even lines
-#define UPSCALE_FUNC(FUNC_NAME, MIX, FUNC, XSTEP)                        \
-static void FUNC_NAME(const uint8_t* cur_y,                              \
-                      const uint8_t* cur_u, const uint8_t* cur_v,        \
-                      const uint8_t* top_u, const uint8_t* top_v,        \
-                      int len, uint8_t* dst) {                           \
-  int x;                                                                 \
-  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */  \
-  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */      \
-  uint32_t uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv);                          \
-  FUNC(cur_y[0], uv0 & 0xff, (uv0 >> 16), dst);                          \
-  len -= 1;   /* first pixel is done. */                                 \
-  for (x = 1; x <= (len >> 1); ++x) {                                    \
-    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */ \
-    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */     \
-    const uint32_t uv0  = MIX(tl_uv, t_uv, l_uv, uv);                    \
-    const uint32_t uv1  = MIX(t_uv, tl_uv, uv, l_uv);                    \
-    FUNC(cur_y[2*x-1], uv0 & 0xff, (uv0 >> 16), dst + (2*x-1) * XSTEP);  \
-    FUNC(cur_y[2*x  ], uv1 & 0xff, (uv1 >> 16), dst + (2*x  ) * XSTEP);  \
-    tl_uv = t_uv;                                                        \
-    l_uv = uv;                                                           \
-  }                                                                      \
-  if (len & 1) {                                                         \
-    uv0 = MIX(tl_uv, tl_uv, l_uv, l_uv);                                 \
-    FUNC(cur_y[len], uv0 & 0xff, (uv0 >> 16), dst + len * XSTEP);        \
-  }                                                                      \
-}                                                                        \
+#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP)                                   \
+static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y,    \
+                             const uint8_t* top_u, const uint8_t* top_v,       \
+                             const uint8_t* cur_u, const uint8_t* cur_v,       \
+                             uint8_t* top_dst, uint8_t* bottom_dst, int len) { \
+  int x;                                                                       \
+  const int last_pixel_pair = (len - 1) >> 1;                                  \
+  uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]);   /* top-left sample */        \
+  uint32_t l_uv  = LOAD_UV(cur_u[0], cur_v[0]);   /* left-sample */            \
+  if (top_y) {                                                                 \
+    const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;                \
+    FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst);                          \
+  }                                                                            \
+  if (bottom_y) {                                                              \
+    const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;                \
+    FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst);                    \
+  }                                                                            \
+  for (x = 1; x <= last_pixel_pair; ++x) {                                     \
+    const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]);  /* top sample */       \
+    const uint32_t uv   = LOAD_UV(cur_u[x], cur_v[x]);  /* sample */           \
+    /* precompute invariant values associated with first and second diagonals*/\
+    const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u;               \
+    const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3;                   \
+    const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3;                    \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (diag_12 + tl_uv) >> 1;                             \
+      const uint32_t uv1 = (diag_03 + t_uv) >> 1;                              \
+      FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                          \
+           top_dst + (2 * x - 1) * XSTEP);                                     \
+      FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16),                          \
+           top_dst + (2 * x - 0) * XSTEP);                                     \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (diag_03 + l_uv) >> 1;                              \
+      const uint32_t uv1 = (diag_12 + uv) >> 1;                                \
+      FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16),                       \
+           bottom_dst + (2 * x - 1) * XSTEP);                                  \
+      FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16),                       \
+           bottom_dst + (2 * x + 0) * XSTEP);                                  \
+    }                                                                          \
+    tl_uv = t_uv;                                                              \
+    l_uv = uv;                                                                 \
+  }                                                                            \
+  if (!(len & 1)) {                                                            \
+    if (top_y) {                                                               \
+      const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2;              \
+      FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16),                            \
+           top_dst + (len - 1) * XSTEP);                                       \
+    }                                                                          \
+    if (bottom_y) {                                                            \
+      const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2;              \
+      FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16),                         \
+           bottom_dst + (len - 1) * XSTEP);                                    \
+    }                                                                          \
+  }                                                                            \
+}
 
 // All variants implemented.
-UPSCALE_FUNC(UpscaleEvenRgb,  MIX_EVEN, VP8YuvToRgb,  3)
-UPSCALE_FUNC(UpscaleOddRgb,   MIX_ODD,  VP8YuvToRgb,  3)
-UPSCALE_FUNC(UpscaleEvenBgr,  MIX_EVEN, VP8YuvToBgr,  3)
-UPSCALE_FUNC(UpscaleOddBgr,   MIX_ODD,  VP8YuvToBgr,  3)
-UPSCALE_FUNC(UpscaleEvenRgba, MIX_EVEN, VP8YuvToRgba, 4)
-UPSCALE_FUNC(UpscaleOddRgba,  MIX_ODD,  VP8YuvToRgba, 4)
-UPSCALE_FUNC(UpscaleEvenBgra, MIX_EVEN, VP8YuvToBgra, 4)
-UPSCALE_FUNC(UpscaleOddBgra,  MIX_ODD,  VP8YuvToBgra, 4)
+UPSCALE_FUNC(UpscaleRgbLinePair,  VP8YuvToRgb,  3)
+UPSCALE_FUNC(UpscaleBgrLinePair,  VP8YuvToBgr,  3)
+UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgba, 4)
+UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgra, 4)
 
 // Main driver function.
-static inline void UpscaleLine(const uint8_t* cur_y,
-                               const uint8_t* cur_u, const uint8_t* cur_v,
-                               const uint8_t* top_u, const uint8_t* top_v,
-                               int len, uint8_t* dst, int odd, CSP_MODE mode) {
-  if (odd) {
-    if (mode == MODE_RGB) {
-      UpscaleOddRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else if (mode == MODE_BGR) {
-      UpscaleOddBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else if (mode == MODE_RGBA) {
-      UpscaleOddRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else {
-      UpscaleOddBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    }
+static inline
+void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y,
+                     const uint8_t* top_u, const uint8_t* top_v,
+                     const uint8_t* cur_u, const uint8_t* cur_v,
+                     uint8_t* top_dst, uint8_t* bottom_dst, int len,
+                     CSP_MODE mode) {
+  if (mode == MODE_RGB) {
+    UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                       top_dst, bottom_dst, len);
+  } else if (mode == MODE_BGR) {
+    UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                       top_dst, bottom_dst, len);
+  } else if (mode == MODE_RGBA) {
+    UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                        top_dst, bottom_dst, len);
   } else {
-    if (mode == MODE_RGB) {
-      UpscaleEvenRgb(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else if (mode == MODE_BGR) {
-      UpscaleEvenBgr(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else if (mode == MODE_RGBA) {
-      UpscaleEvenRgba(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    } else {
-      UpscaleEvenBgra(cur_y, cur_u, cur_v, top_u, top_v, len, dst);
-    }
+    assert(mode == MODE_BGRA);
+    UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v,
+                        top_dst, bottom_dst, len);
   }
 }
+
 #undef LOAD_UV
 #undef UPSCALE_FUNC
-#undef MIX_ODD
-#undef MIX_EVEN
 
 #endif  // FANCY_UPSCALING
 
@@ -180,6 +191,8 @@ static void CustomPut(const VP8Io* io) {
   const int uv_w = (w + 1) / 2;
   assert(!(io->mb_y & 1));
 
+  if (w <= 0 || mb_h <= 0) return;
+
   if (p->mode == MODE_YUV) {
     uint8_t* const y_dst = p->output + io->mb_y * p->stride;
     uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride;
@@ -196,65 +209,53 @@ static void CustomPut(const VP8Io* io) {
     uint8_t* dst = p->output + io->mb_y * p->stride;
     if (io->fancy_upscaling) {
 #ifdef FANCY_UPSCALING
-      const uint8_t* cur_y;
+      const uint8_t* cur_y = io->y;
       const uint8_t* cur_u = io->u;
       const uint8_t* cur_v = io->v;
       const uint8_t* top_u = p->top_u;
       const uint8_t* top_v = p->top_v;
       int y = io->mb_y;
-      int y_end = io->mb_y + io->mb_h - 1;
-      if (y > 0) {
-        // If mid-fly, we need to finish the previous line.
-        cur_y = p->top_y;
-        dst -= p->stride;
-        y -= 1;
+      int y_end = io->mb_y + io->mb_h;
+      if (y == 0) {
+        // First line is special cased. We mirror the u/v samples at boundary.
+        UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v,
+                        NULL, dst, w, p->mode);
       } else {
-        // else we "replicate" the u/v sample of the first line
+        // We can finish the left-over line from previous call
+        UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v,
+                        dst - p->stride, dst, w, p->mode);
+      }
+      // Loop over each output pairs of row.
+      for (; y + 2 < y_end; y += 2) {
         top_u = cur_u;
         top_v = cur_v;
-        // and start with the top line
-        cur_y = io->y;
+        cur_u += io->uv_stride;
+        cur_v += io->uv_stride;
+        dst += 2 * p->stride;
+        cur_y += 2 * io->y_stride;
+        UpscaleLinePair(cur_y - io->y_stride, cur_y,
+                        top_u, top_v, cur_u, cur_v,
+                        dst - p->stride, dst, w, p->mode);
       }
-      if (y_end >= io->height - 1) {
-        // for the very last rows, we can process them right now
-        y_end = io->height;
+      // move to last row
+      cur_y += io->y_stride;
+      if (y_end != io->height) {
+        // Save the unfinished samples for next call (as we're not done yet).
+        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
+        memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u));
+        memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v));
       } else {
-        // we won't process the very last line this time,
-        // waiting for the next call instead.
-      }
-
-      // Loop over each output row.
-      for (; y < y_end; ++y) {
-        if (y & 1) {   // odd lines
-          UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 1, p->mode);
-        } else {       // even lines
-          UpscaleLine(cur_y, cur_u, cur_v, top_u, top_v, w, dst, 0, p->mode);
-          top_u = cur_u;
-          top_v = cur_v;
-          if (y < io->height - 2) {
-            cur_u += io->uv_stride;
-            cur_v += io->uv_stride;
-          }
+        // Process the very last row of even-sized picture
+        if (!(y_end & 1)) {
+          UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v,
+                          dst + p->stride, NULL, w, p->mode);
         }
-        dst += p->stride;
-        if (cur_y == p->top_y) {
-          cur_y = io->y;
-        } else {
-          cur_y += io->y_stride;
-        }
-      }
-      // Save the unfinished samples for next call (if we're not done yet).
-      if (y < io->height - 1) {
-        memcpy(p->top_y, cur_y, w * sizeof(*p->top_y));
-        memcpy(p->top_u, top_u, uv_w * sizeof(*p->top_u));
-        memcpy(p->top_v, top_v, uv_w * sizeof(*p->top_v));
       }
 #else
       assert(0);  // shouldn't happen.
 #endif
     } else {
       // Point-sampling U/V upscaler.
-      // Could be implemented with special MIX functions, too.
       int j;
       for (j = 0; j < mb_h; ++j) {
         const uint8_t* y_src = io->y + j * io->y_stride;