diff options
author | jzern@chromium.org <jzern@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-01-09 22:25:42 +0000 |
---|---|---|
committer | jzern@chromium.org <jzern@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2012-01-09 22:25:42 +0000 |
commit | b7afef7463d25d0a6c366760ee507e62e4c052bd (patch) | |
tree | 85c318db49dfdb548cbb16a2e32f0c8d41d4a0cb /third_party | |
parent | dd32d8a6f739d059b2965a9b7ba8a6eabe96ecc3 (diff) | |
download | chromium_src-b7afef7463d25d0a6c366760ee507e62e4c052bd.zip chromium_src-b7afef7463d25d0a6c366760ee507e62e4c052bd.tar.gz chromium_src-b7afef7463d25d0a6c366760ee507e62e4c052bd.tar.bz2 |
libwebp: update snapshot to v0.1.3
adds sse2 optimizations for the encoder & decoder.
BUG=108376
TEST=webkit layout tests
Review URL: http://codereview.chromium.org/8529002
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@116933 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'third_party')
53 files changed, 8652 insertions, 1533 deletions
diff --git a/third_party/libwebp/README.chromium b/third_party/libwebp/README.chromium index 5e6b403..58871aa 100644 --- a/third_party/libwebp/README.chromium +++ b/third_party/libwebp/README.chromium @@ -1,16 +1,22 @@ -Name: libwebpdecode +Name: WebP image encoder/decoder Short Name: libwebp URL: http://code.google.com/speed/webp -Version: unknown -Revision: 0.1.2 -Included In Release: Yes +Version: v0.1.3 +License File: LICENSE +Security Critical: Yes Description: -Here is a copy of libwebp-decode, from the upstream project: - http://code.google.com/p/webp/downloads/detail?name=libwebp-0.1.2.tar.gz +Source archive: + http://code.google.com/p/webp/downloads/detail?name=libwebp-0.1.3.tar.gz -The project files do not include from the distribution: - examples/ +WebP is an image format that does lossy compression of digital photographic +images. WebP consists of a codec based on VP8, that Google open-sourced in May +2010 and a container based on RIFF. Webmasters, web developers and browser +developers can use WebP to compress, archive and distribute digital images more +efficiently. Local changes: * Removed from types.h the risky: #ifdef ANSI / #define inline /#endif + * Removed examples/, documentation and build related files, keeping only + the contents of src/. + * Merged COPYING/PATENTS to LICENSE diff --git a/third_party/libwebp/dec/alpha.c b/third_party/libwebp/dec/alpha.c new file mode 100644 index 0000000..3052ced --- /dev/null +++ b/third_party/libwebp/dec/alpha.c @@ -0,0 +1,69 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Alpha-plane decompression. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <stdlib.h> +#include "vp8i.h" + +#ifdef WEBP_EXPERIMENTAL_FEATURES + +#include "zlib.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ + +const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec, + int row, int num_rows) { + uint8_t* output = dec->alpha_plane_; + const int stride = dec->pic_hdr_.width_; + if (row < 0 || row + num_rows > dec->pic_hdr_.height_) { + return NULL; // sanity check + } + if (row == 0) { + // TODO(skal): for now, we just decompress everything during the first call. + // Later, we'll decode progressively, but we need to store the + // z_stream state. + const uint8_t* data = dec->alpha_data_; + size_t data_size = dec->alpha_data_size_; + const size_t output_size = stride * dec->pic_hdr_.height_; + int ret = Z_OK; + z_stream strm; + + memset(&strm, 0, sizeof(strm)); + if (inflateInit(&strm) != Z_OK) { + return 0; + } + strm.avail_in = data_size; + strm.next_in = (unsigned char*)data; + do { + strm.avail_out = output_size; + strm.next_out = output; + ret = inflate(&strm, Z_NO_FLUSH); + if (ret == Z_NEED_DICT || ret == Z_DATA_ERROR || ret == Z_MEM_ERROR) { + break; + } + } while (strm.avail_out == 0); + + inflateEnd(&strm); + if (ret != Z_STREAM_END) { + return NULL; // error + } + } + return output + row * stride; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif // WEBP_EXPERIMENTAL_FEATURES diff --git a/third_party/libwebp/dec/buffer.c b/third_party/libwebp/dec/buffer.c new file mode 100644 index 0000000..5de5e6f --- /dev/null +++ b/third_party/libwebp/dec/buffer.c @@ -0,0 +1,198 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Everything about WebPDecBuffer +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <stdlib.h> +#include "vp8i.h" +#include "webpi.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// WebPDecBuffer + +// Number of bytes per pixel for the different color-spaces. +static const int kModeBpp[MODE_LAST] = { 3, 4, 3, 4, 4, 2, 2, 1, 1 }; + +static VP8StatusCode CheckDecBuffer(const WebPDecBuffer* const buffer) { + int ok = 1; + WEBP_CSP_MODE mode = buffer->colorspace; + const int width = buffer->width; + const int height = buffer->height; + if (mode >= MODE_YUV) { // YUV checks + const WebPYUVABuffer* const buf = &buffer->u.YUVA; + const int size = buf->y_stride * height; + const int u_size = buf->u_stride * ((height + 1) / 2); + const int v_size = buf->v_stride * ((height + 1) / 2); + const int a_size = buf->a_stride * height; + ok &= (size <= buf->y_size); + ok &= (u_size <= buf->u_size); + ok &= (v_size <= buf->v_size); + ok &= (a_size <= buf->a_size); + ok &= (buf->y_stride >= width); + ok &= (buf->u_stride >= (width + 1) / 2); + ok &= (buf->v_stride >= (width + 1) / 2); + if (buf->a) { + ok &= (buf->a_stride >= width); + } + } else { // RGB checks + const WebPRGBABuffer* const buf = &buffer->u.RGBA; + ok &= (buf->stride * height <= buf->size); + ok &= (buf->stride >= width * kModeBpp[mode]); + } + return ok ? VP8_STATUS_OK : VP8_STATUS_INVALID_PARAM; +} + +static VP8StatusCode AllocateBuffer(WebPDecBuffer* const buffer) { + const int w = buffer->width; + const int h = buffer->height; + + if (w <= 0 || h <= 0) { + return VP8_STATUS_INVALID_PARAM; + } + + if (!buffer->is_external_memory && buffer->private_memory == NULL) { + uint8_t* output; + WEBP_CSP_MODE mode = buffer->colorspace; + int stride; + int uv_stride = 0, a_stride = 0; + int uv_size = 0; + uint64_t size, a_size = 0, total_size; + // We need memory and it hasn't been allocated yet. + // => initialize output buffer, now that dimensions are known. + stride = w * kModeBpp[mode]; + size = (uint64_t)stride * h; + + if (mode >= MODE_YUV) { + uv_stride = (w + 1) / 2; + uv_size = (uint64_t)uv_stride * ((h + 1) / 2); + if (mode == MODE_YUVA) { + a_stride = w; + a_size = (uint64_t)a_stride * h; + } + } + total_size = size + 2 * uv_size + a_size; + + // Security/sanity checks + if (((size_t)total_size != total_size) || (total_size >= (1ULL << 40))) { + return VP8_STATUS_INVALID_PARAM; + } + + buffer->private_memory = output = (uint8_t*)malloc((size_t)total_size); + if (output == NULL) { + return VP8_STATUS_OUT_OF_MEMORY; + } + + if (mode >= MODE_YUV) { // YUVA initialization + WebPYUVABuffer* const buf = &buffer->u.YUVA; + buf->y = output; + buf->y_stride = stride; + buf->y_size = size; + buf->u = output + size; + buf->u_stride = uv_stride; + buf->u_size = uv_size; + buf->v = output + size + uv_size; + buf->v_stride = uv_stride; + buf->v_size = uv_size; + if (mode == MODE_YUVA) { + buf->a = output + size + 2 * uv_size; + } + buf->a_size = a_size; + buf->a_stride = a_stride; + } else { // RGBA initialization + WebPRGBABuffer* const buf = &buffer->u.RGBA; + buf->rgba = output; + buf->stride = stride; + buf->size = size; + } + } + return CheckDecBuffer(buffer); +} + +VP8StatusCode WebPAllocateDecBuffer(int w, int h, + const WebPDecoderOptions* const options, + WebPDecBuffer* const out) { + if (out == NULL || w <= 0 || h <= 0) { + return VP8_STATUS_INVALID_PARAM; + } + if (options != NULL) { // First, apply options if there is any. + if (options->use_cropping) { + const int cw = options->crop_width; + const int ch = options->crop_height; + const int x = options->crop_left & ~1; + const int y = options->crop_top & ~1; + if (x < 0 || y < 0 || cw <= 0 || ch <= 0 || x + cw > w || y + ch > h) { + return VP8_STATUS_INVALID_PARAM; // out of frame boundary. + } + w = cw; + h = ch; + } + if (options->use_scaling) { + if (options->scaled_width <= 0 || options->scaled_height <= 0) { + return VP8_STATUS_INVALID_PARAM; + } + w = options->scaled_width; + h = options->scaled_height; + } + } + out->width = w; + out->height = h; + + // Then, allocate buffer for real + return AllocateBuffer(out); +} + +//------------------------------------------------------------------------------ +// constructors / destructors + +int WebPInitDecBufferInternal(WebPDecBuffer* const buffer, int version) { + if (version != WEBP_DECODER_ABI_VERSION) return 0; // version mismatch + if (!buffer) return 0; + memset(buffer, 0, sizeof(*buffer)); + return 1; +} + +void WebPFreeDecBuffer(WebPDecBuffer* const buffer) { + if (buffer) { + if (!buffer->is_external_memory) + free(buffer->private_memory); + buffer->private_memory = NULL; + } +} + +void WebPCopyDecBuffer(const WebPDecBuffer* const src, + WebPDecBuffer* const dst) { + if (src && dst) { + *dst = *src; + if (src->private_memory) { + dst->is_external_memory = 1; // dst buffer doesn't own the memory. + dst->private_memory = NULL; + } + } +} + +// Copy and transfer ownership from src to dst (beware of parameter order!) +void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst) { + if (src && dst) { + *dst = *src; + if (src->private_memory) { + src->is_external_memory = 1; // src relinquishes ownership + src->private_memory = NULL; + } + } +} + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dec/frame.c b/third_party/libwebp/dec/frame.c index 44c6357..887e565 100644 --- a/third_party/libwebp/dec/frame.c +++ b/third_party/libwebp/dec/frame.c @@ -10,7 +10,7 @@ // Author: Skal (pascal.massimino@gmail.com) #include <stdlib.h> -#include "vp8i.h" +#include "./vp8i.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { @@ -18,25 +18,84 @@ extern "C" { #define ALIGN_MASK (32 - 1) -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ +// For multi-threaded decoding we need to use 3 rows of 16 pixels as delay line. +// +// Reason is: the deblocking filter cannot deblock the bottom horizontal edges +// immediately, and needs to wait for first few rows of the next macroblock to +// be decoded. Hence, deblocking is lagging behind by 4 or 8 pixels (depending +// on strength). +// With two threads, the vertical positions of the rows being decoded are: +// Decode: [ 0..15][16..31][32..47][48..63][64..79][... +// Deblock: [ 0..11][12..27][28..43][44..59][... +// If we use two threads and two caches of 16 pixels, the sequence would be: +// Decode: [ 0..15][16..31][ 0..15!!][16..31][ 0..15][... +// Deblock: [ 0..11][12..27!!][-4..11][12..27][... +// The problem occurs during row [12..15!!] that both the decoding and +// deblocking threads are writing simultaneously. +// With 3 cache lines, one get a safe write pattern: +// Decode: [ 0..15][16..31][32..47][ 0..15][16..31][32..47][0.. +// Deblock: [ 0..11][12..27][28..43][-4..11][12..27][28... +// Note that multi-threaded output _without_ deblocking can make use of two +// cache lines of 16 pixels only, since there's no lagging behind. The decoding +// and output process have non-concurrent writing: +// Decode: [ 0..15][16..31][ 0..15][16..31][... +// io->put: [ 0..15][16..31][ 0..15][... + +#define MT_CACHE_LINES 3 +#define ST_CACHE_LINES 1 // 1 cache row only for single-threaded case + +// Initialize multi/single-thread worker +static int InitThreadContext(VP8Decoder* const dec) { + dec->cache_id_ = 0; + if (dec->use_threads_) { + WebPWorker* const worker = &dec->worker_; + if (!WebPWorkerReset(worker)) { + return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, + "thread initialization failed."); + } + worker->data1 = dec; + worker->data2 = (void*)&dec->thread_ctx_.io_; + worker->hook = (WebPWorkerHook)VP8FinishRow; + dec->num_caches_ = + (dec->filter_type_ > 0) ? MT_CACHE_LINES : MT_CACHE_LINES - 1; + } else { + dec->num_caches_ = ST_CACHE_LINES; + } + return 1; +} + +//------------------------------------------------------------------------------ // Memory setup -// how many extra luma lines are needed for caching, given a filtering level -static const uint8_t kFilterExtraRows[3] = { 0, 4, 8 }; +// kFilterExtraRows[] = How many extra lines are needed on the MB boundary +// for caching, given a filtering level. +// Simple filter: up to 2 luma samples are read and 1 is written. +// Complex filter: up to 4 luma samples are read and 3 are written. Same for +// U/V, so it's 8 samples total (because of the 2x upsampling). +static const uint8_t kFilterExtraRows[3] = { 0, 2, 8 }; -int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) { +static int AllocateMemory(VP8Decoder* const dec) { + const int num_caches = dec->num_caches_; const int mb_w = dec->mb_w_; const int intra_pred_mode_size = 4 * mb_w * sizeof(uint8_t); const int top_size = (16 + 8 + 8) * mb_w; - const int info_size = (mb_w + 1) * sizeof(VP8MB); + const int mb_info_size = (mb_w + 1) * sizeof(VP8MB); + const int f_info_size = + (dec->filter_type_ > 0) ? + mb_w * (dec->use_threads_ ? 2 : 1) * sizeof(VP8FInfo) + : 0; const int yuv_size = YUV_SIZE * sizeof(*dec->yuv_b_); const int coeffs_size = 384 * sizeof(*dec->coeffs_); - const int cache_height = (16 + kFilterExtraRows[dec->filter_type_]) * 3 / 2; + const int cache_height = (16 * num_caches + + kFilterExtraRows[dec->filter_type_]) * 3 / 2; const int cache_size = top_size * cache_height; + const int alpha_size = + dec->alpha_data_ ? (dec->pic_hdr_.width_ * dec->pic_hdr_.height_) : 0; const int needed = intra_pred_mode_size - + top_size + info_size + + top_size + mb_info_size + f_info_size + yuv_size + coeffs_size - + cache_size + ALIGN_MASK; + + cache_size + alpha_size + ALIGN_MASK; uint8_t* mem; if (needed > dec->mem_size_) { @@ -62,7 +121,18 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) { mem += 8 * mb_w; dec->mb_info_ = ((VP8MB*)mem) + 1; - mem += info_size; + mem += mb_info_size; + + dec->f_info_ = f_info_size ? (VP8FInfo*)mem : NULL; + mem += f_info_size; + dec->thread_ctx_.id_ = 0; + dec->thread_ctx_.f_info_ = dec->f_info_; + if (dec->use_threads_) { + // secondary cache line. The deblocking process need to make use of the + // filtering strength from previous macroblock row, while the new ones + // are being decoded in parallel. We'll just swap the pointers. + dec->thread_ctx_.f_info_ += mb_w; + } mem = (uint8_t*)((uintptr_t)(mem + ALIGN_MASK) & ~ALIGN_MASK); assert((yuv_size & ALIGN_MASK) == 0); @@ -79,36 +149,48 @@ int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) { const int extra_y = extra_rows * dec->cache_y_stride_; const int extra_uv = (extra_rows / 2) * dec->cache_uv_stride_; dec->cache_y_ = ((uint8_t*)mem) + extra_y; - dec->cache_u_ = dec->cache_y_ + 16 * dec->cache_y_stride_ + extra_uv; - dec->cache_v_ = dec->cache_u_ + 8 * dec->cache_uv_stride_ + extra_uv; + dec->cache_u_ = dec->cache_y_ + + 16 * num_caches * dec->cache_y_stride_ + extra_uv; + dec->cache_v_ = dec->cache_u_ + + 8 * num_caches * dec->cache_uv_stride_ + extra_uv; + dec->cache_id_ = 0; } mem += cache_size; + // alpha plane + dec->alpha_plane_ = alpha_size ? (uint8_t*)mem : NULL; + mem += alpha_size; + // note: left-info is initialized once for all. - memset(dec->mb_info_ - 1, 0, (mb_w + 1) * sizeof(*dec->mb_info_)); + memset(dec->mb_info_ - 1, 0, mb_info_size); // initialize top memset(dec->intra_t_, B_DC_PRED, intra_pred_mode_size); + return 1; +} + +static void InitIo(VP8Decoder* const dec, VP8Io* io) { // prepare 'io' - io->width = dec->pic_hdr_.width_; - io->height = dec->pic_hdr_.height_; io->mb_y = 0; io->y = dec->cache_y_; io->u = dec->cache_u_; io->v = dec->cache_v_; io->y_stride = dec->cache_y_stride_; io->uv_stride = dec->cache_uv_stride_; - io->fancy_upscaling = 0; // default - - // Init critical function pointers and look-up tables. - VP8DspInitTables(); - VP8DspInit(); + io->fancy_upsampling = 0; // default + io->a = NULL; +} +int VP8InitFrame(VP8Decoder* const dec, VP8Io* io) { + if (!InitThreadContext(dec)) return 0; // call first. Sets dec->num_caches_. + if (!AllocateMemory(dec)) return 0; + InitIo(dec, io); + VP8DspInit(); // Init critical function pointers and look-up tables. return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Filtering static inline int hev_thresh_from_level(int level, int keyframe) { @@ -119,12 +201,13 @@ static inline int hev_thresh_from_level(int level, int keyframe) { } } -static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) { - VP8MB* const mb = dec->mb_info_ + mb_x; - uint8_t* const y_dst = dec->cache_y_ + mb_x * 16; +static void DoFilter(const VP8Decoder* const dec, int mb_x, int mb_y) { + const VP8ThreadContext* const ctx = &dec->thread_ctx_; const int y_bps = dec->cache_y_stride_; - const int level = mb->f_level_; - const int ilevel = mb->f_ilevel_; + VP8FInfo* const f_info = ctx->f_info_ + mb_x; + uint8_t* const y_dst = dec->cache_y_ + ctx->id_ * 16 * y_bps + mb_x * 16; + const int level = f_info->f_level_; + const int ilevel = f_info->f_ilevel_; const int limit = 2 * level + ilevel; if (level == 0) { return; @@ -133,26 +216,26 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) { if (mb_x > 0) { VP8SimpleHFilter16(y_dst, y_bps, limit + 4); } - if (mb->f_inner_) { + if (f_info->f_inner_) { VP8SimpleHFilter16i(y_dst, y_bps, limit); } if (mb_y > 0) { VP8SimpleVFilter16(y_dst, y_bps, limit + 4); } - if (mb->f_inner_) { + if (f_info->f_inner_) { VP8SimpleVFilter16i(y_dst, y_bps, limit); } } else { // complex - uint8_t* const u_dst = dec->cache_u_ + mb_x * 8; - uint8_t* const v_dst = dec->cache_v_ + mb_x * 8; const int uv_bps = dec->cache_uv_stride_; + uint8_t* const u_dst = dec->cache_u_ + ctx->id_ * 8 * uv_bps + mb_x * 8; + uint8_t* const v_dst = dec->cache_v_ + ctx->id_ * 8 * uv_bps + mb_x * 8; const int hev_thresh = hev_thresh_from_level(level, dec->frm_hdr_.key_frame_); if (mb_x > 0) { VP8HFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); VP8HFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); } - if (mb->f_inner_) { + if (f_info->f_inner_) { VP8HFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); VP8HFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); } @@ -160,16 +243,29 @@ static void DoFilter(VP8Decoder* const dec, int mb_x, int mb_y) { VP8VFilter16(y_dst, y_bps, limit + 4, ilevel, hev_thresh); VP8VFilter8(u_dst, v_dst, uv_bps, limit + 4, ilevel, hev_thresh); } - if (mb->f_inner_) { + if (f_info->f_inner_) { VP8VFilter16i(y_dst, y_bps, limit, ilevel, hev_thresh); VP8VFilter8i(u_dst, v_dst, uv_bps, limit, ilevel, hev_thresh); } } } +// Filter the decoded macroblock row (if needed) +static void FilterRow(const VP8Decoder* const dec) { + int mb_x; + const int mb_y = dec->thread_ctx_.mb_y_; + assert(dec->thread_ctx_.filter_row_); + for (mb_x = dec->tl_mb_x_; mb_x < dec->br_mb_x_; ++mb_x) { + DoFilter(dec, mb_x, mb_y); + } +} + +//------------------------------------------------------------------------------ + void VP8StoreBlock(VP8Decoder* const dec) { if (dec->filter_type_ > 0) { - VP8MB* const info = dec->mb_info_ + dec->mb_x_; + VP8FInfo* const info = dec->f_info_ + dec->mb_x_; + const int skip = dec->mb_info_[dec->mb_x_].skip_; int level = dec->filter_levels_[dec->segment_]; if (dec->filter_hdr_.use_lf_delta_) { // TODO(skal): only CURRENT is handled for now. @@ -193,14 +289,16 @@ void VP8StoreBlock(VP8Decoder* const dec) { } info->f_ilevel_ = (level < 1) ? 1 : level; - info->f_inner_ = (!info->skip_ || dec->is_i4x4_); + info->f_inner_ = (!skip || dec->is_i4x4_); } { // Transfer samples to row cache int y; - uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16; - uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8; - uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8; + const int y_offset = dec->cache_id_ * 16 * dec->cache_y_stride_; + const int uv_offset = dec->cache_id_ * 8 * dec->cache_uv_stride_; + uint8_t* const ydst = dec->cache_y_ + dec->mb_x_ * 16 + y_offset; + uint8_t* const udst = dec->cache_u_ + dec->mb_x_ * 8 + uv_offset; + uint8_t* const vdst = dec->cache_v_ + dec->mb_x_ * 8 + uv_offset; for (y = 0; y < 16; ++y) { memcpy(ydst + y * dec->cache_y_stride_, dec->yuv_b_ + Y_OFF + y * BPS, 16); @@ -214,56 +312,205 @@ void VP8StoreBlock(VP8Decoder* const dec) { } } +//------------------------------------------------------------------------------ +// This function is called after a row of macroblocks is finished decoding. +// It also takes into account the following restrictions: +// * In case of in-loop filtering, we must hold off sending some of the bottom +// pixels as they are yet unfiltered. They will be when the next macroblock +// row is decoded. Meanwhile, we must preserve them by rotating them in the +// cache area. This doesn't hold for the very bottom row of the uncropped +// picture of course. +// * we must clip the remaining pixels against the cropping area. The VP8Io +// struct must have the following fields set correctly before calling put(): + +#define MACROBLOCK_VPOS(mb_y) ((mb_y) * 16) // vertical position of a MB + +// Finalize and transmit a complete row. Return false in case of user-abort. int VP8FinishRow(VP8Decoder* const dec, VP8Io* io) { + int ok = 1; + const VP8ThreadContext* const ctx = &dec->thread_ctx_; const int extra_y_rows = kFilterExtraRows[dec->filter_type_]; const int ysize = extra_y_rows * dec->cache_y_stride_; const int uvsize = (extra_y_rows / 2) * dec->cache_uv_stride_; - const int first_row = (dec->mb_y_ == 0); - const int last_row = (dec->mb_y_ >= dec->mb_h_ - 1); - uint8_t* const ydst = dec->cache_y_ - ysize; - uint8_t* const udst = dec->cache_u_ - uvsize; - uint8_t* const vdst = dec->cache_v_ - uvsize; - if (dec->filter_type_ > 0) { - int mb_x; - for (mb_x = 0; mb_x < dec->mb_w_; ++mb_x) { - DoFilter(dec, mb_x, dec->mb_y_); - } + const int y_offset = ctx->id_ * 16 * dec->cache_y_stride_; + const int uv_offset = ctx->id_ * 8 * dec->cache_uv_stride_; + uint8_t* const ydst = dec->cache_y_ - ysize + y_offset; + uint8_t* const udst = dec->cache_u_ - uvsize + uv_offset; + uint8_t* const vdst = dec->cache_v_ - uvsize + uv_offset; + const int first_row = (ctx->mb_y_ == 0); + const int last_row = (ctx->mb_y_ >= dec->br_mb_y_ - 1); + int y_start = MACROBLOCK_VPOS(ctx->mb_y_); + int y_end = MACROBLOCK_VPOS(ctx->mb_y_ + 1); + + if (ctx->filter_row_) { + FilterRow(dec); } + if (io->put) { - int y_start = dec->mb_y_ * 16; - int y_end = y_start + 16; if (!first_row) { y_start -= extra_y_rows; io->y = ydst; io->u = udst; io->v = vdst; } else { - io->y = dec->cache_y_; - io->u = dec->cache_u_; - io->v = dec->cache_v_; + io->y = dec->cache_y_ + y_offset; + io->u = dec->cache_u_ + uv_offset; + io->v = dec->cache_v_ + uv_offset; } + if (!last_row) { y_end -= extra_y_rows; } - if (y_end > io->height) { - y_end = io->height; + if (y_end > io->crop_bottom) { + y_end = io->crop_bottom; // make sure we don't overflow on last row. + } + io->a = NULL; +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (dec->alpha_data_) { + io->a = VP8DecompressAlphaRows(dec, y_start, y_end - y_start); + if (io->a == NULL) { + return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, + "Could not decode alpha data."); + } } - io->mb_y = y_start; - io->mb_h = y_end - y_start; - if (!io->put(io)) { - return 0; +#endif + if (y_start < io->crop_top) { + const int delta_y = io->crop_top - y_start; + y_start = io->crop_top; + assert(!(delta_y & 1)); + io->y += dec->cache_y_stride_ * delta_y; + io->u += dec->cache_uv_stride_ * (delta_y >> 1); + io->v += dec->cache_uv_stride_ * (delta_y >> 1); + if (io->a) { + io->a += io->width * delta_y; + } + } + if (y_start < y_end) { + io->y += io->crop_left; + io->u += io->crop_left >> 1; + io->v += io->crop_left >> 1; + if (io->a) { + io->a += io->crop_left; + } + io->mb_y = y_start - io->crop_top; + io->mb_w = io->crop_right - io->crop_left; + io->mb_h = y_end - y_start; + ok = io->put(io); } } - // rotate top samples - if (!last_row) { - memcpy(ydst, ydst + 16 * dec->cache_y_stride_, ysize); - memcpy(udst, udst + 8 * dec->cache_uv_stride_, uvsize); - memcpy(vdst, vdst + 8 * dec->cache_uv_stride_, uvsize); + // rotate top samples if needed + if (ctx->id_ + 1 == dec->num_caches_) { + if (!last_row) { + memcpy(dec->cache_y_ - ysize, ydst + 16 * dec->cache_y_stride_, ysize); + memcpy(dec->cache_u_ - uvsize, udst + 8 * dec->cache_uv_stride_, uvsize); + memcpy(dec->cache_v_ - uvsize, vdst + 8 * dec->cache_uv_stride_, uvsize); + } } - return 1; + + return ok; +} + +#undef MACROBLOCK_VPOS + +//------------------------------------------------------------------------------ + +int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io) { + int ok = 1; + VP8ThreadContext* const ctx = &dec->thread_ctx_; + if (!dec->use_threads_) { + // ctx->id_ and ctx->f_info_ are already set + ctx->mb_y_ = dec->mb_y_; + ctx->filter_row_ = dec->filter_row_; + ok = VP8FinishRow(dec, io); + } else { + WebPWorker* const worker = &dec->worker_; + // Finish previous job *before* updating context + ok &= WebPWorkerSync(worker); + assert(worker->status_ == OK); + if (ok) { // spawn a new deblocking/output job + ctx->io_ = *io; + ctx->id_ = dec->cache_id_; + ctx->mb_y_ = dec->mb_y_; + ctx->filter_row_ = dec->filter_row_; + if (ctx->filter_row_) { // just swap filter info + VP8FInfo* const tmp = ctx->f_info_; + ctx->f_info_ = dec->f_info_; + dec->f_info_ = tmp; + } + WebPWorkerLaunch(worker); + if (++dec->cache_id_ == dec->num_caches_) { + dec->cache_id_ = 0; + } + } + } + return ok; +} + +//------------------------------------------------------------------------------ +// Finish setting up the decoding parameter once user's setup() is called. + +VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io) { + // Call setup() first. This may trigger additional decoding features on 'io'. + // Note: Afterward, we must call teardown() not matter what. + if (io->setup && !io->setup(io)) { + VP8SetError(dec, VP8_STATUS_USER_ABORT, "Frame setup failed"); + return dec->status_; + } + + // Disable filtering per user request + if (io->bypass_filtering) { + dec->filter_type_ = 0; + } + // TODO(skal): filter type / strength / sharpness forcing + + // Define the area where we can skip in-loop filtering, in case of cropping. + // + // 'Simple' filter reads two luma samples outside of the macroblock and + // and filters one. It doesn't filter the chroma samples. Hence, we can + // avoid doing the in-loop filtering before crop_top/crop_left position. + // For the 'Complex' filter, 3 samples are read and up to 3 are filtered. + // Means: there's a dependency chain that goes all the way up to the + // top-left corner of the picture (MB #0). We must filter all the previous + // macroblocks. + // TODO(skal): add an 'approximate_decoding' option, that won't produce + // a 1:1 bit-exactness for complex filtering? + { + const int extra_pixels = kFilterExtraRows[dec->filter_type_]; + if (dec->filter_type_ == 2) { + // For complex filter, we need to preserve the dependency chain. + dec->tl_mb_x_ = 0; + dec->tl_mb_y_ = 0; + } else { + // For simple filter, we can filter only the cropped region. + dec->tl_mb_y_ = io->crop_top >> 4; + dec->tl_mb_x_ = io->crop_left >> 4; + } + // We need some 'extra' pixels on the right/bottom. + dec->br_mb_y_ = (io->crop_bottom + 15 + extra_pixels) >> 4; + dec->br_mb_x_ = (io->crop_right + 15 + extra_pixels) >> 4; + if (dec->br_mb_x_ > dec->mb_w_) { + dec->br_mb_x_ = dec->mb_w_; + } + if (dec->br_mb_y_ > dec->mb_h_) { + dec->br_mb_y_ = dec->mb_h_; + } + } + return VP8_STATUS_OK; +} + +int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io) { + int ok = 1; + if (dec->use_threads_) { + ok = WebPWorkerSync(&dec->worker_); + } + + if (io->teardown) { + io->teardown(io); + } + return ok; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Main reconstruction function. static const int kScan[16] = { @@ -358,7 +605,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) { uint8_t* const dst = y_dst + kScan[n]; VP8PredLuma4[dec->imodes_[n]](dst); if (dec->non_zero_ac_ & (1 << n)) { - VP8Transform(coeffs + n * 16, dst); + VP8Transform(coeffs + n * 16, dst, 0); } else if (dec->non_zero_ & (1 << n)) { // only DC is present VP8TransformDC(coeffs + n * 16, dst); } @@ -370,7 +617,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) { for (n = 0; n < 16; n++) { uint8_t* const dst = y_dst + kScan[n]; if (dec->non_zero_ac_ & (1 << n)) { - VP8Transform(coeffs + n * 16, dst); + VP8Transform(coeffs + n * 16, dst, 0); } else if (dec->non_zero_ & (1 << n)) { // only DC is present VP8TransformDC(coeffs + n * 16, dst); } @@ -410,7 +657,7 @@ void VP8ReconstructBlock(VP8Decoder* const dec) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/dec/idec.c b/third_party/libwebp/dec/idec.c index d49ceb0..048d3c5 100644 --- a/third_party/libwebp/dec/idec.c +++ b/third_party/libwebp/dec/idec.c @@ -15,15 +15,11 @@ #include "webpi.h" #include "vp8i.h" -#include "yuv.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -#define RIFF_HEADER_SIZE 20 -#define VP8_HEADER_SIZE 10 -#define WEBP_HEADER_SIZE (RIFF_HEADER_SIZE + VP8_HEADER_SIZE) #define CHUNK_SIZE 4096 #define MAX_MB_SIZE 4096 @@ -32,14 +28,20 @@ extern "C" { // Decoding states. State normally flows like HEADER->PARTS0->DATA->DONE. // If there is any error the decoder goes into state ERROR. -typedef enum { STATE_HEADER = 0, STATE_PARTS0 = 1, - STATE_DATA = 2, STATE_DONE = 3, - STATE_ERROR = 4 +typedef enum { + STATE_PRE_VP8, // All data before that of the first VP8 chunk. + STATE_VP8_FRAME_HEADER, // For VP8 Frame header (within VP8 chunk). + STATE_VP8_PARTS0, + STATE_VP8_DATA, + STATE_DONE, + STATE_ERROR } DecState; // Operating state for the MemBuffer -typedef enum { MEM_MODE_NONE = 0, - MEM_MODE_APPEND, MEM_MODE_MAP +typedef enum { + MEM_MODE_NONE = 0, + MEM_MODE_APPEND, + MEM_MODE_MAP } MemBufferMode; // storage for partition #0 and partial data (in a rolling fashion) @@ -56,12 +58,13 @@ typedef struct { struct WebPIDecoder { DecState state_; // current decoding state - int w_, h_; // width and height WebPDecParams params_; // Params to store output info VP8Decoder* dec_; VP8Io io_; - MemBuffer mem_; // memory buffer + MemBuffer mem_; // input memory buffer. + WebPDecBuffer output_; // output buffer (when no external one is supplied) + uint32_t vp8_size_; // VP8 size extracted from VP8 Header. }; // MB context to restore in case VP8DecodeMB() fails @@ -229,43 +232,63 @@ static void RestoreContext(const MBContext* context, VP8Decoder* const dec, //------------------------------------------------------------------------------ -static VP8StatusCode IDecError(WebPIDecoder* idec, VP8StatusCode error) { +static VP8StatusCode IDecError(WebPIDecoder* const idec, VP8StatusCode error) { + if (idec->state_ == STATE_VP8_DATA) { + VP8Io* const io = &idec->io_; + if (io->teardown) { + io->teardown(io); + } + } idec->state_ = STATE_ERROR; return error; } -// Header -static VP8StatusCode DecodeHeader(WebPIDecoder* const idec) { - int width, height; - uint32_t curr_size, riff_header_size, bits; - WebPDecParams* params = &idec->params_; +static void ChangeState(WebPIDecoder* const idec, DecState new_state, + uint32_t consumed_bytes) { + idec->state_ = new_state; + idec->mem_.start_ += consumed_bytes; + assert(idec->mem_.start_ <= idec->mem_.end_); +} + +// Headers +static VP8StatusCode DecodeWebPHeaders(WebPIDecoder* const idec) { const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_; + uint32_t curr_size = MemDataSize(&idec->mem_); + uint32_t vp8_size; + uint32_t bytes_skipped; + VP8StatusCode status; - if (MemDataSize(&idec->mem_) < WEBP_HEADER_SIZE) { - return VP8_STATUS_SUSPENDED; + status = WebPParseHeaders(&data, &curr_size, &vp8_size, &bytes_skipped); + if (status == VP8_STATUS_NOT_ENOUGH_DATA) { + return VP8_STATUS_SUSPENDED; // We haven't found a VP8 chunk yet. + } else if (status == VP8_STATUS_OK) { + idec->vp8_size_ = vp8_size; + ChangeState(idec, STATE_VP8_FRAME_HEADER, bytes_skipped); + return VP8_STATUS_OK; // We have skipped all pre-VP8 chunks. + } else { + return IDecError(idec, status); } +} - if (!WebPInitDecParams(data, idec->mem_.end_, &width, &height, params)) { - return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR); - } +static VP8StatusCode DecodeVP8FrameHeader(WebPIDecoder* const idec) { + const uint8_t* data = idec->mem_.buf_ + idec->mem_.start_; + const uint32_t curr_size = MemDataSize(&idec->mem_); + uint32_t bits; - // Validate and Skip over RIFF header - curr_size = MemDataSize(&idec->mem_); - if (!WebPCheckRIFFHeader(&data, &curr_size)) { + if (curr_size < VP8_FRAME_HEADER_SIZE) { + // Not enough data bytes to extract VP8 Frame Header. + return VP8_STATUS_SUSPENDED; + } + if (!VP8GetInfo(data, curr_size, idec->vp8_size_, NULL, NULL, NULL)) { return IDecError(idec, VP8_STATUS_BITSTREAM_ERROR); } - riff_header_size = idec->mem_.end_ - curr_size; - bits = data[0] | (data[1] << 8) | (data[2] << 16); - idec->mem_.part0_size_ = (bits >> 5) + VP8_HEADER_SIZE; - idec->mem_.start_ += riff_header_size; - assert(idec->mem_.start_ <= idec->mem_.end_); + bits = data[0] | (data[1] << 8) | (data[2] << 16); + idec->mem_.part0_size_ = (bits >> 5) + VP8_FRAME_HEADER_SIZE; - idec->w_ = width; - idec->h_ = height; - idec->io_.data_size -= riff_header_size; + idec->io_.data_size = curr_size; idec->io_.data = data; - idec->state_ = STATE_PARTS0; + idec->state_ = STATE_VP8_PARTS0; return VP8_STATUS_OK; } @@ -298,14 +321,13 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) { VP8Decoder* const dec = idec->dec_; VP8Io* const io = &idec->io_; const WebPDecParams* const params = &idec->params_; - const WEBP_CSP_MODE mode = params->mode; + WebPDecBuffer* const output = params->output; // Wait till we have enough data for the whole partition #0 if (MemDataSize(&idec->mem_) < idec->mem_.part0_size_) { return VP8_STATUS_SUSPENDED; } - io->opaque = &idec->params_; if (!VP8GetHeaders(dec, io)) { const VP8StatusCode status = dec->status_; if (status == VP8_STATUS_SUSPENDED || @@ -316,36 +338,35 @@ static VP8StatusCode DecodePartition0(WebPIDecoder* const idec) { return IDecError(idec, status); } - if (!WebPCheckDecParams(io, params)) { - return IDecError(idec, VP8_STATUS_INVALID_PARAM); + // Allocate/Verify output buffer now + dec->status_ = WebPAllocateDecBuffer(io->width, io->height, params->options, + output); + if (dec->status_ != VP8_STATUS_OK) { + return IDecError(idec, dec->status_); } - if (mode != MODE_YUV) { - VP8YUVInit(); - } - - // allocate memory and prepare everything. - if (!VP8InitFrame(dec, io)) { + if (!CopyParts0Data(idec)) { return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY); } - if (io->setup && !io->setup(io)) { - return IDecError(idec, VP8_STATUS_USER_ABORT); - } - // disable filtering per user request (_after_ setup() is called) - if (io->bypass_filtering) dec->filter_type_ = 0; - - if (!CopyParts0Data(idec)) { - return IDecError(idec, VP8_STATUS_OUT_OF_MEMORY); + // Finish setting up the decoding parameters. Will call io->setup(). + if (VP8EnterCritical(dec, io) != VP8_STATUS_OK) { + return IDecError(idec, dec->status_); } - idec->state_ = STATE_DATA; + // Note: past this point, teardown() must always be called + // in case of error. + idec->state_ = STATE_VP8_DATA; + // Allocate memory and prepare everything. + if (!VP8InitFrame(dec, io)) { + return IDecError(idec, dec->status_); + } return VP8_STATUS_OK; } // Remaining partitions static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { - VP8BitReader* br; + VP8BitReader* br; VP8Decoder* const dec = idec->dec_; VP8Io* const io = &idec->io_; @@ -355,12 +376,8 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { for (; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) { VP8BitReader* token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)]; if (dec->mb_x_ == 0) { - VP8MB* const left = dec->mb_info_ - 1; - left->nz_ = 0; - left->dc_nz_ = 0; - memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_)); + VP8InitScanline(dec); } - for (; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) { MBContext context; SaveContext(dec, token_br, &context); @@ -383,14 +400,14 @@ static VP8StatusCode DecodeRemaining(WebPIDecoder* const idec) { assert(idec->mem_.start_ <= idec->mem_.end_); } } - if (!VP8FinishRow(dec, io)) { + if (!VP8ProcessRow(dec, io)) { return IDecError(idec, VP8_STATUS_USER_ABORT); } dec->mb_x_ = 0; } - - if (io->teardown) { - io->teardown(io); + // Synchronize the thread and check for errors. + if (!VP8ExitCritical(dec, io)) { + return IDecError(idec, VP8_STATUS_USER_ABORT); } dec->ready_ = 0; idec->state_ = STATE_DONE; @@ -403,14 +420,17 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) { VP8StatusCode status = VP8_STATUS_SUSPENDED; assert(idec->dec_); - if (idec->state_ == STATE_HEADER) { - status = DecodeHeader(idec); + if (idec->state_ == STATE_PRE_VP8) { + status = DecodeWebPHeaders(idec); + } + if (idec->state_ == STATE_VP8_FRAME_HEADER) { + status = DecodeVP8FrameHeader(idec); } - if (idec->state_ == STATE_PARTS0) { + if (idec->state_ == STATE_VP8_PARTS0) { status = DecodePartition0(idec); } - if (idec->state_ == STATE_DATA) { - return DecodeRemaining(idec); + if (idec->state_ == STATE_VP8_DATA) { + status = DecodeRemaining(idec); } return status; } @@ -418,9 +438,11 @@ static VP8StatusCode IDecode(WebPIDecoder* idec) { //------------------------------------------------------------------------------ // Public functions -WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) { +WebPIDecoder* WebPINewDecoder(WebPDecBuffer* const output_buffer) { WebPIDecoder* idec = (WebPIDecoder*)calloc(1, sizeof(WebPIDecoder)); - if (!idec) return NULL; + if (idec == NULL) { + return NULL; + } idec->dec_ = VP8New(); if (idec->dec_ == NULL) { @@ -428,53 +450,97 @@ WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) { return NULL; } - idec->state_ = STATE_HEADER; - idec->params_.mode = mode; + idec->state_ = STATE_PRE_VP8; InitMemBuffer(&idec->mem_); + WebPInitDecBuffer(&idec->output_); VP8InitIo(&idec->io_); - WebPInitCustomIo(&idec->io_); + + WebPResetDecParams(&idec->params_); + idec->params_.output = output_buffer ? output_buffer : &idec->output_; + WebPInitCustomIo(&idec->params_, &idec->io_); // Plug the I/O functions. + +#ifdef WEBP_USE_THREAD + idec->dec_->use_threads_ = idec->params_.options && + (idec->params_.options->use_threads > 0); +#else + idec->dec_->use_threads_ = 0; +#endif + idec->vp8_size_ = 0; + + return idec; +} + +WebPIDecoder* WebPIDecode(const uint8_t* data, uint32_t data_size, + WebPDecoderConfig* const config) { + WebPIDecoder* idec; + + // Parse the bitstream's features, if requested: + if (data != NULL && data_size > 0 && config != NULL) { + if (WebPGetFeatures(data, data_size, &config->input) != VP8_STATUS_OK) { + return NULL; + } + } + // Create an instance of the incremental decoder + idec = WebPINewDecoder(config ? &config->output : NULL); + if (!idec) { + return NULL; + } + // Finish initialization + if (config != NULL) { + idec->params_.options = &config->options; + } return idec; } void WebPIDelete(WebPIDecoder* const idec) { if (!idec) return; VP8Delete(idec->dec_); - WebPClearDecParams(&idec->params_); ClearMemBuffer(&idec->mem_); + WebPFreeDecBuffer(&idec->output_); free(idec); } //------------------------------------------------------------------------------ +// Wrapper toward WebPINewDecoder + +WebPIDecoder* WebPINew(WEBP_CSP_MODE mode) { + WebPIDecoder* const idec = WebPINewDecoder(NULL); + if (!idec) return NULL; + idec->output_.colorspace = mode; + return idec; +} WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer, int output_buffer_size, int output_stride) { WebPIDecoder* idec; - if (mode == MODE_YUV) return NULL; - idec = WebPINew(mode); - if (idec == NULL) return NULL; - idec->params_.output = output_buffer; - idec->params_.stride = output_stride; - idec->params_.output_size = output_buffer_size; - idec->params_.external_buffer = 1; + if (mode >= MODE_YUV) return NULL; + idec = WebPINewDecoder(NULL); + if (!idec) return NULL; + idec->output_.colorspace = mode; + idec->output_.is_external_memory = 1; + idec->output_.u.RGBA.rgba = output_buffer; + idec->output_.u.RGBA.stride = output_stride; + idec->output_.u.RGBA.size = output_buffer_size; return idec; } WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride, uint8_t* u, int u_size, int u_stride, uint8_t* v, int v_size, int v_stride) { - WebPIDecoder* idec = WebPINew(MODE_YUV); - if (idec == NULL) return NULL; - idec->params_.output = luma; - idec->params_.stride = luma_stride; - idec->params_.output_size = luma_size; - idec->params_.u = u; - idec->params_.u_stride = u_stride; - idec->params_.output_u_size = u_size; - idec->params_.v = v; - idec->params_.v_stride = v_stride; - idec->params_.output_v_size = v_size; - idec->params_.external_buffer = 1; + WebPIDecoder* const idec = WebPINewDecoder(NULL); + if (!idec) return NULL; + idec->output_.colorspace = MODE_YUV; + idec->output_.is_external_memory = 1; + idec->output_.u.YUVA.y = luma; + idec->output_.u.YUVA.y_stride = luma_stride; + idec->output_.u.YUVA.y_size = luma_size; + idec->output_.u.YUVA.u = u; + idec->output_.u.YUVA.u_stride = u_stride; + idec->output_.u.YUVA.u_size = u_size; + idec->output_.u.YUVA.v = v; + idec->output_.u.YUVA.v_stride = v_stride; + idec->output_.u.YUVA.v_size = v_size; return idec; } @@ -538,38 +604,81 @@ VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data, //------------------------------------------------------------------------------ -uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y, +static const WebPDecBuffer* GetOutputBuffer(const WebPIDecoder* const idec) { + if (!idec || !idec->dec_ || idec->state_ <= STATE_VP8_PARTS0) { + return NULL; + } + return idec->params_.output; +} + +const WebPDecBuffer* WebPIDecodedArea(const WebPIDecoder* const idec, + int* const left, int* const top, + int* const width, int* const height) { + const WebPDecBuffer* const src = GetOutputBuffer(idec); + if (left) *left = 0; + if (top) *top = 0; + // TODO(skal): later include handling of rotations. + if (src) { + if (width) *width = src->width; + if (height) *height = idec->params_.last_y; + } else { + if (width) *width = 0; + if (height) *height = 0; + } + return src; +} + +uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int* last_y, int* width, int* height, int* stride) { - if (!idec || !idec->dec_ || idec->params_.mode != MODE_RGB || - idec->state_ <= STATE_PARTS0) { + const WebPDecBuffer* const src = GetOutputBuffer(idec); + if (!src) return NULL; + if (src->colorspace >= MODE_YUV) { return NULL; } if (last_y) *last_y = idec->params_.last_y; - if (width) *width = idec->w_; - if (height) *height = idec->h_; - if (stride) *stride = idec->params_.stride; + if (width) *width = src->width; + if (height) *height = src->height; + if (stride) *stride = src->u.RGBA.stride; - return idec->params_.output; + return src->u.RGBA.rgba; } -uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int *last_y, - uint8_t** u, uint8_t** v, int* width, int* height, - int *stride, int* uv_stride) { - if (!idec || !idec->dec_ || idec->params_.mode != MODE_YUV || - idec->state_ <= STATE_PARTS0) { +uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y, + uint8_t** u, uint8_t** v, + int* width, int* height, int *stride, int* uv_stride) { + const WebPDecBuffer* const src = GetOutputBuffer(idec); + if (!src) return NULL; + if (src->colorspace < MODE_YUV) { return NULL; } if (last_y) *last_y = idec->params_.last_y; - if (u) *u = idec->params_.u; - if (v) *v = idec->params_.v; - if (width) *width = idec->w_; - if (height) *height = idec->h_; - if (stride) *stride = idec->params_.stride; - if (uv_stride) *uv_stride = idec->params_.u_stride; + if (u) *u = src->u.YUVA.u; + if (v) *v = src->u.YUVA.v; + if (width) *width = src->width; + if (height) *height = src->height; + if (stride) *stride = src->u.YUVA.y_stride; + if (uv_stride) *uv_stride = src->u.YUVA.u_stride; + + return src->u.YUVA.y; +} - return idec->params_.output; +int WebPISetIOHooks(WebPIDecoder* const idec, + VP8IoPutHook put, + VP8IoSetupHook setup, + VP8IoTeardownHook teardown, + void* user_data) { + if (!idec || !idec->dec_ || idec->state_ > STATE_PRE_VP8) { + return 0; + } + + idec->io_.put = put; + idec->io_.setup = setup; + idec->io_.teardown = teardown; + idec->io_.opaque = user_data; + + return 1; } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/third_party/libwebp/dec/io.c b/third_party/libwebp/dec/io.c new file mode 100644 index 0000000..405df3a --- /dev/null +++ b/third_party/libwebp/dec/io.c @@ -0,0 +1,668 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// functions for sample output. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <assert.h> +#include <stdlib.h> +#include "../dec/vp8i.h" +#include "./webpi.h" +#include "../dsp/dsp.h" +#include "../dsp/yuv.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Main YUV<->RGB conversion functions + +static int EmitYUV(const VP8Io* const io, WebPDecParams* const p) { + WebPDecBuffer* output = p->output; + const WebPYUVABuffer* const buf = &output->u.YUVA; + uint8_t* const y_dst = buf->y + io->mb_y * buf->y_stride; + uint8_t* const u_dst = buf->u + (io->mb_y >> 1) * buf->u_stride; + uint8_t* const v_dst = buf->v + (io->mb_y >> 1) * buf->v_stride; + const int mb_w = io->mb_w; + const int mb_h = io->mb_h; + const int uv_w = (mb_w + 1) / 2; + int j; + for (j = 0; j < mb_h; ++j) { + memcpy(y_dst + j * buf->y_stride, io->y + j * io->y_stride, mb_w); + } + for (j = 0; j < (mb_h + 1) / 2; ++j) { + memcpy(u_dst + j * buf->u_stride, io->u + j * io->uv_stride, uv_w); + memcpy(v_dst + j * buf->v_stride, io->v + j * io->uv_stride, uv_w); + } + return io->mb_h; +} + +// Point-sampling U/V sampler. +static int EmitSampledRGB(const VP8Io* const io, WebPDecParams* const p) { + WebPDecBuffer* output = p->output; + const WebPRGBABuffer* const buf = &output->u.RGBA; + uint8_t* dst = buf->rgba + io->mb_y * buf->stride; + const uint8_t* y_src = io->y; + const uint8_t* u_src = io->u; + const uint8_t* v_src = io->v; + const WebPSampleLinePairFunc sample = WebPSamplers[output->colorspace]; + const int mb_w = io->mb_w; + const int last = io->mb_h - 1; + int j; + for (j = 0; j < last; j += 2) { + sample(y_src, y_src + io->y_stride, u_src, v_src, + dst, dst + buf->stride, mb_w); + y_src += 2 * io->y_stride; + u_src += io->uv_stride; + v_src += io->uv_stride; + dst += 2 * buf->stride; + } + if (j == last) { // Just do the last line twice + sample(y_src, y_src, u_src, v_src, dst, dst, mb_w); + } + return io->mb_h; +} + +//------------------------------------------------------------------------------ +// YUV444 -> RGB conversion + +#if 0 // TODO(skal): this is for future rescaling. +static int EmitRGB(const VP8Io* const io, WebPDecParams* const p) { + WebPDecBuffer* output = p->output; + const WebPRGBABuffer* const buf = &output->u.RGBA; + uint8_t* dst = buf->rgba + io->mb_y * buf->stride; + const uint8_t* y_src = io->y; + const uint8_t* u_src = io->u; + const uint8_t* v_src = io->v; + const WebPYUV444Converter convert = WebPYUV444Converters[output->colorspace]; + const int mb_w = io->mb_w; + const int last = io->mb_h; + int j; + for (j = 0; j < last; ++j) { + convert(y_src, u_src, v_src, dst, mb_w); + y_src += io->y_stride; + u_src += io->uv_stride; + v_src += io->uv_stride; + dst += buf->stride; + } + return io->mb_h; +} +#endif + +//------------------------------------------------------------------------------ +// Fancy upsampling + +#ifdef FANCY_UPSAMPLING +static int EmitFancyRGB(const VP8Io* const io, WebPDecParams* const p) { + int num_lines_out = io->mb_h; // a priori guess + const WebPRGBABuffer* const buf = &p->output->u.RGBA; + uint8_t* dst = buf->rgba + io->mb_y * buf->stride; + const WebPUpsampleLinePairFunc upsample = + io->a ? WebPUpsamplersKeepAlpha[p->output->colorspace] + : WebPUpsamplers[p->output->colorspace]; + const uint8_t* cur_y = io->y; + const uint8_t* cur_u = io->u; + const uint8_t* cur_v = io->v; + const uint8_t* top_u = p->tmp_u; + const uint8_t* top_v = p->tmp_v; + int y = io->mb_y; + int y_end = io->mb_y + io->mb_h; + const int mb_w = io->mb_w; + const int uv_w = (mb_w + 1) / 2; + + if (y == 0) { + // First line is special cased. We mirror the u/v samples at boundary. + upsample(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, NULL, dst, mb_w); + } else { + // We can finish the left-over line from previous call. + // Warning! Don't overwrite the alpha values (if any), as they + // are not lagging one line behind but are already written. + upsample(p->tmp_y, cur_y, top_u, top_v, cur_u, cur_v, + dst - buf->stride, dst, mb_w); + num_lines_out++; + } + // Loop over each output pairs of row. + for (; y + 2 < y_end; y += 2) { + top_u = cur_u; + top_v = cur_v; + cur_u += io->uv_stride; + cur_v += io->uv_stride; + dst += 2 * buf->stride; + cur_y += 2 * io->y_stride; + upsample(cur_y - io->y_stride, cur_y, + top_u, top_v, cur_u, cur_v, + dst - buf->stride, dst, mb_w); + } + // move to last row + cur_y += io->y_stride; + if (io->crop_top + y_end < io->crop_bottom) { + // Save the unfinished samples for next call (as we're not done yet). + memcpy(p->tmp_y, cur_y, mb_w * sizeof(*p->tmp_y)); + memcpy(p->tmp_u, cur_u, uv_w * sizeof(*p->tmp_u)); + memcpy(p->tmp_v, cur_v, uv_w * sizeof(*p->tmp_v)); + // The fancy upsampler leaves a row unfinished behind + // (except for the very last row) + num_lines_out--; + } else { + // Process the very last row of even-sized picture + if (!(y_end & 1)) { + upsample(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, + dst + buf->stride, NULL, mb_w); + } + } + return num_lines_out; +} + +#endif /* FANCY_UPSAMPLING */ + +//------------------------------------------------------------------------------ + +#ifdef WEBP_EXPERIMENTAL_FEATURES +static int EmitAlphaYUV(const VP8Io* const io, WebPDecParams* const p) { + const int mb_w = io->mb_w; + const int mb_h = io->mb_h; + int j; + const WebPYUVABuffer* const buf = &p->output->u.YUVA; + uint8_t* dst = buf->a + io->mb_y * buf->a_stride; + const uint8_t* alpha = io->a; + if (alpha) { + for (j = 0; j < mb_h; ++j) { + memcpy(dst, alpha, mb_w * sizeof(*dst)); + alpha += io->width; + dst += buf->a_stride; + } + } + return 0; +} + +static int EmitAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { + const int mb_w = io->mb_w; + const int mb_h = io->mb_h; + int i, j; + const WebPRGBABuffer* const buf = &p->output->u.RGBA; + uint8_t* dst = buf->rgba + io->mb_y * buf->stride; + const uint8_t* alpha = io->a; + if (alpha) { + for (j = 0; j < mb_h; ++j) { + for (i = 0; i < mb_w; ++i) { + dst[4 * i + 3] = alpha[i]; + } + alpha += io->width; + dst += buf->stride; + } + } + return 0; +} + +#endif /* WEBP_EXPERIMENTAL_FEATURES */ + +//------------------------------------------------------------------------------ +// Simple picture rescaler + +// TODO(skal): start a common library for encoder and decoder, and factorize +// this code in. + +#define RFIX 30 +#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX) + +static void InitRescaler(WebPRescaler* const wrk, + int src_width, int src_height, + uint8_t* dst, + int dst_width, int dst_height, int dst_stride, + int x_add, int x_sub, int y_add, int y_sub, + int32_t* work) { + wrk->x_expand = (src_width < dst_width); + wrk->src_width = src_width; + wrk->src_height = src_height; + wrk->dst_width = dst_width; + wrk->dst_height = dst_height; + wrk->dst = dst; + wrk->dst_stride = dst_stride; + // for 'x_expand', we use bilinear interpolation + wrk->x_add = wrk->x_expand ? (x_sub - 1) : x_add - x_sub; + wrk->x_sub = wrk->x_expand ? (x_add - 1) : x_sub; + wrk->y_accum = y_add; + wrk->y_add = y_add; + wrk->y_sub = y_sub; + wrk->fx_scale = (1 << RFIX) / x_sub; + wrk->fy_scale = (1 << RFIX) / y_sub; + wrk->fxy_scale = wrk->x_expand ? + ((int64_t)dst_height << RFIX) / (x_sub * src_height) : + ((int64_t)dst_height << RFIX) / (x_add * src_height); + wrk->irow = work; + wrk->frow = work + dst_width; +} + +static inline void ImportRow(const uint8_t* const src, + WebPRescaler* const wrk) { + int x_in = 0; + int x_out; + int accum = 0; + if (!wrk->x_expand) { + int sum = 0; + for (x_out = 0; x_out < wrk->dst_width; ++x_out) { + accum += wrk->x_add; + for (; accum > 0; accum -= wrk->x_sub) { + sum += src[x_in++]; + } + { // Emit next horizontal pixel. + const int32_t base = src[x_in++]; + const int32_t frac = base * (-accum); + wrk->frow[x_out] = (sum + base) * wrk->x_sub - frac; + // fresh fractional start for next pixel + sum = MULT(frac, wrk->fx_scale); + } + } + } else { // simple bilinear interpolation + int left = src[0], right = src[0]; + for (x_out = 0; x_out < wrk->dst_width; ++x_out) { + if (accum < 0) { + left = right; + right = src[++x_in]; + accum += wrk->x_add; + } + wrk->frow[x_out] = right * wrk->x_add + (left - right) * accum; + accum -= wrk->x_sub; + } + } + // Accumulate the new row's contribution + for (x_out = 0; x_out < wrk->dst_width; ++x_out) { + wrk->irow[x_out] += wrk->frow[x_out]; + } +} + +static void ExportRow(WebPRescaler* const wrk) { + int x_out; + const int yscale = wrk->fy_scale * (-wrk->y_accum); + assert(wrk->y_accum <= 0); + for (x_out = 0; x_out < wrk->dst_width; ++x_out) { + const int frac = MULT(wrk->frow[x_out], yscale); + const int v = (int)MULT(wrk->irow[x_out] - frac, wrk->fxy_scale); + wrk->dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; + wrk->irow[x_out] = frac; // new fractional start + } + wrk->y_accum += wrk->y_add; + wrk->dst += wrk->dst_stride; +} + +#undef MULT +#undef RFIX + +//------------------------------------------------------------------------------ +// YUV rescaling (no final RGB conversion needed) + +static int Rescale(const uint8_t* src, int src_stride, + int new_lines, WebPRescaler* const wrk) { + int num_lines_out = 0; + while (new_lines-- > 0) { // import new contribution of one source row. + ImportRow(src, wrk); + src += src_stride; + wrk->y_accum -= wrk->y_sub; + while (wrk->y_accum <= 0) { // emit output row(s) + ExportRow(wrk); + num_lines_out++; + } + } + return num_lines_out; +} + +static int EmitRescaledYUV(const VP8Io* const io, WebPDecParams* const p) { + const int mb_h = io->mb_h; + const int uv_mb_h = (mb_h + 1) >> 1; + const int num_lines_out = Rescale(io->y, io->y_stride, mb_h, &p->scaler_y); + Rescale(io->u, io->uv_stride, uv_mb_h, &p->scaler_u); + Rescale(io->v, io->uv_stride, uv_mb_h, &p->scaler_v); + return num_lines_out; +} + +static int EmitRescaledAlphaYUV(const VP8Io* const io, WebPDecParams* const p) { + if (io->a) { + Rescale(io->a, io->width, io->mb_h, &p->scaler_a); + } + return 0; +} + +static int IsAlphaMode(WEBP_CSP_MODE mode) { + return (mode == MODE_RGBA || mode == MODE_BGRA || mode == MODE_ARGB || + mode == MODE_RGBA_4444 || mode == MODE_YUVA); +} + +static int InitYUVRescaler(const VP8Io* const io, WebPDecParams* const p) { + const int has_alpha = IsAlphaMode(p->output->colorspace); + const WebPYUVABuffer* const buf = &p->output->u.YUVA; + const int out_width = io->scaled_width; + const int out_height = io->scaled_height; + const int uv_out_width = (out_width + 1) >> 1; + const int uv_out_height = (out_height + 1) >> 1; + const int uv_in_width = (io->mb_w + 1) >> 1; + const int uv_in_height = (io->mb_h + 1) >> 1; + const size_t work_size = 2 * out_width; // scratch memory for luma rescaler + const size_t uv_work_size = 2 * uv_out_width; // and for each u/v ones + size_t tmp_size; + int32_t* work; + + tmp_size = work_size + 2 * uv_work_size; + if (has_alpha) { + tmp_size += work_size; + } + p->memory = calloc(1, tmp_size * sizeof(*work)); + if (p->memory == NULL) { + return 0; // memory error + } + work = (int32_t*)p->memory; + InitRescaler(&p->scaler_y, io->mb_w, io->mb_h, + buf->y, out_width, out_height, buf->y_stride, + io->mb_w, out_width, io->mb_h, out_height, + work); + InitRescaler(&p->scaler_u, uv_in_width, uv_in_height, + buf->u, uv_out_width, uv_out_height, buf->u_stride, + uv_in_width, uv_out_width, + uv_in_height, uv_out_height, + work + work_size); + InitRescaler(&p->scaler_v, uv_in_width, uv_in_height, + buf->v, uv_out_width, uv_out_height, buf->v_stride, + uv_in_width, uv_out_width, + uv_in_height, uv_out_height, + work + work_size + uv_work_size); + p->emit = EmitRescaledYUV; + if (has_alpha) { + InitRescaler(&p->scaler_a, io->mb_w, io->mb_h, + buf->a, out_width, out_height, buf->a_stride, + io->mb_w, out_width, io->mb_h, out_height, + work + work_size + 2 * uv_work_size); + p->emit_alpha = EmitRescaledAlphaYUV; + } + return 1; +} + +//------------------------------------------------------------------------------ +// RGBA rescaling + +// import new contributions until one row is ready to be output, or all input +// is consumed. +static int Import(const uint8_t* src, int src_stride, + int new_lines, WebPRescaler* const wrk) { + int num_lines_in = 0; + while (num_lines_in < new_lines && wrk->y_accum > 0) { + ImportRow(src, wrk); + src += src_stride; + ++num_lines_in; + wrk->y_accum -= wrk->y_sub; + } + return num_lines_in; +} + +static int ExportRGB(WebPDecParams* const p, int y_pos) { + const WebPYUV444Converter convert = + WebPYUV444Converters[p->output->colorspace]; + const WebPRGBABuffer* const buf = &p->output->u.RGBA; + uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride; + int num_lines_out = 0; + // For RGB rescaling, because of the YUV420, current scan position + // U/V can be +1/-1 line from the Y one. Hence the double test. + while (p->scaler_y.y_accum <= 0 && p->scaler_u.y_accum <= 0) { + assert(p->last_y + y_pos + num_lines_out < p->output->height); + assert(p->scaler_u.y_accum == p->scaler_v.y_accum); + ExportRow(&p->scaler_y); + ExportRow(&p->scaler_u); + ExportRow(&p->scaler_v); + convert(p->scaler_y.dst, p->scaler_u.dst, p->scaler_v.dst, + dst, p->scaler_y.dst_width); + dst += buf->stride; + num_lines_out++; + } + return num_lines_out; +} + +static int EmitRescaledRGB(const VP8Io* const io, WebPDecParams* const p) { + const int mb_h = io->mb_h; + const int uv_mb_h = (mb_h + 1) >> 1; + int j = 0, uv_j = 0; + int num_lines_out = 0; + while (j < mb_h) { + const int y_lines_in = Import(io->y + j * io->y_stride, io->y_stride, + mb_h - j, &p->scaler_y); + const int u_lines_in = Import(io->u + uv_j * io->uv_stride, io->uv_stride, + uv_mb_h - uv_j, &p->scaler_u); + const int v_lines_in = Import(io->v + uv_j * io->uv_stride, io->uv_stride, + uv_mb_h - uv_j, &p->scaler_v); + (void)v_lines_in; // remove a gcc warning + assert(u_lines_in == v_lines_in); + j += y_lines_in; + uv_j += u_lines_in; + num_lines_out += ExportRGB(p, num_lines_out); + } + return num_lines_out; +} + +static int ExportAlpha(WebPDecParams* const p, int y_pos) { + const WebPRGBABuffer* const buf = &p->output->u.RGBA; + uint8_t* dst = buf->rgba + (p->last_y + y_pos) * buf->stride; + int num_lines_out = 0; + while (p->scaler_a.y_accum <= 0) { + int i; + assert(p->last_y + y_pos + num_lines_out < p->output->height); + ExportRow(&p->scaler_a); + for (i = 0; i < p->scaler_a.dst_width; ++i) { + dst[4 * i + 3] = p->scaler_a.dst[i]; + } + dst += buf->stride; + num_lines_out++; + } + return num_lines_out; +} + +static int EmitRescaledAlphaRGB(const VP8Io* const io, WebPDecParams* const p) { + if (io->a) { + int j = 0, pos = 0; + while (j < io->mb_h) { + j += Import(io->a + j * io->width, io->width, io->mb_h - j, &p->scaler_a); + pos += ExportAlpha(p, pos); + } + } + return 0; +} + +static int InitRGBRescaler(const VP8Io* const io, WebPDecParams* const p) { + const int has_alpha = IsAlphaMode(p->output->colorspace); + const int out_width = io->scaled_width; + const int out_height = io->scaled_height; + const int uv_in_width = (io->mb_w + 1) >> 1; + const int uv_in_height = (io->mb_h + 1) >> 1; + const size_t work_size = 2 * out_width; // scratch memory for one rescaler + int32_t* work; // rescalers work area + uint8_t* tmp; // tmp storage for scaled YUV444 samples before RGB conversion + size_t tmp_size1, tmp_size2; + + tmp_size1 = 3 * work_size; + tmp_size2 = 3 * out_width; + if (has_alpha) { + tmp_size1 += work_size; + tmp_size2 += out_width; + } + p->memory = + calloc(1, tmp_size1 * sizeof(*work) + tmp_size2 * sizeof(*tmp)); + if (p->memory == NULL) { + return 0; // memory error + } + work = (int32_t*)p->memory; + tmp = (uint8_t*)(work + tmp_size1); + InitRescaler(&p->scaler_y, io->mb_w, io->mb_h, + tmp + 0 * out_width, out_width, out_height, 0, + io->mb_w, out_width, io->mb_h, out_height, + work + 0 * work_size); + InitRescaler(&p->scaler_u, uv_in_width, uv_in_height, + tmp + 1 * out_width, out_width, out_height, 0, + io->mb_w, 2 * out_width, io->mb_h, 2 * out_height, + work + 1 * work_size); + InitRescaler(&p->scaler_v, uv_in_width, uv_in_height, + tmp + 2 * out_width, out_width, out_height, 0, + io->mb_w, 2 * out_width, io->mb_h, 2 * out_height, + work + 2 * work_size); + p->emit = EmitRescaledRGB; + + if (has_alpha) { + InitRescaler(&p->scaler_a, io->mb_w, io->mb_h, + tmp + 3 * out_width, out_width, out_height, 0, + io->mb_w, out_width, io->mb_h, out_height, + work + 3 * work_size); + p->emit_alpha = EmitRescaledAlphaRGB; + } + return 1; +} + +//------------------------------------------------------------------------------ +// Default custom functions + +// Setup crop_xxx fields, mb_w and mb_h +static int InitFromOptions(const WebPDecoderOptions* const options, + VP8Io* const io) { + const int W = io->width; + const int H = io->height; + int x = 0, y = 0, w = W, h = H; + + // Cropping + io->use_cropping = (options != NULL) && (options->use_cropping > 0); + if (io->use_cropping) { + w = options->crop_width; + h = options->crop_height; + // TODO(skal): take colorspace into account. Don't assume YUV420. + x = options->crop_left & ~1; + y = options->crop_top & ~1; + if (x < 0 || y < 0 || w <= 0 || h <= 0 || x + w > W || y + h > H) { + return 0; // out of frame boundary error + } + } + io->crop_left = x; + io->crop_top = y; + io->crop_right = x + w; + io->crop_bottom = y + h; + io->mb_w = w; + io->mb_h = h; + + // Scaling + io->use_scaling = (options != NULL) && (options->use_scaling > 0); + if (io->use_scaling) { + if (options->scaled_width <= 0 || options->scaled_height <= 0) { + return 0; + } + io->scaled_width = options->scaled_width; + io->scaled_height = options->scaled_height; + } + + // Filter + io->bypass_filtering = options && options->bypass_filtering; + + // Fancy upsampler +#ifdef FANCY_UPSAMPLING + io->fancy_upsampling = (options == NULL) || (!options->no_fancy_upsampling); +#endif + + if (io->use_scaling) { + // disable filter (only for large downscaling ratio). + io->bypass_filtering = (io->scaled_width < W * 3 / 4) && + (io->scaled_height < H * 3 / 4); + io->fancy_upsampling = 0; + } + return 1; +} + +static int CustomSetup(VP8Io* io) { + WebPDecParams* const p = (WebPDecParams*)io->opaque; + const int is_rgb = (p->output->colorspace < MODE_YUV); + + p->memory = NULL; + p->emit = NULL; + p->emit_alpha = NULL; + if (!InitFromOptions(p->options, io)) { + return 0; + } + + if (io->use_scaling) { + const int ok = is_rgb ? InitRGBRescaler(io, p) : InitYUVRescaler(io, p); + if (!ok) { + return 0; // memory error + } + } else { + if (is_rgb) { + p->emit = EmitSampledRGB; // default +#ifdef FANCY_UPSAMPLING + if (io->fancy_upsampling) { + const int uv_width = (io->mb_w + 1) >> 1; + p->memory = malloc(io->mb_w + 2 * uv_width); + if (p->memory == NULL) { + return 0; // memory error. + } + p->tmp_y = (uint8_t*)p->memory; + p->tmp_u = p->tmp_y + io->mb_w; + p->tmp_v = p->tmp_u + uv_width; + p->emit = EmitFancyRGB; + WebPInitUpsamplers(); + } +#endif + } else { + p->emit = EmitYUV; + } +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (IsAlphaMode(p->output->colorspace)) { + // We need transparency output + p->emit_alpha = is_rgb ? EmitAlphaRGB : EmitAlphaYUV; + } +#endif + } + + if (is_rgb) { + VP8YUVInit(); + } + return 1; +} + +//------------------------------------------------------------------------------ + +static int CustomPut(const VP8Io* io) { + WebPDecParams* p = (WebPDecParams*)io->opaque; + const int mb_w = io->mb_w; + const int mb_h = io->mb_h; + int num_lines_out; + assert(!(io->mb_y & 1)); + + if (mb_w <= 0 || mb_h <= 0) { + return 0; + } + num_lines_out = p->emit(io, p); + if (p->emit_alpha) { + p->emit_alpha(io, p); + } + p->last_y += num_lines_out; + return 1; +} + +//------------------------------------------------------------------------------ + +static void CustomTeardown(const VP8Io* io) { + WebPDecParams* const p = (WebPDecParams*)io->opaque; + free(p->memory); + p->memory = NULL; +} + +//------------------------------------------------------------------------------ +// Main entry point + +void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io) { + io->put = CustomPut; + io->setup = CustomSetup; + io->teardown = CustomTeardown; + io->opaque = params; +} + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dec/layer.c b/third_party/libwebp/dec/layer.c new file mode 100644 index 0000000..f7d41e0 --- /dev/null +++ b/third_party/libwebp/dec/layer.c @@ -0,0 +1,34 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Enhancement layer (for YUV444/422) +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <assert.h> +#include <stdlib.h> +#include "vp8i.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ + +int VP8DecodeLayer(VP8Decoder* const dec) { + assert(dec); + assert(dec->layer_data_size_ > 0); + (void)dec; + + // TODO: handle enhancement layer here. + + return 1; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dec/quant.c b/third_party/libwebp/dec/quant.c index 47edbf5..aee4fd3 100644 --- a/third_party/libwebp/dec/quant.c +++ b/third_party/libwebp/dec/quant.c @@ -58,7 +58,7 @@ static const uint16_t kAcTable[128] = { 249, 254, 259, 264, 269, 274, 279, 284 }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Paragraph 9.6 void VP8ParseQuant(VP8Decoder* const dec) { @@ -104,7 +104,7 @@ void VP8ParseQuant(VP8Decoder* const dec) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/dec/tree.c b/third_party/libwebp/dec/tree.c index ed6caad..7055216 100644 --- a/third_party/libwebp/dec/tree.c +++ b/third_party/libwebp/dec/tree.c @@ -65,7 +65,7 @@ static const int8_t kMVRef4[6] = { }; #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Default probabilities // Inter @@ -385,7 +385,7 @@ void VP8ParseIntraMode(VP8BitReader* const br, VP8Decoder* const dec) { : VP8GetBit(br, 183) ? TM_PRED : H_PRED; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Paragraph 13 static const uint8_t diff --git a/third_party/libwebp/dec/vp8.c b/third_party/libwebp/dec/vp8.c index 43a0c35..9149284 100644 --- a/third_party/libwebp/dec/vp8.c +++ b/third_party/libwebp/dec/vp8.c @@ -11,18 +11,19 @@ #include <stdlib.h> #include "vp8i.h" +#include "webpi.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ int WebPGetDecoderVersion(void) { return (DEC_MAJ_VERSION << 16) | (DEC_MIN_VERSION << 8) | DEC_REV_VERSION; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8Decoder static void SetOk(VP8Decoder* const dec) { @@ -43,6 +44,7 @@ VP8Decoder* VP8New(void) { VP8Decoder* dec = (VP8Decoder*)calloc(1, sizeof(VP8Decoder)); if (dec) { SetOk(dec); + WebPWorkerInit(&dec->worker_); dec->ready_ = 0; } return dec; @@ -74,7 +76,56 @@ int VP8SetError(VP8Decoder* const dec, return 0; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ + +int VP8GetInfo(const uint8_t* data, uint32_t data_size, uint32_t chunk_size, + int* width, int* height, int* has_alpha) { + if (data_size < 10) { + return 0; // not enough data + } + // check signature + if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) { + return 0; // Wrong signature. + } else { + const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16); + const int key_frame = !(bits & 1); + const int w = ((data[7] << 8) | data[6]) & 0x3fff; + const int h = ((data[9] << 8) | data[8]) & 0x3fff; + + if (has_alpha) { +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (data_size < 11) return 0; + *has_alpha = !!(data[10] & 0x80); // the colorspace_ bit +#else + *has_alpha = 0; +#endif + } + if (!key_frame) { // Not a keyframe. + return 0; + } + + if (((bits >> 1) & 7) > 3) { + return 0; // unknown profile + } + if (!((bits >> 4) & 1)) { + return 0; // first frame is invisible! + } + if (((bits >> 5)) >= chunk_size) { // partition_length + return 0; // inconsistent size information. + } + + if (width) { + *width = w; + } + if (height) { + *height = h; + } + + return 1; + } +} + +//------------------------------------------------------------------------------ // Header parsing static void ResetSegmentHeader(VP8SegmentHeader* const hdr) { @@ -194,14 +245,12 @@ static int ParseFilterHeader(VP8BitReader* br, VP8Decoder* const dec) { return !br->eof_; } -static inline uint32_t get_le32(const uint8_t* const data) { - return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); -} - // Topmost call int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { - uint8_t* buf; + const uint8_t* buf; uint32_t buf_size; + uint32_t vp8_chunk_size; + uint32_t bytes_skipped; VP8FrameHeader* frm_hdr; VP8PictureHeader* pic_hdr; VP8BitReader* br; @@ -216,41 +265,19 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { "null VP8Io passed to VP8GetHeaders()"); } - buf = (uint8_t *)io->data; + buf = io->data; buf_size = io->data_size; - if (buf == NULL || buf_size <= 4) { - return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, - "Not enough data to parse frame header"); + + // Process Pre-VP8 chunks. + status = WebPParseHeaders(&buf, &buf_size, &vp8_chunk_size, &bytes_skipped); + if (status != VP8_STATUS_OK) { + return VP8SetError(dec, status, "Incorrect/incomplete header."); } - // Skip over valid RIFF headers - if (!memcmp(buf, "RIFF", 4)) { - uint32_t riff_size; - uint32_t chunk_size; - if (buf_size < 20 + 4) { - return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, - "RIFF: Truncated header."); - } - if (memcmp(buf + 8, "WEBP", 4)) { // wrong image file signature - return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, - "RIFF: WEBP signature not found."); - } - riff_size = get_le32(buf + 4); - if (riff_size < 12) { - return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, - "RIFF: Truncated header."); - } - if (memcmp(buf + 12, "VP8 ", 4)) { - return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, - "RIFF: Invalid compression format."); - } - chunk_size = get_le32(buf + 16); - if (chunk_size > riff_size - 12) { - return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, - "RIFF: Inconsistent size information."); - } - buf += 20; - buf_size -= 20; + // Process the VP8 frame header. + if (buf_size < 4) { + return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, + "Truncated header."); } // Paragraph 9.1 @@ -291,8 +318,17 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { dec->mb_w_ = (pic_hdr->width_ + 15) >> 4; dec->mb_h_ = (pic_hdr->height_ + 15) >> 4; + // Setup default output area (can be later modified during io->setup()) io->width = pic_hdr->width_; io->height = pic_hdr->height_; + io->use_scaling = 0; + io->use_cropping = 0; + io->crop_top = 0; + io->crop_left = 0; + io->crop_right = io->width; + io->crop_bottom = io->height; + io->mb_w = io->width; // sanity check + io->mb_h = io->height; // ditto VP8ResetProba(&dec->proba_); ResetSegmentHeader(&dec->segment_hdr_); @@ -305,6 +341,10 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, "bad partition length"); } + + dec->alpha_data_ = NULL; + dec->alpha_data_size_ = 0; + br = &dec->br_; VP8InitBitReader(br, buf, buf + frm_hdr->partition_length_); buf += frm_hdr->partition_length_; @@ -368,12 +408,42 @@ int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io) { VP8ParseProba(br, dec); +#ifdef WEBP_EXPERIMENTAL_FEATURES + // Extensions + if (dec->pic_hdr_.colorspace_) { + const size_t kTrailerSize = 8; + const uint8_t kTrailerMarker = 0x01; + const uint8_t* ext_buf = buf - kTrailerSize; + size_t size; + + if (frm_hdr->partition_length_ < kTrailerSize || + ext_buf[kTrailerSize - 1] != kTrailerMarker) { + Error: + return VP8SetError(dec, VP8_STATUS_BITSTREAM_ERROR, + "RIFF: Inconsistent extra information."); + } + // Alpha + size = (ext_buf[4] << 0) | (ext_buf[5] << 8) | (ext_buf[6] << 16); + if (frm_hdr->partition_length_ < size + kTrailerSize) { + goto Error; + } + dec->alpha_data_ = (size > 0) ? ext_buf - size : NULL; + dec->alpha_data_size_ = size; + + // Layer + size = (ext_buf[0] << 0) | (ext_buf[1] << 8) | (ext_buf[2] << 16); + dec->layer_data_size_ = size; + dec->layer_data_ = NULL; // will be set later + dec->layer_colorspace_ = ext_buf[3]; + } +#endif + // sanitized state dec->ready_ = 1; return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Residual decoding (Paragraph 13.2 / 13.3) static const uint8_t kBands[16 + 1] = { @@ -386,7 +456,7 @@ static const uint8_t kCat4[] = { 176, 155, 140, 135, 0 }; static const uint8_t kCat5[] = { 180, 157, 141, 134, 130, 0 }; static const uint8_t kCat6[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129, 0 }; -static const uint8_t * const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 }; +static const uint8_t* const kCat3456[] = { kCat3, kCat4, kCat5, kCat6 }; static const uint8_t kZigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; @@ -422,7 +492,8 @@ static int GetCoeffs(VP8BitReader* const br, ProbaArray prob, if (!VP8GetBit(br, p[7])) { v = 5 + VP8GetBit(br, 159); } else { - v = 7 + 2 * VP8GetBit(br, 165) + VP8GetBit(br, 145); + v = 7 + 2 * VP8GetBit(br, 165); + v += VP8GetBit(br, 145); } } else { const uint8_t* tab; @@ -551,7 +622,7 @@ static void ParseResiduals(VP8Decoder* const dec, } #undef PACK -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Main loop int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) { @@ -588,16 +659,21 @@ int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br) { return (!token_br->eof_); } +void VP8InitScanline(VP8Decoder* const dec) { + VP8MB* const left = dec->mb_info_ - 1; + left->nz_ = 0; + left->dc_nz_ = 0; + memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_)); + dec->filter_row_ = + (dec->filter_type_ > 0) && + (dec->mb_y_ >= dec->tl_mb_y_) && (dec->mb_y_ <= dec->br_mb_y_); +} + static int ParseFrame(VP8Decoder* const dec, VP8Io* io) { - for (dec->mb_y_ = 0; dec->mb_y_ < dec->mb_h_; ++dec->mb_y_) { - VP8MB* const left = dec->mb_info_ - 1; + for (dec->mb_y_ = 0; dec->mb_y_ < dec->br_mb_y_; ++dec->mb_y_) { VP8BitReader* const token_br = &dec->parts_[dec->mb_y_ & (dec->num_parts_ - 1)]; - - left->nz_ = 0; - left->dc_nz_ = 0; - memset(dec->intra_l_, B_DC_PRED, sizeof(dec->intra_l_)); - + VP8InitScanline(dec); for (dec->mb_x_ = 0; dec->mb_x_ < dec->mb_w_; dec->mb_x_++) { if (!VP8DecodeMB(dec, token_br)) { return VP8SetError(dec, VP8_STATUS_NOT_ENOUGH_DATA, @@ -608,11 +684,13 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) { // Store data and save block's filtering params VP8StoreBlock(dec); } - if (!VP8FinishRow(dec, io)) { - return VP8SetError(dec, VP8_STATUS_USER_ABORT, - "Output aborted."); + if (!VP8ProcessRow(dec, io)) { + return VP8SetError(dec, VP8_STATUS_USER_ABORT, "Output aborted."); } } + if (dec->use_threads_ && !WebPWorkerSync(&dec->worker_)) { + return 0; + } // Finish #ifndef ONLY_KEYFRAME_CODE @@ -621,11 +699,20 @@ static int ParseFrame(VP8Decoder* const dec, VP8Io* io) { } #endif +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (dec->layer_data_size_ > 0) { + if (!VP8DecodeLayer(dec)) { + return 0; + } + } +#endif + return 1; } // Main entry point int VP8Decode(VP8Decoder* const dec, VP8Io* const io) { + int ok = 0; if (dec == NULL) { return 0; } @@ -641,32 +728,22 @@ int VP8Decode(VP8Decoder* const dec, VP8Io* const io) { } assert(dec->ready_); - // will allocate memory and prepare everything. - if (!VP8InitFrame(dec, io)) { - VP8Clear(dec); - return VP8SetError(dec, VP8_STATUS_OUT_OF_MEMORY, - "Allocation failed"); - } + // Finish setting up the decoding parameter. Will call io->setup(). + ok = (VP8EnterCritical(dec, io) == VP8_STATUS_OK); + if (ok) { // good to go. + // Will allocate memory and prepare everything. + if (ok) ok = VP8InitFrame(dec, io); - if (io->setup && !io->setup(io)) { - VP8Clear(dec); - return VP8SetError(dec, VP8_STATUS_USER_ABORT, - "Frame setup failed"); - } + // Main decoding loop + if (ok) ok = ParseFrame(dec, io); - // Disable filtering per user request (_after_ setup() is called) - if (io->bypass_filtering) dec->filter_type_ = 0; + // Exit. + ok &= VP8ExitCritical(dec, io); + } - // Main decoding loop - { - const int ret = ParseFrame(dec, io); - if (io->teardown) { - io->teardown(io); - } - if (!ret) { - VP8Clear(dec); - return 0; - } + if (!ok) { + VP8Clear(dec); + return 0; } dec->ready_ = 0; @@ -677,6 +754,9 @@ void VP8Clear(VP8Decoder* const dec) { if (dec == NULL) { return; } + if (dec->use_threads_) { + WebPWorkerEnd(&dec->worker_); + } if (dec->mem_) { free(dec->mem_); } @@ -686,7 +766,7 @@ void VP8Clear(VP8Decoder* const dec) { dec->ready_ = 0; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/dec/vp8i.h b/third_party/libwebp/dec/vp8i.h index b2ad9a3..2cbdef22 100644 --- a/third_party/libwebp/dec/vp8i.h +++ b/third_party/libwebp/dec/vp8i.h @@ -13,19 +13,21 @@ #define WEBP_DEC_VP8I_H_ #include <string.h> // for memcpy() -#include "bits.h" +#include "../utils/bit_reader.h" +#include "../utils/thread.h" +#include "../dsp/dsp.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Various defines and enums // version numbers #define DEC_MAJ_VERSION 0 #define DEC_MIN_VERSION 1 -#define DEC_REV_VERSION 2 +#define DEC_REV_VERSION 3 #define ONLY_KEYFRAME_CODE // to remove any code related to P-Frames @@ -95,7 +97,7 @@ enum { MB_FEATURE_TREE_PROBS = 3, #define U_OFF (Y_OFF + BPS * 16 + BPS) #define V_OFF (U_OFF + 16) -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Headers typedef struct { @@ -144,19 +146,19 @@ typedef struct { int mode_lf_delta_[NUM_MODE_LF_DELTAS]; } VP8FilterHeader; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Informations about the macroblocks. -typedef struct { - // block type - uint8_t skip_:1; - // filter specs - uint8_t f_level_:6; // filter strength: 0..63 - uint8_t f_ilevel_:6; // inner limit: 1..63 - uint8_t f_inner_:1; // do inner filtering? - // cbp - uint8_t nz_; // non-zero AC/DC coeffs - uint8_t dc_nz_; // non-zero DC coeffs +typedef struct { // filter specs + unsigned int f_level_:6; // filter strength: 0..63 + unsigned int f_ilevel_:6; // inner limit: 1..63 + unsigned int f_inner_:1; // do inner filtering? +} VP8FInfo; + +typedef struct { // used for syntax-parsing + unsigned int nz_; // non-zero AC/DC coeffs + unsigned int dc_nz_:1; // non-zero DC coeffs + unsigned int skip_:1; // block type } VP8MB; // Dequantization matrices @@ -164,7 +166,16 @@ typedef struct { uint16_t y1_mat_[2], y2_mat_[2], uv_mat_[2]; // [DC / AC] } VP8QuantMatrix; -//----------------------------------------------------------------------------- +// Persistent information needed by the parallel processing +typedef struct { + int id_; // cache row to process (in [0..2]) + int mb_y_; // macroblock position of the row + int filter_row_; // true if row-filtering is needed + VP8FInfo* f_info_; // filter strengths + VP8Io io_; // copy of the VP8Io to pass to put() +} VP8ThreadContext; + +//------------------------------------------------------------------------------ // VP8Decoder: the main opaque structure handed over to user struct VP8Decoder { @@ -181,9 +192,20 @@ struct VP8Decoder { VP8FilterHeader filter_hdr_; VP8SegmentHeader segment_hdr_; + // Worker + WebPWorker worker_; + int use_threads_; // use multi-thread + int cache_id_; // current cache row + int num_caches_; // number of cached rows of 16 pixels (1, 2 or 3) + VP8ThreadContext thread_ctx_; // Thread context + // dimension, in macroblock units. int mb_w_, mb_h_; + // Macroblock to process/filter, depending on cropping and filter_type. + int tl_mb_x_, tl_mb_y_; // top-left MB that must be in-loop filtered + int br_mb_x_, br_mb_y_; // last bottom-right MB that must be decoded + // number of partitions. int num_parts_; // per-partition boolean decoders. @@ -212,10 +234,11 @@ struct VP8Decoder { // Boundary data cache and persistent buffers. uint8_t* intra_t_; // top intra modes values: 4 * mb_w_ uint8_t intra_l_[4]; // left intra modes values - uint8_t *y_t_; // top luma samples: 16 * mb_w_ - uint8_t *u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each + uint8_t* y_t_; // top luma samples: 16 * mb_w_ + uint8_t* u_t_, *v_t_; // top u/v samples: 8 * mb_w_ each - VP8MB* mb_info_; // contextual macroblock infos (mb_w_ + 1) + VP8MB* mb_info_; // contextual macroblock info (mb_w_ + 1) + VP8FInfo* f_info_; // filter strength info uint8_t* yuv_b_; // main block for Y/U/V (size = YUV_SIZE) int16_t* coeffs_; // 384 coeffs = (16+8+8) * 4*4 @@ -244,17 +267,35 @@ struct VP8Decoder { uint32_t non_zero_ac_; // Filtering side-info - int filter_type_; // 0=off, 1=simple, 2=complex + int filter_type_; // 0=off, 1=simple, 2=complex + int filter_row_; // per-row flag uint8_t filter_levels_[NUM_MB_SEGMENTS]; // precalculated per-segment + + // extensions + const uint8_t* alpha_data_; // compressed alpha data (if present) + size_t alpha_data_size_; + uint8_t* alpha_plane_; // output + + int layer_colorspace_; + const uint8_t* layer_data_; // compressed layer data (if present) + size_t layer_data_size_; }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // internal functions. Not public. // in vp8.c int VP8SetError(VP8Decoder* const dec, VP8StatusCode error, const char * const msg); +// Validates the VP8 data-header and retrieve basic header information viz width +// and height. Returns 0 in case of formatting error. *width/*height/*has_alpha +// can be passed NULL. +int VP8GetInfo(const uint8_t* data, + uint32_t data_size, // data available so far + uint32_t chunk_size, // total data size expect in the chunk + int *width, int *height, int *has_alpha); + // in tree.c void VP8ResetProba(VP8Proba* const proba); void VP8ParseProba(VP8BitReader* const br, VP8Decoder* const dec); @@ -267,59 +308,38 @@ void VP8ParseQuant(VP8Decoder* const dec); int VP8InitFrame(VP8Decoder* const dec, VP8Io* io); // Predict a block and add residual void VP8ReconstructBlock(VP8Decoder* const dec); +// Call io->setup() and finish setting up scan parameters. +// After this call returns, one must always call VP8ExitCritical() with the +// same parameters. Both functions should be used in pair. Returns VP8_STATUS_OK +// if ok, otherwise sets and returns the error status on *dec. +VP8StatusCode VP8EnterCritical(VP8Decoder* const dec, VP8Io* const io); +// Must always be called in pair with VP8EnterCritical(). +// Returns false in case of error. +int VP8ExitCritical(VP8Decoder* const dec, VP8Io* const io); +// Filter the decoded macroblock row (if needed) +int VP8FinishRow(VP8Decoder* const dec, VP8Io* io); // multi threaded call +// Process the last decoded row (filtering + output) +int VP8ProcessRow(VP8Decoder* const dec, VP8Io* const io); // Store a block, along with filtering params void VP8StoreBlock(VP8Decoder* const dec); // Finalize and transmit a complete row. Return false in case of user-abort. -int VP8FinishRow(VP8Decoder* const dec, VP8Io* io); +int VP8FinishRow(VP8Decoder* const dec, VP8Io* const io); +// To be called at the start of a new scanline, to initialize predictors. +void VP8InitScanline(VP8Decoder* const dec); // Decode one macroblock. Returns false if there is not enough data. int VP8DecodeMB(VP8Decoder* const dec, VP8BitReader* const token_br); -// in dsp.c -typedef void (*VP8Idct)(const int16_t* coeffs, uint8_t* dst); -extern VP8Idct VP8Transform; -extern VP8Idct VP8TransformUV; -extern VP8Idct VP8TransformDC; -extern VP8Idct VP8TransformDCUV; -extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out); - -// *dst is the destination block, with stride BPS. Boundary samples are -// assumed accessible when needed. -typedef void (*VP8PredFunc)(uint8_t *dst); -extern VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES]; -extern VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES]; -extern VP8PredFunc VP8PredLuma4[NUM_BMODES]; - -void VP8DspInit(void); // must be called before anything using the above -void VP8DspInitTables(void); // needs to be called no matter what. - -// simple filter (only for luma) -typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh); -extern VP8SimpleFilterFunc VP8SimpleVFilter16; -extern VP8SimpleFilterFunc VP8SimpleHFilter16; -extern VP8SimpleFilterFunc VP8SimpleVFilter16i; // filter 3 inner edges -extern VP8SimpleFilterFunc VP8SimpleHFilter16i; - -// regular filter (on both macroblock edges and inner edges) -typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride, - int thresh, int ithresh, int hev_t); -typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride, - int thresh, int ithresh, int hev_t); -// on outter edge -extern VP8LumaFilterFunc VP8VFilter16; -extern VP8LumaFilterFunc VP8HFilter16; -extern VP8ChromaFilterFunc VP8VFilter8; -extern VP8ChromaFilterFunc VP8HFilter8; - -// on inner edge -extern VP8LumaFilterFunc VP8VFilter16i; // filtering 3 inner edges altogether -extern VP8LumaFilterFunc VP8HFilter16i; -extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether -extern VP8ChromaFilterFunc VP8HFilter8i; - -//----------------------------------------------------------------------------- +// in alpha.c +const uint8_t* VP8DecompressAlphaRows(VP8Decoder* const dec, + int row, int num_rows); + +// in layer.c +int VP8DecodeLayer(VP8Decoder* const dec); + +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif -#endif // WEBP_DEC_VP8I_H_ +#endif /* WEBP_DEC_VP8I_H_ */ diff --git a/third_party/libwebp/dec/webp.c b/third_party/libwebp/dec/webp.c index 3bf6f55..91ac75f 100644 --- a/third_party/libwebp/dec/webp.c +++ b/third_party/libwebp/dec/webp.c @@ -12,24 +12,29 @@ #include <stdlib.h> #include "vp8i.h" #include "webpi.h" -#include "yuv.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -#define FANCY_UPSCALING // undefined to remove fancy upscaling support - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // RIFF layout is: -// 0ffset tag +// Offset tag // 0...3 "RIFF" 4-byte tag // 4...7 size of image data (including metadata) starting at offset 8 // 8...11 "WEBP" our form-type signature +// The RIFF container (12 bytes) is followed by appropriate chunks: // 12..15 "VP8 ": 4-bytes tags, describing the raw video format used // 16..19 size of the raw VP8 image data, starting at offset 20 // 20.... the VP8 bytes -// There can be extra chunks after the "VP8 " chunk (ICMT, ICOP, ...) +// Or, +// 12..15 "VP8X": 4-bytes tags, describing the extended-VP8 chunk. +// 16..19 size of the VP8X chunk starting at offset 20. +// 20..23 VP8X flags bit-map corresponding to the chunk-types present. +// 24..27 Width of the Canvas Image. +// 28..31 Height of the Canvas Image. +// There can be extra chunks after the "VP8X" chunk (ICCP, TILE, FRM, VP8, +// META ...) // All 32-bits sizes are in little-endian order. // Note: chunk data must be padded to multiple of 2 in size @@ -37,472 +42,312 @@ static inline uint32_t get_le32(const uint8_t* const data) { return data[0] | (data[1] << 8) | (data[2] << 16) | (data[3] << 24); } -// If a RIFF container is detected, validate it and skip over it. -uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr, - uint32_t *data_size_ptr) { - uint32_t chunk_size = 0xffffffffu; - if (*data_size_ptr >= 10 + 20 && !memcmp(*data_ptr, "RIFF", 4)) { - if (memcmp(*data_ptr + 8, "WEBP", 4)) { - return 0; // wrong image file signature +VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size, + uint32_t* riff_size) { + assert(data); + assert(data_size); + assert(riff_size); + + if (*data_size >= RIFF_HEADER_SIZE && + !memcmp(*data, "RIFF", TAG_SIZE)) { + if (memcmp(*data + 8, "WEBP", TAG_SIZE)) { + return VP8_STATUS_BITSTREAM_ERROR; // Wrong image file signature. } else { - const uint32_t riff_size = get_le32(*data_ptr + 4); - if (riff_size < 12) { - return 0; // we should have at least one chunk - } - if (memcmp(*data_ptr + 12, "VP8 ", 4)) { - return 0; // invalid compression format - } - chunk_size = get_le32(*data_ptr + 16); - if (chunk_size > riff_size - 12) { - return 0; // inconsistent size information. + *riff_size = get_le32(*data + TAG_SIZE); + // Check that we have at least one chunk (i.e "WEBP" + "VP8?nnnn"). + if (*riff_size < TAG_SIZE + CHUNK_HEADER_SIZE) { + return VP8_STATUS_BITSTREAM_ERROR; } // We have a RIFF container. Skip it. - *data_ptr += 20; - *data_size_ptr -= 20; - // Note: we don't report error for odd-sized chunks. + *data += RIFF_HEADER_SIZE; + *data_size -= RIFF_HEADER_SIZE; } - return chunk_size; - } - return *data_size_ptr; -} - -//----------------------------------------------------------------------------- -// Fancy upscaling - -#ifdef FANCY_UPSCALING - -// Given samples laid out in a square as: -// [a b] -// [c d] -// we interpolate u/v as: -// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 -// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 - -// We process u and v together stashed into 32bit (16bit each). -#define LOAD_UV(u,v) ((u) | ((v) << 16)) - -#define UPSCALE_FUNC(FUNC_NAME, FUNC, XSTEP) \ -static inline void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ - const uint8_t* top_u, const uint8_t* top_v, \ - const uint8_t* cur_u, const uint8_t* cur_v, \ - uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ - int x; \ - const int last_pixel_pair = (len - 1) >> 1; \ - uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ - uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \ - if (top_y) { \ - const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ - FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \ - } \ - if (bottom_y) { \ - const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ - FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \ - } \ - for (x = 1; x <= last_pixel_pair; ++x) { \ - const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \ - const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \ - /* precompute invariant values associated with first and second diagonals*/\ - const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \ - const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \ - const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \ - if (top_y) { \ - const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \ - const uint32_t uv1 = (diag_03 + t_uv) >> 1; \ - FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (2 * x - 1) * XSTEP); \ - FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \ - top_dst + (2 * x - 0) * XSTEP); \ - } \ - if (bottom_y) { \ - const uint32_t uv0 = (diag_03 + l_uv) >> 1; \ - const uint32_t uv1 = (diag_12 + uv) >> 1; \ - FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (2 * x - 1) * XSTEP); \ - FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \ - bottom_dst + (2 * x + 0) * XSTEP); \ - } \ - tl_uv = t_uv; \ - l_uv = uv; \ - } \ - if (!(len & 1)) { \ - if (top_y) { \ - const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ - FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - top_dst + (len - 1) * XSTEP); \ - } \ - if (bottom_y) { \ - const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ - FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ - bottom_dst + (len - 1) * XSTEP); \ - } \ - } \ -} - -// All variants implemented. -UPSCALE_FUNC(UpscaleRgbLinePair, VP8YuvToRgb, 3) -UPSCALE_FUNC(UpscaleBgrLinePair, VP8YuvToBgr, 3) -UPSCALE_FUNC(UpscaleRgbaLinePair, VP8YuvToRgba, 4) -UPSCALE_FUNC(UpscaleBgraLinePair, VP8YuvToBgra, 4) - -// Main driver function. -static inline -void UpscaleLinePair(const uint8_t* top_y, const uint8_t* bottom_y, - const uint8_t* top_u, const uint8_t* top_v, - const uint8_t* cur_u, const uint8_t* cur_v, - uint8_t* top_dst, uint8_t* bottom_dst, int len, - WEBP_CSP_MODE mode) { - if (mode == MODE_RGB) { - UpscaleRgbLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v, - top_dst, bottom_dst, len); - } else if (mode == MODE_BGR) { - UpscaleBgrLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v, - top_dst, bottom_dst, len); - } else if (mode == MODE_RGBA) { - UpscaleRgbaLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v, - top_dst, bottom_dst, len); } else { - assert(mode == MODE_BGRA); - UpscaleBgraLinePair(top_y, bottom_y, top_u, top_v, cur_u, cur_v, - top_dst, bottom_dst, len); + *riff_size = 0; // Did not get full RIFF Header. } + return VP8_STATUS_OK; } -#undef LOAD_UV -#undef UPSCALE_FUNC - -#endif // FANCY_UPSCALING +VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size, + uint32_t* bytes_skipped, + int* width, int* height, uint32_t* flags) { + assert(data); + assert(data_size); + assert(bytes_skipped); -//----------------------------------------------------------------------------- -// Main conversion driver. + *bytes_skipped = 0; -static int CustomPut(const VP8Io* io) { - WebPDecParams *p = (WebPDecParams*)io->opaque; - const int w = io->width; - const int mb_h = io->mb_h; - const int uv_w = (w + 1) / 2; - assert(!(io->mb_y & 1)); - - if (w <= 0 || mb_h <= 0) { - return 0; + if (*data_size < CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE) { + return VP8_STATUS_NOT_ENOUGH_DATA; // Insufficient data. } - p->last_y = io->mb_y + io->mb_h; // a priori guess - if (p->mode == MODE_YUV) { - uint8_t* const y_dst = p->output + io->mb_y * p->stride; - uint8_t* const u_dst = p->u + (io->mb_y >> 1) * p->u_stride; - uint8_t* const v_dst = p->v + (io->mb_y >> 1) * p->v_stride; - int j; - for (j = 0; j < mb_h; ++j) { - memcpy(y_dst + j * p->stride, io->y + j * io->y_stride, w); + if (!memcmp(*data, "VP8X", TAG_SIZE)) { + const uint32_t chunk_size = get_le32(*data + TAG_SIZE); + if (chunk_size != VP8X_CHUNK_SIZE) { + return VP8_STATUS_BITSTREAM_ERROR; // Wrong chunk size. } - for (j = 0; j < (mb_h + 1) / 2; ++j) { - memcpy(u_dst + j * p->u_stride, io->u + j * io->uv_stride, uv_w); - memcpy(v_dst + j * p->v_stride, io->v + j * io->uv_stride, uv_w); + if (flags) { + *flags = get_le32(*data + 8); } - } else { - uint8_t* dst = p->output + io->mb_y * p->stride; - if (io->fancy_upscaling) { -#ifdef FANCY_UPSCALING - const uint8_t* cur_y = io->y; - const uint8_t* cur_u = io->u; - const uint8_t* cur_v = io->v; - const uint8_t* top_u = p->top_u; - const uint8_t* top_v = p->top_v; - int y = io->mb_y; - int y_end = io->mb_y + io->mb_h; - if (y == 0) { - // First line is special cased. We mirror the u/v samples at boundary. - UpscaleLinePair(NULL, cur_y, cur_u, cur_v, cur_u, cur_v, - NULL, dst, w, p->mode); - } else { - // We can finish the left-over line from previous call - UpscaleLinePair(p->top_y, cur_y, top_u, top_v, cur_u, cur_v, - dst - p->stride, dst, w, p->mode); - } - // Loop over each output pairs of row. - for (; y + 2 < y_end; y += 2) { - top_u = cur_u; - top_v = cur_v; - cur_u += io->uv_stride; - cur_v += io->uv_stride; - dst += 2 * p->stride; - cur_y += 2 * io->y_stride; - UpscaleLinePair(cur_y - io->y_stride, cur_y, - top_u, top_v, cur_u, cur_v, - dst - p->stride, dst, w, p->mode); - } - // move to last row - cur_y += io->y_stride; - if (y_end != io->height) { - // Save the unfinished samples for next call (as we're not done yet). - memcpy(p->top_y, cur_y, w * sizeof(*p->top_y)); - memcpy(p->top_u, cur_u, uv_w * sizeof(*p->top_u)); - memcpy(p->top_v, cur_v, uv_w * sizeof(*p->top_v)); - // The fancy upscaler leaves a row unfinished behind - // (except for the very last row) - p->last_y -= 1; - } else { - // Process the very last row of even-sized picture - if (!(y_end & 1)) { - UpscaleLinePair(cur_y, NULL, cur_u, cur_v, cur_u, cur_v, - dst + p->stride, NULL, w, p->mode); - } - } -#else - assert(0); // shouldn't happen. -#endif - } else { - // Point-sampling U/V upscaler. - int j; - for (j = 0; j < mb_h; ++j) { - const uint8_t* y_src = io->y + j * io->y_stride; - int i; - for (i = 0; i < w; ++i) { - const int y = y_src[i]; - const int u = io->u[(j / 2) * io->uv_stride + (i / 2)]; - const int v = io->v[(j / 2) * io->uv_stride + (i / 2)]; - if (p->mode == MODE_RGB) { - VP8YuvToRgb(y, u, v, dst + i * 3); - } else if (p->mode == MODE_BGR) { - VP8YuvToBgr(y, u, v, dst + i * 3); - } else if (p->mode == MODE_RGBA) { - VP8YuvToRgba(y, u, v, dst + i * 4); - } else { - VP8YuvToBgra(y, u, v, dst + i * 4); - } - } - dst += p->stride; - } + if (width) { + *width = get_le32(*data + 12); } + if (height) { + *height = get_le32(*data + 16); + } + // We have consumed 20 bytes from VP8X. Skip them. + *bytes_skipped = CHUNK_HEADER_SIZE + VP8X_CHUNK_SIZE; + *data += *bytes_skipped; + *data_size -= *bytes_skipped; } - return 1; + return VP8_STATUS_OK; } -//----------------------------------------------------------------------------- - -static int CustomSetup(VP8Io* io) { -#ifdef FANCY_UPSCALING - WebPDecParams *p = (WebPDecParams*)io->opaque; - p->top_y = p->top_u = p->top_v = NULL; - if (p->mode != MODE_YUV) { - const int uv_width = (io->width + 1) >> 1; - p->top_y = (uint8_t*)malloc(io->width + 2 * uv_width); - if (p->top_y == NULL) { - return 0; // memory error. +VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size, + uint32_t riff_size, + uint32_t* bytes_skipped) { + const uint8_t* buf; + uint32_t buf_size; + + assert(data); + assert(data_size); + assert(bytes_skipped); + + buf = *data; + buf_size = *data_size; + *bytes_skipped = 0; + + while (1) { + uint32_t chunk_size; + uint32_t cur_skip_size; + const uint32_t bytes_skipped_header = TAG_SIZE + // "WEBP". + CHUNK_HEADER_SIZE + // "VP8Xnnnn". + VP8X_CHUNK_SIZE; // Data. + *data = buf; + *data_size = buf_size; + + if (buf_size < CHUNK_HEADER_SIZE) { // Insufficient data. + return VP8_STATUS_NOT_ENOUGH_DATA; + } + + chunk_size = get_le32(buf + TAG_SIZE); + cur_skip_size = CHUNK_HEADER_SIZE + chunk_size; + + // Check that total bytes skipped along with current chunk size + // does not exceed riff_size. + if (riff_size > 0 && + (bytes_skipped_header + *bytes_skipped + cur_skip_size > riff_size)) { + return VP8_STATUS_BITSTREAM_ERROR; // Not a valid chunk size. + } + + if (buf_size < cur_skip_size) { // Insufficient data. + return VP8_STATUS_NOT_ENOUGH_DATA; + } + + if (!memcmp(buf, "VP8 ", TAG_SIZE)) { // A valid VP8 header. + return VP8_STATUS_OK; // Found. } - p->top_u = p->top_y + io->width; - p->top_v = p->top_u + uv_width; - io->fancy_upscaling = 1; // activate fancy upscaling + + // We have a full & valid chunk; skip it. + buf += cur_skip_size; + buf_size -= cur_skip_size; + *bytes_skipped += cur_skip_size; } -#endif - return 1; } -static void CustomTeardown(const VP8Io* io) { -#ifdef FANCY_UPSCALING - WebPDecParams *p = (WebPDecParams*)io->opaque; - if (p->top_y) { - free(p->top_y); - p->top_y = p->top_u = p->top_v = NULL; +VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size, + uint32_t riff_size, uint32_t* bytes_skipped, + uint32_t* vp8_chunk_size) { + assert(data); + assert(data_size); + assert(bytes_skipped); + assert(vp8_chunk_size); + + *bytes_skipped = 0; + *vp8_chunk_size = 0; + + if (*data_size < CHUNK_HEADER_SIZE) { + return VP8_STATUS_NOT_ENOUGH_DATA; // Insufficient data. } -#endif -} -void WebPInitCustomIo(VP8Io* const io) { - io->put = CustomPut; - io->setup = CustomSetup; - io->teardown = CustomTeardown; + if (!memcmp(*data, "VP8 ", TAG_SIZE)) { + *vp8_chunk_size = get_le32(*data + TAG_SIZE); + if (riff_size >= TAG_SIZE + CHUNK_HEADER_SIZE && // "WEBP" + "VP8 nnnn". + (*vp8_chunk_size > riff_size - (TAG_SIZE + CHUNK_HEADER_SIZE))) { + return VP8_STATUS_BITSTREAM_ERROR; // Inconsistent size information. + } + // We have consumed CHUNK_HEADER_SIZE bytes from VP8 Header. Skip them. + *bytes_skipped = CHUNK_HEADER_SIZE; + *data += *bytes_skipped; + *data_size -= *bytes_skipped; + } + return VP8_STATUS_OK; } -//----------------------------------------------------------------------------- -// Init/Check/Free decoding parameters and buffer - -int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width, - int* height, WebPDecParams* const params) { - int w, h; +VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size, + uint32_t* vp8_size, uint32_t* bytes_skipped) { + const uint8_t* buf; + uint32_t buf_size; + uint32_t riff_size; + uint32_t vp8_size_tmp; + uint32_t optional_data_size; + uint32_t vp8x_skip_size; + uint32_t vp8_skip_size; + VP8StatusCode status; + + assert(data); + assert(data_size); + assert(vp8_size); + assert(bytes_skipped); + + buf = *data; + buf_size = *data_size; + + *vp8_size = 0; + *bytes_skipped = 0; + + if (buf == NULL || buf_size < RIFF_HEADER_SIZE) { + return VP8_STATUS_NOT_ENOUGH_DATA; + } - if (!WebPGetInfo(data, data_size, &w, &h)) { - return 0; + // Skip over RIFF header. + if (WebPParseRIFF(&buf, &buf_size, &riff_size) != VP8_STATUS_OK) { + return VP8_STATUS_BITSTREAM_ERROR; // Wrong RIFF Header. } - if (width) *width = w; - if (height) *height = h; - - if (!params->external_buffer) { - int stride; - int uv_stride = 0; - int size; - int uv_size = 0; - uint8_t* output; - WEBP_CSP_MODE mode = params->mode; - - // initialize output buffer, now that dimensions are known. - stride = (mode == MODE_RGB || mode == MODE_BGR) ? 3 * w - : (mode == MODE_RGBA || mode == MODE_BGRA) ? 4 * w - : w; - size = stride * h; - - if (mode == MODE_YUV) { - uv_stride = (w + 1) / 2; - uv_size = uv_stride * ((h + 1) / 2); - } - output = (uint8_t*)malloc(size + 2 * uv_size); - if (!output) { - return 0; + // Skip over VP8X header. + status = WebPParseVP8X(&buf, &buf_size, &vp8x_skip_size, NULL, NULL, NULL); + if (status != VP8_STATUS_OK) { + return status; // Wrong VP8X Chunk / Insufficient data. + } + if (vp8x_skip_size > 0) { + // Skip over optional chunks. + status = WebPParseOptionalChunks(&buf, &buf_size, riff_size, + &optional_data_size); + if (status != VP8_STATUS_OK) { + return status; // Found an invalid chunk size / Insufficient data. } + } - params->output = output; - params->stride = stride; - params->output_size = size; - if (mode == MODE_YUV) { - params->u = output + size; - params->u_stride = uv_stride; - params->output_u_size = uv_size; - params->v = output + size + uv_size; - params->v_stride = uv_stride; - params->output_v_size = uv_size; - } + // Skip over VP8 chunk header. + status = WebPParseVP8Header(&buf, &buf_size, riff_size, &vp8_skip_size, + &vp8_size_tmp); + if (status != VP8_STATUS_OK) { + return status; // Invalid VP8 header / Insufficient data. + } + if (vp8_skip_size > 0) { + *vp8_size = vp8_size_tmp; } - return 1; -} -int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params) { - int ok = 1; - WEBP_CSP_MODE mode = params->mode; - ok &= (params->stride * io->height <= params->output_size); - if (mode == MODE_RGB || mode == MODE_BGR) { - ok &= (params->stride >= io->width * 3); - } else if (mode == MODE_RGBA || mode == MODE_BGRA) { - ok &= (params->stride >= io->width * 4); - } else { - // some extra checks for U/V - const int u_size = params->u_stride * ((io->height + 1) / 2); - const int v_size = params->v_stride * ((io->height + 1) / 2); - ok &= (params->stride >= io->width); - ok &= (params->u_stride >= (io->width + 1) / 2) && - (params->v_stride >= (io->width + 1) / 2); - ok &= (u_size <= params->output_u_size && - v_size <= params->output_v_size); - } - return ok; + *bytes_skipped = buf - *data; + assert(*bytes_skipped == *data_size - buf_size); + *data = buf; + *data_size = buf_size; + return VP8_STATUS_OK; } -void WebPClearDecParams(WebPDecParams* params) { - if (!params->external_buffer) { - free(params->output); +//------------------------------------------------------------------------------ +// WebPDecParams + +void WebPResetDecParams(WebPDecParams* const params) { + if (params) { + memset(params, 0, sizeof(*params)); } - memset(params, 0, sizeof(*params)); } -//----------------------------------------------------------------------------- -// "Into" variants +//------------------------------------------------------------------------------ +// "Into" decoding variants -static uint8_t* DecodeInto(WEBP_CSP_MODE mode, - const uint8_t* data, uint32_t data_size, - WebPDecParams* params) { +// Main flow +static VP8StatusCode DecodeInto(const uint8_t* data, uint32_t data_size, + WebPDecParams* const params) { VP8Decoder* dec = VP8New(); + VP8StatusCode status = VP8_STATUS_OK; VP8Io io; - int ok = 1; + assert(params); if (dec == NULL) { - return NULL; + return VP8_STATUS_INVALID_PARAM; } VP8InitIo(&io); io.data = data; io.data_size = data_size; + WebPInitCustomIo(params, &io); // Plug the I/O functions. - params->mode = mode; - io.opaque = params; - WebPInitCustomIo(&io); +#ifdef WEBP_USE_THREAD + dec->use_threads_ = params->options && (params->options->use_threads > 0); +#else + dec->use_threads_ = 0; +#endif + // Decode bitstream header, update io->width/io->height. if (!VP8GetHeaders(dec, &io)) { - VP8Delete(dec); - return NULL; - } - - // check output buffers - ok = WebPCheckDecParams(&io, params); - if (!ok) { - VP8Delete(dec); - return NULL; - } - - if (mode != MODE_YUV) { - VP8YUVInit(); + status = VP8_STATUS_BITSTREAM_ERROR; + } else { + // Allocate/check output buffers. + status = WebPAllocateDecBuffer(io.width, io.height, params->options, + params->output); + if (status == VP8_STATUS_OK) { + // Decode + if (!VP8Decode(dec, &io)) { + status = dec->status_; + } + } } - - ok = VP8Decode(dec, &io); VP8Delete(dec); - return ok ? params->output : NULL; + if (status != VP8_STATUS_OK) { + WebPFreeDecBuffer(params->output); + } + return status; } -uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size, - uint8_t* output, int output_size, - int output_stride) { +// Helpers +static uint8_t* DecodeIntoRGBABuffer(WEBP_CSP_MODE colorspace, + const uint8_t* data, uint32_t data_size, + uint8_t* rgba, int stride, int size) { WebPDecParams params; - - if (output == NULL) { + WebPDecBuffer buf; + if (rgba == NULL) { + return NULL; + } + WebPInitDecBuffer(&buf); + WebPResetDecParams(¶ms); + params.output = &buf; + buf.colorspace = colorspace; + buf.u.RGBA.rgba = rgba; + buf.u.RGBA.stride = stride; + buf.u.RGBA.size = size; + buf.is_external_memory = 1; + if (DecodeInto(data, data_size, ¶ms) != VP8_STATUS_OK) { return NULL; } + return rgba; +} - params.output = output; - params.stride = output_stride; - params.output_size = output_size; - params.output_u_size = 0; - params.output_v_size = 0; - return DecodeInto(MODE_RGB, data, data_size, ¶ms); +uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size, + uint8_t* output, int size, int stride) { + return DecodeIntoRGBABuffer(MODE_RGB, data, data_size, output, stride, size); } uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size, - uint8_t* output, int output_size, - int output_stride) { - WebPDecParams params; - - if (output == NULL) { - return NULL; - } + uint8_t* output, int size, int stride) { + return DecodeIntoRGBABuffer(MODE_RGBA, data, data_size, output, stride, size); +} - params.output = output; - params.stride = output_stride; - params.output_size = output_size; - params.output_u_size = 0; - params.output_v_size = 0; - return DecodeInto(MODE_RGBA, data, data_size, ¶ms); +uint8_t* WebPDecodeARGBInto(const uint8_t* data, uint32_t data_size, + uint8_t* output, int size, int stride) { + return DecodeIntoRGBABuffer(MODE_ARGB, data, data_size, output, stride, size); } uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size, - uint8_t* output, int output_size, - int output_stride) { - WebPDecParams params; - - if (output == NULL) { - return NULL; - } - - params.output = output; - params.stride = output_stride; - params.output_size = output_size; - params.output_u_size = 0; - params.output_v_size = 0; - return DecodeInto(MODE_BGR, data, data_size, ¶ms); + uint8_t* output, int size, int stride) { + return DecodeIntoRGBABuffer(MODE_BGR, data, data_size, output, stride, size); } uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size, - uint8_t* output, int output_size, - int output_stride) { - WebPDecParams params; - - if (output == NULL) { - return NULL; - } - - params.output = output; - params.stride = output_stride; - params.output_size = output_size; - params.output_u_size = 0; - params.output_v_size = 0; - return DecodeInto(MODE_BGRA, data, data_size, ¶ms); + uint8_t* output, int size, int stride) { + return DecodeIntoRGBABuffer(MODE_BGRA, data, data_size, output, stride, size); } uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size, @@ -510,132 +355,241 @@ uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size, uint8_t* u, int u_size, int u_stride, uint8_t* v, int v_size, int v_stride) { WebPDecParams params; - - if (luma == NULL) { + WebPDecBuffer output; + if (luma == NULL) return NULL; + WebPInitDecBuffer(&output); + WebPResetDecParams(¶ms); + params.output = &output; + output.colorspace = MODE_YUV; + output.u.YUVA.y = luma; + output.u.YUVA.y_stride = luma_stride; + output.u.YUVA.y_size = luma_size; + output.u.YUVA.u = u; + output.u.YUVA.u_stride = u_stride; + output.u.YUVA.u_size = u_size; + output.u.YUVA.v = v; + output.u.YUVA.v_stride = v_stride; + output.u.YUVA.v_size = v_size; + output.is_external_memory = 1; + if (DecodeInto(data, data_size, ¶ms) != VP8_STATUS_OK) { return NULL; } - - params.output = luma; - params.stride = luma_stride; - params.output_size = luma_size; - params.u = u; - params.u_stride = u_stride; - params.output_u_size = u_size; - params.v = v; - params.v_stride = v_stride; - params.output_v_size = v_size; - return DecodeInto(MODE_YUV, data, data_size, ¶ms); + return luma; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static uint8_t* Decode(WEBP_CSP_MODE mode, const uint8_t* data, uint32_t data_size, int* width, int* height, - WebPDecParams* params_out) { - uint8_t* output; + WebPDecBuffer* keep_info) { WebPDecParams params; + WebPDecBuffer output; - memset(¶ms, 0, sizeof(params)); - params.mode = mode; - if (!WebPInitDecParams(data, data_size, width, height, ¶ms)) { + WebPInitDecBuffer(&output); + WebPResetDecParams(¶ms); + params.output = &output; + output.colorspace = mode; + + // Retrieve (and report back) the required dimensions from bitstream. + if (!WebPGetInfo(data, data_size, &output.width, &output.height)) { return NULL; } + if (width) *width = output.width; + if (height) *height = output.height; - params.output_size = params.stride * (*height); - params.output_u_size = params.output_v_size = - params.u_stride * ((*height + 1) / 2); - output = DecodeInto(mode, data, data_size, ¶ms); - if (!output) { - WebPClearDecParams(¶ms); + // Decode + if (DecodeInto(data, data_size, ¶ms) != VP8_STATUS_OK) { + return NULL; } - if (params_out) { - *params_out = params; + if (keep_info) { // keep track of the side-info + WebPCopyDecBuffer(&output, keep_info); } - return output; + // return decoded samples (don't clear 'output'!) + return (mode >= MODE_YUV) ? output.u.YUVA.y : output.u.RGBA.rgba; } uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size, - int *width, int *height) { + int* width, int* height) { return Decode(MODE_RGB, data, data_size, width, height, NULL); } uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size, - int *width, int *height) { + int* width, int* height) { return Decode(MODE_RGBA, data, data_size, width, height, NULL); } +uint8_t* WebPDecodeARGB(const uint8_t* data, uint32_t data_size, + int* width, int* height) { + return Decode(MODE_ARGB, data, data_size, width, height, NULL); +} + uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size, - int *width, int *height) { + int* width, int* height) { return Decode(MODE_BGR, data, data_size, width, height, NULL); } uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size, - int *width, int *height) { + int* width, int* height) { return Decode(MODE_BGRA, data, data_size, width, height, NULL); } uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size, - int *width, int *height, uint8_t** u, uint8_t** v, - int *stride, int* uv_stride) { - WebPDecParams params; + int* width, int* height, uint8_t** u, uint8_t** v, + int* stride, int* uv_stride) { + WebPDecBuffer output; // only to preserve the side-infos uint8_t* const out = Decode(MODE_YUV, data, data_size, - width, height, ¶ms); + width, height, &output); if (out) { - *u = params.u; - *v = params.v; - *stride = params.stride; - *uv_stride = params.u_stride; - assert(params.u_stride == params.v_stride); + const WebPYUVABuffer* const buf = &output.u.YUVA; + *u = buf->u; + *v = buf->v; + *stride = buf->y_stride; + *uv_stride = buf->u_stride; + assert(buf->u_stride == buf->v_stride); } return out; } -//----------------------------------------------------------------------------- +static void DefaultFeatures(WebPBitstreamFeatures* const features) { + assert(features); + memset(features, 0, sizeof(*features)); + features->bitstream_version = 0; +} + +static VP8StatusCode GetFeatures(const uint8_t* data, uint32_t data_size, + WebPBitstreamFeatures* const features) { + uint32_t vp8_chunk_size = 0; + uint32_t riff_size = 0; + uint32_t flags = 0; + uint32_t vp8x_skip_size = 0; + uint32_t vp8_skip_size = 0; + VP8StatusCode status; + + if (features == NULL) { + return VP8_STATUS_INVALID_PARAM; + } + DefaultFeatures(features); + + if (data == NULL) { + return VP8_STATUS_INVALID_PARAM; + } + + // Skip over RIFF header. + status = WebPParseRIFF(&data, &data_size, &riff_size); + if (status != VP8_STATUS_OK) { + return status; // Wrong RIFF Header / Insufficient data. + } + + // Skip over VP8X. + status = WebPParseVP8X(&data, &data_size, &vp8x_skip_size, &features->width, + &features->height, &flags); + if (status != VP8_STATUS_OK) { + return status; // Wrong VP8X / insufficient data. + + } + if (vp8x_skip_size > 0) { + return VP8_STATUS_OK; // Return features from VP8X header. + } + + // Skip over VP8 header. + status = WebPParseVP8Header(&data, &data_size, riff_size, &vp8_skip_size, + &vp8_chunk_size); + if (status != VP8_STATUS_OK) { + return status; // Wrong VP8 Chunk-header / insufficient data. + } + if (vp8_skip_size == 0) { + vp8_chunk_size = data_size; // No VP8 chunk wrapper over raw VP8 data. + } + + // Validates raw VP8 data. + if (!VP8GetInfo(data, data_size, vp8_chunk_size, + &features->width, &features->height, &features->has_alpha)) { + return VP8_STATUS_BITSTREAM_ERROR; + } + + return VP8_STATUS_OK; // Return features from VP8 header. +} + +//------------------------------------------------------------------------------ // WebPGetInfo() int WebPGetInfo(const uint8_t* data, uint32_t data_size, - int *width, int *height) { - const uint32_t chunk_size = WebPCheckRIFFHeader(&data, &data_size); - if (!chunk_size) { - return 0; // unsupported RIFF header - } - // Validate raw video data - if (data_size < 10) { - return 0; // not enough data - } - // check signature - if (data[3] != 0x9d || data[4] != 0x01 || data[5] != 0x2a) { - return 0; // Wrong signature. - } else { - const uint32_t bits = data[0] | (data[1] << 8) | (data[2] << 16); - const int key_frame = !(bits & 1); - const int w = ((data[7] << 8) | data[6]) & 0x3fff; - const int h = ((data[9] << 8) | data[8]) & 0x3fff; + int* width, int* height) { + WebPBitstreamFeatures features; - if (!key_frame) { // Not a keyframe. - return 0; - } + if (GetFeatures(data, data_size, &features) != VP8_STATUS_OK) { + return 0; + } - if (((bits >> 1) & 7) > 3) { - return 0; // unknown profile - } - if (!((bits >> 4) & 1)) { - return 0; // first frame is invisible! - } - if (((bits >> 5)) >= chunk_size) { // partition_length - return 0; // inconsistent size information. - } + if (width) { + *width = features.width; + } + if (height) { + *height = features.height; + } - if (width) { - *width = w; - } - if (height) { - *height = h; - } + return 1; +} - return 1; +//------------------------------------------------------------------------------ +// Advance decoding API + +int WebPInitDecoderConfigInternal(WebPDecoderConfig* const config, + int version) { + if (version != WEBP_DECODER_ABI_VERSION) { + return 0; // version mismatch } + if (config == NULL) { + return 0; + } + memset(config, 0, sizeof(*config)); + DefaultFeatures(&config->input); + WebPInitDecBuffer(&config->output); + return 1; +} + +VP8StatusCode WebPGetFeaturesInternal(const uint8_t* data, uint32_t data_size, + WebPBitstreamFeatures* const features, + int version) { + VP8StatusCode status; + if (version != WEBP_DECODER_ABI_VERSION) { + return VP8_STATUS_INVALID_PARAM; // version mismatch + } + if (features == NULL) { + return VP8_STATUS_INVALID_PARAM; + } + + status = GetFeatures(data, data_size, features); + if (status == VP8_STATUS_NOT_ENOUGH_DATA) { + return VP8_STATUS_BITSTREAM_ERROR; // Not enough data treated as error. + } + return status; +} + +VP8StatusCode WebPDecode(const uint8_t* data, uint32_t data_size, + WebPDecoderConfig* const config) { + WebPDecParams params; + VP8StatusCode status; + + if (!config) { + return VP8_STATUS_INVALID_PARAM; + } + + status = GetFeatures(data, data_size, &config->input); + if (status != VP8_STATUS_OK) { + if (status == VP8_STATUS_NOT_ENOUGH_DATA) { + return VP8_STATUS_BITSTREAM_ERROR; // Not enough data treated as error. + } + return status; + } + + WebPResetDecParams(¶ms); + params.output = &config->output; + params.options = &config->options; + status = DecodeInto(data, data_size, ¶ms); + + return status; } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/third_party/libwebp/dec/webpi.h b/third_party/libwebp/dec/webpi.h index cf5bc0e..6c14460 100644 --- a/third_party/libwebp/dec/webpi.h +++ b/third_party/libwebp/dec/webpi.h @@ -9,55 +9,155 @@ // // Author: somnath@google.com (Somnath Banerjee) -#ifndef WEBP_DEC_WEBPI_H -#define WEBP_DEC_WEBPI_H +#ifndef WEBP_DEC_WEBPI_H_ +#define WEBP_DEC_WEBPI_H_ #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -#include "webp/decode_vp8.h" +#include "../webp/decode_vp8.h" -// Decoding output parameters. +//------------------------------------------------------------------------------ +// WebPDecParams: Decoding output parameters. Transient internal object. + +typedef struct WebPDecParams WebPDecParams; +typedef int (*OutputFunc)(const VP8Io* const io, WebPDecParams* const p); + +// Structure use for on-the-fly rescaling typedef struct { - uint8_t* output; // rgb(a) or luma - uint8_t *u, *v; // chroma u/v - uint8_t *top_y, *top_u, *top_v; // cache for the fancy upscaler - int stride; // rgb(a) stride or luma stride - int u_stride; // chroma-u stride - int v_stride; // chroma-v stride - WEBP_CSP_MODE mode; // rgb(a) or yuv - int last_y; // coordinate of the line that was last output - int output_size; // size of 'output' buffer - int output_u_size; // size of 'u' buffer - int output_v_size; // size of 'v' buffer - int external_buffer; // If true, the output buffers are externally owned -} WebPDecParams; - -// If a RIFF container is detected, validate it and skip over it. Returns -// VP8 bit-stream size if RIFF header is valid else returns 0 -uint32_t WebPCheckRIFFHeader(const uint8_t** data_ptr, - uint32_t *data_size_ptr); - -// Initializes VP8Io with custom setup, io and teardown functions -void WebPInitCustomIo(VP8Io* const io); - -// Initializes params_out by allocating output buffer and setting the -// stride information. It also outputs width and height information of -// the WebP image. Returns 1 if succeeds. -int WebPInitDecParams(const uint8_t* data, uint32_t data_size, int* width, - int* height, WebPDecParams* const params_out); - -// Verifies various size configurations (e.g stride >= width, specified -// output size <= stride * height etc.). Returns 0 if checks fail. -int WebPCheckDecParams(const VP8Io* io, const WebPDecParams* params); - -// Deallocate memory allocated by WebPInitDecParams() and reset the -// WebPDecParams object. -void WebPClearDecParams(WebPDecParams* params); + int x_expand; // true if we're expanding in the x direction + int fy_scale, fx_scale; // fixed-point scaling factor + int64_t fxy_scale; // '' + // we need hpel-precise add/sub increments, for the downsampled U/V planes. + int y_accum; // vertical accumulator + int y_add, y_sub; // vertical increments (add ~= src, sub ~= dst) + int x_add, x_sub; // horizontal increments (add ~= src, sub ~= dst) + int src_width, src_height; // source dimensions + int dst_width, dst_height; // destination dimensions + uint8_t* dst; + int dst_stride; + int32_t* irow, *frow; // work buffer +} WebPRescaler; + +struct WebPDecParams { + WebPDecBuffer* output; // output buffer. + uint8_t* tmp_y, *tmp_u, *tmp_v; // cache for the fancy upsampler + // or used for tmp rescaling + + int last_y; // coordinate of the line that was last output + const WebPDecoderOptions* options; // if not NULL, use alt decoding features + // rescalers + WebPRescaler scaler_y, scaler_u, scaler_v, scaler_a; + void* memory; // overall scratch memory for the output work. + OutputFunc emit; // output RGB or YUV samples + OutputFunc emit_alpha; // output alpha channel +}; + +// Should be called first, before any use of the WebPDecParams object. +void WebPResetDecParams(WebPDecParams* const params); + +//------------------------------------------------------------------------------ +// Header parsing helpers + +#define TAG_SIZE 4 +#define CHUNK_HEADER_SIZE 8 +#define RIFF_HEADER_SIZE 12 +#define FRAME_CHUNK_SIZE 20 +#define LOOP_CHUNK_SIZE 4 +#define TILE_CHUNK_SIZE 8 +#define VP8X_CHUNK_SIZE 12 +#define VP8_FRAME_HEADER_SIZE 10 // Size of the frame header within VP8 data. + +// Validates the RIFF container (if detected) and skips over it. +// If a RIFF container is detected, +// Returns VP8_STATUS_BITSTREAM_ERROR for invalid header, and +// VP8_STATUS_OK otherwise. +// In case there are not enough bytes (partial RIFF container), return 0 for +// riff_size. Else return the riff_size extracted from the header. +VP8StatusCode WebPParseRIFF(const uint8_t** data, uint32_t* data_size, + uint32_t* riff_size); + +// Validates the VP8X Header and skips over it. +// Returns VP8_STATUS_BITSTREAM_ERROR for invalid VP8X header, +// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and +// VP8_STATUS_OK otherwise. +// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes +// that are skipped; also Width, Height & Flags are set to the corresponding +// fields extracted from the VP8X chunk. +VP8StatusCode WebPParseVP8X(const uint8_t** data, uint32_t* data_size, + uint32_t* bytes_skipped, + int* width, int* height, uint32_t* flags); + +// Skips to the next VP8 chunk header in the data given the size of the RIFF +// chunk 'riff_size'. +// Returns VP8_STATUS_BITSTREAM_ERROR if any invalid chunk size is encountered, +// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and +// VP8_STATUS_OK otherwise. +// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes +// that are skipped. +VP8StatusCode WebPParseOptionalChunks(const uint8_t** data, uint32_t* data_size, + uint32_t riff_size, + uint32_t* bytes_skipped); + +// Validates the VP8 Header ("VP8 nnnn") and skips over it. +// Returns VP8_STATUS_BITSTREAM_ERROR for invalid (vp8_chunk_size greater than +// riff_size) VP8 header, +// VP8_STATUS_NOT_ENOUGH_DATA in case of insufficient data, and +// VP8_STATUS_OK otherwise. +// If a VP8 chunk is found, bytes_skipped is set to the total number of bytes +// that are skipped and vp8_chunk_size is set to the corresponding size +// extracted from the VP8 chunk header. +// For a partial VP8 chunk, vp8_chunk_size is set to 0. +VP8StatusCode WebPParseVP8Header(const uint8_t** data, uint32_t* data_size, + uint32_t riff_size, uint32_t* bytes_skipped, + uint32_t* vp8_chunk_size); + +// Skips over all valid chunks prior to the first VP8 frame header. +// Returns VP8_STATUS_OK on success, +// VP8_STATUS_BITSTREAM_ERROR if an invalid header/chunk is found, and +// VP8_STATUS_NOT_ENOUGH_DATA if case of insufficient data. +// Also, data, data_size, vp8_size & bytes_skipped are updated appropriately +// on success, where +// vp8_size is the size of VP8 chunk data (extracted from VP8 chunk header) and +// bytes_skipped is set to the total number of bytes that are skipped. +VP8StatusCode WebPParseHeaders(const uint8_t** data, uint32_t* data_size, + uint32_t* vp8_size, uint32_t* bytes_skipped); + +//------------------------------------------------------------------------------ +// Misc utils + +// Initializes VP8Io with custom setup, io and teardown functions. The default +// hooks will use the supplied 'params' as io->opaque handle. +void WebPInitCustomIo(WebPDecParams* const params, VP8Io* const io); + +//------------------------------------------------------------------------------ +// Internal functions regarding WebPDecBuffer memory (in buffer.c). +// Don't really need to be externally visible for now. + +// Prepare 'buffer' with the requested initial dimensions width/height. +// If no external storage is supplied, initializes buffer by allocating output +// memory and setting up the stride information. Validate the parameters. Return +// an error code in case of problem (no memory, or invalid stride / size / +// dimension / etc.). If *options is not NULL, also verify that the options' +// parameters are valid and apply them to the width/height dimensions of the +// output buffer. This takes cropping / scaling / rotation into account. +VP8StatusCode WebPAllocateDecBuffer(int width, int height, + const WebPDecoderOptions* const options, + WebPDecBuffer* const buffer); + +// Copy 'src' into 'dst' buffer, making sure 'dst' is not marked as owner of the +// memory (still held by 'src'). +void WebPCopyDecBuffer(const WebPDecBuffer* const src, + WebPDecBuffer* const dst); + +// Copy and transfer ownership from src to dst (beware of parameter order!) +void WebPGrabDecBuffer(WebPDecBuffer* const src, WebPDecBuffer* const dst); + +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif -#endif // WEBP_DEC_WEBPI_H +#endif /* WEBP_DEC_WEBPI_H_ */ diff --git a/third_party/libwebp/dec/yuv.h b/third_party/libwebp/dec/yuv.h deleted file mode 100644 index 50e63f9..0000000 --- a/third_party/libwebp/dec/yuv.h +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright 2010 Google Inc. -// -// This code is licensed under the same terms as WebM: -// Software License Agreement: http://www.webmproject.org/license/software/ -// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ -// ----------------------------------------------------------------------------- -// -// inline YUV->RGB conversion function -// -// Author: Skal (pascal.massimino@gmail.com) - -#ifndef WEBP_DEC_YUV_H_ -#define WEBP_DEC_YUV_H_ - -#include "webp/decode_vp8.h" - -#if defined(__cplusplus) || defined(c_plusplus) -extern "C" { -#endif - -enum { YUV_FIX = 16, // fixed-point precision - YUV_RANGE_MIN = -227, // min value of r/g/b output - YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output -}; -extern int16_t VP8kVToR[256], VP8kUToB[256]; -extern int32_t VP8kVToG[256], VP8kUToG[256]; -extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; - -inline static void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const rgb) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN]; - rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; - rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN]; -} - -inline static void VP8YuvToRgba(int y, int u, int v, uint8_t* const rgba) { - VP8YuvToRgb(y, u, v, rgba); - rgba[3] = 0xff; -} - -inline static void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v, - uint8_t* const bgr) { - const int r_off = VP8kVToR[v]; - const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; - const int b_off = VP8kUToB[u]; - bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN]; - bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; - bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN]; -} - -inline static void VP8YuvToBgra(int y, int u, int v, uint8_t* const bgra) { - VP8YuvToBgr(y, u, v, bgra); - bgra[3] = 0xff; -} - -// Must be called before everything, to initialize the tables. -void VP8YUVInit(void); - -#if defined(__cplusplus) || defined(c_plusplus) -} // extern "C" -#endif - -#endif // WEBP_DEC_YUV_H_ diff --git a/third_party/libwebp/dsp/cpu.c b/third_party/libwebp/dsp/cpu.c new file mode 100644 index 0000000..1afc25a --- /dev/null +++ b/third_party/libwebp/dsp/cpu.c @@ -0,0 +1,70 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// CPU detection +// +// Author: Christian Duvivier (cduvivier@google.com) + +#include <stddef.h> // for NULL + +#include "./dsp.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// SSE2 detection. +// + +#if defined(__pic__) && defined(__i386__) +static inline void GetCPUInfo(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type)); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void GetCPUInfo(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type)); +} +#elif defined(_MSC_VER) // Visual C++ +#define GetCPUInfo __cpuid +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +static int x86CPUInfo(CPUFeature feature) { + int cpu_info[4]; + GetCPUInfo(cpu_info, 1); + if (feature == kSSE2) { + return 0 != (cpu_info[3] & 0x04000000); + } + if (feature == kSSE3) { + return 0 != (cpu_info[2] & 0x00000001); + } + return 0; +} +VP8CPUInfo VP8GetCPUInfo = x86CPUInfo; +#elif defined(__ARM_NEON__) +// define a dummy function to enable turning off NEON at runtime by setting +// VP8DecGetCPUInfo = NULL +static int armCPUInfo(CPUFeature feature) { + return 1; +} +VP8CPUInfo VP8GetCPUInfo = armCPUInfo; +#else +VP8CPUInfo VP8GetCPUInfo = NULL; +#endif + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dec/dsp.c b/third_party/libwebp/dsp/dec.c index efde49d..2f53222 100644 --- a/third_party/libwebp/dec/dsp.c +++ b/third_party/libwebp/dsp/dec.c @@ -5,21 +5,18 @@ // Additional IP Rights Grant: http://www.webmproject.org/license/additional/ // ----------------------------------------------------------------------------- // -// speed-critical functions. +// Speed-critical decoding functions. // // Author: Skal (pascal.massimino@gmail.com) -#include "vp8i.h" - -#if defined(__SSE2__) -#include <emmintrin.h> -#endif +#include "./dsp.h" +#include "../dec/vp8i.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // run-time tables (~4k) static uint8_t abs0[255 + 255 + 1]; // abs(i) @@ -32,7 +29,7 @@ static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] // and make sure it's set to true _last_ (so as to be thread-safe) static volatile int tables_ok = 0; -void VP8DspInitTables(void) { +static void DspInitTables(void) { if (!tables_ok) { int i; for (i = -255; i <= 255; ++i) { @@ -56,7 +53,7 @@ static inline uint8_t clip_8b(int v) { return (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) #define STORE(x, y, v) \ @@ -66,7 +63,7 @@ static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; #define MUL(a, b) (((a) * (b)) >> 16) -static void Transform(const int16_t* in, uint8_t* dst) { +static void TransformOne(const int16_t* in, uint8_t* dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -106,11 +103,16 @@ static void Transform(const int16_t* in, uint8_t* dst) { } #undef MUL +static void TransformTwo(const int16_t* in, uint8_t* dst, int do_two) { + TransformOne(in, dst); + if (do_two) { + TransformOne(in + 16, dst + 4); + } +} + static void TransformUV(const int16_t* in, uint8_t* dst) { - Transform(in + 0 * 16, dst); - Transform(in + 1 * 16, dst + 4); - Transform(in + 2 * 16, dst + 4 * BPS); - Transform(in + 3 * 16, dst + 4 * BPS + 4); + VP8Transform(in + 0 * 16, dst, 1); + VP8Transform(in + 2 * 16, dst + 4 * BPS, 1); } static void TransformDC(const int16_t *in, uint8_t* dst) { @@ -132,13 +134,7 @@ static void TransformDCUV(const int16_t* in, uint8_t* dst) { #undef STORE -// default C implementations: -VP8Idct VP8Transform = Transform; -VP8Idct VP8TransformUV = TransformUV; -VP8Idct VP8TransformDC = TransformDC; -VP8Idct VP8TransformDCUV = TransformDCUV; - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Paragraph 14.3 static void TransformWHT(const int16_t* in, int16_t* out) { @@ -170,10 +166,10 @@ static void TransformWHT(const int16_t* in, int16_t* out) { void (*VP8TransformWHT)(const int16_t* in, int16_t* out) = TransformWHT; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Intra predictions -#define OUT(x, y) dst[(x) + (y) * BPS] +#define DST(x, y) dst[(x) + (y) * BPS] static inline void TrueMotion(uint8_t *dst, int size) { const uint8_t* top = dst - BPS; @@ -192,7 +188,7 @@ static void TM4(uint8_t *dst) { TrueMotion(dst, 4); } static void TM8uv(uint8_t *dst) { TrueMotion(dst, 8); } static void TM16(uint8_t *dst) { TrueMotion(dst, 16); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // 16x16 static void VE16(uint8_t *dst) { // vertical @@ -248,7 +244,7 @@ static void DC16NoTopLeft(uint8_t *dst) { // DC with no top and left samples Put16(0x80, dst); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // 4x4 #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) @@ -298,13 +294,13 @@ static void RD4(uint8_t *dst) { // Down-right const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; const int D = dst[3 - BPS]; - OUT(0, 3) = AVG3(J, K, L); - OUT(0, 2) = OUT(1, 3) = AVG3(I, J, K); - OUT(0, 1) = OUT(1, 2) = OUT(2, 3) = AVG3(X, I, J); - OUT(0, 0) = OUT(1, 1) = OUT(2, 2) = OUT(3, 3) = AVG3(A, X, I); - OUT(1, 0) = OUT(2, 1) = OUT(3, 2) = AVG3(B, A, X); - OUT(2, 0) = OUT(3, 1) = AVG3(C, B, A); - OUT(3, 0) = AVG3(D, C, B); + DST(0, 3) = AVG3(J, K, L); + DST(0, 2) = DST(1, 3) = AVG3(I, J, K); + DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J); + DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I); + DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X); + DST(2, 0) = DST(3, 1) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); } static void LD4(uint8_t *dst) { // Down-Left @@ -316,13 +312,13 @@ static void LD4(uint8_t *dst) { // Down-Left const int F = dst[5 - BPS]; const int G = dst[6 - BPS]; const int H = dst[7 - BPS]; - OUT(0, 0) = AVG3(A, B, C); - OUT(1, 0) = OUT(0, 1) = AVG3(B, C, D); - OUT(2, 0) = OUT(1, 1) = OUT(0, 2) = AVG3(C, D, E); - OUT(3, 0) = OUT(2, 1) = OUT(1, 2) = OUT(0, 3) = AVG3(D, E, F); - OUT(3, 1) = OUT(2, 2) = OUT(1, 3) = AVG3(E, F, G); - OUT(3, 2) = OUT(2, 3) = AVG3(F, G, H); - OUT(3, 3) = AVG3(G, H, H); + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); } static void VR4(uint8_t *dst) { // Vertical-Right @@ -334,17 +330,17 @@ static void VR4(uint8_t *dst) { // Vertical-Right const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; const int D = dst[3 - BPS]; - OUT(0, 0) = OUT(1, 2) = AVG2(X, A); - OUT(1, 0) = OUT(2, 2) = AVG2(A, B); - OUT(2, 0) = OUT(3, 2) = AVG2(B, C); - OUT(3, 0) = AVG2(C, D); + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); - OUT(0, 3) = AVG3(K, J, I); - OUT(0, 2) = AVG3(J, I, X); - OUT(0, 1) = OUT(1, 3) = AVG3(I, X, A); - OUT(1, 1) = OUT(2, 3) = AVG3(X, A, B); - OUT(2, 1) = OUT(3, 3) = AVG3(A, B, C); - OUT(3, 1) = AVG3(B, C, D); + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); } static void VL4(uint8_t *dst) { // Vertical-Left @@ -356,17 +352,17 @@ static void VL4(uint8_t *dst) { // Vertical-Left const int F = dst[5 - BPS]; const int G = dst[6 - BPS]; const int H = dst[7 - BPS]; - OUT(0, 0) = AVG2(A, B); - OUT(1, 0) = OUT(0, 2) = AVG2(B, C); - OUT(2, 0) = OUT(1, 2) = AVG2(C, D); - OUT(3, 0) = OUT(2, 2) = AVG2(D, E); + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); - OUT(0, 1) = AVG3(A, B, C); - OUT(1, 1) = OUT(0, 3) = AVG3(B, C, D); - OUT(2, 1) = OUT(1, 3) = AVG3(C, D, E); - OUT(3, 1) = OUT(2, 3) = AVG3(D, E, F); - OUT(3, 2) = AVG3(E, F, G); - OUT(3, 3) = AVG3(F, G, H); + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 2) = AVG3(E, F, G); + DST(3, 3) = AVG3(F, G, H); } static void HU4(uint8_t *dst) { // Horizontal-Up @@ -374,14 +370,14 @@ static void HU4(uint8_t *dst) { // Horizontal-Up const int J = dst[-1 + 1 * BPS]; const int K = dst[-1 + 2 * BPS]; const int L = dst[-1 + 3 * BPS]; - OUT(0, 0) = AVG2(I, J); - OUT(2, 0) = OUT(0, 1) = AVG2(J, K); - OUT(2, 1) = OUT(0, 2) = AVG2(K, L); - OUT(1, 0) = AVG3(I, J, K); - OUT(3, 0) = OUT(1, 1) = AVG3(J, K, L); - OUT(3, 1) = OUT(1, 2) = AVG3(K, L, L); - OUT(3, 2) = OUT(2, 2) = - OUT(0, 3) = OUT(1, 3) = OUT(2, 3) = OUT(3, 3) = L; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = + DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; } static void HD4(uint8_t *dst) { // Horizontal-Down @@ -394,23 +390,24 @@ static void HD4(uint8_t *dst) { // Horizontal-Down const int B = dst[1 - BPS]; const int C = dst[2 - BPS]; - OUT(0, 0) = OUT(2, 1) = AVG2(I, X); - OUT(0, 1) = OUT(2, 2) = AVG2(J, I); - OUT(0, 2) = OUT(2, 3) = AVG2(K, J); - OUT(0, 3) = AVG2(L, K); + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); - OUT(3, 0) = AVG3(A, B, C); - OUT(2, 0) = AVG3(X, A, B); - OUT(1, 0) = OUT(3, 1) = AVG3(I, X, A); - OUT(1, 1) = OUT(3, 2) = AVG3(J, I, X); - OUT(1, 2) = OUT(3, 3) = AVG3(K, J, I); - OUT(1, 3) = AVG3(L, K, J); + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); } +#undef DST #undef AVG3 #undef AVG2 -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Chroma static void VE8uv(uint8_t *dst) { // vertical @@ -467,24 +464,24 @@ static void DC8uvNoTopLeft(uint8_t *dst) { // DC with nothing Put8x8uv(0x8080808080808080ULL, dst); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // default C implementations -VP8PredFunc VP8PredLuma4[NUM_BMODES] = { +VP8PredFunc VP8PredLuma4[/* NUM_BMODES */] = { DC4, TM4, VE4, HE4, RD4, VR4, LD4, VL4, HD4, HU4 }; -VP8PredFunc VP8PredLuma16[NUM_B_DC_MODES] = { +VP8PredFunc VP8PredLuma16[/*NUM_B_DC_MODES */] = { DC16, TM16, VE16, HE16, DC16NoTop, DC16NoLeft, DC16NoTopLeft }; -VP8PredFunc VP8PredChroma8[NUM_B_DC_MODES] = { +VP8PredFunc VP8PredChroma8[/*NUM_B_DC_MODES */] = { DC8uv, TM8uv, VE8uv, HE8uv, DC8uvNoTop, DC8uvNoLeft, DC8uvNoTopLeft }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Edge filtering functions // 4 pixels in, 2 pixels out @@ -546,7 +543,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) { abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { @@ -583,7 +580,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) static inline void FilterLoop26(uint8_t* p, int hstride, int vstride, int size, @@ -669,26 +666,62 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ -void (*VP8VFilter16)(uint8_t*, int, int, int, int) = VFilter16; -void (*VP8HFilter16)(uint8_t*, int, int, int, int) = HFilter16; -void (*VP8VFilter8)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8; -void (*VP8HFilter8)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8; -void (*VP8VFilter16i)(uint8_t*, int, int, int, int) = VFilter16i; -void (*VP8HFilter16i)(uint8_t*, int, int, int, int) = HFilter16i; -void (*VP8VFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = VFilter8i; -void (*VP8HFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i; +VP8DecIdct2 VP8Transform; +VP8DecIdct VP8TransformUV; +VP8DecIdct VP8TransformDC; +VP8DecIdct VP8TransformDCUV; -void (*VP8SimpleVFilter16)(uint8_t*, int, int) = SimpleVFilter16; -void (*VP8SimpleHFilter16)(uint8_t*, int, int) = SimpleHFilter16; -void (*VP8SimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i; -void (*VP8SimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i; +VP8LumaFilterFunc VP8VFilter16; +VP8LumaFilterFunc VP8HFilter16; +VP8ChromaFilterFunc VP8VFilter8; +VP8ChromaFilterFunc VP8HFilter8; +VP8LumaFilterFunc VP8VFilter16i; +VP8LumaFilterFunc VP8HFilter16i; +VP8ChromaFilterFunc VP8VFilter8i; +VP8ChromaFilterFunc VP8HFilter8i; +VP8SimpleFilterFunc VP8SimpleVFilter16; +VP8SimpleFilterFunc VP8SimpleHFilter16; +VP8SimpleFilterFunc VP8SimpleVFilter16i; +VP8SimpleFilterFunc VP8SimpleHFilter16i; -//----------------------------------------------------------------------------- +extern void VP8DspInitSSE2(void); +extern void VP8DspInitNEON(void); void VP8DspInit(void) { - // later we'll plug some SSE2 variant here + DspInitTables(); + + VP8Transform = TransformTwo; + VP8TransformUV = TransformUV; + VP8TransformDC = TransformDC; + VP8TransformDCUV = TransformDCUV; + + VP8VFilter16 = VFilter16; + VP8HFilter16 = HFilter16; + VP8VFilter8 = VFilter8; + VP8HFilter8 = HFilter8; + VP8VFilter16i = VFilter16i; + VP8HFilter16i = HFilter16i; + VP8VFilter8i = VFilter8i; + VP8HFilter8i = HFilter8i; + VP8SimpleVFilter16 = SimpleVFilter16; + VP8SimpleHFilter16 = SimpleHFilter16; + VP8SimpleVFilter16i = SimpleVFilter16i; + VP8SimpleHFilter16i = SimpleHFilter16i; + + // If defined, use CPUInfo() to overwrite some pointers with faster versions. + if (VP8GetCPUInfo) { +#if defined(__SSE2__) || defined(_MSC_VER) + if (VP8GetCPUInfo(kSSE2)) { + VP8DspInitSSE2(); + } +#elif defined(__GNUC__) && defined(__ARM_NEON__) + if (VP8GetCPUInfo(kNEON)) { + VP8DspInitNEON(); + } +#endif + } } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/third_party/libwebp/dsp/dec_neon.c b/third_party/libwebp/dsp/dec_neon.c new file mode 100644 index 0000000..e633a30 --- /dev/null +++ b/third_party/libwebp/dsp/dec_neon.c @@ -0,0 +1,168 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// ARM NEON version of dsp functions and loop filtering. +// +// Author: somnath@google.com (Somnath Banerjee) + +#if defined(__GNUC__) && defined(__ARM_NEON__) + +#include "../dec/vp8i.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#define QRegs "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", \ + "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15" + +#define FLIP_SIGN_BIT2(a, b, s) \ + "veor " #a "," #a "," #s " \n" \ + "veor " #b "," #b "," #s " \n" \ + +#define FLIP_SIGN_BIT4(a, b, c, d, s) \ + FLIP_SIGN_BIT2(a, b, s) \ + FLIP_SIGN_BIT2(c, d, s) \ + +#define NEEDS_FILTER(p1, p0, q0, q1, thresh, mask) \ + "vabd.u8 q15," #p0 "," #q0 " \n" /* abs(p0 - q0) */ \ + "vabd.u8 q14," #p1 "," #q1 " \n" /* abs(p1 - q1) */ \ + "vqadd.u8 q15, q15, q15 \n" /* abs(p0 - q0) * 2 */ \ + "vshr.u8 q14, q14, #1 \n" /* abs(p1 - q1) / 2 */ \ + "vqadd.u8 q15, q15, q14 \n" /* abs(p0 - q0) * 2 + abs(p1 - q1) / 2 */ \ + "vdup.8 q14, " #thresh " \n" \ + "vcge.u8 " #mask ", q14, q15 \n" /* mask <= thresh */ + +#define GET_BASE_DELTA(p1, p0, q0, q1, o) \ + "vqsub.s8 q15," #q0 "," #p0 " \n" /* (q0 - p0) */ \ + "vqsub.s8 " #o "," #p1 "," #q1 " \n" /* (p1 - q1) */ \ + "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 1 * (p0 - q0) */ \ + "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 2 * (p0 - q0) */ \ + "vqadd.s8 " #o "," #o ", q15 \n" /* (p1 - q1) + 3 * (p0 - q0) */ + +#define DO_SIMPLE_FILTER(p0, q0, fl) \ + "vmov.i8 q15, #0x03 \n" \ + "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 3 */ \ + "vshr.s8 q15, q15, #3 \n" /* filter1 >> 3 */ \ + "vqadd.s8 " #p0 "," #p0 ", q15 \n" /* p0 += filter1 */ \ + \ + "vmov.i8 q15, #0x04 \n" \ + "vqadd.s8 q15, q15, " #fl " \n" /* filter1 = filter + 4 */ \ + "vshr.s8 q15, q15, #3 \n" /* filter2 >> 3 */ \ + "vqsub.s8 " #q0 "," #q0 ", q15 \n" /* q0 -= filter2 */ + +// Applies filter on 2 pixels (p0 and q0) +#define DO_FILTER2(p1, p0, q0, q1, thresh) \ + NEEDS_FILTER(p1, p0, q0, q1, thresh, q9) /* filter mask in q9 */ \ + "vmov.i8 q10, #0x80 \n" /* sign bit */ \ + FLIP_SIGN_BIT4(p1, p0, q0, q1, q10) /* convert to signed value */ \ + GET_BASE_DELTA(p1, p0, q0, q1, q11) /* get filter level */ \ + "vand q9, q9, q11 \n" /* apply filter mask */ \ + DO_SIMPLE_FILTER(p0, q0, q9) /* apply filter */ \ + FLIP_SIGN_BIT2(p0, q0, q10) + +// Load/Store vertical edge +#define LOAD8x4(c1, c2, c3, c4, b1, b2, stride) \ + "vld4.8 {" #c1"[0], " #c2"[0], " #c3"[0], " #c4"[0]}," #b1 "," #stride"\n" \ + "vld4.8 {" #c1"[1], " #c2"[1], " #c3"[1], " #c4"[1]}," #b2 "," #stride"\n" \ + "vld4.8 {" #c1"[2], " #c2"[2], " #c3"[2], " #c4"[2]}," #b1 "," #stride"\n" \ + "vld4.8 {" #c1"[3], " #c2"[3], " #c3"[3], " #c4"[3]}," #b2 "," #stride"\n" \ + "vld4.8 {" #c1"[4], " #c2"[4], " #c3"[4], " #c4"[4]}," #b1 "," #stride"\n" \ + "vld4.8 {" #c1"[5], " #c2"[5], " #c3"[5], " #c4"[5]}," #b2 "," #stride"\n" \ + "vld4.8 {" #c1"[6], " #c2"[6], " #c3"[6], " #c4"[6]}," #b1 "," #stride"\n" \ + "vld4.8 {" #c1"[7], " #c2"[7], " #c3"[7], " #c4"[7]}," #b2 "," #stride"\n" + +#define STORE8x2(c1, c2, p,stride) \ + "vst2.8 {" #c1"[0], " #c2"[0]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[1], " #c2"[1]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[2], " #c2"[2]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[3], " #c2"[3]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[4], " #c2"[4]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[5], " #c2"[5]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[6], " #c2"[6]}," #p "," #stride " \n" \ + "vst2.8 {" #c1"[7], " #c2"[7]}," #p "," #stride " \n" + +//----------------------------------------------------------------------------- +// Simple In-loop filtering (Paragraph 15.2) + +static void SimpleVFilter16NEON(uint8_t* p, int stride, int thresh) { + __asm__ volatile ( + "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride + + "vld1.u8 {q1}, [%[p]], %[stride] \n" // p1 + "vld1.u8 {q2}, [%[p]], %[stride] \n" // p0 + "vld1.u8 {q3}, [%[p]], %[stride] \n" // q0 + "vld1.u8 {q4}, [%[p]] \n" // q1 + + DO_FILTER2(q1, q2, q3, q4, %[thresh]) + + "sub %[p], %[p], %[stride], lsl #1 \n" // p -= 2 * stride + + "vst1.u8 {q2}, [%[p]], %[stride] \n" // store op0 + "vst1.u8 {q3}, [%[p]] \n" // store oq0 + : [p] "+r"(p) + : [stride] "r"(stride), [thresh] "r"(thresh) + : "memory", QRegs + ); +} + +static void SimpleHFilter16NEON(uint8_t* p, int stride, int thresh) { + __asm__ volatile ( + "sub r4, %[p], #2 \n" // base1 = p - 2 + "lsl r6, %[stride], #1 \n" // r6 = 2 * stride + "add r5, r4, %[stride] \n" // base2 = base1 + stride + + LOAD8x4(d2, d3, d4, d5, [r4], [r5], r6) + LOAD8x4(d6, d7, d8, d9, [r4], [r5], r6) + "vswp d3, d6 \n" // p1:q1 p0:q3 + "vswp d5, d8 \n" // q0:q2 q1:q4 + "vswp q2, q3 \n" // p1:q1 p0:q2 q0:q3 q1:q4 + + DO_FILTER2(q1, q2, q3, q4, %[thresh]) + + "sub %[p], %[p], #1 \n" // p - 1 + + "vswp d5, d6 \n" + STORE8x2(d4, d5, [%[p]], %[stride]) + STORE8x2(d6, d7, [%[p]], %[stride]) + + : [p] "+r"(p) + : [stride] "r"(stride), [thresh] "r"(thresh) + : "memory", "r4", "r5", "r6", QRegs + ); +} + +static void SimpleVFilter16iNEON(uint8_t* p, int stride, int thresh) { + int k; + for (k = 3; k > 0; --k) { + p += 4 * stride; + SimpleVFilter16NEON(p, stride, thresh); + } +} + +static void SimpleHFilter16iNEON(uint8_t* p, int stride, int thresh) { + int k; + for (k = 3; k > 0; --k) { + p += 4; + SimpleHFilter16NEON(p, stride, thresh); + } +} + +extern void VP8DspInitNEON(void); + +void VP8DspInitNEON(void) { + VP8SimpleVFilter16 = SimpleVFilter16NEON; + VP8SimpleHFilter16 = SimpleHFilter16NEON; + VP8SimpleVFilter16i = SimpleVFilter16iNEON; + VP8SimpleHFilter16i = SimpleHFilter16iNEON; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif // __GNUC__ && __ARM_NEON__ diff --git a/third_party/libwebp/dsp/dec_sse2.c b/third_party/libwebp/dsp/dec_sse2.c new file mode 100644 index 0000000..625ec94 --- /dev/null +++ b/third_party/libwebp/dsp/dec_sse2.c @@ -0,0 +1,898 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// SSE2 version of some decoding functions (idct, loop filtering). +// +// Author: somnath@google.com (Somnath Banerjee) +// cduvivier@google.com (Christian Duvivier) + +#if defined(__SSE2__) || defined(_MSC_VER) + +#include <emmintrin.h> +#include "../dec/vp8i.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Transforms (Paragraph 14.4) + +static void TransformSSE2(const int16_t* in, uint8_t* dst, int do_two) { + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + const __m128i k1 = _mm_set1_epi16(20091); + const __m128i k2 = _mm_set1_epi16(-30068); + __m128i T0, T1, T2, T3; + + // Load and concatenate the transform coefficients (we'll do two transforms + // in parallel). In the case of only one transform, the second half of the + // vectors will just contain random value we'll never use nor store. + __m128i in0, in1, in2, in3; + { + in0 = _mm_loadl_epi64((__m128i*)&in[0]); + in1 = _mm_loadl_epi64((__m128i*)&in[4]); + in2 = _mm_loadl_epi64((__m128i*)&in[8]); + in3 = _mm_loadl_epi64((__m128i*)&in[12]); + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (do_two) { + const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); + const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); + const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); + const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); + in0 = _mm_unpacklo_epi64(in0, inB0); + in1 = _mm_unpacklo_epi64(in1, inB1); + in2 = _mm_unpacklo_epi64(in2, inB2); + in3 = _mm_unpacklo_epi64(in3, inB3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + } + + // Vertical pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i a = _mm_add_epi16(in0, in2); + const __m128i b = _mm_sub_epi16(in0, in2); + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + const __m128i c1 = _mm_mulhi_epi16(in1, k2); + const __m128i c2 = _mm_mulhi_epi16(in3, k1); + const __m128i c3 = _mm_sub_epi16(in1, in3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + const __m128i d1 = _mm_mulhi_epi16(in1, k1); + const __m128i d2 = _mm_mulhi_epi16(in3, k2); + const __m128i d3 = _mm_add_epi16(in1, in3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Horizontal pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i four = _mm_set1_epi16(4); + const __m128i dc = _mm_add_epi16(T0, four); + const __m128i a = _mm_add_epi16(dc, T2); + const __m128i b = _mm_sub_epi16(dc, T2); + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + const __m128i c1 = _mm_mulhi_epi16(T1, k2); + const __m128i c2 = _mm_mulhi_epi16(T3, k1); + const __m128i c3 = _mm_sub_epi16(T1, T3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + const __m128i d1 = _mm_mulhi_epi16(T1, k1); + const __m128i d2 = _mm_mulhi_epi16(T3, k2); + const __m128i d3 = _mm_add_epi16(T1, T3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); + const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); + const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); + const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Add inverse transform to 'dst' and store. + { + const __m128i zero = _mm_set1_epi16(0); + // Load the reference(s). + __m128i dst0, dst1, dst2, dst3; + if (do_two) { + // Load eight bytes/pixels per line. + dst0 = _mm_loadl_epi64((__m128i*)&dst[0 * BPS]); + dst1 = _mm_loadl_epi64((__m128i*)&dst[1 * BPS]); + dst2 = _mm_loadl_epi64((__m128i*)&dst[2 * BPS]); + dst3 = _mm_loadl_epi64((__m128i*)&dst[3 * BPS]); + } else { + // Load four bytes/pixels per line. + dst0 = _mm_cvtsi32_si128(*(int*)&dst[0 * BPS]); + dst1 = _mm_cvtsi32_si128(*(int*)&dst[1 * BPS]); + dst2 = _mm_cvtsi32_si128(*(int*)&dst[2 * BPS]); + dst3 = _mm_cvtsi32_si128(*(int*)&dst[3 * BPS]); + } + // Convert to 16b. + dst0 = _mm_unpacklo_epi8(dst0, zero); + dst1 = _mm_unpacklo_epi8(dst1, zero); + dst2 = _mm_unpacklo_epi8(dst2, zero); + dst3 = _mm_unpacklo_epi8(dst3, zero); + // Add the inverse transform(s). + dst0 = _mm_add_epi16(dst0, T0); + dst1 = _mm_add_epi16(dst1, T1); + dst2 = _mm_add_epi16(dst2, T2); + dst3 = _mm_add_epi16(dst3, T3); + // Unsigned saturate to 8b. + dst0 = _mm_packus_epi16(dst0, dst0); + dst1 = _mm_packus_epi16(dst1, dst1); + dst2 = _mm_packus_epi16(dst2, dst2); + dst3 = _mm_packus_epi16(dst3, dst3); + // Store the results. + if (do_two) { + // Store eight bytes/pixels per line. + _mm_storel_epi64((__m128i*)&dst[0 * BPS], dst0); + _mm_storel_epi64((__m128i*)&dst[1 * BPS], dst1); + _mm_storel_epi64((__m128i*)&dst[2 * BPS], dst2); + _mm_storel_epi64((__m128i*)&dst[3 * BPS], dst3); + } else { + // Store four bytes/pixels per line. + *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(dst0); + *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(dst1); + *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(dst2); + *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(dst3); + } + } +} + +//------------------------------------------------------------------------------ +// Loop Filter (Paragraph 15) + +// Compute abs(p - q) = subs(p - q) OR subs(q - p) +#define MM_ABS(p, q) _mm_or_si128( \ + _mm_subs_epu8((q), (p)), \ + _mm_subs_epu8((p), (q))) + +// Shift each byte of "a" by N bits while preserving by the sign bit. +// +// It first shifts the lower bytes of the words and then the upper bytes and +// then merges the results together. +#define SIGNED_SHIFT_N(a, N) { \ + __m128i t = a; \ + t = _mm_slli_epi16(t, 8); \ + t = _mm_srai_epi16(t, N); \ + t = _mm_srli_epi16(t, 8); \ + \ + a = _mm_srai_epi16(a, N + 8); \ + a = _mm_slli_epi16(a, 8); \ + \ + a = _mm_or_si128(t, a); \ +} + +#define FLIP_SIGN_BIT2(a, b) { \ + a = _mm_xor_si128(a, sign_bit); \ + b = _mm_xor_si128(b, sign_bit); \ +} + +#define FLIP_SIGN_BIT4(a, b, c, d) { \ + FLIP_SIGN_BIT2(a, b); \ + FLIP_SIGN_BIT2(c, d); \ +} + +#define GET_NOTHEV(p1, p0, q0, q1, hev_thresh, not_hev) { \ + const __m128i zero = _mm_setzero_si128(); \ + const __m128i t1 = MM_ABS(p1, p0); \ + const __m128i t2 = MM_ABS(q1, q0); \ + \ + const __m128i h = _mm_set1_epi8(hev_thresh); \ + const __m128i t3 = _mm_subs_epu8(t1, h); /* abs(p1 - p0) - hev_tresh */ \ + const __m128i t4 = _mm_subs_epu8(t2, h); /* abs(q1 - q0) - hev_tresh */ \ + \ + not_hev = _mm_or_si128(t3, t4); \ + not_hev = _mm_cmpeq_epi8(not_hev, zero); /* not_hev <= t1 && not_hev <= t2 */\ +} + +#define GET_BASE_DELTA(p1, p0, q0, q1, o) { \ + const __m128i qp0 = _mm_subs_epi8(q0, p0); /* q0 - p0 */ \ + o = _mm_subs_epi8(p1, q1); /* p1 - q1 */ \ + o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 1 * (q0 - p0) */ \ + o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 2 * (q0 - p0) */ \ + o = _mm_adds_epi8(o, qp0); /* p1 - q1 + 3 * (q0 - p0) */ \ +} + +#define DO_SIMPLE_FILTER(p0, q0, fl) { \ + const __m128i three = _mm_set1_epi8(3); \ + const __m128i four = _mm_set1_epi8(4); \ + __m128i v3 = _mm_adds_epi8(fl, three); \ + __m128i v4 = _mm_adds_epi8(fl, four); \ + \ + /* Do +4 side */ \ + SIGNED_SHIFT_N(v4, 3); /* v4 >> 3 */ \ + q0 = _mm_subs_epi8(q0, v4); /* q0 -= v4 */ \ + \ + /* Now do +3 side */ \ + SIGNED_SHIFT_N(v3, 3); /* v3 >> 3 */ \ + p0 = _mm_adds_epi8(p0, v3); /* p0 += v3 */ \ +} + +// Updates values of 2 pixels at MB edge during complex filtering. +// Update operations: +// q = q - a and p = p + a; where a = [(a_hi >> 7), (a_lo >> 7)] +#define UPDATE_2PIXELS(pi, qi, a_lo, a_hi) { \ + const __m128i a_lo7 = _mm_srai_epi16(a_lo, 7); \ + const __m128i a_hi7 = _mm_srai_epi16(a_hi, 7); \ + const __m128i a = _mm_packs_epi16(a_lo7, a_hi7); \ + pi = _mm_adds_epi8(pi, a); \ + qi = _mm_subs_epi8(qi, a); \ +} + +static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, + const __m128i* q1, int thresh, __m128i *mask) { + __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) + *mask = _mm_set1_epi8(0xFE); + t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero + t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 + + *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) + *mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2 + *mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 + + t1 = _mm_set1_epi8(thresh); + *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh + *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); +} + +//------------------------------------------------------------------------------ +// Edge filtering functions + +// Applies filter on 2 pixels (p0 and q0) +static inline void DoFilter2(const __m128i* p1, __m128i* p0, __m128i* q0, + const __m128i* q1, int thresh) { + __m128i a, mask; + const __m128i sign_bit = _mm_set1_epi8(0x80); + const __m128i p1s = _mm_xor_si128(*p1, sign_bit); + const __m128i q1s = _mm_xor_si128(*q1, sign_bit); + + NeedsFilter(p1, p0, q0, q1, thresh, &mask); + + // convert to signed values + FLIP_SIGN_BIT2(*p0, *q0); + + GET_BASE_DELTA(p1s, *p0, *q0, q1s, a); + a = _mm_and_si128(a, mask); // mask filter values we don't care about + DO_SIMPLE_FILTER(*p0, *q0, a); + + // unoffset + FLIP_SIGN_BIT2(*p0, *q0); +} + +// Applies filter on 4 pixels (p1, p0, q0 and q1) +static inline void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, + const __m128i* mask, int hev_thresh) { + __m128i not_hev; + __m128i t1, t2, t3; + const __m128i sign_bit = _mm_set1_epi8(0x80); + + // compute hev mask + GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); + + // convert to signed values + FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); + + t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 + t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1) + t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) + t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) + t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about + + // Do +4 side + t2 = _mm_set1_epi8(4); + t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4 + SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 + t3 = t2; // save t2 + *q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2 + + // Now do +3 side + t2 = _mm_set1_epi8(3); + t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4 + SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 + *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 + + t2 = _mm_set1_epi8(1); + t3 = _mm_adds_epi8(t3, t2); + SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4 + + t3 = _mm_and_si128(not_hev, t3); // if !hev + *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 + *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 + + // unoffset + FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); +} + +// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) +static inline void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, + __m128i* q0, __m128i* q1, __m128i *q2, + const __m128i* mask, int hev_thresh) { + __m128i a, not_hev; + const __m128i sign_bit = _mm_set1_epi8(0x80); + + // compute hev mask + GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); + + // convert to signed values + FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); + FLIP_SIGN_BIT2(*p2, *q2); + + GET_BASE_DELTA(*p1, *p0, *q0, *q1, a); + + { // do simple filter on pixels with hev + const __m128i m = _mm_andnot_si128(not_hev, *mask); + const __m128i f = _mm_and_si128(a, m); + DO_SIMPLE_FILTER(*p0, *q0, f); + } + { // do strong filter on pixels with not hev + const __m128i zero = _mm_setzero_si128(); + const __m128i nine = _mm_set1_epi16(0x0900); + const __m128i sixty_three = _mm_set1_epi16(63); + + const __m128i m = _mm_and_si128(not_hev, *mask); + const __m128i f = _mm_and_si128(a, m); + const __m128i f_lo = _mm_unpacklo_epi8(zero, f); + const __m128i f_hi = _mm_unpackhi_epi8(zero, f); + + const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9 + const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9 + const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18 + const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18 + + const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63 + const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63 + + const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63 + const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63 + + const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63 + const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63 + + UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi); + UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi); + UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi); + } + + // unoffset + FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); + FLIP_SIGN_BIT2(*p2, *q2); +} + +// reads 8 rows across a vertical edge. +// +// TODO(somnath): Investigate _mm_shuffle* also see if it can be broken into +// two Load4x4() to avoid code duplication. +static inline void Load8x4(const uint8_t* b, int stride, + __m128i* p, __m128i* q) { + __m128i t1, t2; + + // Load 0th, 1st, 4th and 5th rows + __m128i r0 = _mm_cvtsi32_si128(*((int*)&b[0 * stride])); // 03 02 01 00 + __m128i r1 = _mm_cvtsi32_si128(*((int*)&b[1 * stride])); // 13 12 11 10 + __m128i r4 = _mm_cvtsi32_si128(*((int*)&b[4 * stride])); // 43 42 41 40 + __m128i r5 = _mm_cvtsi32_si128(*((int*)&b[5 * stride])); // 53 52 51 50 + + r0 = _mm_unpacklo_epi32(r0, r4); // 43 42 41 40 03 02 01 00 + r1 = _mm_unpacklo_epi32(r1, r5); // 53 52 51 50 13 12 11 10 + + // t1 = 53 43 52 42 51 41 50 40 13 03 12 02 11 01 10 00 + t1 = _mm_unpacklo_epi8(r0, r1); + + // Load 2nd, 3rd, 6th and 7th rows + r0 = _mm_cvtsi32_si128(*((int*)&b[2 * stride])); // 23 22 21 22 + r1 = _mm_cvtsi32_si128(*((int*)&b[3 * stride])); // 33 32 31 30 + r4 = _mm_cvtsi32_si128(*((int*)&b[6 * stride])); // 63 62 61 60 + r5 = _mm_cvtsi32_si128(*((int*)&b[7 * stride])); // 73 72 71 70 + + r0 = _mm_unpacklo_epi32(r0, r4); // 63 62 61 60 23 22 21 20 + r1 = _mm_unpacklo_epi32(r1, r5); // 73 72 71 70 33 32 31 30 + + // t2 = 73 63 72 62 71 61 70 60 33 23 32 22 31 21 30 20 + t2 = _mm_unpacklo_epi8(r0, r1); + + // t1 = 33 23 13 03 32 22 12 02 31 21 11 01 30 20 10 00 + // t2 = 73 63 53 43 72 62 52 42 71 61 51 41 70 60 50 40 + r0 = t1; + t1 = _mm_unpacklo_epi16(t1, t2); + t2 = _mm_unpackhi_epi16(r0, t2); + + // *p = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + // *q = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + *p = _mm_unpacklo_epi32(t1, t2); + *q = _mm_unpackhi_epi32(t1, t2); +} + +static inline void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, + __m128i* p1, __m128i* p0, + __m128i* q0, __m128i* q1) { + __m128i t1, t2; + // Assume the pixels around the edge (|) are numbered as follows + // 00 01 | 02 03 + // 10 11 | 12 13 + // ... | ... + // e0 e1 | e2 e3 + // f0 f1 | f2 f3 + // + // r0 is pointing to the 0th row (00) + // r8 is pointing to the 8th row (80) + + // Load + // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 + // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 + // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 + // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 + Load8x4(r0, stride, p1, q0); + Load8x4(r8, stride, p0, q1); + + t1 = *p1; + t2 = *q0; + // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 + // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 + // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 + // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 + *p1 = _mm_unpacklo_epi64(t1, *p0); + *p0 = _mm_unpackhi_epi64(t1, *p0); + *q0 = _mm_unpacklo_epi64(t2, *q1); + *q1 = _mm_unpackhi_epi64(t2, *q1); +} + +static inline void Store4x4(__m128i* x, uint8_t* dst, int stride) { + int i; + for (i = 0; i < 4; ++i, dst += stride) { + *((int32_t*)dst) = _mm_cvtsi128_si32(*x); + *x = _mm_srli_si128(*x, 4); + } +} + +// Transpose back and store +static inline void Store16x4(uint8_t* r0, uint8_t* r8, int stride, __m128i* p1, + __m128i* p0, __m128i* q0, __m128i* q1) { + __m128i t1; + + // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00 + // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80 + t1 = *p0; + *p0 = _mm_unpacklo_epi8(*p1, t1); + *p1 = _mm_unpackhi_epi8(*p1, t1); + + // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02 + // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82 + t1 = *q0; + *q0 = _mm_unpacklo_epi8(t1, *q1); + *q1 = _mm_unpackhi_epi8(t1, *q1); + + // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00 + // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40 + t1 = *p0; + *p0 = _mm_unpacklo_epi16(t1, *q0); + *q0 = _mm_unpackhi_epi16(t1, *q0); + + // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80 + // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0 + t1 = *p1; + *p1 = _mm_unpacklo_epi16(t1, *q1); + *q1 = _mm_unpackhi_epi16(t1, *q1); + + Store4x4(p0, r0, stride); + r0 += 4 * stride; + Store4x4(q0, r0, stride); + + Store4x4(p1, r8, stride); + r8 += 4 * stride; + Store4x4(q1, r8, stride); +} + +//------------------------------------------------------------------------------ +// Simple In-loop filtering (Paragraph 15.2) + +static void SimpleVFilter16SSE2(uint8_t* p, int stride, int thresh) { + // Load + __m128i p1 = _mm_loadu_si128((__m128i*)&p[-2 * stride]); + __m128i p0 = _mm_loadu_si128((__m128i*)&p[-stride]); + __m128i q0 = _mm_loadu_si128((__m128i*)&p[0]); + __m128i q1 = _mm_loadu_si128((__m128i*)&p[stride]); + + DoFilter2(&p1, &p0, &q0, &q1, thresh); + + // Store + _mm_storeu_si128((__m128i*)&p[-stride], p0); + _mm_storeu_si128((__m128i*)p, q0); +} + +static void SimpleHFilter16SSE2(uint8_t* p, int stride, int thresh) { + __m128i p1, p0, q0, q1; + + p -= 2; // beginning of p1 + + Load16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); + DoFilter2(&p1, &p0, &q0, &q1, thresh); + Store16x4(p, p + 8 * stride, stride, &p1, &p0, &q0, &q1); +} + +static void SimpleVFilter16iSSE2(uint8_t* p, int stride, int thresh) { + int k; + for (k = 3; k > 0; --k) { + p += 4 * stride; + SimpleVFilter16SSE2(p, stride, thresh); + } +} + +static void SimpleHFilter16iSSE2(uint8_t* p, int stride, int thresh) { + int k; + for (k = 3; k > 0; --k) { + p += 4; + SimpleHFilter16SSE2(p, stride, thresh); + } +} + +//------------------------------------------------------------------------------ +// Complex In-loop filtering (Paragraph 15.3) + +#define MAX_DIFF1(p3, p2, p1, p0, m) { \ + m = MM_ABS(p3, p2); \ + m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ +} + +#define MAX_DIFF2(p3, p2, p1, p0, m) { \ + m = _mm_max_epu8(m, MM_ABS(p3, p2)); \ + m = _mm_max_epu8(m, MM_ABS(p2, p1)); \ + m = _mm_max_epu8(m, MM_ABS(p1, p0)); \ +} + +#define LOAD_H_EDGES4(p, stride, e1, e2, e3, e4) { \ + e1 = _mm_loadu_si128((__m128i*)&(p)[0 * stride]); \ + e2 = _mm_loadu_si128((__m128i*)&(p)[1 * stride]); \ + e3 = _mm_loadu_si128((__m128i*)&(p)[2 * stride]); \ + e4 = _mm_loadu_si128((__m128i*)&(p)[3 * stride]); \ +} + +#define LOADUV_H_EDGE(p, u, v, stride) { \ + p = _mm_loadl_epi64((__m128i*)&(u)[(stride)]); \ + p = _mm_unpacklo_epi64(p, _mm_loadl_epi64((__m128i*)&(v)[(stride)])); \ +} + +#define LOADUV_H_EDGES4(u, v, stride, e1, e2, e3, e4) { \ + LOADUV_H_EDGE(e1, u, v, 0 * stride); \ + LOADUV_H_EDGE(e2, u, v, 1 * stride); \ + LOADUV_H_EDGE(e3, u, v, 2 * stride); \ + LOADUV_H_EDGE(e4, u, v, 3 * stride); \ +} + +#define STOREUV(p, u, v, stride) { \ + _mm_storel_epi64((__m128i*)&u[(stride)], p); \ + p = _mm_srli_si128(p, 8); \ + _mm_storel_epi64((__m128i*)&v[(stride)], p); \ +} + +#define COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask) { \ + __m128i fl_yes; \ + const __m128i it = _mm_set1_epi8(ithresh); \ + mask = _mm_subs_epu8(mask, it); \ + mask = _mm_cmpeq_epi8(mask, _mm_setzero_si128()); \ + NeedsFilter(&p1, &p0, &q0, &q1, thresh, &fl_yes); \ + mask = _mm_and_si128(mask, fl_yes); \ +} + +// on macroblock edges +static void VFilter16SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i t1; + __m128i mask; + __m128i p2, p1, p0, q0, q1, q2; + + // Load p3, p2, p1, p0 + LOAD_H_EDGES4(p - 4 * stride, stride, t1, p2, p1, p0); + MAX_DIFF1(t1, p2, p1, p0, mask); + + // Load q0, q1, q2, q3 + LOAD_H_EDGES4(p, stride, q0, q1, q2, t1); + MAX_DIFF2(t1, q2, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + + // Store + _mm_storeu_si128((__m128i*)&p[-3 * stride], p2); + _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); + _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); + _mm_storeu_si128((__m128i*)&p[0 * stride], q0); + _mm_storeu_si128((__m128i*)&p[1 * stride], q1); + _mm_storeu_si128((__m128i*)&p[2 * stride], q2); +} + +static void HFilter16SSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + uint8_t* const b = p - 4; + Load16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(p3, p2, p1, p0, mask); + + Load16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + MAX_DIFF2(q3, q2, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + + Store16x4(b, b + 8 * stride, stride, &p3, &p2, &p1, &p0); + Store16x4(p, p + 8 * stride, stride, &q0, &q1, &q2, &q3); +} + +// on three inner edges +static void VFilter16iSSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + int k; + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + for (k = 3; k > 0; --k) { + // Load p3, p2, p1, p0 + LOAD_H_EDGES4(p, stride, t2, t1, p1, p0); + MAX_DIFF1(t2, t1, p1, p0, mask); + + p += 4 * stride; + + // Load q0, q1, q2, q3 + LOAD_H_EDGES4(p, stride, q0, q1, t1, t2); + MAX_DIFF2(t2, t1, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + // Store + _mm_storeu_si128((__m128i*)&p[-2 * stride], p1); + _mm_storeu_si128((__m128i*)&p[-1 * stride], p0); + _mm_storeu_si128((__m128i*)&p[0 * stride], q0); + _mm_storeu_si128((__m128i*)&p[1 * stride], q1); + } +} + +static void HFilter16iSSE2(uint8_t* p, int stride, + int thresh, int ithresh, int hev_thresh) { + int k; + uint8_t* b; + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + for (k = 3; k > 0; --k) { + b = p; + Load16x4(b, b + 8 * stride, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + b += 4; // beginning of q0 + Load16x4(b, b + 8 * stride, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + b -= 2; // beginning of p1 + Store16x4(b, b + 8 * stride, stride, &p1, &p0, &q0, &q1); + + p += 4; + } +} + +// 8-pixels wide variant, for chroma filtering +static void VFilter8SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i t1, p2, p1, p0, q0, q1, q2; + + // Load p3, p2, p1, p0 + LOADUV_H_EDGES4(u - 4 * stride, v - 4 * stride, stride, t1, p2, p1, p0); + MAX_DIFF1(t1, p2, p1, p0, mask); + + // Load q0, q1, q2, q3 + LOADUV_H_EDGES4(u, v, stride, q0, q1, q2, t1); + MAX_DIFF2(t1, q2, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + + // Store + STOREUV(p2, u, v, -3 * stride); + STOREUV(p1, u, v, -2 * stride); + STOREUV(p0, u, v, -1 * stride); + STOREUV(q0, u, v, 0 * stride); + STOREUV(q1, u, v, 1 * stride); + STOREUV(q2, u, v, 2 * stride); +} + +static void HFilter8SSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i p3, p2, p1, p0, q0, q1, q2, q3; + + uint8_t* const tu = u - 4; + uint8_t* const tv = v - 4; + Load16x4(tu, tv, stride, &p3, &p2, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(p3, p2, p1, p0, mask); + + Load16x4(u, v, stride, &q0, &q1, &q2, &q3); // q0, q1, q2, q3 + MAX_DIFF2(q3, q2, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter6(&p2, &p1, &p0, &q0, &q1, &q2, &mask, hev_thresh); + + Store16x4(tu, tv, stride, &p3, &p2, &p1, &p0); + Store16x4(u, v, stride, &q0, &q1, &q2, &q3); +} + +static void VFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + + // Load p3, p2, p1, p0 + LOADUV_H_EDGES4(u, v, stride, t2, t1, p1, p0); + MAX_DIFF1(t2, t1, p1, p0, mask); + + u += 4 * stride; + v += 4 * stride; + + // Load q0, q1, q2, q3 + LOADUV_H_EDGES4(u, v, stride, q0, q1, t1, t2); + MAX_DIFF2(t2, t1, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + // Store + STOREUV(p1, u, v, -2 * stride); + STOREUV(p0, u, v, -1 * stride); + STOREUV(q0, u, v, 0 * stride); + STOREUV(q1, u, v, 1 * stride); +} + +static void HFilter8iSSE2(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_thresh) { + __m128i mask; + __m128i t1, t2, p1, p0, q0, q1; + Load16x4(u, v, stride, &t2, &t1, &p1, &p0); // p3, p2, p1, p0 + MAX_DIFF1(t2, t1, p1, p0, mask); + + u += 4; // beginning of q0 + v += 4; + Load16x4(u, v, stride, &q0, &q1, &t1, &t2); // q0, q1, q2, q3 + MAX_DIFF2(t2, t1, q1, q0, mask); + + COMPLEX_FL_MASK(p1, p0, q0, q1, thresh, ithresh, mask); + DoFilter4(&p1, &p0, &q0, &q1, &mask, hev_thresh); + + u -= 2; // beginning of p1 + v -= 2; + Store16x4(u, v, stride, &p1, &p0, &q0, &q1); +} + +extern void VP8DspInitSSE2(void); + +void VP8DspInitSSE2(void) { + VP8Transform = TransformSSE2; + + VP8VFilter16 = VFilter16SSE2; + VP8HFilter16 = HFilter16SSE2; + VP8VFilter8 = VFilter8SSE2; + VP8HFilter8 = HFilter8SSE2; + VP8VFilter16i = VFilter16iSSE2; + VP8HFilter16i = HFilter16iSSE2; + VP8VFilter8i = VFilter8iSSE2; + VP8HFilter8i = HFilter8iSSE2; + + VP8SimpleVFilter16 = SimpleVFilter16SSE2; + VP8SimpleHFilter16 = SimpleHFilter16SSE2; + VP8SimpleVFilter16i = SimpleVFilter16iSSE2; + VP8SimpleHFilter16i = SimpleHFilter16iSSE2; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif //__SSE2__ || _MSC_VER diff --git a/third_party/libwebp/dsp/dsp.h b/third_party/libwebp/dsp/dsp.h new file mode 100644 index 0000000..aa6fd3d --- /dev/null +++ b/third_party/libwebp/dsp/dsp.h @@ -0,0 +1,175 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Speed-critical functions. +// +// Author: Skal (pascal.massimino@gmail.com) + +#ifndef WEBP_DSP_DSP_H_ +#define WEBP_DSP_DSP_H_ + +#include "../webp/types.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// CPU detection + +typedef enum { + kSSE2, + kSSE3, + kNEON +} CPUFeature; +// returns true if the CPU supports the feature. +typedef int (*VP8CPUInfo)(CPUFeature feature); +extern VP8CPUInfo VP8GetCPUInfo; + +//------------------------------------------------------------------------------ +// Encoding + +int VP8GetAlpha(const int histo[]); + +// Transforms +// VP8Idct: Does one of two inverse transforms. If do_two is set, the transforms +// will be done for (ref, in, dst) and (ref + 4, in + 16, dst + 4). +typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two); +typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); +typedef void (*VP8WHT)(const int16_t* in, int16_t* out); +extern VP8Idct VP8ITransform; +extern VP8Fdct VP8FTransform; +extern VP8WHT VP8ITransformWHT; +extern VP8WHT VP8FTransformWHT; +// Predictions +// *dst is the destination block. *top and *left can be NULL. +typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left, + const uint8_t* top); +typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top); +extern VP8Intra4Preds VP8EncPredLuma4; +extern VP8IntraPreds VP8EncPredLuma16; +extern VP8IntraPreds VP8EncPredChroma8; + +typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); +extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; +typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, + const uint16_t* const weights); +extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; + +typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); +extern VP8BlockCopy VP8Copy4x4; +extern VP8BlockCopy VP8Copy8x8; +extern VP8BlockCopy VP8Copy16x16; +// Quantization +struct VP8Matrix; // forward declaration +typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], + int n, const struct VP8Matrix* const mtx); +extern VP8QuantizeBlock VP8EncQuantizeBlock; + +// Compute susceptibility based on DCT-coeff histograms: +// the higher, the "easier" the macroblock is to compress. +typedef int (*VP8CHisto)(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block); +extern const int VP8DspScan[16 + 4 + 4]; +extern VP8CHisto VP8CollectHistogram; + +void VP8EncDspInit(void); // must be called before using any of the above + +//------------------------------------------------------------------------------ +// Decoding + +typedef void (*VP8DecIdct)(const int16_t* coeffs, uint8_t* dst); +// when doing two transforms, coeffs is actually int16_t[2][16]. +typedef void (*VP8DecIdct2)(const int16_t* coeffs, uint8_t* dst, int do_two); +extern VP8DecIdct2 VP8Transform; +extern VP8DecIdct VP8TransformUV; +extern VP8DecIdct VP8TransformDC; +extern VP8DecIdct VP8TransformDCUV; +extern void (*VP8TransformWHT)(const int16_t* in, int16_t* out); + +// *dst is the destination block, with stride BPS. Boundary samples are +// assumed accessible when needed. +typedef void (*VP8PredFunc)(uint8_t* dst); +extern VP8PredFunc VP8PredLuma16[/* NUM_B_DC_MODES */]; +extern VP8PredFunc VP8PredChroma8[/* NUM_B_DC_MODES */]; +extern VP8PredFunc VP8PredLuma4[/* NUM_BMODES */]; + +// simple filter (only for luma) +typedef void (*VP8SimpleFilterFunc)(uint8_t* p, int stride, int thresh); +extern VP8SimpleFilterFunc VP8SimpleVFilter16; +extern VP8SimpleFilterFunc VP8SimpleHFilter16; +extern VP8SimpleFilterFunc VP8SimpleVFilter16i; // filter 3 inner edges +extern VP8SimpleFilterFunc VP8SimpleHFilter16i; + +// regular filter (on both macroblock edges and inner edges) +typedef void (*VP8LumaFilterFunc)(uint8_t* luma, int stride, + int thresh, int ithresh, int hev_t); +typedef void (*VP8ChromaFilterFunc)(uint8_t* u, uint8_t* v, int stride, + int thresh, int ithresh, int hev_t); +// on outer edge +extern VP8LumaFilterFunc VP8VFilter16; +extern VP8LumaFilterFunc VP8HFilter16; +extern VP8ChromaFilterFunc VP8VFilter8; +extern VP8ChromaFilterFunc VP8HFilter8; + +// on inner edge +extern VP8LumaFilterFunc VP8VFilter16i; // filtering 3 inner edges altogether +extern VP8LumaFilterFunc VP8HFilter16i; +extern VP8ChromaFilterFunc VP8VFilter8i; // filtering u and v altogether +extern VP8ChromaFilterFunc VP8HFilter8i; + +// must be called before anything using the above +extern void VP8DspInit(void); + +//------------------------------------------------------------------------------ +// WebP I/O + +#define FANCY_UPSAMPLING // undefined to remove fancy upsampling support + +#ifdef FANCY_UPSAMPLING +typedef void (*WebPUpsampleLinePairFunc)( + const uint8_t* top_y, const uint8_t* bottom_y, + const uint8_t* top_u, const uint8_t* top_v, + const uint8_t* cur_u, const uint8_t* cur_v, + uint8_t* top_dst, uint8_t* bottom_dst, int len); + + +// Fancy upsampling functions to convert YUV to RGB(A) modes +extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; +extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */]; + +// Initializes SSE2 version of the fancy upsamplers. +void WebPInitUpsamplersSSE2(void); + +#endif // FANCY_UPSAMPLING + +// Point-sampling methods. +typedef void (*WebPSampleLinePairFunc)( + const uint8_t* top_y, const uint8_t* bottom_y, + const uint8_t* u, const uint8_t* v, + uint8_t* top_dst, uint8_t* bottom_dst, int len); + +extern const WebPSampleLinePairFunc WebPSamplers[/* MODE_LAST */]; + +// YUV444->RGB converters +typedef void (*WebPYUV444Converter)(const uint8_t* y, + const uint8_t* u, const uint8_t* v, + uint8_t* dst, int len); + +extern const WebPYUV444Converter WebPYUV444Converters[/* MODE_LAST */]; + +// Main function to be called +void WebPInitUpsamplers(void); + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif /* WEBP_DSP_DSP_H_ */ diff --git a/third_party/libwebp/dsp/enc.c b/third_party/libwebp/dsp/enc.c new file mode 100644 index 0000000..11eea7c --- /dev/null +++ b/third_party/libwebp/dsp/enc.c @@ -0,0 +1,744 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Speed-critical encoding functions. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include "../enc/vp8enci.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Compute susceptibility based on DCT-coeff histograms: +// the higher, the "easier" the macroblock is to compress. + +static int ClipAlpha(int alpha) { + return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; +} + +int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { + int num = 0, den = 0, val = 0; + int k; + int alpha; + // note: changing this loop to avoid the numerous "k + 1" slows things down. + for (k = 0; k < MAX_COEFF_THRESH; ++k) { + if (histo[k + 1]) { + val += histo[k + 1]; + num += val * (k + 1); + den += (k + 1) * (k + 1); + } + } + // we scale the value to a usable [0..255] range + alpha = den ? 10 * num / den - 5 : 0; + return ClipAlpha(alpha); +} + +const int VP8DspScan[16 + 4 + 4] = { + // Luma + 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS, + 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS, + 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS, + 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS, + + 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U + 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V +}; + +static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block) { + int histo[MAX_COEFF_THRESH + 1] = { 0 }; + int16_t out[16]; + int j, k; + for (j = start_block; j < end_block; ++j) { + VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + + // Convert coefficients to bin (within out[]). + for (k = 0; k < 16; ++k) { + const int v = abs(out[k]) >> 2; + out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; + } + + // Use bin to update histogram. + for (k = 0; k < 16; ++k) { + histo[out[k]]++; + } + } + + return VP8GetAlpha(histo); +} + +//------------------------------------------------------------------------------ +// run-time tables (~4k) + +static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] + +// We declare this variable 'volatile' to prevent instruction reordering +// and make sure it's set to true _last_ (so as to be thread-safe) +static volatile int tables_ok = 0; + +static void InitTables(void) { + if (!tables_ok) { + int i; + for (i = -255; i <= 255 + 255; ++i) { + clip1[255 + i] = (i < 0) ? 0 : (i > 255) ? 255 : i; + } + tables_ok = 1; + } +} + +static inline uint8_t clip_8b(int v) { + return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; +} + +//------------------------------------------------------------------------------ +// Transforms (Paragraph 14.4) + +#define STORE(x, y, v) \ + dst[(x) + (y) * BPS] = clip_8b(ref[(x) + (y) * BPS] + ((v) >> 3)) + +static const int kC1 = 20091 + (1 << 16); +static const int kC2 = 35468; +#define MUL(a, b) (((a) * (b)) >> 16) + +static inline void ITransformOne(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { + int C[4 * 4], *tmp; + int i; + tmp = C; + for (i = 0; i < 4; ++i) { // vertical pass + const int a = in[0] + in[8]; + const int b = in[0] - in[8]; + const int c = MUL(in[4], kC2) - MUL(in[12], kC1); + const int d = MUL(in[4], kC1) + MUL(in[12], kC2); + tmp[0] = a + d; + tmp[1] = b + c; + tmp[2] = b - c; + tmp[3] = a - d; + tmp += 4; + in++; + } + + tmp = C; + for (i = 0; i < 4; ++i) { // horizontal pass + const int dc = tmp[0] + 4; + const int a = dc + tmp[8]; + const int b = dc - tmp[8]; + const int c = MUL(tmp[4], kC2) - MUL(tmp[12], kC1); + const int d = MUL(tmp[4], kC1) + MUL(tmp[12], kC2); + STORE(0, i, a + d); + STORE(1, i, b + c); + STORE(2, i, b - c); + STORE(3, i, a - d); + tmp++; + } +} + +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + ITransformOne(ref, in, dst); + if (do_two) { + ITransformOne(ref + 4, in + 16, dst + 4); + } +} + +static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { + int i; + int tmp[16]; + for (i = 0; i < 4; ++i, src += BPS, ref += BPS) { + const int d0 = src[0] - ref[0]; + const int d1 = src[1] - ref[1]; + const int d2 = src[2] - ref[2]; + const int d3 = src[3] - ref[3]; + const int a0 = (d0 + d3) << 3; + const int a1 = (d1 + d2) << 3; + const int a2 = (d1 - d2) << 3; + const int a3 = (d0 - d3) << 3; + tmp[0 + i * 4] = (a0 + a1); + tmp[1 + i * 4] = (a2 * 2217 + a3 * 5352 + 14500) >> 12; + tmp[2 + i * 4] = (a0 - a1); + tmp[3 + i * 4] = (a3 * 2217 - a2 * 5352 + 7500) >> 12; + } + for (i = 0; i < 4; ++i) { + const int a0 = (tmp[0 + i] + tmp[12 + i]); + const int a1 = (tmp[4 + i] + tmp[ 8 + i]); + const int a2 = (tmp[4 + i] - tmp[ 8 + i]); + const int a3 = (tmp[0 + i] - tmp[12 + i]); + out[0 + i] = (a0 + a1 + 7) >> 4; + out[4 + i] = ((a2 * 2217 + a3 * 5352 + 12000) >> 16) + (a3 != 0); + out[8 + i] = (a0 - a1 + 7) >> 4; + out[12+ i] = ((a3 * 2217 - a2 * 5352 + 51000) >> 16); + } +} + +static void ITransformWHT(const int16_t* in, int16_t* out) { + int tmp[16]; + int i; + for (i = 0; i < 4; ++i) { + const int a0 = in[0 + i] + in[12 + i]; + const int a1 = in[4 + i] + in[ 8 + i]; + const int a2 = in[4 + i] - in[ 8 + i]; + const int a3 = in[0 + i] - in[12 + i]; + tmp[0 + i] = a0 + a1; + tmp[8 + i] = a0 - a1; + tmp[4 + i] = a3 + a2; + tmp[12 + i] = a3 - a2; + } + for (i = 0; i < 4; ++i) { + const int dc = tmp[0 + i * 4] + 3; // w/ rounder + const int a0 = dc + tmp[3 + i * 4]; + const int a1 = tmp[1 + i * 4] + tmp[2 + i * 4]; + const int a2 = tmp[1 + i * 4] - tmp[2 + i * 4]; + const int a3 = dc - tmp[3 + i * 4]; + out[ 0] = (a0 + a1) >> 3; + out[16] = (a3 + a2) >> 3; + out[32] = (a0 - a1) >> 3; + out[48] = (a3 - a2) >> 3; + out += 64; + } +} + +static void FTransformWHT(const int16_t* in, int16_t* out) { + int tmp[16]; + int i; + for (i = 0; i < 4; ++i, in += 64) { + const int a0 = (in[0 * 16] + in[2 * 16]) << 2; + const int a1 = (in[1 * 16] + in[3 * 16]) << 2; + const int a2 = (in[1 * 16] - in[3 * 16]) << 2; + const int a3 = (in[0 * 16] - in[2 * 16]) << 2; + tmp[0 + i * 4] = (a0 + a1) + (a0 != 0); + tmp[1 + i * 4] = a3 + a2; + tmp[2 + i * 4] = a3 - a2; + tmp[3 + i * 4] = a0 - a1; + } + for (i = 0; i < 4; ++i) { + const int a0 = (tmp[0 + i] + tmp[8 + i]); + const int a1 = (tmp[4 + i] + tmp[12+ i]); + const int a2 = (tmp[4 + i] - tmp[12+ i]); + const int a3 = (tmp[0 + i] - tmp[8 + i]); + const int b0 = a0 + a1; + const int b1 = a3 + a2; + const int b2 = a3 - a2; + const int b3 = a0 - a1; + out[ 0 + i] = (b0 + (b0 > 0) + 3) >> 3; + out[ 4 + i] = (b1 + (b1 > 0) + 3) >> 3; + out[ 8 + i] = (b2 + (b2 > 0) + 3) >> 3; + out[12 + i] = (b3 + (b3 > 0) + 3) >> 3; + } +} + +#undef MUL +#undef STORE + +//------------------------------------------------------------------------------ +// Intra predictions + +#define DST(x, y) dst[(x) + (y) * BPS] + +static inline void Fill(uint8_t* dst, int value, int size) { + int j; + for (j = 0; j < size; ++j) { + memset(dst + j * BPS, value, size); + } +} + +static inline void VerticalPred(uint8_t* dst, const uint8_t* top, int size) { + int j; + if (top) { + for (j = 0; j < size; ++j) memcpy(dst + j * BPS, top, size); + } else { + Fill(dst, 127, size); + } +} + +static inline void HorizontalPred(uint8_t* dst, const uint8_t* left, int size) { + if (left) { + int j; + for (j = 0; j < size; ++j) { + memset(dst + j * BPS, left[j], size); + } + } else { + Fill(dst, 129, size); + } +} + +static inline void TrueMotion(uint8_t* dst, const uint8_t* left, + const uint8_t* top, int size) { + int y; + if (left) { + if (top) { + const uint8_t* const clip = clip1 + 255 - left[-1]; + for (y = 0; y < size; ++y) { + const uint8_t* const clip_table = clip + left[y]; + int x; + for (x = 0; x < size; ++x) { + dst[x] = clip_table[top[x]]; + } + dst += BPS; + } + } else { + HorizontalPred(dst, left, size); + } + } else { + // true motion without left samples (hence: with default 129 value) + // is equivalent to VE prediction where you just copy the top samples. + // Note that if top samples are not available, the default value is + // then 129, and not 127 as in the VerticalPred case. + if (top) { + VerticalPred(dst, top, size); + } else { + Fill(dst, 129, size); + } + } +} + +static inline void DCMode(uint8_t* dst, const uint8_t* left, + const uint8_t* top, + int size, int round, int shift) { + int DC = 0; + int j; + if (top) { + for (j = 0; j < size; ++j) DC += top[j]; + if (left) { // top and left present + for (j = 0; j < size; ++j) DC += left[j]; + } else { // top, but no left + DC += DC; + } + DC = (DC + round) >> shift; + } else if (left) { // left but no top + for (j = 0; j < size; ++j) DC += left[j]; + DC += DC; + DC = (DC + round) >> shift; + } else { // no top, no left, nothing. + DC = 0x80; + } + Fill(dst, DC, size); +} + +//------------------------------------------------------------------------------ +// Chroma 8x8 prediction (paragraph 12.2) + +static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, + const uint8_t* top) { + // U block + DCMode(C8DC8 + dst, left, top, 8, 8, 4); + VerticalPred(C8VE8 + dst, top, 8); + HorizontalPred(C8HE8 + dst, left, 8); + TrueMotion(C8TM8 + dst, left, top, 8); + // V block + dst += 8; + if (top) top += 8; + if (left) left += 16; + DCMode(C8DC8 + dst, left, top, 8, 8, 4); + VerticalPred(C8VE8 + dst, top, 8); + HorizontalPred(C8HE8 + dst, left, 8); + TrueMotion(C8TM8 + dst, left, top, 8); +} + +//------------------------------------------------------------------------------ +// luma 16x16 prediction (paragraph 12.3) + +static void Intra16Preds(uint8_t* dst, + const uint8_t* left, const uint8_t* top) { + DCMode(I16DC16 + dst, left, top, 16, 16, 5); + VerticalPred(I16VE16 + dst, top, 16); + HorizontalPred(I16HE16 + dst, left, 16); + TrueMotion(I16TM16 + dst, left, top, 16); +} + +//------------------------------------------------------------------------------ +// luma 4x4 prediction + +#define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) +#define AVG2(a, b) (((a) + (b) + 1) >> 1) + +static void VE4(uint8_t* dst, const uint8_t* top) { // vertical + const uint8_t vals[4] = { + AVG3(top[-1], top[0], top[1]), + AVG3(top[ 0], top[1], top[2]), + AVG3(top[ 1], top[2], top[3]), + AVG3(top[ 2], top[3], top[4]) + }; + int i; + for (i = 0; i < 4; ++i) { + memcpy(dst + i * BPS, vals, 4); + } +} + +static void HE4(uint8_t* dst, const uint8_t* top) { // horizontal + const int X = top[-1]; + const int I = top[-2]; + const int J = top[-3]; + const int K = top[-4]; + const int L = top[-5]; + *(uint32_t*)(dst + 0 * BPS) = 0x01010101U * AVG3(X, I, J); + *(uint32_t*)(dst + 1 * BPS) = 0x01010101U * AVG3(I, J, K); + *(uint32_t*)(dst + 2 * BPS) = 0x01010101U * AVG3(J, K, L); + *(uint32_t*)(dst + 3 * BPS) = 0x01010101U * AVG3(K, L, L); +} + +static void DC4(uint8_t* dst, const uint8_t* top) { + uint32_t dc = 4; + int i; + for (i = 0; i < 4; ++i) dc += top[i] + top[-5 + i]; + Fill(dst, dc >> 3, 4); +} + +static void RD4(uint8_t* dst, const uint8_t* top) { + const int X = top[-1]; + const int I = top[-2]; + const int J = top[-3]; + const int K = top[-4]; + const int L = top[-5]; + const int A = top[0]; + const int B = top[1]; + const int C = top[2]; + const int D = top[3]; + DST(0, 3) = AVG3(J, K, L); + DST(0, 2) = DST(1, 3) = AVG3(I, J, K); + DST(0, 1) = DST(1, 2) = DST(2, 3) = AVG3(X, I, J); + DST(0, 0) = DST(1, 1) = DST(2, 2) = DST(3, 3) = AVG3(A, X, I); + DST(1, 0) = DST(2, 1) = DST(3, 2) = AVG3(B, A, X); + DST(2, 0) = DST(3, 1) = AVG3(C, B, A); + DST(3, 0) = AVG3(D, C, B); +} + +static void LD4(uint8_t* dst, const uint8_t* top) { + const int A = top[0]; + const int B = top[1]; + const int C = top[2]; + const int D = top[3]; + const int E = top[4]; + const int F = top[5]; + const int G = top[6]; + const int H = top[7]; + DST(0, 0) = AVG3(A, B, C); + DST(1, 0) = DST(0, 1) = AVG3(B, C, D); + DST(2, 0) = DST(1, 1) = DST(0, 2) = AVG3(C, D, E); + DST(3, 0) = DST(2, 1) = DST(1, 2) = DST(0, 3) = AVG3(D, E, F); + DST(3, 1) = DST(2, 2) = DST(1, 3) = AVG3(E, F, G); + DST(3, 2) = DST(2, 3) = AVG3(F, G, H); + DST(3, 3) = AVG3(G, H, H); +} + +static void VR4(uint8_t* dst, const uint8_t* top) { + const int X = top[-1]; + const int I = top[-2]; + const int J = top[-3]; + const int K = top[-4]; + const int A = top[0]; + const int B = top[1]; + const int C = top[2]; + const int D = top[3]; + DST(0, 0) = DST(1, 2) = AVG2(X, A); + DST(1, 0) = DST(2, 2) = AVG2(A, B); + DST(2, 0) = DST(3, 2) = AVG2(B, C); + DST(3, 0) = AVG2(C, D); + + DST(0, 3) = AVG3(K, J, I); + DST(0, 2) = AVG3(J, I, X); + DST(0, 1) = DST(1, 3) = AVG3(I, X, A); + DST(1, 1) = DST(2, 3) = AVG3(X, A, B); + DST(2, 1) = DST(3, 3) = AVG3(A, B, C); + DST(3, 1) = AVG3(B, C, D); +} + +static void VL4(uint8_t* dst, const uint8_t* top) { + const int A = top[0]; + const int B = top[1]; + const int C = top[2]; + const int D = top[3]; + const int E = top[4]; + const int F = top[5]; + const int G = top[6]; + const int H = top[7]; + DST(0, 0) = AVG2(A, B); + DST(1, 0) = DST(0, 2) = AVG2(B, C); + DST(2, 0) = DST(1, 2) = AVG2(C, D); + DST(3, 0) = DST(2, 2) = AVG2(D, E); + + DST(0, 1) = AVG3(A, B, C); + DST(1, 1) = DST(0, 3) = AVG3(B, C, D); + DST(2, 1) = DST(1, 3) = AVG3(C, D, E); + DST(3, 1) = DST(2, 3) = AVG3(D, E, F); + DST(3, 2) = AVG3(E, F, G); + DST(3, 3) = AVG3(F, G, H); +} + +static void HU4(uint8_t* dst, const uint8_t* top) { + const int I = top[-2]; + const int J = top[-3]; + const int K = top[-4]; + const int L = top[-5]; + DST(0, 0) = AVG2(I, J); + DST(2, 0) = DST(0, 1) = AVG2(J, K); + DST(2, 1) = DST(0, 2) = AVG2(K, L); + DST(1, 0) = AVG3(I, J, K); + DST(3, 0) = DST(1, 1) = AVG3(J, K, L); + DST(3, 1) = DST(1, 2) = AVG3(K, L, L); + DST(3, 2) = DST(2, 2) = + DST(0, 3) = DST(1, 3) = DST(2, 3) = DST(3, 3) = L; +} + +static void HD4(uint8_t* dst, const uint8_t* top) { + const int X = top[-1]; + const int I = top[-2]; + const int J = top[-3]; + const int K = top[-4]; + const int L = top[-5]; + const int A = top[0]; + const int B = top[1]; + const int C = top[2]; + + DST(0, 0) = DST(2, 1) = AVG2(I, X); + DST(0, 1) = DST(2, 2) = AVG2(J, I); + DST(0, 2) = DST(2, 3) = AVG2(K, J); + DST(0, 3) = AVG2(L, K); + + DST(3, 0) = AVG3(A, B, C); + DST(2, 0) = AVG3(X, A, B); + DST(1, 0) = DST(3, 1) = AVG3(I, X, A); + DST(1, 1) = DST(3, 2) = AVG3(J, I, X); + DST(1, 2) = DST(3, 3) = AVG3(K, J, I); + DST(1, 3) = AVG3(L, K, J); +} + +static void TM4(uint8_t* dst, const uint8_t* top) { + int x, y; + const uint8_t* const clip = clip1 + 255 - top[-1]; + for (y = 0; y < 4; ++y) { + const uint8_t* const clip_table = clip + top[-2 - y]; + for (x = 0; x < 4; ++x) { + dst[x] = clip_table[top[x]]; + } + dst += BPS; + } +} + +#undef DST +#undef AVG3 +#undef AVG2 + +// Left samples are top[-5 .. -2], top_left is top[-1], top are +// located at top[0..3], and top right is top[4..7] +static void Intra4Preds(uint8_t* dst, const uint8_t* top) { + DC4(I4DC4 + dst, top); + TM4(I4TM4 + dst, top); + VE4(I4VE4 + dst, top); + HE4(I4HE4 + dst, top); + RD4(I4RD4 + dst, top); + VR4(I4VR4 + dst, top); + LD4(I4LD4 + dst, top); + VL4(I4VL4 + dst, top); + HD4(I4HD4 + dst, top); + HU4(I4HU4 + dst, top); +} + +//------------------------------------------------------------------------------ +// Metric + +static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) { + int count = 0; + int y, x; + for (y = 0; y < h; ++y) { + for (x = 0; x < w; ++x) { + const int diff = (int)a[x] - b[x]; + count += diff * diff; + } + a += BPS; + b += BPS; + } + return count; +} + +static int SSE16x16(const uint8_t* a, const uint8_t* b) { + return GetSSE(a, b, 16, 16); +} +static int SSE16x8(const uint8_t* a, const uint8_t* b) { + return GetSSE(a, b, 16, 8); +} +static int SSE8x8(const uint8_t* a, const uint8_t* b) { + return GetSSE(a, b, 8, 8); +} +static int SSE4x4(const uint8_t* a, const uint8_t* b) { + return GetSSE(a, b, 4, 4); +} + +//------------------------------------------------------------------------------ +// Texture distortion +// +// We try to match the spectral content (weighted) between source and +// reconstructed samples. + +// Hadamard transform +// Returns the weighted sum of the absolute value of transformed coefficients. +static int TTransform(const uint8_t* in, const uint16_t* w) { + int sum = 0; + int tmp[16]; + int i; + // horizontal pass + for (i = 0; i < 4; ++i, in += BPS) { + const int a0 = (in[0] + in[2]) << 2; + const int a1 = (in[1] + in[3]) << 2; + const int a2 = (in[1] - in[3]) << 2; + const int a3 = (in[0] - in[2]) << 2; + tmp[0 + i * 4] = a0 + a1 + (a0 != 0); + tmp[1 + i * 4] = a3 + a2; + tmp[2 + i * 4] = a3 - a2; + tmp[3 + i * 4] = a0 - a1; + } + // vertical pass + for (i = 0; i < 4; ++i, ++w) { + const int a0 = (tmp[0 + i] + tmp[8 + i]); + const int a1 = (tmp[4 + i] + tmp[12+ i]); + const int a2 = (tmp[4 + i] - tmp[12+ i]); + const int a3 = (tmp[0 + i] - tmp[8 + i]); + const int b0 = a0 + a1; + const int b1 = a3 + a2; + const int b2 = a3 - a2; + const int b3 = a0 - a1; + // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 + sum += w[ 0] * ((abs(b0) + 3) >> 3); + sum += w[ 4] * ((abs(b1) + 3) >> 3); + sum += w[ 8] * ((abs(b2) + 3) >> 3); + sum += w[12] * ((abs(b3) + 3) >> 3); + } + return sum; +} + +static int Disto4x4(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int sum1 = TTransform(a, w); + const int sum2 = TTransform(b, w); + return (abs(sum2 - sum1) + 8) >> 4; +} + +static int Disto16x16(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + int D = 0; + int x, y; + for (y = 0; y < 16 * BPS; y += 4 * BPS) { + for (x = 0; x < 16; x += 4) { + D += Disto4x4(a + x + y, b + x + y, w); + } + } + return D; +} + +//------------------------------------------------------------------------------ +// Quantization +// + +static const uint8_t kZigzag[16] = { + 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 +}; + +// Simple quantization +static int QuantizeBlock(int16_t in[16], int16_t out[16], + int n, const VP8Matrix* const mtx) { + int last = -1; + for (; n < 16; ++n) { + const int j = kZigzag[n]; + const int sign = (in[j] < 0); + int coeff = (sign ? -in[j] : in[j]) + mtx->sharpen_[j]; + if (coeff > 2047) coeff = 2047; + if (coeff > mtx->zthresh_[j]) { + const int Q = mtx->q_[j]; + const int iQ = mtx->iq_[j]; + const int B = mtx->bias_[j]; + out[n] = QUANTDIV(coeff, iQ, B); + if (sign) out[n] = -out[n]; + in[j] = out[n] * Q; + if (out[n]) last = n; + } else { + out[n] = 0; + in[j] = 0; + } + } + return (last >= 0); +} + +//------------------------------------------------------------------------------ +// Block copy + +static inline void Copy(const uint8_t* src, uint8_t* dst, int size) { + int y; + for (y = 0; y < size; ++y) { + memcpy(dst, src, size); + src += BPS; + dst += BPS; + } +} + +static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } +static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); } +static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); } + +//------------------------------------------------------------------------------ +// Initialization + +// Speed-critical function pointers. We have to initialize them to the default +// implementations within VP8EncDspInit(). +VP8CHisto VP8CollectHistogram; +VP8Idct VP8ITransform; +VP8Fdct VP8FTransform; +VP8WHT VP8ITransformWHT; +VP8WHT VP8FTransformWHT; +VP8Intra4Preds VP8EncPredLuma4; +VP8IntraPreds VP8EncPredLuma16; +VP8IntraPreds VP8EncPredChroma8; +VP8Metric VP8SSE16x16; +VP8Metric VP8SSE8x8; +VP8Metric VP8SSE16x8; +VP8Metric VP8SSE4x4; +VP8WMetric VP8TDisto4x4; +VP8WMetric VP8TDisto16x16; +VP8QuantizeBlock VP8EncQuantizeBlock; +VP8BlockCopy VP8Copy4x4; +VP8BlockCopy VP8Copy8x8; +VP8BlockCopy VP8Copy16x16; + +extern void VP8EncDspInitSSE2(void); + +void VP8EncDspInit(void) { + InitTables(); + + // default C implementations + VP8CollectHistogram = CollectHistogram; + VP8ITransform = ITransform; + VP8FTransform = FTransform; + VP8ITransformWHT = ITransformWHT; + VP8FTransformWHT = FTransformWHT; + VP8EncPredLuma4 = Intra4Preds; + VP8EncPredLuma16 = Intra16Preds; + VP8EncPredChroma8 = IntraChromaPreds; + VP8SSE16x16 = SSE16x16; + VP8SSE8x8 = SSE8x8; + VP8SSE16x8 = SSE16x8; + VP8SSE4x4 = SSE4x4; + VP8TDisto4x4 = Disto4x4; + VP8TDisto16x16 = Disto16x16; + VP8EncQuantizeBlock = QuantizeBlock; + VP8Copy4x4 = Copy4x4; + VP8Copy8x8 = Copy8x8; + VP8Copy16x16 = Copy16x16; + + // If defined, use CPUInfo() to overwrite some pointers with faster versions. + if (VP8GetCPUInfo) { +#if defined(__SSE2__) || defined(_MSC_VER) + if (VP8GetCPUInfo(kSSE2)) { + VP8EncDspInitSSE2(); + } +#endif + } +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dsp/enc_sse2.c b/third_party/libwebp/dsp/enc_sse2.c new file mode 100644 index 0000000..fac8000 --- /dev/null +++ b/third_party/libwebp/dsp/enc_sse2.c @@ -0,0 +1,834 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// SSE2 version of speed-critical encoding functions. +// +// Author: Christian Duvivier (cduvivier@google.com) + +#if defined(__SSE2__) || defined(_MSC_VER) +#include <emmintrin.h> + +#include "../enc/vp8enci.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Compute susceptibility based on DCT-coeff histograms: +// the higher, the "easier" the macroblock is to compress. + +static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block) { + int histo[MAX_COEFF_THRESH + 1] = { 0 }; + int16_t out[16]; + int j, k; + const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); + for (j = start_block; j < end_block; ++j) { + VP8FTransform(ref + VP8DspScan[j], pred + VP8DspScan[j], out); + + // Convert coefficients to bin (within out[]). + { + // Load. + const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); + const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); + // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) + const __m128i sign0 = _mm_srai_epi16(out0, 15); + const __m128i sign1 = _mm_srai_epi16(out1, 15); + // abs(out) = (out ^ sign) - sign + const __m128i xor0 = _mm_xor_si128(out0, sign0); + const __m128i xor1 = _mm_xor_si128(out1, sign1); + const __m128i abs0 = _mm_sub_epi16(xor0, sign0); + const __m128i abs1 = _mm_sub_epi16(xor1, sign1); + // v = abs(out) >> 2 + const __m128i v0 = _mm_srai_epi16(abs0, 2); + const __m128i v1 = _mm_srai_epi16(abs1, 2); + // bin = min(v, MAX_COEFF_THRESH) + const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); + const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); + // Store. + _mm_storeu_si128((__m128i*)&out[0], bin0); + _mm_storeu_si128((__m128i*)&out[8], bin1); + } + + // Use bin to update histogram. + for (k = 0; k < 16; ++k) { + histo[out[k]]++; + } + } + + return VP8GetAlpha(histo); +} + +//------------------------------------------------------------------------------ +// Transforms (Paragraph 14.4) + +// Does one or two inverse transforms. +static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + const __m128i k1 = _mm_set1_epi16(20091); + const __m128i k2 = _mm_set1_epi16(-30068); + __m128i T0, T1, T2, T3; + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + __m128i in0, in1, in2, in3; + { + in0 = _mm_loadl_epi64((__m128i*)&in[0]); + in1 = _mm_loadl_epi64((__m128i*)&in[4]); + in2 = _mm_loadl_epi64((__m128i*)&in[8]); + in3 = _mm_loadl_epi64((__m128i*)&in[12]); + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (do_two) { + const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); + const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); + const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); + const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); + in0 = _mm_unpacklo_epi64(in0, inB0); + in1 = _mm_unpacklo_epi64(in1, inB1); + in2 = _mm_unpacklo_epi64(in2, inB2); + in3 = _mm_unpacklo_epi64(in3, inB3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + } + + // Vertical pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i a = _mm_add_epi16(in0, in2); + const __m128i b = _mm_sub_epi16(in0, in2); + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + const __m128i c1 = _mm_mulhi_epi16(in1, k2); + const __m128i c2 = _mm_mulhi_epi16(in3, k1); + const __m128i c3 = _mm_sub_epi16(in1, in3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + const __m128i d1 = _mm_mulhi_epi16(in1, k1); + const __m128i d2 = _mm_mulhi_epi16(in3, k2); + const __m128i d3 = _mm_add_epi16(in1, in3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Horizontal pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i four = _mm_set1_epi16(4); + const __m128i dc = _mm_add_epi16(T0, four); + const __m128i a = _mm_add_epi16(dc, T2); + const __m128i b = _mm_sub_epi16(dc, T2); + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + const __m128i c1 = _mm_mulhi_epi16(T1, k2); + const __m128i c2 = _mm_mulhi_epi16(T3, k1); + const __m128i c3 = _mm_sub_epi16(T1, T3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + const __m128i d1 = _mm_mulhi_epi16(T1, k1); + const __m128i d2 = _mm_mulhi_epi16(T3, k2); + const __m128i d3 = _mm_add_epi16(T1, T3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); + const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); + const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); + const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Add inverse transform to 'ref' and store. + { + const __m128i zero = _mm_set1_epi16(0); + // Load the reference(s). + __m128i ref0, ref1, ref2, ref3; + if (do_two) { + // Load eight bytes/pixels per line. + ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); + ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); + ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); + ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); + } else { + // Load four bytes/pixels per line. + ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]); + ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]); + ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]); + ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]); + } + // Convert to 16b. + ref0 = _mm_unpacklo_epi8(ref0, zero); + ref1 = _mm_unpacklo_epi8(ref1, zero); + ref2 = _mm_unpacklo_epi8(ref2, zero); + ref3 = _mm_unpacklo_epi8(ref3, zero); + // Add the inverse transform(s). + ref0 = _mm_add_epi16(ref0, T0); + ref1 = _mm_add_epi16(ref1, T1); + ref2 = _mm_add_epi16(ref2, T2); + ref3 = _mm_add_epi16(ref3, T3); + // Unsigned saturate to 8b. + ref0 = _mm_packus_epi16(ref0, ref0); + ref1 = _mm_packus_epi16(ref1, ref1); + ref2 = _mm_packus_epi16(ref2, ref2); + ref3 = _mm_packus_epi16(ref3, ref3); + // Store the results. + if (do_two) { + // Store eight bytes/pixels per line. + _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); + _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); + _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); + _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); + } else { + // Store four bytes/pixels per line. + *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); + *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); + *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); + *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); + } + } +} + +static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { + const __m128i zero = _mm_setzero_si128(); + const __m128i seven = _mm_set1_epi16(7); + const __m128i k7500 = _mm_set1_epi32(7500); + const __m128i k14500 = _mm_set1_epi32(14500); + const __m128i k51000 = _mm_set1_epi32(51000); + const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); + const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, + 5352, 2217, 5352, 2217); + const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, + 2217, -5352, 2217, -5352); + + __m128i v01, v32; + + // Difference between src and ref and initial transpose. + { + // Load src and convert to 16b. + const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); + const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); + const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); + const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); + const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); + const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); + const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); + const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); + // Load ref and convert to 16b. + const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); + const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); + const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); + const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); + const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); + const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); + const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); + const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); + // Compute difference. + const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); + const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); + const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); + const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); + + // Transpose. + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); + // a02 a12 a22 a32 a03 a13 a23 a33 + // a00 a10 a20 a30 a01 a11 a21 a31 + // a03 a13 a23 a33 a02 a12 a22 a32 + } + + // First pass and subsequent transpose. + { + // Same operations are done on the (0,3) and (1,2) pairs. + // b0 = (a0 + a3) << 3 + // b1 = (a1 + a2) << 3 + // b3 = (a0 - a3) << 3 + // b2 = (a1 - a2) << 3 + const __m128i a01 = _mm_add_epi16(v01, v32); + const __m128i a32 = _mm_sub_epi16(v01, v32); + const __m128i b01 = _mm_slli_epi16(a01, 3); + const __m128i b32 = _mm_slli_epi16(a32, 3); + const __m128i b11 = _mm_unpackhi_epi64(b01, b01); + const __m128i b22 = _mm_unpackhi_epi64(b32, b32); + + // e0 = b0 + b1 + // e2 = b0 - b1 + const __m128i e0 = _mm_add_epi16(b01, b11); + const __m128i e2 = _mm_sub_epi16(b01, b11); + const __m128i e02 = _mm_unpacklo_epi64(e0, e2); + + // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 + // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 + const __m128i b23 = _mm_unpacklo_epi16(b22, b32); + const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); + const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); + const __m128i d1 = _mm_add_epi32(c1, k14500); + const __m128i d3 = _mm_add_epi32(c3, k7500); + const __m128i e1 = _mm_srai_epi32(d1, 12); + const __m128i e3 = _mm_srai_epi32(d3, 12); + const __m128i e13 = _mm_packs_epi32(e1, e3); + + // Transpose. + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); + const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); + // 02 12 22 32 03 13 23 33 + // 00 10 20 30 01 11 21 31 + // 03 13 23 33 02 12 22 32 + } + + // Second pass + { + // Same operations are done on the (0,3) and (1,2) pairs. + // a0 = v0 + v3 + // a1 = v1 + v2 + // a3 = v0 - v3 + // a2 = v1 - v2 + const __m128i a01 = _mm_add_epi16(v01, v32); + const __m128i a32 = _mm_sub_epi16(v01, v32); + const __m128i a11 = _mm_unpackhi_epi64(a01, a01); + const __m128i a22 = _mm_unpackhi_epi64(a32, a32); + + // d0 = (a0 + a1 + 7) >> 4; + // d2 = (a0 - a1 + 7) >> 4; + const __m128i b0 = _mm_add_epi16(a01, a11); + const __m128i b2 = _mm_sub_epi16(a01, a11); + const __m128i c0 = _mm_add_epi16(b0, seven); + const __m128i c2 = _mm_add_epi16(b2, seven); + const __m128i d0 = _mm_srai_epi16(c0, 4); + const __m128i d2 = _mm_srai_epi16(c2, 4); + + // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) + // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) + const __m128i b23 = _mm_unpacklo_epi16(a22, a32); + const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); + const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); + const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); + const __m128i d3 = _mm_add_epi32(c3, k51000); + const __m128i e1 = _mm_srai_epi32(d1, 16); + const __m128i e3 = _mm_srai_epi32(d3, 16); + const __m128i f1 = _mm_packs_epi32(e1, e1); + const __m128i f3 = _mm_packs_epi32(e3, e3); + // f1 = f1 + (a3 != 0); + // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the + // desired (0, 1), we add one earlier through k12000_plus_one. + const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); + + _mm_storel_epi64((__m128i*)&out[ 0], d0); + _mm_storel_epi64((__m128i*)&out[ 4], g1); + _mm_storel_epi64((__m128i*)&out[ 8], d2); + _mm_storel_epi64((__m128i*)&out[12], f3); + } +} + +//------------------------------------------------------------------------------ +// Metric + +static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { + const __m128i zero = _mm_set1_epi16(0); + + // Load values. + const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]); + const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]); + const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]); + const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]); + const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]); + const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]); + const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]); + const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]); + + // Combine pair of lines and convert to 16b. + const __m128i a01 = _mm_unpacklo_epi32(a0, a1); + const __m128i a23 = _mm_unpacklo_epi32(a2, a3); + const __m128i b01 = _mm_unpacklo_epi32(b0, b1); + const __m128i b23 = _mm_unpacklo_epi32(b2, b3); + const __m128i a01s = _mm_unpacklo_epi8(a01, zero); + const __m128i a23s = _mm_unpacklo_epi8(a23, zero); + const __m128i b01s = _mm_unpacklo_epi8(b01, zero); + const __m128i b23s = _mm_unpacklo_epi8(b23, zero); + + // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2 + // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't + // need absolute values, there is no need to do calculation + // in 8bit as we are already in 16bit, ... Yet this is what + // benchmarks the fastest! + const __m128i d0 = _mm_subs_epu8(a01s, b01s); + const __m128i d1 = _mm_subs_epu8(b01s, a01s); + const __m128i d2 = _mm_subs_epu8(a23s, b23s); + const __m128i d3 = _mm_subs_epu8(b23s, a23s); + + // Square and add them all together. + const __m128i madd0 = _mm_madd_epi16(d0, d0); + const __m128i madd1 = _mm_madd_epi16(d1, d1); + const __m128i madd2 = _mm_madd_epi16(d2, d2); + const __m128i madd3 = _mm_madd_epi16(d3, d3); + const __m128i sum0 = _mm_add_epi32(madd0, madd1); + const __m128i sum1 = _mm_add_epi32(madd2, madd3); + const __m128i sum2 = _mm_add_epi32(sum0, sum1); + int32_t tmp[4]; + _mm_storeu_si128((__m128i*)tmp, sum2); + return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); +} + +//------------------------------------------------------------------------------ +// Texture distortion +// +// We try to match the spectral content (weighted) between source and +// reconstructed samples. + +// Hadamard transform +// Returns the difference between the weighted sum of the absolute value of +// transformed coefficients. +static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { + int32_t sum[4]; + __m128i tmp_0, tmp_1, tmp_2, tmp_3; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i three = _mm_set1_epi16(3); + + // Load, combine and tranpose inputs. + { + const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); + const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); + const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); + const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); + const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); + const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); + const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); + const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); + + // Combine inA and inB (we'll do two transforms in parallel). + const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); + // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 + // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 + // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 + // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 + + // Transpose the two 4x4, discarding the filling zeroes. + const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); + const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); + // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 + // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 + + // Convert to 16b. + tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); + tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); + tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); + tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Horizontal pass and subsequent transpose. + { + // Calculate a and b (two 4x4 at once). + const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); + const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); + const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); + const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); + // b0_extra = (a0 != 0); + const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); + const __m128i b0_base = _mm_add_epi16(a0, a1); + const __m128i b1 = _mm_add_epi16(a3, a2); + const __m128i b2 = _mm_sub_epi16(a3, a2); + const __m128i b3 = _mm_sub_epi16(a0, a1); + const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + + // Transpose the two 4x4. + const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and difference of weighted sums. + { + // Load all inputs. + // TODO(cduvivier): Make variable declarations and allocations aligned so + // we can use _mm_load_si128 instead of _mm_loadu_si128. + const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); + const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); + + // Calculate a and b (two 4x4 at once). + const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); + const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); + const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); + const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); + const __m128i b0 = _mm_add_epi16(a0, a1); + const __m128i b1 = _mm_add_epi16(a3, a2); + const __m128i b2 = _mm_sub_epi16(a3, a2); + const __m128i b3 = _mm_sub_epi16(a0, a1); + + // Separate the transforms of inA and inB. + __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); + __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); + __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); + __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); + + { + // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) + const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); + const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); + const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); + const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); + + // b = abs(b) = (b ^ sign) - sign + A_b0 = _mm_xor_si128(A_b0, sign_A_b0); + A_b2 = _mm_xor_si128(A_b2, sign_A_b2); + B_b0 = _mm_xor_si128(B_b0, sign_B_b0); + B_b2 = _mm_xor_si128(B_b2, sign_B_b2); + A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); + A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); + B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); + B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); + } + + // b = abs(b) + 3 + A_b0 = _mm_add_epi16(A_b0, three); + A_b2 = _mm_add_epi16(A_b2, three); + B_b0 = _mm_add_epi16(B_b0, three); + B_b2 = _mm_add_epi16(B_b2, three); + + // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 + // b = (abs(b) + 3) >> 3 + A_b0 = _mm_srai_epi16(A_b0, 3); + A_b2 = _mm_srai_epi16(A_b2, 3); + B_b0 = _mm_srai_epi16(B_b0, 3); + B_b2 = _mm_srai_epi16(B_b2, 3); + + // weighted sums + A_b0 = _mm_madd_epi16(A_b0, w_0); + A_b2 = _mm_madd_epi16(A_b2, w_8); + B_b0 = _mm_madd_epi16(B_b0, w_0); + B_b2 = _mm_madd_epi16(B_b2, w_8); + A_b0 = _mm_add_epi32(A_b0, A_b2); + B_b0 = _mm_add_epi32(B_b0, B_b2); + + // difference of weighted sums + A_b0 = _mm_sub_epi32(A_b0, B_b0); + _mm_storeu_si128((__m128i*)&sum[0], A_b0); + } + return sum[0] + sum[1] + sum[2] + sum[3]; +} + +static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransformSSE2(a, b, w); + return (abs(diff_sum) + 8) >> 4; +} + +static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + int D = 0; + int x, y; + for (y = 0; y < 16 * BPS; y += 4 * BPS) { + for (x = 0; x < 16; x += 4) { + D += Disto4x4SSE2(a + x + y, b + x + y, w); + } + } + return D; +} + + +//------------------------------------------------------------------------------ +// Quantization +// + +// Simple quantization +static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], + int n, const VP8Matrix* const mtx) { + const __m128i max_coeff_2047 = _mm_set1_epi16(2047); + const __m128i zero = _mm_set1_epi16(0); + __m128i sign0, sign8; + __m128i coeff0, coeff8; + __m128i out0, out8; + __m128i packed_out; + + // Load all inputs. + // TODO(cduvivier): Make variable declarations and allocations aligned so that + // we can use _mm_load_si128 instead of _mm_loadu_si128. + __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); + __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); + const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); + const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); + const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); + const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); + const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); + const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); + const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); + const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); + const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); + const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); + + // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) + sign0 = _mm_srai_epi16(in0, 15); + sign8 = _mm_srai_epi16(in8, 15); + + // coeff = abs(in) = (in ^ sign) - sign + coeff0 = _mm_xor_si128(in0, sign0); + coeff8 = _mm_xor_si128(in8, sign8); + coeff0 = _mm_sub_epi16(coeff0, sign0); + coeff8 = _mm_sub_epi16(coeff8, sign8); + + // coeff = abs(in) + sharpen + coeff0 = _mm_add_epi16(coeff0, sharpen0); + coeff8 = _mm_add_epi16(coeff8, sharpen8); + + // if (coeff > 2047) coeff = 2047 + coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); + coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); + + // out = (coeff * iQ + B) >> QFIX; + { + // doing calculations with 32b precision (QFIX=17) + // out = (coeff * iQ) + __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); + __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); + __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); + __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); + __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); + __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); + __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); + __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); + // expand bias from 16b to 32b + __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); + __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); + __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); + __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); + // out = (coeff * iQ + B) + out_00 = _mm_add_epi32(out_00, bias_00); + out_04 = _mm_add_epi32(out_04, bias_04); + out_08 = _mm_add_epi32(out_08, bias_08); + out_12 = _mm_add_epi32(out_12, bias_12); + // out = (coeff * iQ + B) >> QFIX; + out_00 = _mm_srai_epi32(out_00, QFIX); + out_04 = _mm_srai_epi32(out_04, QFIX); + out_08 = _mm_srai_epi32(out_08, QFIX); + out_12 = _mm_srai_epi32(out_12, QFIX); + // pack result as 16b + out0 = _mm_packs_epi32(out_00, out_04); + out8 = _mm_packs_epi32(out_08, out_12); + } + + // get sign back (if (sign[j]) out_n = -out_n) + out0 = _mm_xor_si128(out0, sign0); + out8 = _mm_xor_si128(out8, sign8); + out0 = _mm_sub_epi16(out0, sign0); + out8 = _mm_sub_epi16(out8, sign8); + + // in = out * Q + in0 = _mm_mullo_epi16(out0, q0); + in8 = _mm_mullo_epi16(out8, q8); + + // if (coeff <= mtx->zthresh_) {in=0; out=0;} + { + __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); + __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); + in0 = _mm_and_si128(in0, cmp0); + in8 = _mm_and_si128(in8, cmp8); + _mm_storeu_si128((__m128i*)&in[0], in0); + _mm_storeu_si128((__m128i*)&in[8], in8); + out0 = _mm_and_si128(out0, cmp0); + out8 = _mm_and_si128(out8, cmp8); + } + + // zigzag the output before storing it. + // + // The zigzag pattern can almost be reproduced with a small sequence of + // shuffles. After it, we only need to swap the 7th (ending up in third + // position instead of twelfth) and 8th values. + { + __m128i outZ0, outZ8; + outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); + outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); + outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); + outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); + outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); + outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); + _mm_storeu_si128((__m128i*)&out[0], outZ0); + _mm_storeu_si128((__m128i*)&out[8], outZ8); + packed_out = _mm_packs_epi16(outZ0, outZ8); + } + { + const int16_t outZ_12 = out[12]; + const int16_t outZ_3 = out[3]; + out[3] = outZ_12; + out[12] = outZ_3; + } + + // detect if all 'out' values are zeroes or not + { + int32_t tmp[4]; + _mm_storeu_si128((__m128i*)tmp, packed_out); + if (n) { + tmp[0] &= ~0xff; + } + return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); + } +} + +extern void VP8EncDspInitSSE2(void); +void VP8EncDspInitSSE2(void) { + VP8CollectHistogram = CollectHistogramSSE2; + VP8EncQuantizeBlock = QuantizeBlockSSE2; + VP8ITransform = ITransformSSE2; + VP8FTransform = FTransformSSE2; + VP8SSE4x4 = SSE4x4SSE2; + VP8TDisto4x4 = Disto4x4SSE2; + VP8TDisto16x16 = Disto16x16SSE2; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif //__SSE2__ diff --git a/third_party/libwebp/dsp/upsampling.c b/third_party/libwebp/dsp/upsampling.c new file mode 100644 index 0000000..c88a17a --- /dev/null +++ b/third_party/libwebp/dsp/upsampling.c @@ -0,0 +1,226 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// YUV to RGB upsampling functions. +// +// Author: somnath@google.com (Somnath Banerjee) + +#include "./dsp.h" +#include "./yuv.h" +#include "../dec/webpi.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Fancy upsampler + +#ifdef FANCY_UPSAMPLING + +// Fancy upsampling functions to convert YUV to RGB +WebPUpsampleLinePairFunc WebPUpsamplers[MODE_LAST]; +WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[MODE_LAST]; + +// Given samples laid out in a square as: +// [a b] +// [c d] +// we interpolate u/v as: +// ([9*a + 3*b + 3*c + d 3*a + 9*b + 3*c + d] + [8 8]) / 16 +// ([3*a + b + 9*c + 3*d a + 3*b + 3*c + 9*d] [8 8]) / 16 + +// We process u and v together stashed into 32bit (16bit each). +#define LOAD_UV(u,v) ((u) | ((v) << 16)) + +#define UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ + const uint8_t* top_u, const uint8_t* top_v, \ + const uint8_t* cur_u, const uint8_t* cur_v, \ + uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ + int x; \ + const int last_pixel_pair = (len - 1) >> 1; \ + uint32_t tl_uv = LOAD_UV(top_u[0], top_v[0]); /* top-left sample */ \ + uint32_t l_uv = LOAD_UV(cur_u[0], cur_v[0]); /* left-sample */ \ + if (top_y) { \ + const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ + FUNC(top_y[0], uv0 & 0xff, (uv0 >> 16), top_dst); \ + } \ + if (bottom_y) { \ + const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ + FUNC(bottom_y[0], uv0 & 0xff, (uv0 >> 16), bottom_dst); \ + } \ + for (x = 1; x <= last_pixel_pair; ++x) { \ + const uint32_t t_uv = LOAD_UV(top_u[x], top_v[x]); /* top sample */ \ + const uint32_t uv = LOAD_UV(cur_u[x], cur_v[x]); /* sample */ \ + /* precompute invariant values associated with first and second diagonals*/\ + const uint32_t avg = tl_uv + t_uv + l_uv + uv + 0x00080008u; \ + const uint32_t diag_12 = (avg + 2 * (t_uv + l_uv)) >> 3; \ + const uint32_t diag_03 = (avg + 2 * (tl_uv + uv)) >> 3; \ + if (top_y) { \ + const uint32_t uv0 = (diag_12 + tl_uv) >> 1; \ + const uint32_t uv1 = (diag_03 + t_uv) >> 1; \ + FUNC(top_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ + top_dst + (2 * x - 1) * XSTEP); \ + FUNC(top_y[2 * x - 0], uv1 & 0xff, (uv1 >> 16), \ + top_dst + (2 * x - 0) * XSTEP); \ + } \ + if (bottom_y) { \ + const uint32_t uv0 = (diag_03 + l_uv) >> 1; \ + const uint32_t uv1 = (diag_12 + uv) >> 1; \ + FUNC(bottom_y[2 * x - 1], uv0 & 0xff, (uv0 >> 16), \ + bottom_dst + (2 * x - 1) * XSTEP); \ + FUNC(bottom_y[2 * x + 0], uv1 & 0xff, (uv1 >> 16), \ + bottom_dst + (2 * x + 0) * XSTEP); \ + } \ + tl_uv = t_uv; \ + l_uv = uv; \ + } \ + if (!(len & 1)) { \ + if (top_y) { \ + const uint32_t uv0 = (3 * tl_uv + l_uv + 0x00020002u) >> 2; \ + FUNC(top_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ + top_dst + (len - 1) * XSTEP); \ + } \ + if (bottom_y) { \ + const uint32_t uv0 = (3 * l_uv + tl_uv + 0x00020002u) >> 2; \ + FUNC(bottom_y[len - 1], uv0 & 0xff, (uv0 >> 16), \ + bottom_dst + (len - 1) * XSTEP); \ + } \ + } \ +} + +// All variants implemented. +UPSAMPLE_FUNC(UpsampleRgbLinePair, VP8YuvToRgb, 3) +UPSAMPLE_FUNC(UpsampleBgrLinePair, VP8YuvToBgr, 3) +UPSAMPLE_FUNC(UpsampleRgbaLinePair, VP8YuvToRgba, 4) +UPSAMPLE_FUNC(UpsampleBgraLinePair, VP8YuvToBgra, 4) +UPSAMPLE_FUNC(UpsampleArgbLinePair, VP8YuvToArgb, 4) +UPSAMPLE_FUNC(UpsampleRgba4444LinePair, VP8YuvToRgba4444, 2) +UPSAMPLE_FUNC(UpsampleRgb565LinePair, VP8YuvToRgb565, 2) +// These two don't erase the alpha value +UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePair, VP8YuvToRgb, 4) +UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePair, VP8YuvToBgr, 4) +UPSAMPLE_FUNC(UpsampleArgbKeepAlphaLinePair, VP8YuvToArgbKeepA, 4) +UPSAMPLE_FUNC(UpsampleRgba4444KeepAlphaLinePair, VP8YuvToRgba4444KeepA, 2) + +#undef LOAD_UV +#undef UPSAMPLE_FUNC + +#endif // FANCY_UPSAMPLING + +//------------------------------------------------------------------------------ +// simple point-sampling + +#define SAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ + const uint8_t* u, const uint8_t* v, \ + uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ + int i; \ + for (i = 0; i < len - 1; i += 2) { \ + FUNC(top_y[0], u[0], v[0], top_dst); \ + FUNC(top_y[1], u[0], v[0], top_dst + XSTEP); \ + FUNC(bottom_y[0], u[0], v[0], bottom_dst); \ + FUNC(bottom_y[1], u[0], v[0], bottom_dst + XSTEP); \ + top_y += 2; \ + bottom_y += 2; \ + u++; \ + v++; \ + top_dst += 2 * XSTEP; \ + bottom_dst += 2 * XSTEP; \ + } \ + if (i == len - 1) { /* last one */ \ + FUNC(top_y[0], u[0], v[0], top_dst); \ + FUNC(bottom_y[0], u[0], v[0], bottom_dst); \ + } \ +} + +// All variants implemented. +SAMPLE_FUNC(SampleRgbLinePair, VP8YuvToRgb, 3) +SAMPLE_FUNC(SampleBgrLinePair, VP8YuvToBgr, 3) +SAMPLE_FUNC(SampleRgbaLinePair, VP8YuvToRgba, 4) +SAMPLE_FUNC(SampleBgraLinePair, VP8YuvToBgra, 4) +SAMPLE_FUNC(SampleArgbLinePair, VP8YuvToArgb, 4) +SAMPLE_FUNC(SampleRgba4444LinePair, VP8YuvToRgba4444, 2) +SAMPLE_FUNC(SampleRgb565LinePair, VP8YuvToRgb565, 2) + +#undef SAMPLE_FUNC + +const WebPSampleLinePairFunc WebPSamplers[MODE_LAST] = { + SampleRgbLinePair, // MODE_RGB + SampleRgbaLinePair, // MODE_RGBA + SampleBgrLinePair, // MODE_BGR + SampleBgraLinePair, // MODE_BGRA + SampleArgbLinePair, // MODE_ARGB + SampleRgba4444LinePair, // MODE_RGBA_4444 + SampleRgb565LinePair // MODE_RGB_565 +}; + +//------------------------------------------------------------------------------ +// YUV444 converter + +#define YUV444_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* y, const uint8_t* u, const uint8_t* v, \ + uint8_t* dst, int len) { \ + int i; \ + for (i = 0; i < len; ++i) FUNC(y[i], u[i], v[i], &dst[i * XSTEP]); \ +} + +YUV444_FUNC(Yuv444ToRgb, VP8YuvToRgb, 3) +YUV444_FUNC(Yuv444ToBgr, VP8YuvToBgr, 3) +YUV444_FUNC(Yuv444ToRgba, VP8YuvToRgba, 4) +YUV444_FUNC(Yuv444ToBgra, VP8YuvToBgra, 4) +YUV444_FUNC(Yuv444ToArgb, VP8YuvToArgb, 4) +YUV444_FUNC(Yuv444ToRgba4444, VP8YuvToRgba4444, 2) +YUV444_FUNC(Yuv444ToRgb565, VP8YuvToRgb565, 2) + +#undef YUV444_FUNC + +const WebPYUV444Converter WebPYUV444Converters[MODE_LAST] = { + Yuv444ToRgb, // MODE_RGB + Yuv444ToRgba, // MODE_RGBA + Yuv444ToBgr, // MODE_BGR + Yuv444ToBgra, // MODE_BGRA + Yuv444ToArgb, // MODE_ARGB + Yuv444ToRgba4444, // MODE_RGBA_4444 + Yuv444ToRgb565 // MODE_RGB_565 +}; + +//------------------------------------------------------------------------------ +// Main call + +void WebPInitUpsamplers(void) { +#ifdef FANCY_UPSAMPLING + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePair; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePair; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePair; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePair; + WebPUpsamplers[MODE_ARGB] = UpsampleArgbLinePair; + WebPUpsamplers[MODE_RGBA_4444] = UpsampleRgba4444LinePair; + WebPUpsamplers[MODE_RGB_565] = UpsampleRgb565LinePair; + + WebPUpsamplersKeepAlpha[MODE_RGB] = UpsampleRgbLinePair; + WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePair; + WebPUpsamplersKeepAlpha[MODE_BGR] = UpsampleBgrLinePair; + WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePair; + WebPUpsamplersKeepAlpha[MODE_ARGB] = UpsampleArgbKeepAlphaLinePair; + WebPUpsamplersKeepAlpha[MODE_RGBA_4444] = UpsampleRgba4444KeepAlphaLinePair; + WebPUpsamplersKeepAlpha[MODE_RGB_565] = UpsampleRgb565LinePair; + + // If defined, use CPUInfo() to overwrite some pointers with faster versions. + if (VP8GetCPUInfo) { +#if defined(__SSE2__) || defined(_MSC_VER) + if (VP8GetCPUInfo(kSSE2)) { + WebPInitUpsamplersSSE2(); + } +#endif + } +#endif // FANCY_UPSAMPLING +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/dsp/upsampling_sse2.c b/third_party/libwebp/dsp/upsampling_sse2.c new file mode 100644 index 0000000..30eb6a9 --- /dev/null +++ b/third_party/libwebp/dsp/upsampling_sse2.c @@ -0,0 +1,215 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// SSE2 version of YUV to RGB upsampling functions. +// +// Author: somnath@google.com (Somnath Banerjee) + +#if defined(__SSE2__) || defined(_MSC_VER) + +#include <assert.h> +#include <emmintrin.h> +#include <string.h> +#include "./dsp.h" +#include "./yuv.h" +#include "../dec/webpi.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#ifdef FANCY_UPSAMPLING + +// We compute (9*a + 3*b + 3*c + d + 8) / 16 as follows +// u = (9*a + 3*b + 3*c + d + 8) / 16 +// = (a + (a + 3*b + 3*c + d) / 8 + 1) / 2 +// = (a + m + 1) / 2 +// where m = (a + 3*b + 3*c + d) / 8 +// = ((a + b + c + d) / 2 + b + c) / 4 +// +// Let's say k = (a + b + c + d) / 4. +// We can compute k as +// k = (s + t + 1) / 2 - ((a^d) | (b^c) | (s^t)) & 1 +// where s = (a + d + 1) / 2 and t = (b + c + 1) / 2 +// +// Then m can be written as +// m = (k + t + 1) / 2 - (((b^c) & (s^t)) | (k^t)) & 1 + +// Computes out = (k + in + 1) / 2 - ((ij & (s^t)) | (k^in)) & 1 +#define GET_M(ij, in, out) do { \ + const __m128i tmp0 = _mm_avg_epu8(k, (in)); /* (k + in + 1) / 2 */ \ + const __m128i tmp1 = _mm_and_si128((ij), st); /* (ij) & (s^t) */ \ + const __m128i tmp2 = _mm_xor_si128(k, (in)); /* (k^in) */ \ + const __m128i tmp3 = _mm_or_si128(tmp1, tmp2); /* ((ij) & (s^t)) | (k^in) */\ + const __m128i tmp4 = _mm_and_si128(tmp3, one); /* & 1 -> lsb_correction */ \ + (out) = _mm_sub_epi8(tmp0, tmp4); /* (k + in + 1) / 2 - lsb_correction */ \ +} while (0) + +// pack and store two alterning pixel rows +#define PACK_AND_STORE(a, b, da, db, out) do { \ + const __m128i ta = _mm_avg_epu8(a, da); /* (9a + 3b + 3c + d + 8) / 16 */ \ + const __m128i tb = _mm_avg_epu8(b, db); /* (3a + 9b + c + 3d + 8) / 16 */ \ + const __m128i t1 = _mm_unpacklo_epi8(ta, tb); \ + const __m128i t2 = _mm_unpackhi_epi8(ta, tb); \ + _mm_store_si128(((__m128i*)(out)) + 0, t1); \ + _mm_store_si128(((__m128i*)(out)) + 1, t2); \ +} while (0) + +// Loads 17 pixels each from rows r1 and r2 and generates 32 pixels. +#define UPSAMPLE_32PIXELS(r1, r2, out) { \ + const __m128i one = _mm_set1_epi8(1); \ + const __m128i a = _mm_loadu_si128((__m128i*)&(r1)[0]); \ + const __m128i b = _mm_loadu_si128((__m128i*)&(r1)[1]); \ + const __m128i c = _mm_loadu_si128((__m128i*)&(r2)[0]); \ + const __m128i d = _mm_loadu_si128((__m128i*)&(r2)[1]); \ + \ + const __m128i s = _mm_avg_epu8(a, d); /* s = (a + d + 1) / 2 */ \ + const __m128i t = _mm_avg_epu8(b, c); /* t = (b + c + 1) / 2 */ \ + const __m128i st = _mm_xor_si128(s, t); /* st = s^t */ \ + \ + const __m128i ad = _mm_xor_si128(a, d); /* ad = a^d */ \ + const __m128i bc = _mm_xor_si128(b, c); /* bc = b^c */ \ + \ + const __m128i t1 = _mm_or_si128(ad, bc); /* (a^d) | (b^c) */ \ + const __m128i t2 = _mm_or_si128(t1, st); /* (a^d) | (b^c) | (s^t) */ \ + const __m128i t3 = _mm_and_si128(t2, one); /* (a^d) | (b^c) | (s^t) & 1 */ \ + const __m128i t4 = _mm_avg_epu8(s, t); \ + const __m128i k = _mm_sub_epi8(t4, t3); /* k = (a + b + c + d) / 4 */ \ + __m128i diag1, diag2; \ + \ + GET_M(bc, t, diag1); /* diag1 = (a + 3b + 3c + d) / 8 */ \ + GET_M(ad, s, diag2); /* diag2 = (3a + b + c + 3d) / 8 */ \ + \ + /* pack the alternate pixels */ \ + PACK_AND_STORE(a, b, diag1, diag2, &(out)[0 * 32]); \ + PACK_AND_STORE(c, d, diag2, diag1, &(out)[2 * 32]); \ +} + +// Turn the macro into a function for reducing code-size when non-critical +static void Upsample32Pixels(const uint8_t r1[], const uint8_t r2[], + uint8_t* const out) { + UPSAMPLE_32PIXELS(r1, r2, out); +} + +#define UPSAMPLE_LAST_BLOCK(tb, bb, num_pixels, out) { \ + uint8_t r1[17], r2[17]; \ + memcpy(r1, (tb), (num_pixels)); \ + memcpy(r2, (bb), (num_pixels)); \ + /* replicate last byte */ \ + memset(r1 + (num_pixels), r1[(num_pixels) - 1], 17 - (num_pixels)); \ + memset(r2 + (num_pixels), r2[(num_pixels) - 1], 17 - (num_pixels)); \ + /* using the shared function instead of the macro saves ~3k code size */ \ + Upsample32Pixels(r1, r2, out); \ +} + +#define CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, uv, \ + top_dst, bottom_dst, cur_x, num_pixels) { \ + int n; \ + if (top_y) { \ + for (n = 0; n < (num_pixels); ++n) { \ + FUNC(top_y[(cur_x) + n], (uv)[n], (uv)[32 + n], \ + top_dst + ((cur_x) + n) * XSTEP); \ + } \ + } \ + if (bottom_y) { \ + for (n = 0; n < (num_pixels); ++n) { \ + FUNC(bottom_y[(cur_x) + n], (uv)[64 + n], (uv)[64 + 32 + n], \ + bottom_dst + ((cur_x) + n) * XSTEP); \ + } \ + } \ +} + +#define SSE2_UPSAMPLE_FUNC(FUNC_NAME, FUNC, XSTEP) \ +static void FUNC_NAME(const uint8_t* top_y, const uint8_t* bottom_y, \ + const uint8_t* top_u, const uint8_t* top_v, \ + const uint8_t* cur_u, const uint8_t* cur_v, \ + uint8_t* top_dst, uint8_t* bottom_dst, int len) { \ + int b; \ + /* 16 byte aligned array to cache reconstructed u and v */ \ + uint8_t uv_buf[4 * 32 + 15]; \ + uint8_t* const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); \ + const int uv_len = (len + 1) >> 1; \ + /* 17 pixels must be read-able for each block */ \ + const int num_blocks = (uv_len - 1) >> 4; \ + const int leftover = uv_len - num_blocks * 16; \ + const int last_pos = 1 + 32 * num_blocks; \ + \ + const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; \ + const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; \ + \ + assert(len > 0); \ + /* Treat the first pixel in regular way */ \ + if (top_y) { \ + const int u0 = (top_u[0] + u_diag) >> 1; \ + const int v0 = (top_v[0] + v_diag) >> 1; \ + FUNC(top_y[0], u0, v0, top_dst); \ + } \ + if (bottom_y) { \ + const int u0 = (cur_u[0] + u_diag) >> 1; \ + const int v0 = (cur_v[0] + v_diag) >> 1; \ + FUNC(bottom_y[0], u0, v0, bottom_dst); \ + } \ + \ + for (b = 0; b < num_blocks; ++b) { \ + UPSAMPLE_32PIXELS(top_u, cur_u, r_uv + 0 * 32); \ + UPSAMPLE_32PIXELS(top_v, cur_v, r_uv + 1 * 32); \ + CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \ + 32 * b + 1, 32) \ + top_u += 16; \ + cur_u += 16; \ + top_v += 16; \ + cur_v += 16; \ + } \ + \ + UPSAMPLE_LAST_BLOCK(top_u, cur_u, leftover, r_uv + 0 * 32); \ + UPSAMPLE_LAST_BLOCK(top_v, cur_v, leftover, r_uv + 1 * 32); \ + CONVERT2RGB(FUNC, XSTEP, top_y, bottom_y, r_uv, top_dst, bottom_dst, \ + last_pos, len - last_pos); \ +} + +// SSE2 variants of the fancy upsampler. +SSE2_UPSAMPLE_FUNC(UpsampleRgbLinePairSSE2, VP8YuvToRgb, 3) +SSE2_UPSAMPLE_FUNC(UpsampleBgrLinePairSSE2, VP8YuvToBgr, 3) +SSE2_UPSAMPLE_FUNC(UpsampleRgbaLinePairSSE2, VP8YuvToRgba, 4) +SSE2_UPSAMPLE_FUNC(UpsampleBgraLinePairSSE2, VP8YuvToBgra, 4) +// These two don't erase the alpha value +SSE2_UPSAMPLE_FUNC(UpsampleRgbKeepAlphaLinePairSSE2, VP8YuvToRgb, 4) +SSE2_UPSAMPLE_FUNC(UpsampleBgrKeepAlphaLinePairSSE2, VP8YuvToBgr, 4) + +#undef GET_M +#undef PACK_AND_STORE +#undef UPSAMPLE_32PIXELS +#undef UPSAMPLE_LAST_BLOCK +#undef CONVERT2RGB +#undef SSE2_UPSAMPLE_FUNC + +//------------------------------------------------------------------------------ + +extern WebPUpsampleLinePairFunc WebPUpsamplers[/* MODE_LAST */]; +extern WebPUpsampleLinePairFunc WebPUpsamplersKeepAlpha[/* MODE_LAST */]; + +#endif // FANCY_UPSAMPLING + +void WebPInitUpsamplersSSE2(void) { +#ifdef FANCY_UPSAMPLING + WebPUpsamplers[MODE_RGB] = UpsampleRgbLinePairSSE2; + WebPUpsamplers[MODE_RGBA] = UpsampleRgbaLinePairSSE2; + WebPUpsamplers[MODE_BGR] = UpsampleBgrLinePairSSE2; + WebPUpsamplers[MODE_BGRA] = UpsampleBgraLinePairSSE2; + + WebPUpsamplersKeepAlpha[MODE_RGB] = UpsampleRgbLinePairSSE2; + WebPUpsamplersKeepAlpha[MODE_RGBA] = UpsampleRgbKeepAlphaLinePairSSE2; + WebPUpsamplersKeepAlpha[MODE_BGR] = UpsampleBgrLinePairSSE2; + WebPUpsamplersKeepAlpha[MODE_BGRA] = UpsampleBgrKeepAlphaLinePairSSE2; +#endif // FANCY_UPSAMPLING +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif //__SSE2__ || _MSC_VER diff --git a/third_party/libwebp/dec/yuv.c b/third_party/libwebp/dsp/yuv.c index ac448ee..ee9ea16 100644 --- a/third_party/libwebp/dec/yuv.c +++ b/third_party/libwebp/dsp/yuv.c @@ -9,7 +9,7 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "yuv.h" +#include "./yuv.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { @@ -20,9 +20,14 @@ enum { YUV_HALF = 1 << (YUV_FIX - 1) }; int16_t VP8kVToR[256], VP8kUToB[256]; int32_t VP8kVToG[256], VP8kUToG[256]; uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; +uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; static int done = 0; +static inline uint8_t clip(int v, int max_value) { + return v < 0 ? 0 : v > max_value ? max_value : v; +} + void VP8YUVInit(void) { int i; if (done) { @@ -36,7 +41,8 @@ void VP8YUVInit(void) { } for (i = YUV_RANGE_MIN; i < YUV_RANGE_MAX; ++i) { const int k = ((i - 16) * 76283 + YUV_HALF) >> YUV_FIX; - VP8kClip[i - YUV_RANGE_MIN] = (k < 0) ? 0 : (k > 255) ? 255 : k; + VP8kClip[i - YUV_RANGE_MIN] = clip(k, 255); + VP8kClip4Bits[i - YUV_RANGE_MIN] = clip((k + 8) >> 4, 15); } done = 1; } diff --git a/third_party/libwebp/dsp/yuv.h b/third_party/libwebp/dsp/yuv.h new file mode 100644 index 0000000..4fa9bed --- /dev/null +++ b/third_party/libwebp/dsp/yuv.h @@ -0,0 +1,109 @@ +// Copyright 2010 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// inline YUV->RGB conversion function +// +// Author: Skal (pascal.massimino@gmail.com) + +#ifndef WEBP_DSP_YUV_H_ +#define WEBP_DSP_YUV_H_ + +#include "../webp/decode_vp8.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +enum { YUV_FIX = 16, // fixed-point precision + YUV_RANGE_MIN = -227, // min value of r/g/b output + YUV_RANGE_MAX = 256 + 226 // max value of r/g/b output +}; +extern int16_t VP8kVToR[256], VP8kUToB[256]; +extern int32_t VP8kVToG[256], VP8kUToG[256]; +extern uint8_t VP8kClip[YUV_RANGE_MAX - YUV_RANGE_MIN]; +extern uint8_t VP8kClip4Bits[YUV_RANGE_MAX - YUV_RANGE_MIN]; + +static inline void VP8YuvToRgb(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgb) { + const int r_off = VP8kVToR[v]; + const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; + const int b_off = VP8kUToB[u]; + rgb[0] = VP8kClip[y + r_off - YUV_RANGE_MIN]; + rgb[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; + rgb[2] = VP8kClip[y + b_off - YUV_RANGE_MIN]; +} + +static inline void VP8YuvToRgb565(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgb) { + const int r_off = VP8kVToR[v]; + const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; + const int b_off = VP8kUToB[u]; + rgb[0] = ((VP8kClip[y + r_off - YUV_RANGE_MIN] & 0xf8) | + (VP8kClip[y + g_off - YUV_RANGE_MIN] >> 5)); + rgb[1] = (((VP8kClip[y + g_off - YUV_RANGE_MIN] << 3) & 0xe0) | + (VP8kClip[y + b_off - YUV_RANGE_MIN] >> 3)); +} + +static inline void VP8YuvToArgbKeepA(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const argb) { + // Don't update Aplha (argb[0]) + VP8YuvToRgb(y, u, v, argb + 1); +} + +static inline void VP8YuvToArgb(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const argb) { + argb[0] = 0xff; + VP8YuvToArgbKeepA(y, u, v, argb); +} + +static inline void VP8YuvToRgba4444KeepA(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const argb) { + const int r_off = VP8kVToR[v]; + const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; + const int b_off = VP8kUToB[u]; + // Don't update Aplha (last 4 bits of argb[1]) + argb[0] = ((VP8kClip4Bits[y + r_off - YUV_RANGE_MIN] << 4) | + VP8kClip4Bits[y + g_off - YUV_RANGE_MIN]); + argb[1] = (argb[1] & 0x0f) | (VP8kClip4Bits[y + b_off - YUV_RANGE_MIN] << 4); +} + +static inline void VP8YuvToRgba4444(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const argb) { + argb[1] = 0x0f; + VP8YuvToRgba4444KeepA(y, u, v, argb); +} + +static inline void VP8YuvToBgr(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const bgr) { + const int r_off = VP8kVToR[v]; + const int g_off = (VP8kVToG[v] + VP8kUToG[u]) >> YUV_FIX; + const int b_off = VP8kUToB[u]; + bgr[0] = VP8kClip[y + b_off - YUV_RANGE_MIN]; + bgr[1] = VP8kClip[y + g_off - YUV_RANGE_MIN]; + bgr[2] = VP8kClip[y + r_off - YUV_RANGE_MIN]; +} + +static inline void VP8YuvToBgra(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const bgra) { + VP8YuvToBgr(y, u, v, bgra); + bgra[3] = 0xff; +} + +static inline void VP8YuvToRgba(uint8_t y, uint8_t u, uint8_t v, + uint8_t* const rgba) { + VP8YuvToRgb(y, u, v, rgba); + rgba[3] = 0xff; +} + +// Must be called before everything, to initialize the tables. +void VP8YUVInit(void); + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif /* WEBP_DSP_YUV_H_ */ diff --git a/third_party/libwebp/enc/alpha.c b/third_party/libwebp/enc/alpha.c new file mode 100644 index 0000000..2ea054d --- /dev/null +++ b/third_party/libwebp/enc/alpha.c @@ -0,0 +1,114 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Alpha-plane compression. +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <assert.h> +#include <stdlib.h> +#include "vp8enci.h" + +#ifdef WEBP_EXPERIMENTAL_FEATURES +#include "zlib.h" +#endif + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#ifdef WEBP_EXPERIMENTAL_FEATURES + +#define CHUNK_SIZE 8192 + +//------------------------------------------------------------------------------ + +static int CompressAlpha(const uint8_t* data, size_t data_size, + uint8_t** output, size_t* output_size, + int algo) { + int ret = Z_OK; + z_stream strm; + unsigned char chunk[CHUNK_SIZE]; + + *output = NULL; + *output_size = 0; + memset(&strm, 0, sizeof(strm)); + if (deflateInit(&strm, algo ? Z_BEST_SPEED : Z_BEST_COMPRESSION) != Z_OK) { + return 0; + } + strm.next_in = (unsigned char*)data; + strm.avail_in = data_size; + do { + size_t size_out; + + strm.next_out = chunk; + strm.avail_out = CHUNK_SIZE; + ret = deflate(&strm, Z_FINISH); + if (ret == Z_STREAM_ERROR) { + break; + } + size_out = CHUNK_SIZE - strm.avail_out; + if (size_out) { + size_t new_size = *output_size + size_out; + uint8_t* new_output = realloc(*output, new_size); + if (new_output == NULL) { + ret = Z_MEM_ERROR; + break; + } + memcpy(new_output + *output_size, chunk, size_out); + *output_size = new_size; + *output = new_output; + } + } while (ret != Z_STREAM_END || strm.avail_out == 0); + + deflateEnd(&strm); + if (ret != Z_STREAM_END) { + free(*output); + output_size = 0; + return 0; + } + return 1; +} + +#endif /* WEBP_EXPERIMENTAL_FEATURES */ + +void VP8EncInitAlpha(VP8Encoder* enc) { + enc->has_alpha_ = (enc->pic_->a != NULL); + enc->alpha_data_ = NULL; + enc->alpha_data_size_ = 0; +} + +void VP8EncCodeAlphaBlock(VP8EncIterator* it) { + (void)it; + // Nothing for now. We just ZLIB-compress in the end. +} + +int VP8EncFinishAlpha(VP8Encoder* enc) { + if (enc->has_alpha_) { +#ifdef WEBP_EXPERIMENTAL_FEATURES + const WebPPicture* pic = enc->pic_; + assert(pic->a); + if (!CompressAlpha(pic->a, pic->width * pic->height, + &enc->alpha_data_, &enc->alpha_data_size_, + enc->config_->alpha_compression)) { + return 0; + } +#endif + } + return 1; +} + +void VP8EncDeleteAlpha(VP8Encoder* enc) { + free(enc->alpha_data_); + enc->alpha_data_ = NULL; + enc->alpha_data_size_ = 0; + enc->has_alpha_ = 0; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/enc/analysis.c b/third_party/libwebp/enc/analysis.c index 41e12e8..c6609f8 100644 --- a/third_party/libwebp/enc/analysis.c +++ b/third_party/libwebp/enc/analysis.c @@ -20,52 +20,13 @@ extern "C" { #endif -#define MAX_COEFF_THRESH 64 #define MAX_ITERS_K_MEANS 6 -//----------------------------------------------------------------------------- -// Compute susceptibility based on DCT-coeff histograms: -// the higher, the "easier" the macroblock is to compress. - static int ClipAlpha(int alpha) { return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; } -static int GetAlpha(const int histo[MAX_COEFF_THRESH]) { - int num = 0, den = 0, val = 0; - int k; - int alpha; - for (k = 0; k < MAX_COEFF_THRESH; ++k) { - if (histo[k]) { - val += histo[k]; - num += val * (k + 1); - den += (k + 1) * (k + 1); - } - } - // we scale the value to a usable [0..255] range - alpha = den ? 10 * num / den - 5 : 0; - return ClipAlpha(alpha); -} - -static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, - int start_block, int end_block) { - int histo[MAX_COEFF_THRESH] = { 0 }; - int16_t out[16]; - int j, k; - for (j = start_block; j < end_block; ++j) { - VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out); - for (k = 0; k < 16; ++k) { - const int v = abs(out[k]) >> 2; - if (v) { - const int bin = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; - histo[bin - 1]++; - } - } - } - return GetAlpha(histo); -} - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Smooth the segment map by replacing isolated block by the majority of its // neighbours. @@ -86,11 +47,11 @@ static void SmoothSegmentMap(VP8Encoder* const enc) { cnt[mb[-w - 1].segment_]++; // top-left cnt[mb[-w + 0].segment_]++; // top cnt[mb[-w + 1].segment_]++; // top-right - cnt[mb[ - 1].segment_]++; // left - cnt[mb[ + 1].segment_]++; // right - cnt[mb[ w - 1].segment_]++; // bottom-left - cnt[mb[ w + 0].segment_]++; // bottom - cnt[mb[ w + 1].segment_]++; // bottom-right + cnt[mb[ - 1].segment_]++; // left + cnt[mb[ + 1].segment_]++; // right + cnt[mb[ w - 1].segment_]++; // bottom-left + cnt[mb[ w + 0].segment_]++; // bottom + cnt[mb[ w + 1].segment_]++; // bottom-right for (n = 0; n < NUM_MB_SEGMENTS; ++n) { if (cnt[n] >= majority_cnt_3_x_3_grid) { majority_seg = n; @@ -108,7 +69,7 @@ static void SmoothSegmentMap(VP8Encoder* const enc) { free(tmp); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Finalize Segment probability based on the coding tree static int GetProba(int a, int b) { @@ -178,7 +139,7 @@ static void SetSegmentAlphas(VP8Encoder* const enc, } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Simplified k-Means, to assign Nb segments based on alpha-histogram static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) { @@ -259,7 +220,7 @@ static void AssignSegments(VP8Encoder* const enc, const int alphas[256]) { SetSegmentAlphas(enc, centers, weighted_average); // pick some alphas. } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Macroblock analysis: collect histogram for each mode, deduce the maximal // susceptibility and set best modes for this macroblock. // Segment assignment is done later. @@ -278,9 +239,9 @@ static int MBAnalyzeBestIntra16Mode(VP8EncIterator* const it) { VP8MakeLuma16Preds(it); for (mode = 0; mode < max_mode; ++mode) { - const int alpha = CollectHistogram(it->yuv_in_ + Y_OFF, - it->yuv_p_ + VP8I16ModeOffsets[mode], - 0, 16); + const int alpha = VP8CollectHistogram(it->yuv_in_ + Y_OFF, + it->yuv_p_ + VP8I16ModeOffsets[mode], + 0, 16); if (alpha > best_alpha) { best_alpha = alpha; best_mode = mode; @@ -303,9 +264,9 @@ static int MBAnalyzeBestIntra4Mode(VP8EncIterator* const it, VP8MakeIntra4Preds(it); for (mode = 0; mode < max_mode; ++mode) { - const int alpha = CollectHistogram(src, - it->yuv_p_ + VP8I4ModeOffsets[mode], - 0, 1); + const int alpha = VP8CollectHistogram(src, + it->yuv_p_ + VP8I4ModeOffsets[mode], + 0, 1); if (alpha > best_mode_alpha) { best_mode_alpha = alpha; modes[it->i4_] = mode; @@ -329,9 +290,9 @@ static int MBAnalyzeBestUVMode(VP8EncIterator* const it) { int mode; VP8MakeChroma8Preds(it); for (mode = 0; mode < max_mode; ++mode) { - const int alpha = CollectHistogram(it->yuv_in_ + U_OFF, - it->yuv_p_ + VP8UVModeOffsets[mode], - 16, 16 + 4 + 4); + const int alpha = VP8CollectHistogram(it->yuv_in_ + U_OFF, + it->yuv_p_ + VP8UVModeOffsets[mode], + 16, 16 + 4 + 4); if (alpha > best_alpha) { best_alpha = alpha; best_mode = mode; @@ -367,7 +328,7 @@ static void MBAnalyze(VP8EncIterator* const it, it->mb_->alpha_ = best_alpha; // Informative only. } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Main analysis loop: // Collect all susceptibilities for each macroblock and record their // distribution in alphas[]. Segments is assigned a-posteriori, based on diff --git a/third_party/libwebp/enc/config.c b/third_party/libwebp/enc/config.c index 86ef5ce2..1a74f68 100644 --- a/third_party/libwebp/enc/config.c +++ b/third_party/libwebp/enc/config.c @@ -10,15 +10,15 @@ // Author: Skal (pascal.massimino@gmail.com) #include <assert.h> -#include "webp/encode.h" +#include "../webp/encode.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // WebPConfig -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ int WebPConfigInitInternal(WebPConfig* const config, WebPPreset preset, float quality, int version) { @@ -41,6 +41,8 @@ int WebPConfigInitInternal(WebPConfig* const config, config->show_compressed = 0; config->preprocessing = 0; config->autofilter = 0; + config->alpha_compression = 0; + config->partition_limit = 0; // TODO(skal): tune. switch (preset) { @@ -105,10 +107,14 @@ int WebPValidateConfig(const WebPConfig* const config) { return 0; if (config->partitions < 0 || config->partitions > 3) return 0; + if (config->partition_limit < 0 || config->partition_limit > 100) + return 0; + if (config->alpha_compression < 0) + return 0; return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/cost.c b/third_party/libwebp/enc/cost.c index f765598..656d1ea 100644 --- a/third_party/libwebp/enc/cost.c +++ b/third_party/libwebp/enc/cost.c @@ -17,7 +17,7 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Boolean-cost cost table const uint16_t VP8EntropyCost[256] = { @@ -49,13 +49,13 @@ const uint16_t VP8EntropyCost[256] = { 10, 9, 7, 6, 4, 3 }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Level cost tables // For each given level, the following table given the pattern of contexts // to use for coding it (in [][0]) as well as the bit value to use for // each context (in [][1]). -static const uint16_t kLevelCodes[MAX_VARIABLE_LEVEL][2] = { +const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2] = { {0x001, 0x000}, {0x007, 0x001}, {0x00f, 0x005}, {0x00f, 0x00d}, {0x033, 0x003}, {0x033, 0x003}, {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x033, 0x023}, {0x0d3, 0x013}, @@ -337,8 +337,8 @@ const uint16_t VP8LevelFixedCosts[2048] = { }; static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) { - int pattern = kLevelCodes[level - 1][0]; - int bits = kLevelCodes[level - 1][1]; + int pattern = VP8LevelCodes[level - 1][0]; + int bits = VP8LevelCodes[level - 1][1]; int cost = 0; int i; for (i = 2; pattern; ++i) { @@ -351,7 +351,7 @@ static int VariableLevelCost(int level, const uint8_t probas[NUM_PROBAS]) { return cost; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Pre-calc level costs once for all void VP8CalculateLevelCosts(VP8Proba* const proba) { @@ -374,12 +374,13 @@ void VP8CalculateLevelCosts(VP8Proba* const proba) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Mode cost tables. // These are the fixed probabilities (in the coding trees) turned into bit-cost // by calling VP8BitCost(). const uint16_t VP8FixedCostsUV[4] = { 302, 984, 439, 642 }; +// note: these values include the fixed VP8BitCost(1, 145) mode selection cost. const uint16_t VP8FixedCostsI16[4] = { 663, 919, 872, 919 }; const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = { { { 251, 1362, 1934, 2085, 2314, 2230, 1839, 1988, 2437, 2348 }, @@ -484,7 +485,7 @@ const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES] = { { 516, 1378, 1569, 1110, 1798, 1798, 1198, 2199, 1543, 712 } }, }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/cost.h b/third_party/libwebp/enc/cost.h index b80bb10..4b00c85 100644 --- a/third_party/libwebp/enc/cost.h +++ b/third_party/libwebp/enc/cost.h @@ -27,11 +27,13 @@ static inline int VP8BitCost(int bit, uint8_t proba) { } // Cost of coding 'nb' 1's and 'total-nb' 0's using 'proba' probability. -static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total, uint8_t proba) { +static inline uint64_t VP8BranchCost(uint64_t nb, uint64_t total, + uint8_t proba) { return nb * VP8BitCost(1, proba) + (total - nb) * VP8BitCost(0, proba); } // Level cost calculations +extern const uint16_t VP8LevelCodes[MAX_VARIABLE_LEVEL][2]; void VP8CalculateLevelCosts(VP8Proba* const proba); static inline int VP8LevelCost(const uint16_t* const table, int level) { return VP8LevelFixedCosts[level] @@ -43,10 +45,10 @@ extern const uint16_t VP8FixedCostsUV[4]; extern const uint16_t VP8FixedCostsI16[4]; extern const uint16_t VP8FixedCostsI4[NUM_BMODES][NUM_BMODES][NUM_BMODES]; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif -#endif // WEBP_ENC_COST_H_ +#endif /* WEBP_ENC_COST_H_ */ diff --git a/third_party/libwebp/enc/dsp.c b/third_party/libwebp/enc/enc.c index 45f977c..1d77ab3 100644 --- a/third_party/libwebp/enc/dsp.c +++ b/third_party/libwebp/enc/enc.c @@ -16,7 +16,55 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ +// Compute susceptibility based on DCT-coeff histograms: +// the higher, the "easier" the macroblock is to compress. + +static int ClipAlpha(int alpha) { + return alpha < 0 ? 0 : alpha > 255 ? 255 : alpha; +} + +int VP8GetAlpha(const int histo[MAX_COEFF_THRESH + 1]) { + int num = 0, den = 0, val = 0; + int k; + int alpha; + // note: changing this loop to avoid the numerous "k + 1" slows things down. + for (k = 0; k < MAX_COEFF_THRESH; ++k) { + if (histo[k + 1]) { + val += histo[k + 1]; + num += val * (k + 1); + den += (k + 1) * (k + 1); + } + } + // we scale the value to a usable [0..255] range + alpha = den ? 10 * num / den - 5 : 0; + return ClipAlpha(alpha); +} + +static int CollectHistogram(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block) { + int histo[MAX_COEFF_THRESH + 1] = { 0 }; + int16_t out[16]; + int j, k; + for (j = start_block; j < end_block; ++j) { + VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out); + + // Convert coefficients to bin (within out[]). + for (k = 0; k < 16; ++k) { + const int v = abs(out[k]) >> 2; + out[k] = (v > MAX_COEFF_THRESH) ? MAX_COEFF_THRESH : v; + } + + // Use bin to update histogram. + for (k = 0; k < 16; ++k) { + histo[out[k]]++; + } + } + + return VP8GetAlpha(histo); +} + +//------------------------------------------------------------------------------ // run-time tables (~4k) static uint8_t clip1[255 + 510 + 1]; // clips [-255,510] to [0,255] @@ -39,7 +87,7 @@ static inline uint8_t clip_8b(int v) { return (!(v & ~0xff)) ? v : v < 0 ? 0 : 255; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Transforms (Paragraph 14.4) #define STORE(x, y, v) \ @@ -49,7 +97,8 @@ static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; #define MUL(a, b) (((a) * (b)) >> 16) -static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) { +static inline void ITransformOne(const uint8_t* ref, const int16_t* in, + uint8_t* dst) { int C[4 * 4], *tmp; int i; tmp = C; @@ -81,6 +130,14 @@ static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst) { } } +static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + ITransformOne(ref, in, dst); + if (do_two) { + ITransformOne(ref + 4, in + 16, dst + 4); + } +} + static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { int i; int tmp[16]; @@ -166,16 +223,10 @@ static void FTransformWHT(const int16_t* in, int16_t* out) { } } -// default C implementations: -VP8Idct VP8ITransform = ITransform; -VP8Fdct VP8FTransform = FTransform; -VP8WHT VP8ITransformWHT = ITransformWHT; -VP8WHT VP8FTransformWHT = FTransformWHT; - #undef MUL #undef STORE -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Intra predictions #define OUT(x, y) dst[(x) + (y) * BPS] @@ -260,7 +311,7 @@ static inline void DCMode(uint8_t* dst, const uint8_t* left, Fill(dst, DC, size); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Chroma 8x8 prediction (paragraph 12.2) static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, @@ -280,7 +331,7 @@ static void IntraChromaPreds(uint8_t* dst, const uint8_t* left, TrueMotion(C8TM8 + dst, left, top, 8); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // luma 16x16 prediction (paragraph 12.3) static void Intra16Preds(uint8_t* dst, @@ -291,7 +342,7 @@ static void Intra16Preds(uint8_t* dst, TrueMotion(I16TM16 + dst, left, top, 16); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // luma 4x4 prediction #define AVG3(a, b, c) (((a) + 2 * (b) + (c) + 2) >> 2) @@ -478,12 +529,7 @@ static void Intra4Preds(uint8_t* dst, const uint8_t* top) { HU4(I4HU4 + dst, top); } -// default C implementations -VP8Intra4Preds VP8EncPredLuma4 = Intra4Preds; -VP8IntraPreds VP8EncPredLuma16 = Intra16Preds; -VP8IntraPreds VP8EncPredChroma8 = IntraChromaPreds; - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Metric static inline int GetSSE(const uint8_t* a, const uint8_t* b, int w, int h) { @@ -513,22 +559,19 @@ static int SSE4x4(const uint8_t* a, const uint8_t* b) { return GetSSE(a, b, 4, 4); } -// default C implementations -VP8Metric VP8SSE16x16 = SSE16x16; -VP8Metric VP8SSE8x8 = SSE8x8; -VP8Metric VP8SSE16x8 = SSE16x8; -VP8Metric VP8SSE4x4 = SSE4x4; - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Texture distortion // // We try to match the spectral content (weighted) between source and // reconstructed samples. // Hadamard transform -static void TTransform(const uint8_t* in, int16_t* out) { +// Returns the weighted sum of the absolute value of transformed coefficients. +static int TTransform(const uint8_t* in, const uint16_t* w) { + int sum = 0; int tmp[16]; int i; + // horizontal pass for (i = 0; i < 4; ++i, in += BPS) { const int a0 = (in[0] + in[2]) << 2; const int a1 = (in[1] + in[3]) << 2; @@ -539,7 +582,8 @@ static void TTransform(const uint8_t* in, int16_t* out) { tmp[2 + i * 4] = a3 - a2; tmp[3 + i * 4] = a0 - a1; } - for (i = 0; i < 4; ++i) { + // vertical pass + for (i = 0; i < 4; ++i, ++w) { const int a0 = (tmp[0 + i] + tmp[8 + i]); const int a1 = (tmp[4 + i] + tmp[12+ i]); const int a2 = (tmp[4 + i] - tmp[12+ i]); @@ -548,24 +592,20 @@ static void TTransform(const uint8_t* in, int16_t* out) { const int b1 = a3 + a2; const int b2 = a3 - a2; const int b3 = a0 - a1; - out[ 0 + i] = (b0 + (b0 < 0) + 3) >> 3; - out[ 4 + i] = (b1 + (b1 < 0) + 3) >> 3; - out[ 8 + i] = (b2 + (b2 < 0) + 3) >> 3; - out[12 + i] = (b3 + (b3 < 0) + 3) >> 3; + // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 + sum += w[ 0] * ((abs(b0) + 3) >> 3); + sum += w[ 4] * ((abs(b1) + 3) >> 3); + sum += w[ 8] * ((abs(b2) + 3) >> 3); + sum += w[12] * ((abs(b3) + 3) >> 3); } + return sum; } static int Disto4x4(const uint8_t* const a, const uint8_t* const b, const uint16_t* const w) { - int16_t tmp1[16], tmp2[16]; - int k; - int D; - TTransform(a, tmp1); - TTransform(b, tmp2); - D = 0; - for (k = 0; k < 16; ++k) - D += w[k] * (abs(tmp2[k]) - abs(tmp1[k])); - return (abs(D) + 8) >> 4; + const int sum1 = TTransform(a, w); + const int sum2 = TTransform(b, w); + return (abs(sum2 - sum1) + 8) >> 4; } static int Disto16x16(const uint8_t* const a, const uint8_t* const b, @@ -580,10 +620,7 @@ static int Disto16x16(const uint8_t* const a, const uint8_t* const b, return D; } -VP8WMetric VP8TDisto4x4 = Disto4x4; -VP8WMetric VP8TDisto16x16 = Disto16x16; - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Quantization // @@ -612,10 +649,7 @@ static int QuantizeBlock(int16_t in[16], int16_t out[16], return (last >= 0); } -// default C implementation -VP8QuantizeBlock VP8EncQuantizeBlock = QuantizeBlock; - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Block copy static inline void Copy(const uint8_t* src, uint8_t* dst, int size) { @@ -631,15 +665,104 @@ static void Copy4x4(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 4); } static void Copy8x8(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 8); } static void Copy16x16(const uint8_t* src, uint8_t* dst) { Copy(src, dst, 16); } -// default C implementations -VP8BlockCopy VP8Copy4x4 = Copy4x4; -VP8BlockCopy VP8Copy8x8 = Copy8x8; -VP8BlockCopy VP8Copy16x16 = Copy16x16; +//------------------------------------------------------------------------------ +// SSE2 detection. +// + +#if defined(__pic__) && defined(__i386__) +static inline void GetCPUInfo(int cpu_info[4], int info_type) { + __asm__ volatile ( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type)); +} +#elif defined(__i386__) || defined(__x86_64__) +static inline void GetCPUInfo(int cpu_info[4], int info_type) { + __asm__ volatile ( + "cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) + : "a"(info_type)); +} +#elif defined(_MSC_VER) // Visual C++ +#define GetCPUInfo __cpuid +#endif + +#if defined(__i386__) || defined(__x86_64__) || defined(_MSC_VER) +static int x86CPUInfo(CPUFeature feature) { + int cpu_info[4]; + GetCPUInfo(cpu_info, 1); + if (feature == kSSE2) { + return 0 != (cpu_info[3] & 0x04000000); + } + if (feature == kSSE3) { + return 0 != (cpu_info[2] & 0x00000001); + } + return 0; +} +VP8CPUInfo VP8EncGetCPUInfo = x86CPUInfo; +#else +VP8CPUInfo VP8EncGetCPUInfo = NULL; +#endif -//----------------------------------------------------------------------------- +// Speed-critical function pointers. We have to initialize them to the default +// implementations within VP8EncDspInit(). +VP8CHisto VP8CollectHistogram; +VP8Idct VP8ITransform; +VP8Fdct VP8FTransform; +VP8WHT VP8ITransformWHT; +VP8WHT VP8FTransformWHT; +VP8Intra4Preds VP8EncPredLuma4; +VP8IntraPreds VP8EncPredLuma16; +VP8IntraPreds VP8EncPredChroma8; +VP8Metric VP8SSE16x16; +VP8Metric VP8SSE8x8; +VP8Metric VP8SSE16x8; +VP8Metric VP8SSE4x4; +VP8WMetric VP8TDisto4x4; +VP8WMetric VP8TDisto16x16; +VP8QuantizeBlock VP8EncQuantizeBlock; +VP8BlockCopy VP8Copy4x4; +VP8BlockCopy VP8Copy8x8; +VP8BlockCopy VP8Copy16x16; + +extern void VP8EncDspInitSSE2(void); void VP8EncDspInit(void) { InitTables(); + + // default C implementations + VP8CollectHistogram = CollectHistogram; + VP8ITransform = ITransform; + VP8FTransform = FTransform; + VP8ITransformWHT = ITransformWHT; + VP8FTransformWHT = FTransformWHT; + VP8EncPredLuma4 = Intra4Preds; + VP8EncPredLuma16 = Intra16Preds; + VP8EncPredChroma8 = IntraChromaPreds; + VP8SSE16x16 = SSE16x16; + VP8SSE8x8 = SSE8x8; + VP8SSE16x8 = SSE16x8; + VP8SSE4x4 = SSE4x4; + VP8TDisto4x4 = Disto4x4; + VP8TDisto16x16 = Disto16x16; + VP8EncQuantizeBlock = QuantizeBlock; + VP8Copy4x4 = Copy4x4; + VP8Copy8x8 = Copy8x8; + VP8Copy16x16 = Copy16x16; + + // If defined, use CPUInfo() to overwrite some pointers with faster versions. + if (VP8EncGetCPUInfo) { + if (VP8EncGetCPUInfo(kSSE2)) { +#if defined(__SSE2__) || defined(_MSC_VER) + VP8EncDspInitSSE2(); +#endif + } + if (VP8EncGetCPUInfo(kSSE3)) { + // later we'll plug some SSE3 variant here + } + } } #if defined(__cplusplus) || defined(c_plusplus) diff --git a/third_party/libwebp/enc/enc_sse2.c b/third_party/libwebp/enc/enc_sse2.c new file mode 100644 index 0000000..22d2d62 --- /dev/null +++ b/third_party/libwebp/enc/enc_sse2.c @@ -0,0 +1,834 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// SSE2 version of speed-critical functions. +// +// Author: Christian Duvivier (cduvivier@google.com) + +#if defined(__SSE2__) || defined(_MSC_VER) +#include <emmintrin.h> + +#include "vp8enci.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +//------------------------------------------------------------------------------ +// Compute susceptibility based on DCT-coeff histograms: +// the higher, the "easier" the macroblock is to compress. + +static int CollectHistogramSSE2(const uint8_t* ref, const uint8_t* pred, + int start_block, int end_block) { + int histo[MAX_COEFF_THRESH + 1] = { 0 }; + int16_t out[16]; + int j, k; + const __m128i max_coeff_thresh = _mm_set1_epi16(MAX_COEFF_THRESH); + for (j = start_block; j < end_block; ++j) { + VP8FTransform(ref + VP8Scan[j], pred + VP8Scan[j], out); + + // Convert coefficients to bin (within out[]). + { + // Load. + const __m128i out0 = _mm_loadu_si128((__m128i*)&out[0]); + const __m128i out1 = _mm_loadu_si128((__m128i*)&out[8]); + // sign(out) = out >> 15 (0x0000 if positive, 0xffff if negative) + const __m128i sign0 = _mm_srai_epi16(out0, 15); + const __m128i sign1 = _mm_srai_epi16(out1, 15); + // abs(out) = (out ^ sign) - sign + const __m128i xor0 = _mm_xor_si128(out0, sign0); + const __m128i xor1 = _mm_xor_si128(out1, sign1); + const __m128i abs0 = _mm_sub_epi16(xor0, sign0); + const __m128i abs1 = _mm_sub_epi16(xor1, sign1); + // v = abs(out) >> 2 + const __m128i v0 = _mm_srai_epi16(abs0, 2); + const __m128i v1 = _mm_srai_epi16(abs1, 2); + // bin = min(v, MAX_COEFF_THRESH) + const __m128i bin0 = _mm_min_epi16(v0, max_coeff_thresh); + const __m128i bin1 = _mm_min_epi16(v1, max_coeff_thresh); + // Store. + _mm_storeu_si128((__m128i*)&out[0], bin0); + _mm_storeu_si128((__m128i*)&out[8], bin1); + } + + // Use bin to update histogram. + for (k = 0; k < 16; ++k) { + histo[out[k]]++; + } + } + + return VP8GetAlpha(histo); +} + +//------------------------------------------------------------------------------ +// Transforms (Paragraph 14.4) + +// Does one or two inverse transforms. +static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, + int do_two) { + // This implementation makes use of 16-bit fixed point versions of two + // multiply constants: + // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 + // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 + // + // To be able to use signed 16-bit integers, we use the following trick to + // have constants within range: + // - Associated constants are obtained by subtracting the 16-bit fixed point + // version of one: + // k = K - (1 << 16) => K = k + (1 << 16) + // K1 = 85267 => k1 = 20091 + // K2 = 35468 => k2 = -30068 + // - The multiplication of a variable by a constant become the sum of the + // variable and the multiplication of that variable by the associated + // constant: + // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x + const __m128i k1 = _mm_set1_epi16(20091); + const __m128i k2 = _mm_set1_epi16(-30068); + __m128i T0, T1, T2, T3; + + // Load and concatenate the transform coefficients (we'll do two inverse + // transforms in parallel). In the case of only one inverse transform, the + // second half of the vectors will just contain random value we'll never + // use nor store. + __m128i in0, in1, in2, in3; + { + in0 = _mm_loadl_epi64((__m128i*)&in[0]); + in1 = _mm_loadl_epi64((__m128i*)&in[4]); + in2 = _mm_loadl_epi64((__m128i*)&in[8]); + in3 = _mm_loadl_epi64((__m128i*)&in[12]); + // a00 a10 a20 a30 x x x x + // a01 a11 a21 a31 x x x x + // a02 a12 a22 a32 x x x x + // a03 a13 a23 a33 x x x x + if (do_two) { + const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); + const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); + const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); + const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); + in0 = _mm_unpacklo_epi64(in0, inB0); + in1 = _mm_unpacklo_epi64(in1, inB1); + in2 = _mm_unpacklo_epi64(in2, inB2); + in3 = _mm_unpacklo_epi64(in3, inB3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + } + + // Vertical pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i a = _mm_add_epi16(in0, in2); + const __m128i b = _mm_sub_epi16(in0, in2); + // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 + const __m128i c1 = _mm_mulhi_epi16(in1, k2); + const __m128i c2 = _mm_mulhi_epi16(in3, k1); + const __m128i c3 = _mm_sub_epi16(in1, in3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 + const __m128i d1 = _mm_mulhi_epi16(in1, k1); + const __m128i d2 = _mm_mulhi_epi16(in3, k2); + const __m128i d3 = _mm_add_epi16(in1, in3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Horizontal pass and subsequent transpose. + { + // First pass, c and d calculations are longer because of the "trick" + // multiplications. + const __m128i four = _mm_set1_epi16(4); + const __m128i dc = _mm_add_epi16(T0, four); + const __m128i a = _mm_add_epi16(dc, T2); + const __m128i b = _mm_sub_epi16(dc, T2); + // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 + const __m128i c1 = _mm_mulhi_epi16(T1, k2); + const __m128i c2 = _mm_mulhi_epi16(T3, k1); + const __m128i c3 = _mm_sub_epi16(T1, T3); + const __m128i c4 = _mm_sub_epi16(c1, c2); + const __m128i c = _mm_add_epi16(c3, c4); + // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 + const __m128i d1 = _mm_mulhi_epi16(T1, k1); + const __m128i d2 = _mm_mulhi_epi16(T3, k2); + const __m128i d3 = _mm_add_epi16(T1, T3); + const __m128i d4 = _mm_add_epi16(d1, d2); + const __m128i d = _mm_add_epi16(d3, d4); + + // Second pass. + const __m128i tmp0 = _mm_add_epi16(a, d); + const __m128i tmp1 = _mm_add_epi16(b, c); + const __m128i tmp2 = _mm_sub_epi16(b, c); + const __m128i tmp3 = _mm_sub_epi16(a, d); + const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); + const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); + const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); + const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); + + // Transpose the two 4x4. + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Add inverse transform to 'ref' and store. + { + const __m128i zero = _mm_set1_epi16(0); + // Load the reference(s). + __m128i ref0, ref1, ref2, ref3; + if (do_two) { + // Load eight bytes/pixels per line. + ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); + ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); + ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); + ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); + } else { + // Load four bytes/pixels per line. + ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]); + ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]); + ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]); + ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]); + } + // Convert to 16b. + ref0 = _mm_unpacklo_epi8(ref0, zero); + ref1 = _mm_unpacklo_epi8(ref1, zero); + ref2 = _mm_unpacklo_epi8(ref2, zero); + ref3 = _mm_unpacklo_epi8(ref3, zero); + // Add the inverse transform(s). + ref0 = _mm_add_epi16(ref0, T0); + ref1 = _mm_add_epi16(ref1, T1); + ref2 = _mm_add_epi16(ref2, T2); + ref3 = _mm_add_epi16(ref3, T3); + // Unsigned saturate to 8b. + ref0 = _mm_packus_epi16(ref0, ref0); + ref1 = _mm_packus_epi16(ref1, ref1); + ref2 = _mm_packus_epi16(ref2, ref2); + ref3 = _mm_packus_epi16(ref3, ref3); + // Store the results. + if (do_two) { + // Store eight bytes/pixels per line. + _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); + _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); + _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); + _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); + } else { + // Store four bytes/pixels per line. + *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); + *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); + *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); + *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); + } + } +} + +static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, + int16_t* out) { + const __m128i zero = _mm_setzero_si128(); + const __m128i seven = _mm_set1_epi16(7); + const __m128i k7500 = _mm_set1_epi32(7500); + const __m128i k14500 = _mm_set1_epi32(14500); + const __m128i k51000 = _mm_set1_epi32(51000); + const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); + const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, + 5352, 2217, 5352, 2217); + const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, + 2217, -5352, 2217, -5352); + + __m128i v01, v32; + + // Difference between src and ref and initial transpose. + { + // Load src and convert to 16b. + const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); + const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); + const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); + const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); + const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); + const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); + const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); + const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); + // Load ref and convert to 16b. + const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); + const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); + const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); + const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); + const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); + const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); + const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); + const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); + // Compute difference. + const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); + const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); + const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); + const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); + + // Transpose. + // 00 01 02 03 0 0 0 0 + // 10 11 12 13 0 0 0 0 + // 20 21 22 23 0 0 0 0 + // 30 31 32 33 0 0 0 0 + const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); + // a02 a12 a22 a32 a03 a13 a23 a33 + // a00 a10 a20 a30 a01 a11 a21 a31 + // a03 a13 a23 a33 a02 a12 a22 a32 + } + + // First pass and subsequent transpose. + { + // Same operations are done on the (0,3) and (1,2) pairs. + // b0 = (a0 + a3) << 3 + // b1 = (a1 + a2) << 3 + // b3 = (a0 - a3) << 3 + // b2 = (a1 - a2) << 3 + const __m128i a01 = _mm_add_epi16(v01, v32); + const __m128i a32 = _mm_sub_epi16(v01, v32); + const __m128i b01 = _mm_slli_epi16(a01, 3); + const __m128i b32 = _mm_slli_epi16(a32, 3); + const __m128i b11 = _mm_unpackhi_epi64(b01, b01); + const __m128i b22 = _mm_unpackhi_epi64(b32, b32); + + // e0 = b0 + b1 + // e2 = b0 - b1 + const __m128i e0 = _mm_add_epi16(b01, b11); + const __m128i e2 = _mm_sub_epi16(b01, b11); + const __m128i e02 = _mm_unpacklo_epi64(e0, e2); + + // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 + // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 + const __m128i b23 = _mm_unpacklo_epi16(b22, b32); + const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); + const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); + const __m128i d1 = _mm_add_epi32(c1, k14500); + const __m128i d3 = _mm_add_epi32(c3, k7500); + const __m128i e1 = _mm_srai_epi32(d1, 12); + const __m128i e3 = _mm_srai_epi32(d3, 12); + const __m128i e13 = _mm_packs_epi32(e1, e3); + + // Transpose. + // 00 01 02 03 20 21 22 23 + // 10 11 12 13 30 31 32 33 + const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); + const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); + // 00 10 01 11 02 12 03 13 + // 20 30 21 31 22 32 23 33 + const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); + // 02 12 22 32 03 13 23 33 + // 00 10 20 30 01 11 21 31 + // 03 13 23 33 02 12 22 32 + } + + // Second pass + { + // Same operations are done on the (0,3) and (1,2) pairs. + // a0 = v0 + v3 + // a1 = v1 + v2 + // a3 = v0 - v3 + // a2 = v1 - v2 + const __m128i a01 = _mm_add_epi16(v01, v32); + const __m128i a32 = _mm_sub_epi16(v01, v32); + const __m128i a11 = _mm_unpackhi_epi64(a01, a01); + const __m128i a22 = _mm_unpackhi_epi64(a32, a32); + + // d0 = (a0 + a1 + 7) >> 4; + // d2 = (a0 - a1 + 7) >> 4; + const __m128i b0 = _mm_add_epi16(a01, a11); + const __m128i b2 = _mm_sub_epi16(a01, a11); + const __m128i c0 = _mm_add_epi16(b0, seven); + const __m128i c2 = _mm_add_epi16(b2, seven); + const __m128i d0 = _mm_srai_epi16(c0, 4); + const __m128i d2 = _mm_srai_epi16(c2, 4); + + // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) + // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) + const __m128i b23 = _mm_unpacklo_epi16(a22, a32); + const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); + const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); + const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); + const __m128i d3 = _mm_add_epi32(c3, k51000); + const __m128i e1 = _mm_srai_epi32(d1, 16); + const __m128i e3 = _mm_srai_epi32(d3, 16); + const __m128i f1 = _mm_packs_epi32(e1, e1); + const __m128i f3 = _mm_packs_epi32(e3, e3); + // f1 = f1 + (a3 != 0); + // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the + // desired (0, 1), we add one earlier through k12000_plus_one. + const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); + + _mm_storel_epi64((__m128i*)&out[ 0], d0); + _mm_storel_epi64((__m128i*)&out[ 4], g1); + _mm_storel_epi64((__m128i*)&out[ 8], d2); + _mm_storel_epi64((__m128i*)&out[12], f3); + } +} + +//------------------------------------------------------------------------------ +// Metric + +static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { + const __m128i zero = _mm_set1_epi16(0); + + // Load values. + const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]); + const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]); + const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]); + const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]); + const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]); + const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]); + const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]); + const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]); + + // Combine pair of lines and convert to 16b. + const __m128i a01 = _mm_unpacklo_epi32(a0, a1); + const __m128i a23 = _mm_unpacklo_epi32(a2, a3); + const __m128i b01 = _mm_unpacklo_epi32(b0, b1); + const __m128i b23 = _mm_unpacklo_epi32(b2, b3); + const __m128i a01s = _mm_unpacklo_epi8(a01, zero); + const __m128i a23s = _mm_unpacklo_epi8(a23, zero); + const __m128i b01s = _mm_unpacklo_epi8(b01, zero); + const __m128i b23s = _mm_unpacklo_epi8(b23, zero); + + // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2 + // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't + // need absolute values, there is no need to do calculation + // in 8bit as we are already in 16bit, ... Yet this is what + // benchmarks the fastest! + const __m128i d0 = _mm_subs_epu8(a01s, b01s); + const __m128i d1 = _mm_subs_epu8(b01s, a01s); + const __m128i d2 = _mm_subs_epu8(a23s, b23s); + const __m128i d3 = _mm_subs_epu8(b23s, a23s); + + // Square and add them all together. + const __m128i madd0 = _mm_madd_epi16(d0, d0); + const __m128i madd1 = _mm_madd_epi16(d1, d1); + const __m128i madd2 = _mm_madd_epi16(d2, d2); + const __m128i madd3 = _mm_madd_epi16(d3, d3); + const __m128i sum0 = _mm_add_epi32(madd0, madd1); + const __m128i sum1 = _mm_add_epi32(madd2, madd3); + const __m128i sum2 = _mm_add_epi32(sum0, sum1); + int32_t tmp[4]; + _mm_storeu_si128((__m128i*)tmp, sum2); + return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); +} + +//------------------------------------------------------------------------------ +// Texture distortion +// +// We try to match the spectral content (weighted) between source and +// reconstructed samples. + +// Hadamard transform +// Returns the difference between the weighted sum of the absolute value of +// transformed coefficients. +static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, + const uint16_t* const w) { + int32_t sum[4]; + __m128i tmp_0, tmp_1, tmp_2, tmp_3; + const __m128i zero = _mm_setzero_si128(); + const __m128i one = _mm_set1_epi16(1); + const __m128i three = _mm_set1_epi16(3); + + // Load, combine and tranpose inputs. + { + const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); + const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); + const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); + const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); + const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); + const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); + const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); + const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); + + // Combine inA and inB (we'll do two transforms in parallel). + const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); + const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); + const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); + const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); + // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 + // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 + // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 + // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 + + // Transpose the two 4x4, discarding the filling zeroes. + const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); + const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); + // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 + // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); + // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 + + // Convert to 16b. + tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); + tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); + tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); + tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Horizontal pass and subsequent transpose. + { + // Calculate a and b (two 4x4 at once). + const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); + const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); + const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); + const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); + // b0_extra = (a0 != 0); + const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); + const __m128i b0_base = _mm_add_epi16(a0, a1); + const __m128i b1 = _mm_add_epi16(a3, a2); + const __m128i b2 = _mm_sub_epi16(a3, a2); + const __m128i b3 = _mm_sub_epi16(a0, a1); + const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); + // a00 a01 a02 a03 b00 b01 b02 b03 + // a10 a11 a12 a13 b10 b11 b12 b13 + // a20 a21 a22 a23 b20 b21 b22 b23 + // a30 a31 a32 a33 b30 b31 b32 b33 + + // Transpose the two 4x4. + const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); + const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); + const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); + const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); + // a00 a10 a01 a11 a02 a12 a03 a13 + // a20 a30 a21 a31 a22 a32 a23 a33 + // b00 b10 b01 b11 b02 b12 b03 b13 + // b20 b30 b21 b31 b22 b32 b23 b33 + const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); + const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); + const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); + // a00 a10 a20 a30 a01 a11 a21 a31 + // b00 b10 b20 b30 b01 b11 b21 b31 + // a02 a12 a22 a32 a03 a13 a23 a33 + // b02 b12 a22 b32 b03 b13 b23 b33 + tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); + tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); + tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); + tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); + // a00 a10 a20 a30 b00 b10 b20 b30 + // a01 a11 a21 a31 b01 b11 b21 b31 + // a02 a12 a22 a32 b02 b12 b22 b32 + // a03 a13 a23 a33 b03 b13 b23 b33 + } + + // Vertical pass and difference of weighted sums. + { + // Load all inputs. + // TODO(cduvivier): Make variable declarations and allocations aligned so + // we can use _mm_load_si128 instead of _mm_loadu_si128. + const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); + const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); + + // Calculate a and b (two 4x4 at once). + const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); + const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); + const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); + const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); + const __m128i b0 = _mm_add_epi16(a0, a1); + const __m128i b1 = _mm_add_epi16(a3, a2); + const __m128i b2 = _mm_sub_epi16(a3, a2); + const __m128i b3 = _mm_sub_epi16(a0, a1); + + // Separate the transforms of inA and inB. + __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); + __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); + __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); + __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); + + { + // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) + const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); + const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); + const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); + const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); + + // b = abs(b) = (b ^ sign) - sign + A_b0 = _mm_xor_si128(A_b0, sign_A_b0); + A_b2 = _mm_xor_si128(A_b2, sign_A_b2); + B_b0 = _mm_xor_si128(B_b0, sign_B_b0); + B_b2 = _mm_xor_si128(B_b2, sign_B_b2); + A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); + A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); + B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); + B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); + } + + // b = abs(b) + 3 + A_b0 = _mm_add_epi16(A_b0, three); + A_b2 = _mm_add_epi16(A_b2, three); + B_b0 = _mm_add_epi16(B_b0, three); + B_b2 = _mm_add_epi16(B_b2, three); + + // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 + // b = (abs(b) + 3) >> 3 + A_b0 = _mm_srai_epi16(A_b0, 3); + A_b2 = _mm_srai_epi16(A_b2, 3); + B_b0 = _mm_srai_epi16(B_b0, 3); + B_b2 = _mm_srai_epi16(B_b2, 3); + + // weighted sums + A_b0 = _mm_madd_epi16(A_b0, w_0); + A_b2 = _mm_madd_epi16(A_b2, w_8); + B_b0 = _mm_madd_epi16(B_b0, w_0); + B_b2 = _mm_madd_epi16(B_b2, w_8); + A_b0 = _mm_add_epi32(A_b0, A_b2); + B_b0 = _mm_add_epi32(B_b0, B_b2); + + // difference of weighted sums + A_b0 = _mm_sub_epi32(A_b0, B_b0); + _mm_storeu_si128((__m128i*)&sum[0], A_b0); + } + return sum[0] + sum[1] + sum[2] + sum[3]; +} + +static int Disto4x4SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + const int diff_sum = TTransformSSE2(a, b, w); + return (abs(diff_sum) + 8) >> 4; +} + +static int Disto16x16SSE2(const uint8_t* const a, const uint8_t* const b, + const uint16_t* const w) { + int D = 0; + int x, y; + for (y = 0; y < 16 * BPS; y += 4 * BPS) { + for (x = 0; x < 16; x += 4) { + D += Disto4x4SSE2(a + x + y, b + x + y, w); + } + } + return D; +} + + +//------------------------------------------------------------------------------ +// Quantization +// + +// Simple quantization +static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], + int n, const VP8Matrix* const mtx) { + const __m128i max_coeff_2047 = _mm_set1_epi16(2047); + const __m128i zero = _mm_set1_epi16(0); + __m128i sign0, sign8; + __m128i coeff0, coeff8; + __m128i out0, out8; + __m128i packed_out; + + // Load all inputs. + // TODO(cduvivier): Make variable declarations and allocations aligned so that + // we can use _mm_load_si128 instead of _mm_loadu_si128. + __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); + __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); + const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); + const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); + const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); + const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); + const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); + const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); + const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); + const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); + const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); + const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); + + // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) + sign0 = _mm_srai_epi16(in0, 15); + sign8 = _mm_srai_epi16(in8, 15); + + // coeff = abs(in) = (in ^ sign) - sign + coeff0 = _mm_xor_si128(in0, sign0); + coeff8 = _mm_xor_si128(in8, sign8); + coeff0 = _mm_sub_epi16(coeff0, sign0); + coeff8 = _mm_sub_epi16(coeff8, sign8); + + // coeff = abs(in) + sharpen + coeff0 = _mm_add_epi16(coeff0, sharpen0); + coeff8 = _mm_add_epi16(coeff8, sharpen8); + + // if (coeff > 2047) coeff = 2047 + coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); + coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); + + // out = (coeff * iQ + B) >> QFIX; + { + // doing calculations with 32b precision (QFIX=17) + // out = (coeff * iQ) + __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); + __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); + __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); + __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); + __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); + __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); + __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); + __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); + // expand bias from 16b to 32b + __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); + __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); + __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); + __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); + // out = (coeff * iQ + B) + out_00 = _mm_add_epi32(out_00, bias_00); + out_04 = _mm_add_epi32(out_04, bias_04); + out_08 = _mm_add_epi32(out_08, bias_08); + out_12 = _mm_add_epi32(out_12, bias_12); + // out = (coeff * iQ + B) >> QFIX; + out_00 = _mm_srai_epi32(out_00, QFIX); + out_04 = _mm_srai_epi32(out_04, QFIX); + out_08 = _mm_srai_epi32(out_08, QFIX); + out_12 = _mm_srai_epi32(out_12, QFIX); + // pack result as 16b + out0 = _mm_packs_epi32(out_00, out_04); + out8 = _mm_packs_epi32(out_08, out_12); + } + + // get sign back (if (sign[j]) out_n = -out_n) + out0 = _mm_xor_si128(out0, sign0); + out8 = _mm_xor_si128(out8, sign8); + out0 = _mm_sub_epi16(out0, sign0); + out8 = _mm_sub_epi16(out8, sign8); + + // in = out * Q + in0 = _mm_mullo_epi16(out0, q0); + in8 = _mm_mullo_epi16(out8, q8); + + // if (coeff <= mtx->zthresh_) {in=0; out=0;} + { + __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); + __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); + in0 = _mm_and_si128(in0, cmp0); + in8 = _mm_and_si128(in8, cmp8); + _mm_storeu_si128((__m128i*)&in[0], in0); + _mm_storeu_si128((__m128i*)&in[8], in8); + out0 = _mm_and_si128(out0, cmp0); + out8 = _mm_and_si128(out8, cmp8); + } + + // zigzag the output before storing it. + // + // The zigzag pattern can almost be reproduced with a small sequence of + // shuffles. After it, we only need to swap the 7th (ending up in third + // position instead of twelfth) and 8th values. + { + __m128i outZ0, outZ8; + outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); + outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); + outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); + outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); + outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); + outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); + _mm_storeu_si128((__m128i*)&out[0], outZ0); + _mm_storeu_si128((__m128i*)&out[8], outZ8); + packed_out = _mm_packs_epi16(outZ0, outZ8); + } + { + const int16_t outZ_12 = out[12]; + const int16_t outZ_3 = out[3]; + out[3] = outZ_12; + out[12] = outZ_3; + } + + // detect if all 'out' values are zeroes or not + { + int32_t tmp[4]; + _mm_storeu_si128((__m128i*)tmp, packed_out); + if (n) { + tmp[0] &= ~0xff; + } + return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); + } +} + +extern void VP8EncDspInitSSE2(void); +void VP8EncDspInitSSE2(void) { + VP8CollectHistogram = CollectHistogramSSE2; + VP8EncQuantizeBlock = QuantizeBlockSSE2; + VP8ITransform = ITransformSSE2; + VP8FTransform = FTransformSSE2; + VP8SSE4x4 = SSE4x4SSE2; + VP8TDisto4x4 = Disto4x4SSE2; + VP8TDisto16x16 = Disto16x16SSE2; +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif //__SSE2__ diff --git a/third_party/libwebp/enc/filter.c b/third_party/libwebp/enc/filter.c index a0a42b0..04b3610 100644 --- a/third_party/libwebp/enc/filter.c +++ b/third_party/libwebp/enc/filter.c @@ -45,7 +45,7 @@ static void InitTables(void) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Edge filtering functions // 4 pixels in, 2 pixels out @@ -92,7 +92,7 @@ static inline int needs_filter2(const uint8_t* p, int step, int t, int it) { abs0[255 + q2 - q1] <= it && abs0[255 + q1 - q0] <= it; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Simple In-loop filtering (Paragraph 15.2) static void SimpleVFilter16(uint8_t* p, int stride, int thresh) { @@ -129,7 +129,7 @@ static void SimpleHFilter16i(uint8_t* p, int stride, int thresh) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Complex In-loop filtering (Paragraph 15.3) static inline void FilterLoop24(uint8_t* p, int hstride, int vstride, int size, @@ -177,7 +177,7 @@ static void HFilter8i(uint8_t* u, uint8_t* v, int stride, FilterLoop24(v + 4, 1, stride, 8, thresh, ithresh, hev_thresh); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ void (*VP8EncVFilter16i)(uint8_t*, int, int, int, int) = VFilter16i; void (*VP8EncHFilter16i)(uint8_t*, int, int, int, int) = HFilter16i; @@ -187,7 +187,7 @@ void (*VP8EncHFilter8i)(uint8_t*, uint8_t*, int, int, int, int) = HFilter8i; void (*VP8EncSimpleVFilter16i)(uint8_t*, int, int) = SimpleVFilter16i; void (*VP8EncSimpleHFilter16i)(uint8_t*, int, int) = SimpleHFilter16i; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Paragraph 15.4: compute the inner-edge filtering strength static int GetILevel(int sharpness, int level) { @@ -229,7 +229,7 @@ static void DoFilter(const VP8EncIterator* const it, int level) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // SSIM metric enum { KERNEL = 3 }; @@ -302,7 +302,7 @@ static double GetMBSSIM(const uint8_t* yuv1, const uint8_t* yuv2) { return GetSSIM(&s); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Exposed APIs: Encoder should call the following 3 functions to adjust // loop filter strength diff --git a/third_party/libwebp/enc/frame.c b/third_party/libwebp/enc/frame.c index 9864c1d..eebea73 100644 --- a/third_party/libwebp/enc/frame.c +++ b/third_party/libwebp/enc/frame.c @@ -37,7 +37,7 @@ typedef struct { CostArray* cost; } VP8Residual; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Tables for level coding const uint8_t VP8EncBands[16 + 1] = { @@ -51,18 +51,16 @@ static const uint8_t kCat5[] = { 180, 157, 141, 134, 130 }; static const uint8_t kCat6[] = { 254, 254, 243, 230, 196, 177, 153, 140, 133, 130, 129 }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Reset the statistics about: number of skips, token proba, level cost,... static void ResetStats(VP8Encoder* const enc, int precalc_cost) { VP8Proba* const proba = &enc->proba_; if (precalc_cost) VP8CalculateLevelCosts(proba); proba->nb_skip_ = 0; - proba->nb_i4_ = 0; - proba->nb_i16_ = 0; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Skip decision probability static int CalcSkipProba(uint64_t nb, uint64_t total) { @@ -86,7 +84,7 @@ static int FinalizeSkipProba(VP8Encoder* const enc) { return size; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Recording of token probabilities. static void ResetTokenStats(VP8Encoder* const enc) { @@ -101,6 +99,9 @@ static int Record(int bit, uint64_t* const stats) { return bit; } +// We keep the table free variant around for reference, in case. +#define USE_LEVEL_CODE_TABLE + // Simulate block coding, but only record statistics. // Note: no need to record the fixed probas. static int RecordCoeffs(int ctx, VP8Residual* res) { @@ -111,14 +112,16 @@ static int RecordCoeffs(int ctx, VP8Residual* res) { } while (1) { - const int v = abs(res->coeffs[n++]); + int v = res->coeffs[n++]; if (!Record(v != 0, s[1])) { s = res->stats[VP8EncBands[n]][0]; continue; } - if (!Record(v > 1, s[2])) { + if (!Record(2u < (unsigned int)(v + 1), s[2])) { // v = -1 or 1 s = res->stats[VP8EncBands[n]][1]; } else { + v = abs(v); +#if !defined(USE_LEVEL_CODE_TABLE) if (!Record(v > 4, s[3])) { if (Record(v != 2, s[4])) Record(v == 4, s[5]); @@ -129,6 +132,20 @@ static int RecordCoeffs(int ctx, VP8Residual* res) { } else { Record((v >= 3 + (8 << 3)), s[10]); } +#else + if (v > MAX_VARIABLE_LEVEL) + v = MAX_VARIABLE_LEVEL; + + { + const int bits = VP8LevelCodes[v - 1][1]; + int pattern = VP8LevelCodes[v - 1][0]; + int i; + for (i = 0; (pattern >>= 1) != 0; ++i) { + const int mask = 2 << i; + if (pattern & 1) Record(!!(bits & mask), s[3 + i]); + } + } +#endif s = res->stats[VP8EncBands[n]][2]; } if (n == 16 || !Record(n <= res->last, s[0])) { @@ -174,7 +191,7 @@ static int FinalizeTokenProbas(VP8Encoder* const enc) { return size; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // helper functions for residuals struct VP8Residual. static void InitResidual(int first, int coeff_type, @@ -199,7 +216,7 @@ static void SetResidualCoeffs(const int16_t* const coeffs, res->coeffs = coeffs; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Mode costs static int GetResidualCost(int ctx, const VP8Residual* const res) { @@ -213,16 +230,18 @@ static int GetResidualCost(int ctx, const VP8Residual* const res) { return cost; } while (n <= res->last) { - const int v = abs(res->coeffs[n++]); - cost += VP8LevelCost(t, v); + const int v = res->coeffs[n++]; if (v == 0) { + cost += VP8LevelCost(t, 0); p = res->prob[VP8EncBands[n]][0]; t = res->cost[VP8EncBands[n]][0]; continue; - } else if (v == 1) { + } else if (2u >= (unsigned int)(v + 1)) { // v = -1 or 1 + cost += VP8LevelCost(t, 1); p = res->prob[VP8EncBands[n]][1]; t = res->cost[VP8EncBands[n]][1]; } else { + cost += VP8LevelCost(t, abs(v)); p = res->prob[VP8EncBands[n]][2]; t = res->cost[VP8EncBands[n]][2]; } @@ -292,7 +311,7 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd) { return R; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Coefficient coding static int PutCoeffs(VP8BitWriter* const bw, int ctx, const VP8Residual* res) { @@ -462,7 +481,7 @@ static void RecordResiduals(VP8EncIterator* const it, VP8IteratorBytesToNz(it); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // ExtraInfo map / Debug function #if SEGMENT_VISU @@ -525,7 +544,7 @@ static void StoreSideInfo(const VP8EncIterator* const it) { #endif } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Main loops // // VP8EncLoop(): does the final bitstream coding. @@ -568,6 +587,14 @@ int VP8EncLoop(VP8Encoder* const enc) { } else { // reset predictors after a skip ResetAfterSkip(&it); } +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (enc->has_alpha_) { + VP8EncCodeAlphaBlock(&it); + } + if (enc->use_layer_) { + VP8EncCodeLayerBlock(&it); + } +#endif StoreSideInfo(&it); VP8StoreFilterStats(&it); VP8IteratorExport(&it); @@ -589,7 +616,7 @@ int VP8EncLoop(VP8Encoder* const enc) { return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8StatLoop(): only collect statistics (number of skips, token usage, ...) // This is used for deciding optimal probabilities. It also // modifies the quantizer value if some target (size, PNSR) @@ -664,7 +691,7 @@ int VP8StatLoop(VP8Encoder* const enc) { } // binary search for a size close to target - for (pass = 0; pass < enc->config_->pass || (dqs[pass] > 0); ++pass) { + for (pass = 0; pass < enc->config_->pass && (dqs[pass] > 0); ++pass) { const int rd_opt = 1; float PSNR; int criterion; @@ -688,7 +715,7 @@ int VP8StatLoop(VP8Encoder* const enc) { return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/iterator.c b/third_party/libwebp/enc/iterator.c index 991644d..ad22970 100644 --- a/third_party/libwebp/enc/iterator.c +++ b/third_party/libwebp/enc/iterator.c @@ -17,9 +17,9 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8Iterator -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static void InitLeft(VP8EncIterator* const it) { const VP8Encoder* const enc = it->enc_; @@ -68,7 +68,7 @@ void VP8IteratorInit(VP8Encoder* const enc, VP8EncIterator* const it) { VP8IteratorReset(it); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Import the source samples into the cache. Takes care of replicating // boundary pixels if necessary. @@ -122,7 +122,7 @@ void VP8IteratorImport(const VP8EncIterator* const it) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Copy back the compressed samples into user space if requested. void VP8IteratorExport(const VP8EncIterator* const it) { @@ -148,16 +148,18 @@ void VP8IteratorExport(const VP8EncIterator* const it) { memcpy(ydst + i * pic->y_stride, ysrc + i * BPS, w); } // U/V plane - w = (w + 1) / 2; - h = (h + 1) / 2; - for (i = 0; i < h; ++i) { - memcpy(udst + i * pic->uv_stride, usrc + i * BPS, w); - memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, w); + { + const int uv_w = (w + 1) / 2; + const int uv_h = (h + 1) / 2; + for (i = 0; i < uv_h; ++i) { + memcpy(udst + i * pic->uv_stride, usrc + i * BPS, uv_w); + memcpy(vdst + i * pic->uv_stride, vsrc + i * BPS, uv_w); + } } } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Non-zero contexts setup/teardown // Nz bits: @@ -214,7 +216,8 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) { nz |= (it->top_nz_[6] << 22) | (it->top_nz_[7] << 23); nz |= (it->top_nz_[8] << 24); // we propagate the _top_ bit, esp. for intra4 // left - nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7) | (it->left_nz_[2] << 11); + nz |= (it->left_nz_[0] << 3) | (it->left_nz_[1] << 7); + nz |= (it->left_nz_[2] << 11); nz |= (it->left_nz_[4] << 17) | (it->left_nz_[6] << 21); *it->nz_ = nz; @@ -222,7 +225,7 @@ void VP8IteratorBytesToNz(VP8EncIterator* const it) { #undef BIT -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Advance to the next position, doing the bookeeping. int VP8IteratorNext(VP8EncIterator* const it, @@ -267,7 +270,7 @@ int VP8IteratorNext(VP8EncIterator* const it, return (0 < --it->done_); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Helper function to set mode properties void VP8SetIntra16Mode(const VP8EncIterator* const it, int mode) { @@ -304,7 +307,7 @@ void VP8SetSegment(const VP8EncIterator* const it, int segment) { it->mb_->segment_ = segment; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Intra4x4 sub-blocks iteration // // We store and update the boundary samples into an array of 37 pixels. They @@ -399,7 +402,7 @@ int VP8IteratorRotateI4(VP8EncIterator* const it, return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/layer.c b/third_party/libwebp/enc/layer.c new file mode 100644 index 0000000..65e4cd8 --- /dev/null +++ b/third_party/libwebp/enc/layer.c @@ -0,0 +1,55 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Enhancement layer (for YUV444/422) +// +// Author: Skal (pascal.massimino@gmail.com) + +#include <assert.h> +#include <stdlib.h> +#include "vp8enci.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#ifdef WEBP_EXPERIMENTAL_FEATURES + +#endif /* WEBP_EXPERIMENTAL_FEATURES */ + +//------------------------------------------------------------------------------ + +void VP8EncInitLayer(VP8Encoder* const enc) { + enc->use_layer_ = (enc->pic_->u0 != NULL); + enc->layer_data_size_ = 0; + enc->layer_data_ = NULL; + if (enc->use_layer_) { + VP8BitWriterInit(&enc->layer_bw_, enc->mb_w_ * enc->mb_h_ * 3); + } +} + +void VP8EncCodeLayerBlock(VP8EncIterator* it) { + (void)it; // remove a warning +#ifdef WEBP_EXPERIMENTAL_FEATURES +#endif /* WEBP_EXPERIMENTAL_FEATURES */ +} + +int VP8EncFinishLayer(VP8Encoder* const enc) { + if (enc->use_layer_) { + enc->layer_data_ = VP8BitWriterFinish(&enc->layer_bw_); + enc->layer_data_size_ = VP8BitWriterSize(&enc->layer_bw_); + } + return 1; +} + +void VP8EncDeleteLayer(VP8Encoder* enc) { + free(enc->layer_data_); +} + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/enc/picture.c b/third_party/libwebp/enc/picture.c index 6c12ea4..2f3c96e 100644 --- a/third_party/libwebp/enc/picture.c +++ b/third_party/libwebp/enc/picture.c @@ -9,6 +9,7 @@ // // Author: Skal (pascal.massimino@gmail.com) +#include <assert.h> #include <stdlib.h> #include "vp8enci.h" @@ -16,54 +17,122 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // WebPPicture -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ int WebPPictureAlloc(WebPPicture* const picture) { if (picture) { + const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK; + const int has_alpha = picture->colorspace & WEBP_CSP_ALPHA_BIT; const int width = picture->width; const int height = picture->height; + const int y_stride = width; const int uv_width = (width + 1) / 2; const int uv_height = (height + 1) / 2; - const uint64_t y_size = (uint64_t)width * height; - const uint64_t uv_size = (uint64_t)uv_width * uv_height; - const uint64_t total_size = y_size + 2 * uv_size; + const int uv_stride = uv_width; + int uv0_stride = 0; + int a_width, a_stride; + uint64_t y_size, uv_size, uv0_size, a_size, total_size; + uint8_t* mem; + + // U/V + switch (uv_csp) { + case WEBP_YUV420: + break; +#ifdef WEBP_EXPERIMENTAL_FEATURES + case WEBP_YUV400: // for now, we'll just reset the U/V samples + break; + case WEBP_YUV422: + uv0_stride = uv_width; + break; + case WEBP_YUV444: + uv0_stride = width; + break; +#endif + default: + return 0; + } + uv0_size = height * uv0_stride; + + // alpha + a_width = has_alpha ? width : 0; + a_stride = a_width; + y_size = (uint64_t)y_stride * height; + uv_size = (uint64_t)uv_stride * uv_height; + a_size = (uint64_t)a_stride * height; + + total_size = y_size + a_size + 2 * uv_size + 2 * uv0_size; + // Security and validation checks - if (uv_width <= 0 || uv_height <= 0 || // check param error + if (width <= 0 || height <= 0 || // check for luma/alpha param error + uv_width < 0 || uv_height < 0 || // check for u/v param error y_size >= (1ULL << 40) || // check for reasonable global size (size_t)total_size != total_size) { // check for overflow on 32bit return 0; } - picture->y_stride = width; - picture->uv_stride = uv_width; + picture->y_stride = y_stride; + picture->uv_stride = uv_stride; + picture->a_stride = a_stride; + picture->uv0_stride = uv0_stride; WebPPictureFree(picture); // erase previous buffer - picture->y = (uint8_t*)malloc((size_t)total_size); - if (picture->y == NULL) return 0; - picture->u = picture->y + y_size; - picture->v = picture->u + uv_size; + mem = (uint8_t*)malloc((size_t)total_size); + if (mem == NULL) return 0; + + picture->y = mem; + mem += y_size; + + picture->u = mem; + mem += uv_size; + picture->v = mem; + mem += uv_size; + + if (a_size) { + picture->a = mem; + mem += a_size; + } + if (uv0_size) { + picture->u0 = mem; + mem += uv0_size; + picture->v0 = mem; + mem += uv0_size; + } } return 1; } +// Grab the 'specs' (writer, *opaque, width, height...) from 'src' and copy them +// into 'dst'. Mark 'dst' as not owning any memory. 'src' can be NULL. +static void WebPPictureGrabSpecs(const WebPPicture* const src, + WebPPicture* const dst) { + if (src) *dst = *src; + dst->y = dst->u = dst->v = NULL; + dst->u0 = dst->v0 = NULL; + dst->a = NULL; +} + +// Release memory owned by 'picture'. void WebPPictureFree(WebPPicture* const picture) { if (picture) { free(picture->y); - picture->y = picture->u = picture->v = NULL; + WebPPictureGrabSpecs(NULL, picture); } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ +// Picture copying int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) { int y; if (src == NULL || dst == NULL) return 0; if (src == dst) return 1; - *dst = *src; - dst->y = NULL; + + WebPPictureGrabSpecs(src, dst); if (!WebPPictureAlloc(dst)) return 0; + for (y = 0; y < dst->height; ++y) { - memcpy(dst->y + y * dst->y_stride, src->y + y * src->y_stride, src->width); + memcpy(dst->y + y * dst->y_stride, + src->y + y * src->y_stride, src->width); } for (y = 0; y < (dst->height + 1) / 2; ++y) { memcpy(dst->u + y * dst->uv_stride, @@ -71,9 +140,32 @@ int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst) { memcpy(dst->v + y * dst->uv_stride, src->v + y * src->uv_stride, (src->width + 1) / 2); } +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (dst->a != NULL) { + for (y = 0; y < dst->height; ++y) { + memcpy(dst->a + y * dst->a_stride, + src->a + y * src->a_stride, src->width); + } + } + if (dst->u0 != NULL) { + int uv0_width = src->width; + if ((dst->colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) { + uv0_width = (uv0_width + 1) / 2; + } + for (y = 0; y < dst->height; ++y) { + memcpy(dst->u0 + y * dst->uv0_stride, + src->u0 + y * src->uv0_stride, uv0_width); + memcpy(dst->v0 + y * dst->uv0_stride, + src->v0 + y * src->uv0_stride, uv0_width); + } + } +#endif return 1; } +//------------------------------------------------------------------------------ +// Picture cropping + int WebPPictureCrop(WebPPicture* const pic, int left, int top, int width, int height) { WebPPicture tmp; @@ -84,8 +176,7 @@ int WebPPictureCrop(WebPPicture* const pic, if (left < 0 || ((left + width + 1) & ~1) > pic->width) return 0; if (top < 0 || ((top + height + 1) & ~1) > pic->height) return 0; - tmp = *pic; - tmp.y = NULL; + WebPPictureGrabSpecs(pic, &tmp); tmp.width = width; tmp.height = height; if (!WebPPictureAlloc(&tmp)) return 0; @@ -99,12 +190,189 @@ int WebPPictureCrop(WebPPicture* const pic, memcpy(tmp.u + y * tmp.uv_stride, pic->u + offset, (width + 1) / 2); memcpy(tmp.v + y * tmp.uv_stride, pic->v + offset, (width + 1) / 2); } + +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (tmp.a) { + for (y = 0; y < height; ++y) { + memcpy(tmp.a + y * tmp.a_stride, + pic->a + (top + y) * pic->a_stride + left, width); + } + } + if (tmp.u0) { + int w = width; + int l = left; + if (tmp.colorspace == WEBP_YUV422) { + w = (w + 1) / 2; + l = (l + 1) / 2; + } + for (y = 0; y < height; ++y) { + memcpy(tmp.u0 + y * tmp.uv0_stride, + pic->u0 + (top + y) * pic->uv0_stride + l, w); + memcpy(tmp.v0 + y * tmp.uv0_stride, + pic->v0 + (top + y) * pic->uv0_stride + l, w); + } + } +#endif + + WebPPictureFree(pic); + *pic = tmp; + return 1; +} + +//------------------------------------------------------------------------------ +// Simple picture rescaler + +#define RFIX 30 +#define MULT(x,y) (((int64_t)(x) * (y) + (1 << (RFIX - 1))) >> RFIX) +static inline void ImportRow(const uint8_t* src, int src_width, + int32_t* frow, int32_t* irow, int dst_width) { + const int x_expand = (src_width < dst_width); + const int fx_scale = (1 << RFIX) / dst_width; + int x_in = 0; + int x_out; + int x_accum = 0; + if (!x_expand) { + int sum = 0; + for (x_out = 0; x_out < dst_width; ++x_out) { + x_accum += src_width - dst_width; + for (; x_accum > 0; x_accum -= dst_width) { + sum += src[x_in++]; + } + { // Emit next horizontal pixel. + const int32_t base = src[x_in++]; + const int32_t frac = base * (-x_accum); + frow[x_out] = (sum + base) * dst_width - frac; + sum = MULT(frac, fx_scale); // fresh fractional start for next pixel + } + } + } else { // simple bilinear interpolation + int left = src[0], right = src[0]; + for (x_out = 0; x_out < dst_width; ++x_out) { + if (x_accum < 0) { + left = right; + right = src[++x_in]; + x_accum += dst_width - 1; + } + frow[x_out] = right * (dst_width - 1) + (left - right) * x_accum; + x_accum -= src_width - 1; + } + } + // Accumulate the new row's contribution + for (x_out = 0; x_out < dst_width; ++x_out) { + irow[x_out] += frow[x_out]; + } +} + +static void ExportRow(int32_t* frow, int32_t* irow, uint8_t* dst, int dst_width, + const int yscale, const int64_t fxy_scale) { + int x_out; + for (x_out = 0; x_out < dst_width; ++x_out) { + const int frac = MULT(frow[x_out], yscale); + const int v = (int)(MULT(irow[x_out] - frac, fxy_scale)); + dst[x_out] = (!(v & ~0xff)) ? v : (v < 0) ? 0 : 255; + irow[x_out] = frac; // new fractional start + } +} + +static void RescalePlane(const uint8_t* src, + int src_width, int src_height, int src_stride, + uint8_t* dst, + int dst_width, int dst_height, int dst_stride, + int32_t* const work) { + const int x_expand = (src_width < dst_width); + const int fy_scale = (1 << RFIX) / dst_height; + const int64_t fxy_scale = x_expand ? + ((int64_t)dst_height << RFIX) / (dst_width * src_height) : + ((int64_t)dst_height << RFIX) / (src_width * src_height); + int y_accum = src_height; + int y; + int32_t* irow = work; // integral contribution + int32_t* frow = work + dst_width; // fractional contribution + + memset(work, 0, 2 * dst_width * sizeof(*work)); + for (y = 0; y < src_height; ++y) { + // import new contribution of one source row. + ImportRow(src, src_width, frow, irow, dst_width); + src += src_stride; + // emit output row(s) + y_accum -= dst_height; + for (; y_accum <= 0; y_accum += src_height) { + const int yscale = fy_scale * (-y_accum); + ExportRow(frow, irow, dst, dst_width, yscale, fxy_scale); + dst += dst_stride; + } + } +} +#undef MULT +#undef RFIX + +int WebPPictureRescale(WebPPicture* const pic, int width, int height) { + WebPPicture tmp; + int prev_width, prev_height; + int32_t* work; + + if (pic == NULL) return 0; + prev_width = pic->width; + prev_height = pic->height; + // if width is unspecified, scale original proportionally to height ratio. + if (width == 0) { + width = (prev_width * height + prev_height / 2) / prev_height; + } + // if height is unspecified, scale original proportionally to width ratio. + if (height == 0) { + height = (prev_height * width + prev_width / 2) / prev_width; + } + // Check if the overall dimensions still make sense. + if (width <= 0 || height <= 0) return 0; + + WebPPictureGrabSpecs(pic, &tmp); + tmp.width = width; + tmp.height = height; + if (!WebPPictureAlloc(&tmp)) return 0; + + work = malloc(2 * width * sizeof(int32_t)); + if (work == NULL) { + WebPPictureFree(&tmp); + return 0; + } + + RescalePlane(pic->y, prev_width, prev_height, pic->y_stride, + tmp.y, width, height, tmp.y_stride, work); + RescalePlane(pic->u, + (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride, + tmp.u, + (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work); + RescalePlane(pic->v, + (prev_width + 1) / 2, (prev_height + 1) / 2, pic->uv_stride, + tmp.v, + (width + 1) / 2, (height + 1) / 2, tmp.uv_stride, work); + +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (tmp.a) { + RescalePlane(pic->a, prev_width, prev_height, pic->a_stride, + tmp.a, width, height, tmp.a_stride, work); + } + if (tmp.u0) { + int s = 1; + if ((tmp.colorspace & WEBP_CSP_UV_MASK) == WEBP_YUV422) { + s = 2; + } + RescalePlane( + pic->u0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride, + tmp.u0, (width + s / 2) / s, height, tmp.uv0_stride, work); + RescalePlane( + pic->v0, (prev_width + s / 2) / s, prev_height, pic->uv0_stride, + tmp.v0, (width + s / 2) / s, height, tmp.uv0_stride, work); + } +#endif + WebPPictureFree(pic); + free(work); *pic = tmp; return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Write-to-memory typedef struct { @@ -150,7 +418,7 @@ static int WebPMemoryWrite(const uint8_t* data, size_t data_size, return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // RGB -> YUV conversion // The exact naming is Y'CbCr, following the ITU-R BT.601 standard. // More information at: http://en.wikipedia.org/wiki/YCbCr @@ -196,36 +464,98 @@ static inline int rgb_to_v(int r, int g, int b) { picture->v[dst] = rgb_to_v(r, g, b); \ } +#define RGB_TO_UV0(x_in, x_out, y, SUM) { \ + const int src = (step * (x_in) + (y) * rgb_stride); \ + const int dst = (x_out) + (y) * picture->uv0_stride; \ + const int r = SUM(r_ptr + src); \ + const int g = SUM(g_ptr + src); \ + const int b = SUM(b_ptr + src); \ + picture->u0[dst] = rgb_to_u(r, g, b); \ + picture->v0[dst] = rgb_to_v(r, g, b); \ +} + +static void MakeGray(WebPPicture* const picture) { + int y; + const int uv_width = (picture->width + 1) >> 1; + for (y = 0; y < ((picture->height + 1) >> 1); ++y) { + memset(picture->u + y * picture->uv_stride, 128, uv_width); + memset(picture->v + y * picture->uv_stride, 128, uv_width); + } +} + static int Import(WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride, - int step, int swap) { + int step, int swap_rb, int import_alpha) { + const WebPEncCSP uv_csp = picture->colorspace & WEBP_CSP_UV_MASK; int x, y; - const uint8_t* const r_ptr = rgb + (swap ? 2 : 0); + const uint8_t* const r_ptr = rgb + (swap_rb ? 2 : 0); const uint8_t* const g_ptr = rgb + 1; - const uint8_t* const b_ptr = rgb + (swap ? 0 : 2); + const uint8_t* const b_ptr = rgb + (swap_rb ? 0 : 2); + const int width = picture->width; + const int height = picture->height; - for (y = 0; y < picture->height; ++y) { - for (x = 0; x < picture->width; ++x) { + // Import luma plane + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { const int offset = step * x + y * rgb_stride; picture->y[x + y * picture->y_stride] = rgb_to_y(r_ptr[offset], g_ptr[offset], b_ptr[offset]); } } - for (y = 0; y < (picture->height >> 1); ++y) { - for (x = 0; x < (picture->width >> 1); ++x) { - RGB_TO_UV(x, y, SUM4); + + // Downsample U/V plane + if (uv_csp != WEBP_YUV400) { + for (y = 0; y < (height >> 1); ++y) { + for (x = 0; x < (width >> 1); ++x) { + RGB_TO_UV(x, y, SUM4); + } + if (picture->width & 1) { + RGB_TO_UV(x, y, SUM2V); + } } - if (picture->width & 1) { - RGB_TO_UV(x, y, SUM2V); + if (height & 1) { + for (x = 0; x < (width >> 1); ++x) { + RGB_TO_UV(x, y, SUM2H); + } + if (width & 1) { + RGB_TO_UV(x, y, SUM1); + } } - } - if (picture->height & 1) { - for (x = 0; x < (picture->width >> 1); ++x) { - RGB_TO_UV(x, y, SUM2H); + +#ifdef WEBP_EXPERIMENTAL_FEATURES + // Store original U/V samples too + if (uv_csp == WEBP_YUV422) { + for (y = 0; y < height; ++y) { + for (x = 0; x < (width >> 1); ++x) { + RGB_TO_UV0(2 * x, x, y, SUM2H); + } + if (width & 1) { + RGB_TO_UV0(2 * x, x, y, SUM1); + } + } + } else if (uv_csp == WEBP_YUV444) { + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + RGB_TO_UV0(x, x, y, SUM1); + } + } } - if (picture->width & 1) { - RGB_TO_UV(x, y, SUM1); +#endif + } else { + MakeGray(picture); + } + + if (import_alpha) { +#ifdef WEBP_EXPERIMENTAL_FEATURES + const uint8_t* const a_ptr = rgb + 3; + assert(step >= 4); + for (y = 0; y < height; ++y) { + for (x = 0; x < width; ++x) { + picture->a[x + y * picture->a_stride] = + a_ptr[step * x + y * rgb_stride]; + } } +#endif } return 1; } @@ -237,34 +567,38 @@ static int Import(WebPPicture* const picture, int WebPPictureImportRGB(WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride) { + picture->colorspace &= ~WEBP_CSP_ALPHA_BIT; if (!WebPPictureAlloc(picture)) return 0; - return Import(picture, rgb, rgb_stride, 3, 0); + return Import(picture, rgb, rgb_stride, 3, 0, 0); } int WebPPictureImportBGR(WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride) { + picture->colorspace &= ~WEBP_CSP_ALPHA_BIT; if (!WebPPictureAlloc(picture)) return 0; - return Import(picture, rgb, rgb_stride, 3, 1); + return Import(picture, rgb, rgb_stride, 3, 1, 0); } int WebPPictureImportRGBA(WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride) { + picture->colorspace |= WEBP_CSP_ALPHA_BIT; if (!WebPPictureAlloc(picture)) return 0; - return Import(picture, rgba, rgba_stride, 4, 0); + return Import(picture, rgba, rgba_stride, 4, 0, 1); } int WebPPictureImportBGRA(WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride) { + picture->colorspace |= WEBP_CSP_ALPHA_BIT; if (!WebPPictureAlloc(picture)) return 0; - return Import(picture, rgba, rgba_stride, 4, 1); + return Import(picture, rgba, rgba_stride, 4, 1, 1); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Simplest call: typedef int (*Importer)(WebPPicture* const, const uint8_t* const, int); -static size_t Encode(const uint8_t* rgb, int width, int height, int stride, +static size_t Encode(const uint8_t* rgba, int width, int height, int stride, Importer import, float quality_factor, uint8_t** output) { size_t output_size = 0; WebPPicture pic; @@ -286,7 +620,7 @@ static size_t Encode(const uint8_t* rgb, int width, int height, int stride, wrt.size = &output_size; InitMemoryWriter(&wrt); - ok = import(&pic, rgb, stride) && WebPEncode(&config, &pic); + ok = import(&pic, rgba, stride) && WebPEncode(&config, &pic); WebPPictureFree(&pic); if (!ok) { free(*output); @@ -309,7 +643,7 @@ ENCODE_FUNC(WebPEncodeBGRA, WebPPictureImportBGRA); #undef ENCODE_FUNC -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/quant.c b/third_party/libwebp/enc/quant.c index e439919..27cdbfb 100644 --- a/third_party/libwebp/enc/quant.c +++ b/third_party/libwebp/enc/quant.c @@ -33,13 +33,13 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static inline int clip(int v, int m, int M) { return v < m ? m : v > M ? M : v; } -const uint8_t VP8Zigzag[16] = { +static const uint8_t kZigzag[16] = { 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15 }; @@ -132,7 +132,7 @@ static const uint8_t kFreqSharpening[16] = { 90, 90, 90, 90 }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Initialize quantization parameters in VP8Matrix // Returns the average quantizer @@ -143,7 +143,7 @@ static int ExpandMatrix(VP8Matrix* const m, int type) { m->q_[i] = m->q_[1]; } for (i = 0; i < 16; ++i) { - const int j = VP8Zigzag[i]; + const int j = kZigzag[i]; const int bias = kBiasMatrices[type][j]; m->iq_[j] = (1 << QFIX) / m->q_[j]; m->bias_[j] = BIAS(bias); @@ -192,7 +192,7 @@ static void SetupMatrices(VP8Encoder* enc) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Initialize filtering parameters // Very small filter-strength values have close to no visual effect. So we can @@ -214,7 +214,7 @@ static void SetupFilterStrength(VP8Encoder* const enc) { enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Note: if you change the values below, remember that the max range // allowed by the syntax for DQ_UV is [-16,16]. @@ -286,7 +286,7 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality) { SetupFilterStrength(enc); // initialize segments' filtering, eventually } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Form the predictions in cache // Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index @@ -316,7 +316,7 @@ void VP8MakeIntra4Preds(const VP8EncIterator* const it) { VP8EncPredLuma4(it->yuv_p_, it->i4_top_); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Quantize // Layout: @@ -341,7 +341,7 @@ const int VP8Scan[16 + 4 + 4] = { 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Distortion measurement static const uint16_t kWeightY[16] = { @@ -384,7 +384,7 @@ static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) { dst->score += src->score; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Performs trellis-optimized quantization. // Trellis @@ -440,7 +440,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, // compute maximal distortion. max_error = 0; for (n = first; n < 16; ++n) { - const int j = VP8Zigzag[n]; + const int j = kZigzag[n]; const int err = in[j] * in[j]; max_error += kWeightTrellis[j] * err; if (err > thresh) last = n; @@ -464,7 +464,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, // traverse trellis. for (n = first; n <= last; ++n) { - const int j = VP8Zigzag[n]; + const int j = kZigzag[n]; const int Q = mtx->q_[j]; const int iQ = mtx->iq_[j]; const int B = BIAS(0x00); // neutral bias @@ -560,7 +560,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, for (; n >= first; --n) { const Node* const node = &NODE(n, best_node); - const int j = VP8Zigzag[n]; + const int j = kZigzag[n]; out[n] = node->sign ? -node->level : node->level; nz |= (node->level != 0); in[j] = out[n] * mtx->q_[j]; @@ -571,7 +571,7 @@ static int TrellisQuantizeBlock(const VP8EncIterator* const it, #undef NODE -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Performs: difference, transform, quantize, back-transform, add // all at once. Output is the reconstructed block in *yuv_out, and the // quantized levels in *levels. @@ -615,8 +615,8 @@ static int ReconstructIntra16(VP8EncIterator* const it, // Transform back VP8ITransformWHT(dc_tmp, tmp[0]); - for (n = 0; n < 16; ++n) { - VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n]); + for (n = 0; n < 16; n += 2) { + VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1); } return nz; @@ -642,7 +642,7 @@ static int ReconstructIntra4(VP8EncIterator* const it, } else { nz = VP8EncQuantizeBlock(tmp, levels, 0, &dqm->y1_); } - VP8ITransform(ref, tmp, yuv_out); + VP8ITransform(ref, tmp, yuv_out, 0); return nz; } @@ -666,8 +666,8 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, for (x = 0; x < 2; ++x, ++n) { const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y]; const int non_zero = - TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, &dqm->uv_, - dqm->lambda_trellis_uv_); + TrellisQuantizeBlock(it, tmp[n], rd->uv_levels[n], ctx, 2, + &dqm->uv_, dqm->lambda_trellis_uv_); it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero; nz |= non_zero << n; } @@ -679,13 +679,13 @@ static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd, } } - for (n = 0; n < 8; ++n) { - VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n]); + for (n = 0; n < 8; n += 2) { + VP8ITransform(ref + VP8Scan[16 + n], tmp[n], yuv_out + VP8Scan[16 + n], 1); } return (nz << 16); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost. // Pick the mode is lower RD-cost = Rate + lamba * Distortion. @@ -738,7 +738,7 @@ static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* const rd) { VP8SetIntra16Mode(it, rd->mode_i16); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // return the cost array corresponding to the surrounding prediction modes. static const uint16_t* GetCostModeI4(VP8EncIterator* const it, @@ -757,10 +757,15 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { const int tlambda = dqm->tlambda_; const uint8_t* const src0 = it->yuv_in_ + Y_OFF; uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF; + int total_header_bits = 0; VP8ModeScore rd_best; + if (enc->max_i4_header_bits_ == 0) { + return 0; + } + InitScore(&rd_best); - rd_best.score = 0; + rd_best.score = 211; // '211' is the value of VP8BitCost(0, 145) VP8IteratorStartI4(it); do { VP8ModeScore rd_i4; @@ -799,7 +804,9 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { } SetRDScore(dqm->lambda_mode_, &rd_i4); AddScore(&rd_best, &rd_i4); - if (rd_best.score >= rd->score) { + total_header_bits += mode_costs[best_mode]; + if (rd_best.score >= rd->score || + total_header_bits > enc->max_i4_header_bits_) { return 0; } // Copy selected samples if not in the right place already. @@ -817,7 +824,7 @@ static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) { return 1; // select intra4x4 over intra16x16 } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { VP8Encoder* const enc = it->enc_; @@ -855,7 +862,7 @@ static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) { AddScore(rd, &rd_best); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Final reconstruction and quantization. static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { @@ -882,7 +889,7 @@ static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) { rd->nz = nz; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Entry point int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt) { diff --git a/third_party/libwebp/enc/syntax.c b/third_party/libwebp/enc/syntax.c index a788f3c..f119018 100644 --- a/third_party/libwebp/enc/syntax.c +++ b/third_party/libwebp/enc/syntax.c @@ -26,7 +26,7 @@ extern "C" { #define MAX_PARTITION0_SIZE (1 << 19) // max size of mode partition #define MAX_PARTITION_SIZE (1 << 24) // max size for token partition -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Writers for header's various pieces (in order of appearance) // Main keyframe header @@ -39,26 +39,31 @@ static void PutLE32(uint8_t* const data, uint32_t val) { } static int PutHeader(int profile, size_t size0, size_t total_size, - const WebPPicture* const pic) { + WebPPicture* const pic) { uint8_t buf[KHEADER_SIZE]; uint8_t RIFF[KRIFF_SIZE] = { 'R', 'I', 'F', 'F', 0, 0, 0, 0, 'W', 'E', 'B', 'P', 'V', 'P', '8', ' ' }; uint32_t bits; - if (size0 >= MAX_PARTITION0_SIZE) { - return 0; // partition #0 is too big to fit + if (size0 >= MAX_PARTITION0_SIZE) { // partition #0 is too big to fit + return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION0_OVERFLOW); } - PutLE32(RIFF + 4, total_size + KSIZE_OFFSET); - PutLE32(RIFF + 16, total_size); - if (!pic->writer(RIFF, sizeof(RIFF), pic)) - return 0; + if (total_size > 0xfffffffeU - KRIFF_SIZE) { + return WebPEncodingSetError(pic, VP8_ENC_ERROR_FILE_TOO_BIG); + } + + PutLE32(RIFF + 4, (uint32_t)(total_size + KSIZE_OFFSET)); + PutLE32(RIFF + 16, (uint32_t)total_size); + if (!pic->writer(RIFF, sizeof(RIFF), pic)) { + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_WRITE); + } - bits = 0 // keyframe (1b) - | (profile << 1) // profile (3b) - | (1 << 4) // visible (1b) - | (size0 << 5); // partition length (19b) + bits = 0 // keyframe (1b) + | (profile << 1) // profile (3b) + | (1 << 4) // visible (1b) + | ((uint32_t)size0 << 5); // partition length (19b) buf[0] = bits & 0xff; buf[1] = (bits >> 8) & 0xff; buf[2] = (bits >> 16) & 0xff; @@ -138,13 +143,13 @@ static void PutQuant(VP8BitWriter* const bw, // Partition sizes static int EmitPartitionsSize(const VP8Encoder* const enc, - const WebPPicture* const pic) { + WebPPicture* const pic) { uint8_t buf[3 * (MAX_NUM_PARTITIONS - 1)]; int p; for (p = 0; p < enc->num_parts_ - 1; ++p) { const size_t part_size = VP8BitWriterSize(enc->parts_ + p); if (part_size >= MAX_PARTITION_SIZE) { - return 0; // partition is too big to fit + return WebPEncodingSetError(pic, VP8_ENC_ERROR_PARTITION_OVERFLOW); } buf[3 * p + 0] = (part_size >> 0) & 0xff; buf[3 * p + 1] = (part_size >> 8) & 0xff; @@ -153,16 +158,69 @@ static int EmitPartitionsSize(const VP8Encoder* const enc, return p ? pic->writer(buf, 3 * p, pic) : 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ + +#ifdef WEBP_EXPERIMENTAL_FEATURES + +#define KTRAILER_SIZE 8 + +static void PutLE24(uint8_t* buf, size_t value) { + buf[0] = (value >> 0) & 0xff; + buf[1] = (value >> 8) & 0xff; + buf[2] = (value >> 16) & 0xff; +} + +static int WriteExtensions(VP8Encoder* const enc) { + uint8_t buffer[KTRAILER_SIZE]; + VP8BitWriter* const bw = &enc->bw_; + WebPPicture* const pic = enc->pic_; + + // Layer (bytes 0..3) + PutLE24(buffer + 0, enc->layer_data_size_); + buffer[3] = enc->pic_->colorspace & WEBP_CSP_UV_MASK; + if (enc->layer_data_size_ > 0) { + assert(enc->use_layer_); + // append layer data to last partition + if (!VP8BitWriterAppend(&enc->parts_[enc->num_parts_ - 1], + enc->layer_data_, enc->layer_data_size_)) { + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY); + } + } + // Alpha (bytes 4..6) + PutLE24(buffer + 4, enc->alpha_data_size_); + if (enc->alpha_data_size_ > 0) { + assert(enc->has_alpha_); + if (!VP8BitWriterAppend(bw, enc->alpha_data_, enc->alpha_data_size_)) { + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY); + } + } + + buffer[KTRAILER_SIZE - 1] = 0x01; // marker + if (!VP8BitWriterAppend(bw, buffer, KTRAILER_SIZE)) { + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY); + } + return 1; +} + +#endif /* WEBP_EXPERIMENTAL_FEATURES */ + +//------------------------------------------------------------------------------ static size_t GeneratePartition0(VP8Encoder* const enc) { VP8BitWriter* const bw = &enc->bw_; const int mb_size = enc->mb_w_ * enc->mb_h_; uint64_t pos1, pos2, pos3; +#ifdef WEBP_EXPERIMENTAL_FEATURES + const int need_extensions = enc->has_alpha_ || enc->use_layer_; +#endif pos1 = VP8BitWriterPos(bw); VP8BitWriterInit(bw, mb_size * 7 / 8); // ~7 bits per macroblock +#ifdef WEBP_EXPERIMENTAL_FEATURES + VP8PutBitUniform(bw, need_extensions); // extensions +#else VP8PutBitUniform(bw, 0); // colorspace +#endif VP8PutBitUniform(bw, 0); // clamp type PutSegmentHeader(bw, enc); @@ -174,11 +232,20 @@ static size_t GeneratePartition0(VP8Encoder* const enc) { pos2 = VP8BitWriterPos(bw); VP8CodeIntraModes(enc); VP8BitWriterFinish(bw); + +#ifdef WEBP_EXPERIMENTAL_FEATURES + if (need_extensions && !WriteExtensions(enc)) { + return 0; + } +#endif + pos3 = VP8BitWriterPos(bw); if (enc->pic_->stats) { enc->pic_->stats->header_bytes[0] = (int)((pos2 - pos1 + 7) >> 3); enc->pic_->stats->header_bytes[1] = (int)((pos3 - pos2 + 7) >> 3); + enc->pic_->stats->alpha_data_size = (int)enc->alpha_data_size_; + enc->pic_->stats->layer_data_size = (int)enc->layer_data_size_; } return !bw->error_; } @@ -191,7 +258,7 @@ int VP8EncWrite(VP8Encoder* const enc) { int p; // Partition #0 with header and partition sizes - ok = GeneratePartition0(enc); + ok = !!GeneratePartition0(enc); // Compute total size (for the RIFF header) coded_size = KHEADER_SIZE + VP8BitWriterSize(bw) + 3 * (enc->num_parts_ - 1); @@ -226,11 +293,11 @@ int VP8EncWrite(VP8Encoder* const enc) { ok = pic->writer(pad_byte, 1, pic); } - enc->coded_size_ = coded_size + KRIFF_SIZE; + enc->coded_size_ = (int)coded_size + KRIFF_SIZE; return ok; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/tree.c b/third_party/libwebp/enc/tree.c index b1a9aa4..60f6343 100644 --- a/third_party/libwebp/enc/tree.c +++ b/third_party/libwebp/enc/tree.c @@ -15,7 +15,7 @@ extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Default probabilities // Paragraph 13.5 @@ -343,7 +343,7 @@ void VP8CodeIntraModes(VP8Encoder* const enc) { } while (VP8IteratorNext(&it, 0)); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Paragraph 13 const uint8_t diff --git a/third_party/libwebp/enc/vp8enci.h b/third_party/libwebp/enc/vp8enci.h index b19450d..357523a 100644 --- a/third_party/libwebp/enc/vp8enci.h +++ b/third_party/libwebp/enc/vp8enci.h @@ -13,20 +13,24 @@ #define WEBP_ENC_VP8ENCI_H_ #include "string.h" // for memcpy() -#include "webp/encode.h" -#include "bit_writer.h" +#include "../webp/encode.h" +#include "../dsp/dsp.h" +#include "../utils/bit_writer.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Various defines and enums // version numbers #define ENC_MAJ_VERSION 0 #define ENC_MIN_VERSION 1 -#define ENC_REV_VERSION 2 +#define ENC_REV_VERSION 3 + +// size of histogram used by CollectHistogram. +#define MAX_COEFF_THRESH 64 // intra prediction modes enum { B_DC_PRED = 0, // 4x4 modes @@ -158,7 +162,7 @@ static inline int QUANTDIV(int n, int iQ, int B) { } extern const uint8_t VP8Zigzag[16]; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Headers typedef uint8_t ProbaArray[NUM_CTX][NUM_PROBAS]; @@ -184,7 +188,7 @@ typedef struct { StatsArray stats_[NUM_TYPES][NUM_BANDS]; // 7.4k CostArray level_cost_[NUM_TYPES][NUM_BANDS]; // 11.4k int use_skip_proba_; // Note: we always use skip_proba for now. - int nb_skip_, nb_i4_, nb_i16_; // block type counters + int nb_skip_; // number of skipped blocks } VP8Proba; // Filter parameters. Not actually used in the code (we don't perform @@ -196,19 +200,19 @@ typedef struct { int i4x4_lf_delta_; // delta filter level for i4x4 relative to i16x16 } VP8FilterHeader; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Informations about the macroblocks. typedef struct { // block type - uint8_t type_:2; // 0=i4x4, 1=i16x16 - uint8_t uv_mode_:2; - uint8_t skip_:1; - uint8_t segment_:2; + unsigned int type_:2; // 0=i4x4, 1=i16x16 + unsigned int uv_mode_:2; + unsigned int skip_:1; + unsigned int segment_:2; uint8_t alpha_; // quantization-susceptibility } VP8MBInfo; -typedef struct { +typedef struct VP8Matrix { uint16_t q_[16]; // quantizer steps uint16_t iq_[16]; // reciprocals, fixed point. uint16_t bias_[16]; // rounding bias @@ -258,7 +262,7 @@ typedef struct { uint8_t* preds_; // intra mode predictors (4x4 blocks) uint32_t* nz_; // non-zero pattern uint8_t i4_boundary_[37]; // 32+5 boundary samples needed by intra4x4 - uint8_t* i4_top_; // pointer to the current *top boundary sample + uint8_t* i4_top_; // pointer to the current top boundary sample int i4_; // current intra4x4 mode being tested int top_nz_[9]; // top-non-zero context. int left_nz_[9]; // left-non-zero. left_nz[8] is independent. @@ -302,7 +306,7 @@ void VP8SetSkip(const VP8EncIterator* const it, int skip); void VP8SetSegment(const VP8EncIterator* const it, int segment); void VP8IteratorResetCosts(VP8EncIterator* const it); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8Encoder struct VP8Encoder { @@ -326,6 +330,17 @@ struct VP8Encoder { VP8BitWriter bw_; // part0 VP8BitWriter parts_[MAX_NUM_PARTITIONS]; // token partitions + // transparency blob + int has_alpha_; + uint8_t* alpha_data_; // non-NULL if transparency is present + size_t alpha_data_size_; + + // enhancement layer + int use_layer_; + VP8BitWriter layer_bw_; + uint8_t* layer_data_; + size_t layer_data_size_; + // quantization info (one set of DC/AC dequant factor per segment) VP8SegmentInfo dqm_[NUM_MB_SEGMENTS]; int base_quant_; // nominal quantizer value. Only used @@ -345,8 +360,9 @@ struct VP8Encoder { int block_count_[3]; // quality/speed settings - int method_; // 0=fastest, 6=best/slowest. - int rd_opt_level_; // Deduced from method_. + int method_; // 0=fastest, 6=best/slowest. + int rd_opt_level_; // Deduced from method_. + int max_i4_header_bits_; // partition #0 safeness factor // Memory VP8MBInfo* mb_info_; // contextual macroblock infos (mb_w_ + 1) @@ -366,7 +382,7 @@ struct VP8Encoder { LFStats *lf_stats_; // autofilter stats (if NULL, autofilter is off) }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // internal functions. Not public. // in tree.c @@ -403,6 +419,10 @@ int VP8GetCostUV(VP8EncIterator* const it, const VP8ModeScore* const rd); int VP8EncLoop(VP8Encoder* const enc); int VP8StatLoop(VP8Encoder* const enc); + // in webpenc.c +// Assign an error code to a picture. Return false for convenience. +int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error); + // in analysis.c // Main analysis loop. Decides the segmentations and complexity. // Assigns a first guess for Intra16 and uvmode_ prediction modes. @@ -414,58 +434,27 @@ void VP8SetSegmentParams(VP8Encoder* const enc, float quality); // Pick best modes and fills the levels. Returns true if skipped. int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd, int rd_opt); - // in dsp.c -// Transforms -typedef void (*VP8Idct)(const uint8_t* ref, const int16_t* in, uint8_t* dst); -typedef void (*VP8Fdct)(const uint8_t* src, const uint8_t* ref, int16_t* out); -typedef void (*VP8WHT)(const int16_t* in, int16_t* out); -extern VP8Idct VP8ITransform; -extern VP8Fdct VP8FTransform; -extern VP8WHT VP8ITransformWHT; -extern VP8WHT VP8FTransformWHT; -// Predictions -// *dst is the destination block. *top, *top_right and *left can be NULL. -typedef void (*VP8IntraPreds)(uint8_t *dst, const uint8_t* left, - const uint8_t* top); -typedef void (*VP8Intra4Preds)(uint8_t *dst, const uint8_t* top); -extern VP8Intra4Preds VP8EncPredLuma4; -extern VP8IntraPreds VP8EncPredLuma16; -extern VP8IntraPreds VP8EncPredChroma8; - -typedef int (*VP8Metric)(const uint8_t* pix, const uint8_t* ref); -extern VP8Metric VP8SSE16x16, VP8SSE16x8, VP8SSE8x8, VP8SSE4x4; -typedef int (*VP8WMetric)(const uint8_t* pix, const uint8_t* ref, - const uint16_t* const weights); -extern VP8WMetric VP8TDisto4x4, VP8TDisto16x16; - -typedef void (*VP8BlockCopy)(const uint8_t* src, uint8_t* dst); -extern VP8BlockCopy VP8Copy4x4; -extern VP8BlockCopy VP8Copy8x8; -extern VP8BlockCopy VP8Copy16x16; -// Quantization -typedef int (*VP8QuantizeBlock)(int16_t in[16], int16_t out[16], - int n, const VP8Matrix* const mtx); -extern VP8QuantizeBlock VP8EncQuantizeBlock; - -typedef enum { - kSSE2, - kSSE3 -} CPUFeature; -// returns true if the CPU supports the feature. -typedef int (*VP8CPUInfo)(CPUFeature feature); -extern VP8CPUInfo CPUInfo; - -void VP8EncDspInit(void); // must be called before using any of the above + // in alpha.c +void VP8EncInitAlpha(VP8Encoder* enc); // initialize alpha compression +void VP8EncCodeAlphaBlock(VP8EncIterator* it); // analyze or code a macroblock +int VP8EncFinishAlpha(VP8Encoder* enc); // finalize compressed data +void VP8EncDeleteAlpha(VP8Encoder* enc); // delete compressed data + + // in layer.c +void VP8EncInitLayer(VP8Encoder* const enc); // init everything +void VP8EncCodeLayerBlock(VP8EncIterator* it); // code one more macroblock +int VP8EncFinishLayer(VP8Encoder* const enc); // finalize coding +void VP8EncDeleteLayer(VP8Encoder* enc); // reclaim memory // in filter.c extern void VP8InitFilter(VP8EncIterator* const it); extern void VP8StoreFilterStats(VP8EncIterator* const it); extern void VP8AdjustFilterStrength(VP8EncIterator* const it); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif -#endif // WEBP_ENC_VP8ENCI_H_ +#endif /* WEBP_ENC_VP8ENCI_H_ */ diff --git a/third_party/libwebp/enc/webpenc.c b/third_party/libwebp/enc/webpenc.c index 59221d7..00c7d61 100644 --- a/third_party/libwebp/enc/webpenc.c +++ b/third_party/libwebp/enc/webpenc.c @@ -9,6 +9,7 @@ // // Author: Skal (pascal.massimino@gmail.com) +#include <assert.h> #include <stdlib.h> #include <string.h> #include <math.h> @@ -25,17 +26,15 @@ extern "C" { #include <stdio.h> #endif -#define MAX_DIMENSION 16384 // maximum width/height allowed by the spec - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ int WebPGetEncoderVersion(void) { return (ENC_MAJ_VERSION << 16) | (ENC_MIN_VERSION << 8) | ENC_REV_VERSION; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // WebPPicture -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static int DummyWriter(const uint8_t* data, size_t data_size, const WebPPicture* const picture) { @@ -53,13 +52,14 @@ int WebPPictureInitInternal(WebPPicture* const picture, int version) { if (picture) { memset(picture, 0, sizeof(*picture)); picture->writer = DummyWriter; + WebPEncodingSetError(picture, VP8_ENC_OK); } return 1; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8Encoder -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static void ResetSegmentHeader(VP8Encoder* const enc) { VP8SegmentHeader* const hdr = &enc->segment_hdr_; @@ -110,11 +110,15 @@ static void ResetBoundaryPredictions(VP8Encoder* const enc) { static void MapConfigToTools(VP8Encoder* const enc) { const int method = enc->config_->method; + const int limit = 100 - enc->config_->partition_limit; enc->method_ = method; enc->rd_opt_level_ = (method >= 6) ? 3 : (method >= 5) ? 2 : (method >= 3) ? 1 : 0; + enc->max_i4_header_bits_ = + 256 * 16 * 16 * // upper bound: up to 16bit per 4x4 block + (limit * limit) / (100 * 100); // ... modulated with a quadratic curve. } // Memory scaling with dimensions: @@ -155,7 +159,8 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config, 16 + 16 + 16 + 8 + 1 + // left y/u/v 2 * ALIGN_CST) // align all * sizeof(uint8_t); - const size_t lf_stats_size = config->autofilter ? sizeof(LFStats) : 0; + const size_t lf_stats_size = + config->autofilter ? sizeof(LFStats) + ALIGN_CST : 0; VP8Encoder* enc; uint8_t* mem; size_t size = sizeof(VP8Encoder) + ALIGN_CST // main struct @@ -193,7 +198,10 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config, printf("===================================\n"); #endif mem = (uint8_t*)malloc(size); - if (mem == NULL) return NULL; + if (mem == NULL) { + WebPEncodingSetError(picture, VP8_ENC_ERROR_OUT_OF_MEMORY); + return NULL; + } enc = (VP8Encoder*)mem; mem = (uint8_t*)DO_ALIGN(mem + sizeof(*enc)); memset(enc, 0, sizeof(*enc)); @@ -215,7 +223,7 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config, mem += preds_w * preds_h * sizeof(uint8_t); enc->nz_ = 1 + (uint32_t*)mem; mem += nz_size; - enc->lf_stats_ = lf_stats_size ? (LFStats*)mem : NULL; + enc->lf_stats_ = lf_stats_size ? (LFStats*)DO_ALIGN(mem) : NULL; mem += lf_stats_size; // top samples (all 16-aligned) @@ -242,14 +250,25 @@ static VP8Encoder* InitEncoder(const WebPConfig* const config, ResetFilterHeader(enc); ResetBoundaryPredictions(enc); +#ifdef WEBP_EXPERIMENTAL_FEATURES + VP8EncInitAlpha(enc); + VP8EncInitLayer(enc); +#endif + return enc; } static void DeleteEncoder(VP8Encoder* enc) { - free(enc); + if (enc) { +#ifdef WEBP_EXPERIMENTAL_FEATURES + VP8EncDeleteAlpha(enc); + VP8EncDeleteLayer(enc); +#endif + free(enc); + } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ static double GetPSNR(uint64_t err, uint64_t size) { return err ? 10. * log10(255. * 255. * size / err) : 99.; @@ -284,31 +303,46 @@ static void StoreStats(VP8Encoder* const enc) { } } -//----------------------------------------------------------------------------- +int WebPEncodingSetError(WebPPicture* const pic, WebPEncodingError error) { + assert((int)error <= VP8_ENC_ERROR_BAD_WRITE); + assert((int)error >= VP8_ENC_OK); + pic->error_code = error; + return 0; +} + +//------------------------------------------------------------------------------ int WebPEncode(const WebPConfig* const config, WebPPicture* const pic) { VP8Encoder* enc; int ok; - if (config == NULL || pic == NULL) - return 0; // bad params + if (pic == NULL) + return 0; + WebPEncodingSetError(pic, VP8_ENC_OK); // all ok so far + if (config == NULL) // bad params + return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER); if (!WebPValidateConfig(config)) - return 0; // invalid config. + return WebPEncodingSetError(pic, VP8_ENC_ERROR_INVALID_CONFIGURATION); if (pic->width <= 0 || pic->height <= 0) - return 0; // invalid parameters + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION); if (pic->y == NULL || pic->u == NULL || pic->v == NULL) - return 0; // invalid parameters - if (pic->width >= MAX_DIMENSION || pic->height >= MAX_DIMENSION) - return 0; // image is too big + return WebPEncodingSetError(pic, VP8_ENC_ERROR_NULL_PARAMETER); + if (pic->width > WEBP_MAX_DIMENSION || pic->height > WEBP_MAX_DIMENSION) + return WebPEncodingSetError(pic, VP8_ENC_ERROR_BAD_DIMENSION); enc = InitEncoder(config, pic); - if (enc == NULL) return 0; + if (enc == NULL) return 0; // pic->error is already set. ok = VP8EncAnalyze(enc) && VP8StatLoop(enc) && VP8EncLoop(enc) +#ifdef WEBP_EXPERIMENTAL_FEATURES + && VP8EncFinishAlpha(enc) + && VP8EncFinishLayer(enc) +#endif && VP8EncWrite(enc); StoreStats(enc); DeleteEncoder(enc); + return ok; } diff --git a/third_party/libwebp/libwebp.gyp b/third_party/libwebp/libwebp.gyp index 6df7a08..576fa9c 100644 --- a/third_party/libwebp/libwebp.gyp +++ b/third_party/libwebp/libwebp.gyp @@ -1,4 +1,4 @@ -# Copyright (c) 2010 The Chromium Authors. All rights reserved. +# Copyright (c) 2012 The Chromium Authors. All rights reserved. # Use of this source code is governed by a BSD-style license that can be # found in the LICENSE file. @@ -14,19 +14,19 @@ 'type': 'static_library', 'include_dirs': ['.'], 'sources': [ + 'enc/alpha.c', 'enc/analysis.c', - 'enc/bit_writer.c', 'enc/config.c', 'enc/cost.c', - 'enc/dsp.c', 'enc/filter.c', 'enc/frame.c', 'enc/iterator.c', + 'enc/layer.c', 'enc/picture.c', 'enc/quant.c', 'enc/syntax.c', 'enc/tree.c', - 'enc/webpenc.c' + 'enc/webpenc.c', ], }, { @@ -34,15 +34,42 @@ 'type': 'static_library', 'include_dirs': ['.'], 'sources': [ - 'dec/bits.c', - 'dec/dsp.c', + 'dec/alpha.c', + 'dec/buffer.c', 'dec/frame.c', 'dec/idec.c', + 'dec/io.c', + 'dec/layer.c', 'dec/quant.c', 'dec/tree.c', 'dec/vp8.c', 'dec/webp.c', - 'dec/yuv.c', + ], + }, + { + 'target_name': 'libwebp_dsp', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': [ + 'dsp/cpu.c', + 'dsp/dec.c', + 'dsp/dec_neon.c', + 'dsp/dec_sse2.c', + 'dsp/enc.c', + 'dsp/enc_sse2.c', + 'dsp/upsampling.c', + 'dsp/upsampling_sse2.c', + 'dsp/yuv.c', + ], + }, + { + 'target_name': 'libwebp_utils', + 'type': 'static_library', + 'include_dirs': ['.'], + 'sources': [ + 'utils/bit_reader.c', + 'utils/bit_writer.c', + 'utils/thread.c', ], }, { @@ -51,6 +78,8 @@ 'dependencies' : [ 'libwebp_enc', 'libwebp_dec', + 'libwebp_dsp', + 'libwebp_utils', ], 'direct_dependent_settings': { 'include_dirs': ['.'], diff --git a/third_party/libwebp/dec/bits.c b/third_party/libwebp/utils/bit_reader.c index da3b777..c37efa7 100644 --- a/third_party/libwebp/dec/bits.c +++ b/third_party/libwebp/utils/bit_reader.c @@ -9,13 +9,13 @@ // // Author: Skal (pascal.massimino@gmail.com) -#include "bits.h" +#include "./bit_reader.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8BitReader void VP8InitBitReader(VP8BitReader* const br, @@ -56,7 +56,7 @@ const uint8_t kVP8NewRange[128] = { 241, 243, 245, 247, 249, 251, 253, 127 }; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Higher-level calls uint32_t VP8GetValue(VP8BitReader* const br, int bits) { @@ -72,7 +72,7 @@ int32_t VP8GetSignedValue(VP8BitReader* const br, int bits) { return VP8Get(br) ? -value : value; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/dec/bits.h b/third_party/libwebp/utils/bit_reader.h index 82e4c3a..d65a74e 100644 --- a/third_party/libwebp/dec/bits.h +++ b/third_party/libwebp/utils/bit_reader.h @@ -9,20 +9,21 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_DEC_BITS_H_ -#define WEBP_DEC_BITS_H_ +#ifndef WEBP_UTILS_BIT_READER_H_ +#define WEBP_UTILS_BIT_READER_H_ #include <assert.h> -#include "webp/decode_vp8.h" +#include "../webp/decode_vp8.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Bitreader and code-tree reader -typedef struct { +typedef struct VP8BitReader VP8BitReader; +struct VP8BitReader { const uint8_t* buf_; // next byte to be read const uint8_t* buf_end_; // end of read buffer int eof_; // true if input is exhausted @@ -31,7 +32,7 @@ typedef struct { uint32_t range_; // current range minus 1. In [127, 254] interval. uint32_t value_; // current value int missing_; // number of missing bits in value_ (8bit) -} VP8BitReader; +}; // Initialize the bit reader and the boolean decoder. void VP8InitBitReader(VP8BitReader* const br, @@ -61,15 +62,16 @@ static inline uint32_t VP8GetByte(VP8BitReader* const br) { static inline uint32_t VP8BitUpdate(VP8BitReader* const br, uint32_t split) { uint32_t bit; + const uint32_t value_split = (split + 1) << 8; // Make sure we have a least 8 bits in 'value_' if (br->missing_ > 0) { br->value_ |= VP8GetByte(br) << br->missing_; br->missing_ -= 8; } - bit = ((br->value_ >> 8) > split); + bit = (br->value_ >= value_split); if (bit) { br->range_ -= split + 1; - br->value_ -= (split + 1) << 8; + br->value_ -= value_split; } else { br->range_ = split; } @@ -104,4 +106,4 @@ static inline int VP8GetSigned(VP8BitReader* const br, int v) { } // extern "C" #endif -#endif // WEBP_DEC_BITS_H_ +#endif /* WEBP_UTILS_BIT_READER_H_ */ diff --git a/third_party/libwebp/enc/bit_writer.c b/third_party/libwebp/utils/bit_writer.c index 3656a7e..9ed8275 100644 --- a/third_party/libwebp/enc/bit_writer.c +++ b/third_party/libwebp/utils/bit_writer.c @@ -10,14 +10,15 @@ // Author: Skal (pascal.massimino@gmail.com) #include <assert.h> +#include <string.h> // for memcpy() #include <stdlib.h> -#include "vp8enci.h" +#include "./bit_writer.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // VP8BitWriter static int BitWriterResize(VP8BitWriter* const bw, size_t extra_size) { @@ -68,7 +69,7 @@ static void kFlush(VP8BitWriter* const bw) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // renormalization static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i) @@ -84,7 +85,7 @@ static const uint8_t kNorm[128] = { // renorm_sizes[i] = 8 - log2(i) }; // range = ((range + 1) << kVP8Log2Range[range]) - 1 -const uint8_t kNewRange[128] = { +static const uint8_t kNewRange[128] = { 127, 127, 191, 127, 159, 191, 223, 127, 143, 159, 175, 191, 207, 223, 239, 127, 135, 143, 151, 159, 167, 175, 183, 191, 199, 207, 215, 223, 231, 239, 247, 127, 131, 135, 139, 143, 147, 151, 155, 159, 163, 167, 171, 175, 179, @@ -147,7 +148,7 @@ void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits) { } } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ int VP8BitWriterInit(VP8BitWriter* const bw, size_t expected_size) { bw->range_ = 255 - 1; @@ -168,7 +169,17 @@ uint8_t* VP8BitWriterFinish(VP8BitWriter* const bw) { return bw->buf_; } -//----------------------------------------------------------------------------- +int VP8BitWriterAppend(VP8BitWriter* const bw, + const uint8_t* data, size_t size) { + assert(data); + if (bw->nb_bits_ != -8) return 0; // kFlush() must have been called + if (!BitWriterResize(bw, size)) return 0; + memcpy(bw->buf_ + bw->pos_, data, size); + bw->pos_ += size; + return 1; +} + +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/enc/bit_writer.h b/third_party/libwebp/utils/bit_writer.h index 3773c9c..1368a34 100644 --- a/third_party/libwebp/enc/bit_writer.h +++ b/third_party/libwebp/utils/bit_writer.h @@ -9,16 +9,16 @@ // // Author: Skal (pascal.massimino@gmail.com) -#ifndef WEBP_ENC_BIT_WRITER_H_ -#define WEBP_ENC_BIT_WRITER_H_ +#ifndef WEBP_UTILS_BIT_WRITER_H_ +#define WEBP_UTILS_BIT_WRITER_H_ -#include "vp8enci.h" +#include "../webp/types.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Bit-writing typedef struct VP8BitWriter VP8BitWriter; @@ -39,6 +39,8 @@ int VP8PutBit(VP8BitWriter* const bw, int bit, int prob); int VP8PutBitUniform(VP8BitWriter* const bw, int bit); void VP8PutValue(VP8BitWriter* const bw, int value, int nb_bits); void VP8PutSignedValue(VP8BitWriter* const bw, int value, int nb_bits); +int VP8BitWriterAppend(VP8BitWriter* const bw, + const uint8_t* data, size_t size); // return approximate write position (in bits) static inline uint64_t VP8BitWriterPos(const VP8BitWriter* const bw) { @@ -52,10 +54,10 @@ static inline size_t VP8BitWriterSize(const VP8BitWriter* const bw) { return bw->pos_; } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" #endif -#endif // WEBP_ENC_BIT_WRITER_H_ +#endif /* WEBP_UTILS_BIT_WRITER_H_ */ diff --git a/third_party/libwebp/utils/thread.c b/third_party/libwebp/utils/thread.c new file mode 100644 index 0000000..c8c5e9d --- /dev/null +++ b/third_party/libwebp/utils/thread.c @@ -0,0 +1,243 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Author: skal@google.com (Pascal Massimino) + +#include <assert.h> +#include <string.h> // for memset() +#include "./thread.h" + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#ifdef WEBP_USE_THREAD + +#if defined(_WIN32) + +//------------------------------------------------------------------------------ +// simplistic pthread emulation layer + +#include <process.h> + +// _beginthreadex requires __stdcall +#define THREADFN unsigned int __stdcall +#define THREAD_RETURN(val) (unsigned int)((DWORD_PTR)val) + +static int pthread_create(pthread_t* const thread, const void* attr, + unsigned int (__stdcall *start)(void*), void* arg) { + (void)attr; + *thread = (pthread_t)_beginthreadex(NULL, /* void *security */ + 0, /* unsigned stack_size */ + start, + arg, + 0, /* unsigned initflag */ + NULL); /* unsigned *thrdaddr */ + if (*thread == NULL) return 1; + SetThreadPriority(*thread, THREAD_PRIORITY_ABOVE_NORMAL); + return 0; +} + +static int pthread_join(pthread_t thread, void** value_ptr) { + (void)value_ptr; + return (WaitForSingleObject(thread, INFINITE) != WAIT_OBJECT_0 || + CloseHandle(thread) == 0); +} + +// Mutex +static int pthread_mutex_init(pthread_mutex_t* const mutex, void* mutexattr) { + (void)mutexattr; + InitializeCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_lock(pthread_mutex_t* const mutex) { + EnterCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_unlock(pthread_mutex_t* const mutex) { + LeaveCriticalSection(mutex); + return 0; +} + +static int pthread_mutex_destroy(pthread_mutex_t* const mutex) { + DeleteCriticalSection(mutex); + return 0; +} + +// Condition +static int pthread_cond_destroy(pthread_cond_t* const condition) { + int ok = 1; + ok &= (CloseHandle(condition->waiting_sem_) != 0); + ok &= (CloseHandle(condition->received_sem_) != 0); + ok &= (CloseHandle(condition->signal_event_) != 0); + return !ok; +} + +static int pthread_cond_init(pthread_cond_t* const condition, void* cond_attr) { + (void)cond_attr; + condition->waiting_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->received_sem_ = CreateSemaphore(NULL, 0, 1, NULL); + condition->signal_event_ = CreateEvent(NULL, FALSE, FALSE, NULL); + if (condition->waiting_sem_ == NULL || + condition->received_sem_ == NULL || + condition->signal_event_ == NULL) { + pthread_cond_destroy(condition); + return 1; + } + return 0; +} + +static int pthread_cond_signal(pthread_cond_t* const condition) { + int ok = 1; + if (WaitForSingleObject(condition->waiting_sem_, 0) == WAIT_OBJECT_0) { + // a thread is waiting in pthread_cond_wait: allow it to be notified + ok = SetEvent(condition->signal_event_); + // wait until the event is consumed so the signaler cannot consume + // the event via its own pthread_cond_wait. + ok &= (WaitForSingleObject(condition->received_sem_, INFINITE) != + WAIT_OBJECT_0); + } + return !ok; +} + +static int pthread_cond_wait(pthread_cond_t* const condition, + pthread_mutex_t* const mutex) { + int ok; + // note that there is a consumer available so the signal isn't dropped in + // pthread_cond_signal + if (!ReleaseSemaphore(condition->waiting_sem_, 1, NULL)) + return 1; + // now unlock the mutex so pthread_cond_signal may be issued + pthread_mutex_unlock(mutex); + ok = (WaitForSingleObject(condition->signal_event_, INFINITE) == + WAIT_OBJECT_0); + ok &= ReleaseSemaphore(condition->received_sem_, 1, NULL); + pthread_mutex_lock(mutex); + return !ok; +} + +#else // _WIN32 +# define THREADFN void* +# define THREAD_RETURN(val) val +#endif + +//------------------------------------------------------------------------------ + +static THREADFN WebPWorkerThreadLoop(void *ptr) { // thread loop + WebPWorker* const worker = (WebPWorker*)ptr; + int done = 0; + while (!done) { + pthread_mutex_lock(&worker->mutex_); + while (worker->status_ == OK) { // wait in idling mode + pthread_cond_wait(&worker->condition_, &worker->mutex_); + } + if (worker->status_ == WORK) { + if (worker->hook) { + worker->had_error |= !worker->hook(worker->data1, worker->data2); + } + worker->status_ = OK; + } else if (worker->status_ == NOT_OK) { // finish the worker + done = 1; + } + // signal to the main thread that we're done (for Sync()) + pthread_cond_signal(&worker->condition_); + pthread_mutex_unlock(&worker->mutex_); + } + return THREAD_RETURN(NULL); // Thread is finished +} + +// main thread state control +static void WebPWorkerChangeState(WebPWorker* const worker, + WebPWorkerStatus new_status) { + // no-op when attempting to change state on a thread that didn't come up + if (worker->status_ < OK) return; + + pthread_mutex_lock(&worker->mutex_); + // wait for the worker to finish + while (worker->status_ != OK) { + pthread_cond_wait(&worker->condition_, &worker->mutex_); + } + // assign new status and release the working thread if needed + if (new_status != OK) { + worker->status_ = new_status; + pthread_cond_signal(&worker->condition_); + } + pthread_mutex_unlock(&worker->mutex_); +} + +#endif + +//------------------------------------------------------------------------------ + +void WebPWorkerInit(WebPWorker* const worker) { + memset(worker, 0, sizeof(*worker)); + worker->status_ = NOT_OK; +} + +int WebPWorkerSync(WebPWorker* const worker) { +#ifdef WEBP_USE_THREAD + WebPWorkerChangeState(worker, OK); +#endif + assert(worker->status_ <= OK); + return !worker->had_error; +} + +int WebPWorkerReset(WebPWorker* const worker) { + int ok = 1; + worker->had_error = 0; + if (worker->status_ < OK) { +#ifdef WEBP_USE_THREAD + if (pthread_mutex_init(&worker->mutex_, NULL) || + pthread_cond_init(&worker->condition_, NULL)) { + return 0; + } + pthread_mutex_lock(&worker->mutex_); + ok = !pthread_create(&worker->thread_, NULL, WebPWorkerThreadLoop, worker); + if (ok) worker->status_ = OK; + pthread_mutex_unlock(&worker->mutex_); +#else + worker->status_ = OK; +#endif + } else if (worker->status_ > OK) { + ok = WebPWorkerSync(worker); + } + assert(!ok || (worker->status_ == OK)); + return ok; +} + +void WebPWorkerLaunch(WebPWorker* const worker) { +#ifdef WEBP_USE_THREAD + WebPWorkerChangeState(worker, WORK); +#else + if (worker->hook) + worker->had_error |= !worker->hook(worker->data1, worker->data2); +#endif +} + +void WebPWorkerEnd(WebPWorker* const worker) { + if (worker->status_ >= OK) { +#ifdef WEBP_USE_THREAD + WebPWorkerChangeState(worker, NOT_OK); + pthread_join(worker->thread_, NULL); + pthread_mutex_destroy(&worker->mutex_); + pthread_cond_destroy(&worker->condition_); +#else + worker->status_ = NOT_OK; +#endif + } + assert(worker->status_ == NOT_OK); +} + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif diff --git a/third_party/libwebp/utils/thread.h b/third_party/libwebp/utils/thread.h new file mode 100644 index 0000000..2c40f72 --- /dev/null +++ b/third_party/libwebp/utils/thread.h @@ -0,0 +1,86 @@ +// Copyright 2011 Google Inc. +// +// This code is licensed under the same terms as WebM: +// Software License Agreement: http://www.webmproject.org/license/software/ +// Additional IP Rights Grant: http://www.webmproject.org/license/additional/ +// ----------------------------------------------------------------------------- +// +// Multi-threaded worker +// +// Author: skal@google.com (Pascal Massimino) + +#ifndef WEBP_UTILS_THREAD_H_ +#define WEBP_UTILS_THREAD_H_ + +#if defined(__cplusplus) || defined(c_plusplus) +extern "C" { +#endif + +#if WEBP_USE_THREAD + +#if defined(_WIN32) + +#include <windows.h> +typedef HANDLE pthread_t; +typedef CRITICAL_SECTION pthread_mutex_t; +typedef struct { + HANDLE waiting_sem_; + HANDLE received_sem_; + HANDLE signal_event_; +} pthread_cond_t; + +#else + +#include <pthread.h> + +#endif /* _WIN32 */ +#endif /* WEBP_USE_THREAD */ + +// State of the worker thread object +typedef enum { + NOT_OK = 0, // object is unusable + OK, // ready to work + WORK // busy finishing the current task +} WebPWorkerStatus; + +// Function to be called by the worker thread. Takes two opaque pointers as +// arguments (data1 and data2), and should return false in case of error. +typedef int (*WebPWorkerHook)(void*, void*); + +// Synchronize object used to launch job in the worker thread +typedef struct { +#if WEBP_USE_THREAD + pthread_mutex_t mutex_; + pthread_cond_t condition_; + pthread_t thread_; +#endif + WebPWorkerStatus status_; + WebPWorkerHook hook; // hook to call + void* data1; // first argument passed to 'hook' + void* data2; // second argument passed to 'hook' + int had_error; // return value of the last call to 'hook' +} WebPWorker; + +// Must be called first, before any other method. +void WebPWorkerInit(WebPWorker* const worker); +// Must be called initialize the object and spawn the thread. Re-entrant. +// Will potentially launch the thread. Returns false in case of error. +int WebPWorkerReset(WebPWorker* const worker); +// Make sure the previous work is finished. Returns true if worker->had_error +// was not set and not error condition was triggered by the working thread. +int WebPWorkerSync(WebPWorker* const worker); +// Trigger the thread to call hook() with data1 and data2 argument. These +// hook/data1/data2 can be changed at any time before calling this function, +// but not be changed afterward until the next call to WebPWorkerSync(). +void WebPWorkerLaunch(WebPWorker* const worker); +// Kill the thread and terminate the object. To use the object again, one +// must call WebPWorkerReset() again. +void WebPWorkerEnd(WebPWorker* const worker); + +//------------------------------------------------------------------------------ + +#if defined(__cplusplus) || defined(c_plusplus) +} // extern "C" +#endif + +#endif /* WEBP_UTILS_THREAD_H_ */ diff --git a/third_party/libwebp/webp/decode.h b/third_party/libwebp/webp/decode.h index 6c63d54..fe5fa4a 100644 --- a/third_party/libwebp/webp/decode.h +++ b/third_party/libwebp/webp/decode.h @@ -12,40 +12,46 @@ #ifndef WEBP_WEBP_DECODE_H_ #define WEBP_WEBP_DECODE_H_ -#include "webp/types.h" +#include "./types.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif +#define WEBP_DECODER_ABI_VERSION 0x0002 + // Return the decoder's version number, packed in hexadecimal using 8bits for // each of major/minor/revision. E.g: v2.5.7 is 0x020507. -int WebPGetDecoderVersion(void); +WEBP_EXTERN(int) WebPGetDecoderVersion(void); // Retrieve basic header information: width, height. // This function will also validate the header and return 0 in // case of formatting error. // Pointers *width/*height can be passed NULL if deemed irrelevant. -int WebPGetInfo(const uint8_t* data, uint32_t data_size, - int *width, int *height); +WEBP_EXTERN(int) WebPGetInfo(const uint8_t* data, uint32_t data_size, + int* width, int* height); // Decodes WEBP images pointed to by *data and returns RGB samples, along // with the dimensions in *width and *height. // The returned pointer should be deleted calling free(). // Returns NULL in case of error. -uint8_t* WebPDecodeRGB(const uint8_t* data, uint32_t data_size, - int *width, int *height); +WEBP_EXTERN(uint8_t*) WebPDecodeRGB(const uint8_t* data, uint32_t data_size, + int* width, int* height); // Same as WebPDecodeRGB, but returning RGBA data. -uint8_t* WebPDecodeRGBA(const uint8_t* data, uint32_t data_size, - int *width, int *height); +WEBP_EXTERN(uint8_t*) WebPDecodeRGBA(const uint8_t* data, uint32_t data_size, + int* width, int* height); + +// Same as WebPDecodeRGBA, but returning ARGB data. +WEBP_EXTERN(uint8_t*) WebPDecodeARGB(const uint8_t* data, uint32_t data_size, + int* width, int* height); // This variant decode to BGR instead of RGB. -uint8_t* WebPDecodeBGR(const uint8_t* data, uint32_t data_size, - int *width, int *height); +WEBP_EXTERN(uint8_t*) WebPDecodeBGR(const uint8_t* data, uint32_t data_size, + int* width, int* height); // This variant decodes to BGRA instead of RGBA. -uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size, - int *width, int *height); +WEBP_EXTERN(uint8_t*) WebPDecodeBGRA(const uint8_t* data, uint32_t data_size, + int* width, int* height); // Decode WEBP images stored in *data in Y'UV format(*). The pointer returned is // the Y samples buffer. Upon return, *u and *v will point to the U and V @@ -56,11 +62,12 @@ uint8_t* WebPDecodeBGRA(const uint8_t* data, uint32_t data_size, // have a common stride returned as '*uv_stride'. // Return NULL in case of error. // (*) Also named Y'CbCr. See: http://en.wikipedia.org/wiki/YCbCr -uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size, - int *width, int *height, uint8_t** u, uint8_t** v, - int *stride, int* uv_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeYUV(const uint8_t* data, uint32_t data_size, + int* width, int* height, + uint8_t** u, uint8_t** v, + int* stride, int* uv_stride); -// These three functions are variants of the above ones, that decode the image +// These five functions are variants of the above ones, that decode the image // directly into a pre-allocated buffer 'output_buffer'. The maximum storage // available in this buffer is indicated by 'output_buffer_size'. If this // storage is not sufficient (or an error occurred), NULL is returned. @@ -68,19 +75,22 @@ uint8_t* WebPDecodeYUV(const uint8_t* data, uint32_t data_size, // The parameter 'output_stride' specifies the distance (in bytes) // between scanlines. Hence, output_buffer_size is expected to be at least // output_stride x picture-height. -uint8_t* WebPDecodeRGBInto(const uint8_t* data, uint32_t data_size, - uint8_t* output_buffer, int output_buffer_size, - int output_stride); -uint8_t* WebPDecodeRGBAInto(const uint8_t* data, uint32_t data_size, - uint8_t* output_buffer, int output_buffer_size, - int output_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeRGBInto( + const uint8_t* data, uint32_t data_size, + uint8_t* output_buffer, int output_buffer_size, int output_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeRGBAInto( + const uint8_t* data, uint32_t data_size, + uint8_t* output_buffer, int output_buffer_size, int output_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeARGBInto( + const uint8_t* data, uint32_t data_size, + uint8_t* output_buffer, int output_buffer_size, int output_stride); // BGR variants -uint8_t* WebPDecodeBGRInto(const uint8_t* data, uint32_t data_size, - uint8_t* output_buffer, int output_buffer_size, - int output_stride); -uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size, - uint8_t* output_buffer, int output_buffer_size, - int output_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeBGRInto( + const uint8_t* data, uint32_t data_size, + uint8_t* output_buffer, int output_buffer_size, int output_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeBGRAInto( + const uint8_t* data, uint32_t data_size, + uint8_t* output_buffer, int output_buffer_size, int output_stride); // WebPDecodeYUVInto() is a variant of WebPDecodeYUV() that operates directly // into pre-allocated luma/chroma plane buffers. This function requires the @@ -89,19 +99,72 @@ uint8_t* WebPDecodeBGRAInto(const uint8_t* data, uint32_t data_size, // 'u_size' and 'v_size' respectively. // Pointer to the luma plane ('*luma') is returned or NULL if an error occurred // during decoding (or because some buffers were found to be too small). -uint8_t* WebPDecodeYUVInto(const uint8_t* data, uint32_t data_size, - uint8_t* luma, int luma_size, int luma_stride, - uint8_t* u, int u_size, int u_stride, - uint8_t* v, int v_size, int v_stride); +WEBP_EXTERN(uint8_t*) WebPDecodeYUVInto( + const uint8_t* data, uint32_t data_size, + uint8_t* luma, int luma_size, int luma_stride, + uint8_t* u, int u_size, int u_stride, + uint8_t* v, int v_size, int v_stride); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ +// Output colorspaces and buffer -// Output colorspaces +// Colorspaces typedef enum { MODE_RGB = 0, MODE_RGBA = 1, MODE_BGR = 2, MODE_BGRA = 3, - MODE_YUV = 4 } WEBP_CSP_MODE; + MODE_ARGB = 4, MODE_RGBA_4444 = 5, + MODE_RGB_565 = 6, + // YUV modes must come after RGB ones. + MODE_YUV = 7, MODE_YUVA = 8, // yuv 4:2:0 + MODE_LAST = 9 + } WEBP_CSP_MODE; + +// Generic structure for describing the sample buffer. +typedef struct { // view as RGBA + uint8_t* rgba; // pointer to RGBA samples + int stride; // stride in bytes from one scanline to the next. + int size; // total size of the *rgba buffer. +} WebPRGBABuffer; + +typedef struct { // view as YUVA + uint8_t* y, *u, *v, *a; // pointer to luma, chroma U/V, alpha samples + int y_stride; // luma stride + int u_stride, v_stride; // chroma strides + int a_stride; // alpha stride + int y_size; // luma plane size + int u_size, v_size; // chroma planes size + int a_size; // alpha-plane size +} WebPYUVABuffer; + +// Output buffer +typedef struct { + WEBP_CSP_MODE colorspace; // Colorspace. + int width, height; // Dimensions. + int is_external_memory; // If true, 'internal_memory' pointer is not used. + union { + WebPRGBABuffer RGBA; + WebPYUVABuffer YUVA; + } u; // Nameless union of buffer parameters. + uint8_t* private_memory; // Internally allocated memory (only when + // is_external_memory is false). Should not be used + // externally, but accessed via the buffer union. +} WebPDecBuffer; + +// Internal, version-checked, entry point +WEBP_EXTERN(int) WebPInitDecBufferInternal(WebPDecBuffer* const, int); + +// Initialize the structure as empty. Must be called before any other use. +// Returns false in case of version mismatch +static inline int WebPInitDecBuffer(WebPDecBuffer* const buffer) { + return WebPInitDecBufferInternal(buffer, WEBP_DECODER_ABI_VERSION); +} +// Free any memory associated with the buffer. Must always be called last. +// Note: doesn't free the 'buffer' structure itself. +WEBP_EXTERN(void) WebPFreeDecBuffer(WebPDecBuffer* const buffer); + +//------------------------------------------------------------------------------ // Enumeration of the status codes + typedef enum { VP8_STATUS_OK = 0, VP8_STATUS_OUT_OF_MEMORY, @@ -113,11 +176,11 @@ typedef enum { VP8_STATUS_NOT_ENOUGH_DATA } VP8StatusCode; -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Incremental decoding // -// This API allows streamlined decoding of partial data. -// Picture can be incrementally decoded as data become available thanks to the +// This API allows streamlined decoding of partial data. +// Picture can be incrementally decoded as data become available thanks to the // WebPIDecoder object. This object can be left in a SUSPENDED state if the // picture is only partially decoded, pending additional input. // Code example: @@ -138,16 +201,26 @@ typedef enum { typedef struct WebPIDecoder WebPIDecoder; +// Creates a new incremental decoder with the supplied buffer parameter. +// This output_buffer can be passed NULL, in which case a default output buffer +// is used (with MODE_RGB). Otherwise, an internal reference to 'output_buffer' +// is kept, which means that the lifespan of 'output_buffer' must be larger than +// that of the returned WebPIDecoder object. +// Returns NULL if the allocation failed. +WEBP_EXTERN(WebPIDecoder*) WebPINewDecoder(WebPDecBuffer* const output_buffer); + // Creates a WebPIDecoder object. Returns NULL in case of failure. -WebPIDecoder* WebPINew(WEBP_CSP_MODE mode); +// TODO(skal): DEPRECATED. Prefer using WebPINewDecoder(). +WEBP_EXTERN(WebPIDecoder*) WebPINew(WEBP_CSP_MODE mode); // This function allocates and initializes an incremental-decoder object, which // will output the r/g/b(/a) samples specified by 'mode' into a preallocated // buffer 'output_buffer'. The size of this buffer is at least // 'output_buffer_size' and the stride (distance in bytes between two scanlines) // is specified by 'output_stride'. Returns NULL if the allocation failed. -WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer, - int output_buffer_size, int output_stride); +WEBP_EXTERN(WebPIDecoder*) WebPINewRGB( + WEBP_CSP_MODE mode, + uint8_t* output_buffer, int output_buffer_size, int output_stride); // This function allocates and initializes an incremental-decoder object, which // will output the raw luma/chroma samples into a preallocated planes. The luma @@ -156,41 +229,165 @@ WebPIDecoder* WebPINewRGB(WEBP_CSP_MODE mode, uint8_t* output_buffer, // 'u_size' and 'u_stride' parameters, and the chroma-v plane by 'v', 'v_size' // and 'v_size'. // Returns NULL if the allocation failed. -WebPIDecoder* WebPINewYUV(uint8_t* luma, int luma_size, int luma_stride, - uint8_t* u, int u_size, int u_stride, - uint8_t* v, int v_size, int v_stride); +WEBP_EXTERN(WebPIDecoder*) WebPINewYUV( + uint8_t* luma, int luma_size, int luma_stride, + uint8_t* u, int u_size, int u_stride, + uint8_t* v, int v_size, int v_stride); -// Deletes the WebpBuffer object and associated memory. Must always be called +// Deletes the WebPIDecoder object and associated memory. Must always be called // if WebPINew, WebPINewRGB or WebPINewYUV succeeded. -void WebPIDelete(WebPIDecoder* const idec); +WEBP_EXTERN(void) WebPIDelete(WebPIDecoder* const idec); // Copies and decodes the next available data. Returns VP8_STATUS_OK when // the image is successfully decoded. Returns VP8_STATUS_SUSPENDED when more // data is expected. Returns error in other cases. -VP8StatusCode WebPIAppend(WebPIDecoder* const idec, const uint8_t* data, - uint32_t data_size); +WEBP_EXTERN(VP8StatusCode) WebPIAppend( + WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size); // A variant of the above function to be used when data buffer contains // partial data from the beginning. In this case data buffer is not copied // to the internal memory. // Note that the value of the 'data' pointer can change between calls to // WebPIUpdate, for instance when the data buffer is resized to fit larger data. -VP8StatusCode WebPIUpdate(WebPIDecoder* const idec, const uint8_t* data, - uint32_t data_size); +WEBP_EXTERN(VP8StatusCode) WebPIUpdate( + WebPIDecoder* const idec, const uint8_t* data, uint32_t data_size); -// Returns the RGB image decoded so far. Returns NULL if output params are not -// initialized yet. *last_y is the index of last decoded row in raster scan -// order. Some pointers (*last_y, *width etc.) can be NULL if corresponding -// information is not needed. -uint8_t* WebPIDecGetRGB(const WebPIDecoder* const idec, int *last_y, - int* width, int* height, int* stride); +// Returns the r/g/b/(a) image decoded so far. Returns NULL if output params +// are not initialized yet. The r/g/b/(a) output type corresponds to the mode +// specified in WebPINew()/WebPINewRGB(). *last_y is the index of last decoded +// row in raster scan order. Some pointers (*last_y, *width etc.) can be NULL if +// corresponding information is not needed. +WEBP_EXTERN(uint8_t*) WebPIDecGetRGB( + const WebPIDecoder* const idec, int* last_y, + int* width, int* height, int* stride); // Same as above function to get YUV image. Returns pointer to the luma plane // or NULL in case of error. -uint8_t* WebPIDecGetYUV(const WebPIDecoder* const idec, int* last_y, - uint8_t** u, uint8_t** v, - int* width, int* height, int* stride, int* uv_stride); +WEBP_EXTERN(uint8_t*) WebPIDecGetYUV( + const WebPIDecoder* const idec, int* last_y, + uint8_t** u, uint8_t** v, + int* width, int* height, int* stride, int* uv_stride); + +// Generic call to retrieve information about the displayable area. +// If non NULL, the left/right/width/height pointers are filled with the visible +// rectangular area so far. +// Returns NULL in case the incremental decoder object is in an invalid state. +// Otherwise returns the pointer to the internal representation. This structure +// is read-only, tied to WebPIDecoder's lifespan and should not be modified. +WEBP_EXTERN(const WebPDecBuffer*) WebPIDecodedArea( + const WebPIDecoder* const idec, + int* const left, int* const top, + int* const width, int* const height); + +//------------------------------------------------------------------------------ +// Advanced decoding parametrization +// +// Code sample for using the advanced decoding API +/* + // A) Init a configuration object + WebPDecoderConfig config; + CHECK(WebPInitDecoderConfig(&config)); + + // B) optional: retrieve the bitstream's features. + CHECK(WebPGetFeatures(data, data_size, &config.input) == VP8_STATUS_OK); + + // C) Adjust 'config', if needed + config.no_fancy = 1; + config.output.colorspace = MODE_BGRA; + // etc. + + // Note that you can also make config.output point to an externally + // supplied memory buffer, provided it's big enough to store the decoded + // picture. Otherwise, config.output will just be used to allocate memory + // and store the decoded picture. + + // D) Decode! + CHECK(WebPDecode(data, data_size, &config) == VP8_STATUS_OK); + + // E) Decoded image is now in config.output (and config.output.u.RGBA) + + // F) Reclaim memory allocated in config's object. It's safe to call + // this function even if the memory is external and wasn't allocated + // by WebPDecode(). + WebPFreeDecBuffer(&config.output); +*/ + +// Features gathered from the bitstream +typedef struct { + int width; // the original width, as read from the bitstream + int height; // the original height, as read from the bitstream + int has_alpha; // true if bitstream contains an alpha channel + int no_incremental_decoding; // if true, using incremental decoding is not + // recommended. + int rotate; // TODO(later) + int uv_sampling; // should be 0 for now. TODO(later) + int bitstream_version; // should be 0 for now. TODO(later) +} WebPBitstreamFeatures; + +// Internal, version-checked, entry point +WEBP_EXTERN(VP8StatusCode) WebPGetFeaturesInternal( + const uint8_t*, uint32_t, WebPBitstreamFeatures* const, int); + +// Retrieve features from the bitstream. The *features structure is filled +// with information gathered from the bitstream. +// Returns false in case of error or version mismatch. +// In case of error, features->bitstream_status will reflect the error code. +static inline + VP8StatusCode WebPGetFeatures(const uint8_t* data, uint32_t data_size, + WebPBitstreamFeatures* const features) { + return WebPGetFeaturesInternal(data, data_size, features, + WEBP_DECODER_ABI_VERSION); +} + +// Decoding options +typedef struct { + int bypass_filtering; // if true, skip the in-loop filtering + int no_fancy_upsampling; // if true, use faster pointwise upsampler + int use_cropping; // if true, cropping is applied _first_ + int crop_left, crop_top; // top-left position for cropping. + // Will be snapped to even values. + int crop_width, crop_height; // dimension of the cropping area + int use_scaling; // if true, scaling is applied _afterward_ + int scaled_width, scaled_height; // final resolution + int force_rotation; // forced rotation (to be applied _last_) + int no_enhancement; // if true, discard enhancement layer + int use_threads; // if true, use multi-threaded decoding +} WebPDecoderOptions; + +// Main object storing the configuration for advanced decoding. +typedef struct { + WebPBitstreamFeatures input; // Immutable bitstream features (optional) + WebPDecBuffer output; // Output buffer (can point to external mem) + WebPDecoderOptions options; // Decoding options +} WebPDecoderConfig; + +// Internal, version-checked, entry point +WEBP_EXTERN(int) WebPInitDecoderConfigInternal(WebPDecoderConfig* const, int); + +// Initialize the configuration as empty. This function must always be +// called first, unless WebPGetFeatures() is to be called. +// Returns false in case of mismatched version. +static inline int WebPInitDecoderConfig(WebPDecoderConfig* const config) { + return WebPInitDecoderConfigInternal(config, WEBP_DECODER_ABI_VERSION); +} + +// Instantiate a new incremental decoder object with requested configuration. +// The bitstream can be passed using *data and data_size parameter, +// in which case the features will be parsed and stored into config->input. +// Otherwise, 'data' can be NULL and now parsing will occur. +// Note that 'config' can be NULL too, in which case a default configuration is +// used. +// The return WebPIDecoder object must always be deleted calling WebPIDelete(). +// Returns NULL in case of error (and config->status will then reflect +// the error condition). +WEBP_EXTERN(WebPIDecoder*) WebPIDecode(const uint8_t* data, uint32_t data_size, + WebPDecoderConfig* const config); +// Non-incremental version. This version decodes the full data at once, taking +// 'config' into account. Return decoding status (VP8_STATUS_OK if decoding +// was successful). +WEBP_EXTERN(VP8StatusCode) WebPDecode(const uint8_t* data, uint32_t data_size, + WebPDecoderConfig* const config); #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/webp/decode_vp8.h b/third_party/libwebp/webp/decode_vp8.h index 153a4c5..1ff4bda 100644 --- a/third_party/libwebp/webp/decode_vp8.h +++ b/third_party/libwebp/webp/decode_vp8.h @@ -12,18 +12,16 @@ #ifndef WEBP_WEBP_DECODE_VP8_H_ #define WEBP_WEBP_DECODE_VP8_H_ -#include "webp/decode.h" +#include "./decode.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -#define WEBP_DECODER_ABI_VERSION 0x0001 - -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Lower-level API // -// Thes functions provide fine-grained control of the decoding process. +// These functions provide fine-grained control of the decoding process. // The call flow should resemble: // // VP8Io io; @@ -40,14 +38,22 @@ extern "C" { // Input / Output typedef struct VP8Io VP8Io; +typedef int (*VP8IoPutHook)(const VP8Io* io); +typedef int (*VP8IoSetupHook)(VP8Io* io); +typedef void (*VP8IoTeardownHook)(const VP8Io* io); + struct VP8Io { // set by VP8GetHeaders() - int width, height; // picture dimensions, in pixels + int width, height; // picture dimensions, in pixels (invariable). + // These are the original, uncropped dimensions. + // The actual area passed to put() is stored + // in mb_w / mb_h fields. // set before calling put() int mb_y; // position of the current rows (in pixels) + int mb_w; // number of columns in the sample int mb_h; // number of rows in the sample - const uint8_t *y, *u, *v; // rows to copy (in yuv420 format) + const uint8_t* y, *u, *v; // rows to copy (in yuv420 format) int y_stride; // row stride for luma int uv_stride; // row stride for chroma @@ -56,20 +62,24 @@ struct VP8Io { // called when fresh samples are available. Currently, samples are in // YUV420 format, and can be up to width x 24 in size (depending on the // in-loop filtering level, e.g.). Should return false in case of error - // or abort request. - int (*put)(const VP8Io* io); + // or abort request. The actual size of the area to update is mb_w x mb_h + // in size, taking cropping into account. + VP8IoPutHook put; // called just before starting to decode the blocks. - // Should returns 0 in case of error. - int (*setup)(VP8Io* io); + // Must return false in case of setup error, true otherwise. If false is + // returned, teardown() will NOT be called. But if the setup succeeded + // and true is returned, then teardown() will always be called afterward. + VP8IoSetupHook setup; - // called just after block decoding is finished (or when an error occurred). - void (*teardown)(const VP8Io* io); + // Called just after block decoding is finished (or when an error occurred + // during put()). Is NOT called if setup() failed. + VP8IoTeardownHook teardown; // this is a recommendation for the user-side yuv->rgb converter. This flag // is set when calling setup() hook and can be overwritten by it. It then // can be taken into consideration during the put() method. - int fancy_upscaling; + int fancy_upsampling; // Input buffer. uint32_t data_size; @@ -80,16 +90,36 @@ struct VP8Io { // of more visible blocking. Note that output will also be non-compliant // with the VP8 specifications. int bypass_filtering; + + // Cropping parameters. + int use_cropping; + int crop_left, crop_right, crop_top, crop_bottom; + + // Scaling parameters. + int use_scaling; + int scaled_width, scaled_height; + + // pointer to the alpha data (if present) corresponding to the rows + const uint8_t* a; }; // Internal, version-checked, entry point -int VP8InitIoInternal(VP8Io* const, int); +WEBP_EXTERN(int) VP8InitIoInternal(VP8Io* const, int); + +// Set the custom IO function pointers and user-data. The setter for IO hooks +// should be called before initiating incremental decoding. Returns true if +// WebPIDecoder object is successfully modified, false otherwise. +WEBP_EXTERN(int) WebPISetIOHooks(WebPIDecoder* const idec, + VP8IoPutHook put, + VP8IoSetupHook setup, + VP8IoTeardownHook teardown, + void* user_data); // Main decoding object. This is an opaque structure. typedef struct VP8Decoder VP8Decoder; // Create a new decoder object. -VP8Decoder* VP8New(void); +WEBP_EXTERN(VP8Decoder*) VP8New(void); // Must be called to make sure 'io' is initialized properly. // Returns false in case of version mismatch. Upon such failure, no other @@ -99,26 +129,26 @@ static inline int VP8InitIo(VP8Io* const io) { } // Start decoding a new picture. Returns true if ok. -int VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io); +WEBP_EXTERN(int) VP8GetHeaders(VP8Decoder* const dec, VP8Io* const io); // Decode a picture. Will call VP8GetHeaders() if it wasn't done already. // Returns false in case of error. -int VP8Decode(VP8Decoder* const dec, VP8Io* const io); +WEBP_EXTERN(int) VP8Decode(VP8Decoder* const dec, VP8Io* const io); // Return current status of the decoder: -VP8StatusCode VP8Status(VP8Decoder* const dec); +WEBP_EXTERN(VP8StatusCode) VP8Status(VP8Decoder* const dec); // return readable string corresponding to the last status. -const char* VP8StatusMessage(VP8Decoder* const dec); +WEBP_EXTERN(const char*) VP8StatusMessage(VP8Decoder* const dec); // Resets the decoder in its initial state, reclaiming memory. // Not a mandatory call between calls to VP8Decode(). -void VP8Clear(VP8Decoder* const dec); +WEBP_EXTERN(void) VP8Clear(VP8Decoder* const dec); // Destroy the decoder object. -void VP8Delete(VP8Decoder* const dec); +WEBP_EXTERN(void) VP8Delete(VP8Decoder* const dec); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/webp/encode.h b/third_party/libwebp/webp/encode.h index e0cc5dc..31f0539 100644 --- a/third_party/libwebp/webp/encode.h +++ b/third_party/libwebp/webp/encode.h @@ -14,35 +14,38 @@ #include <stdlib.h> -#include "webp/types.h" +#include "./types.h" #if defined(__cplusplus) || defined(c_plusplus) extern "C" { #endif -#define WEBP_ENCODER_ABI_VERSION 0x0001 +#define WEBP_ENCODER_ABI_VERSION 0x0002 // Return the encoder's version number, packed in hexadecimal using 8bits for // each of major/minor/revision. E.g: v2.5.7 is 0x020507. -int WebPGetEncoderVersion(void); +WEBP_EXTERN(int) WebPGetEncoderVersion(void); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // One-stop-shop call! No questions asked: // Returns the size of the compressed data (pointed to by *output), or 0 if // an error occurred. The compressed data must be released by the caller // using the call 'free(*output)'. -// Currently, alpha values are discarded. -size_t WebPEncodeRGB(const uint8_t* rgb, int width, int height, int stride, - float quality_factor, uint8_t** output); -size_t WebPEncodeBGR(const uint8_t* bgr, int width, int height, int stride, - float quality_factor, uint8_t** output); -size_t WebPEncodeRGBA(const uint8_t* rgba, int width, int height, int stride, - float quality_factor, uint8_t** output); -size_t WebPEncodeBGRA(const uint8_t* bgra, int width, int height, int stride, - float quality_factor, uint8_t** output); - -//----------------------------------------------------------------------------- +WEBP_EXTERN(size_t) WebPEncodeRGB(const uint8_t* rgb, + int width, int height, int stride, + float quality_factor, uint8_t** output); +WEBP_EXTERN(size_t) WebPEncodeBGR(const uint8_t* bgr, + int width, int height, int stride, + float quality_factor, uint8_t** output); +WEBP_EXTERN(size_t) WebPEncodeRGBA(const uint8_t* rgba, + int width, int height, int stride, + float quality_factor, uint8_t** output); +WEBP_EXTERN(size_t) WebPEncodeBGRA(const uint8_t* bgra, + int width, int height, int stride, + float quality_factor, uint8_t** output); + +//------------------------------------------------------------------------------ // Coding parameters typedef struct { @@ -66,6 +69,9 @@ typedef struct { int preprocessing; // preprocessing filter (0=none, 1=segment-smooth) int partitions; // log2(number of token partitions) in [0..3] // Default is set to 0 for easier progressive decoding. + int partition_limit; // quality degradation allowed to fit the 512k limit on + // prediction modes coding (0=no degradation, 100=full) + int alpha_compression; // Algorithm for optimizing the alpha plane (0 = none) } WebPConfig; // Enumerate some predefined settings for WebPConfig, depending on the type @@ -80,7 +86,8 @@ typedef enum { } WebPPreset; // Internal, version-checked, entry point -int WebPConfigInitInternal(WebPConfig* const, WebPPreset, float, int); +WEBP_EXTERN(int) WebPConfigInitInternal( + WebPConfig* const, WebPPreset, float, int); // Should always be called, to initialize a fresh WebPConfig structure before // modification. Returns 0 in case of version mismatch. WebPConfigInit() must @@ -101,25 +108,28 @@ static inline int WebPConfigPreset(WebPConfig* const config, } // Returns 1 if all parameters are in valid range and the configuration is OK. -int WebPValidateConfig(const WebPConfig* const config); +WEBP_EXTERN(int) WebPValidateConfig(const WebPConfig* const config); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // Input / Output typedef struct WebPPicture WebPPicture; // main structure for I/O -// non-essential structure for storing auxilliary statistics +// non-essential structure for storing auxiliary statistics typedef struct { float PSNR[4]; // peak-signal-to-noise ratio for Y/U/V/All int coded_size; // final size int block_count[3]; // number of intra4/intra16/skipped macroblocks - int header_bytes[2]; // approximative number of bytes spent for header + int header_bytes[2]; // approximate number of bytes spent for header // and mode-partition #0 - int residual_bytes[3][4]; // approximative number of bytes spent for + int residual_bytes[3][4]; // approximate number of bytes spent for // DC/AC/uv coefficients for each (0..3) segments. int segment_size[4]; // number of macroblocks in each segments int segment_quant[4]; // quantizer values for each segments int segment_level[4]; // filtering strength for each segments [0..63] + + int alpha_data_size; // size of the transparency data + int layer_data_size; // size of the enhancement layer data } WebPAuxStats; // Signature for output function. Should return 1 if writing was successful. @@ -128,13 +138,46 @@ typedef struct { typedef int (*WebPWriterFunction)(const uint8_t* data, size_t data_size, const WebPPicture* const picture); +typedef enum { + // chroma sampling + WEBP_YUV420 = 0, // 4:2:0 + WEBP_YUV422 = 1, // 4:2:2 + WEBP_YUV444 = 2, // 4:4:4 + WEBP_YUV400 = 3, // grayscale + WEBP_CSP_UV_MASK = 3, // bit-mask to get the UV sampling factors + // alpha channel variants + WEBP_YUV420A = 4, + WEBP_YUV422A = 5, + WEBP_YUV444A = 6, + WEBP_YUV400A = 7, // grayscale + alpha + WEBP_CSP_ALPHA_BIT = 4 // bit that is set if alpha is present +} WebPEncCSP; + +// Encoding error conditions. +typedef enum { + VP8_ENC_OK = 0, + VP8_ENC_ERROR_OUT_OF_MEMORY, // memory error allocating objects + VP8_ENC_ERROR_BITSTREAM_OUT_OF_MEMORY, // memory error while flushing bits + VP8_ENC_ERROR_NULL_PARAMETER, // a pointer parameter is NULL + VP8_ENC_ERROR_INVALID_CONFIGURATION, // configuration is invalid + VP8_ENC_ERROR_BAD_DIMENSION, // picture has invalid width/height + VP8_ENC_ERROR_PARTITION0_OVERFLOW, // partition is bigger than 512k + VP8_ENC_ERROR_PARTITION_OVERFLOW, // partition is bigger than 16M + VP8_ENC_ERROR_BAD_WRITE, // error while flushing bytes + VP8_ENC_ERROR_FILE_TOO_BIG, // file is bigger than 4G +} WebPEncodingError; + +// maximum width/height allowed (inclusive), in pixels +#define WEBP_MAX_DIMENSION 16383 + struct WebPPicture { // input - int colorspace; // colorspace: should be 0 for now (=Y'CbCr). - int width, height; // dimensions. + WebPEncCSP colorspace; // colorspace: should be YUV420 for now (=Y'CbCr). + int width, height; // dimensions (less or equal to WEBP_MAX_DIMENSION) uint8_t *y, *u, *v; // pointers to luma/chroma planes. int y_stride, uv_stride; // luma/chroma strides. - uint8_t *a; // pointer to the alpha plane (unused for now). + uint8_t *a; // pointer to the alpha plane + int a_stride; // stride of the alpha plane // output WebPWriterFunction writer; // can be NULL @@ -152,10 +195,16 @@ struct WebPPicture { // where to store statistics, if not NULL: WebPAuxStats* stats; + + // original samples (for non-YUV420 modes) + uint8_t *u0, *v0; + int uv0_stride; + + WebPEncodingError error_code; // error code in case of problem. }; // Internal, version-checked, entry point -int WebPPictureInitInternal(WebPPicture* const, int); +WEBP_EXTERN(int) WebPPictureInitInternal(WebPPicture* const, int); // Should always be called, to initialize the structure. Returns 0 in case of // version mismatch. WebPPictureInit() must have succeeded before using the @@ -164,54 +213,64 @@ static inline int WebPPictureInit(WebPPicture* const picture) { return WebPPictureInitInternal(picture, WEBP_ENCODER_ABI_VERSION); } -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ // WebPPicture utils // Convenience allocation / deallocation based on picture->width/height: -// Allocate y/u/v buffers as per width/height specification. +// Allocate y/u/v buffers as per colorspace/width/height specification. // Note! This function will free the previous buffer if needed. // Returns 0 in case of memory error. -int WebPPictureAlloc(WebPPicture* const picture); +WEBP_EXTERN(int) WebPPictureAlloc(WebPPicture* const picture); // Release memory allocated by WebPPictureAlloc() or WebPPictureImport*() // Note that this function does _not_ free the memory pointed to by 'picture'. -void WebPPictureFree(WebPPicture* const picture); +WEBP_EXTERN(void) WebPPictureFree(WebPPicture* const picture); // Copy the pixels of *src into *dst, using WebPPictureAlloc. // Returns 0 in case of memory allocation error. -int WebPPictureCopy(const WebPPicture* const src, WebPPicture* const dst); +WEBP_EXTERN(int) WebPPictureCopy(const WebPPicture* const src, + WebPPicture* const dst); // self-crops a picture to the rectangle defined by top/left/width/height. // Returns 0 in case of memory allocation error, or if the rectangle is // outside of the source picture. -int WebPPictureCrop(WebPPicture* const picture, - int left, int top, int width, int height); +WEBP_EXTERN(int) WebPPictureCrop(WebPPicture* const picture, + int left, int top, int width, int height); + +// Rescale a picture to new dimension width x height. +// Now gamma correction is applied. +// Returns false in case of error (invalid parameter or insufficient memory). +WEBP_EXTERN(int) WebPPictureRescale(WebPPicture* const pic, + int width, int height); -// Colorspace conversion function. Previous buffer will be free'd, if any. +// Colorspace conversion function to import RGB samples. +// Previous buffer will be free'd, if any. // *rgb buffer should have a size of at least height * rgb_stride. // Returns 0 in case of memory error. -int WebPPictureImportRGB(WebPPicture* const picture, - const uint8_t* const rgb, int rgb_stride); -// Same, but for RGBA buffer. Alpha information is ignored. -int WebPPictureImportRGBA(WebPPicture* const picture, - const uint8_t* const rgba, int rgba_stride); - -// Variant of the above, but taking BGR input: -int WebPPictureImportBGR(WebPPicture* const picture, - const uint8_t* const bgr, int bgr_stride); -int WebPPictureImportBGRA(WebPPicture* const picture, - const uint8_t* const bgra, int bgra_stride); - -//----------------------------------------------------------------------------- +WEBP_EXTERN(int) WebPPictureImportRGB( + WebPPicture* const picture, const uint8_t* const rgb, int rgb_stride); +// Same, but for RGBA buffer +WEBP_EXTERN(int) WebPPictureImportRGBA( + WebPPicture* const picture, const uint8_t* const rgba, int rgba_stride); + +// Variant of the above, but taking BGR(A) input: +WEBP_EXTERN(int) WebPPictureImportBGR( + WebPPicture* const picture, const uint8_t* const bgr, int bgr_stride); +WEBP_EXTERN(int) WebPPictureImportBGRA( + WebPPicture* const picture, const uint8_t* const bgra, int bgra_stride); + +//------------------------------------------------------------------------------ // Main call -// Main encoding call, after config and picture have been initialiazed. -// 'picture' must be less than 16384x16384 in dimension, and the 'config' object -// must be a valid one. +// Main encoding call, after config and picture have been initialized. +// 'picture' must be less than 16384x16384 in dimension (cf WEBP_MAX_DIMENSION), +// and the 'config' object must be a valid one. // Returns false in case of error, true otherwise. -int WebPEncode(const WebPConfig* const config, WebPPicture* const picture); +// In case of error, picture->error_code is updated accordingly. +WEBP_EXTERN(int) WebPEncode( + const WebPConfig* const config, WebPPicture* const picture); -//----------------------------------------------------------------------------- +//------------------------------------------------------------------------------ #if defined(__cplusplus) || defined(c_plusplus) } // extern "C" diff --git a/third_party/libwebp/webp/types.h b/third_party/libwebp/webp/types.h index a6d2cd4..2882364 100644 --- a/third_party/libwebp/webp/types.h +++ b/third_party/libwebp/webp/types.h @@ -26,4 +26,10 @@ typedef long long int int64_t; #define inline __forceinline #endif /* _MSC_VER */ +#ifndef WEBP_EXTERN +// This explicitly marks library functions and allows for changing the +// signature for e.g., Windows DLL builds. +#define WEBP_EXTERN(type) extern type +#endif /* WEBP_EXTERN */ + #endif /* WEBP_WEBP_TYPES_H_ */ |