aboutsummaryrefslogtreecommitdiffstats
path: root/contrib
diff options
context:
space:
mode:
authorDinesh K Garg <dineshg@codeaurora.org>2010-12-28 15:43:58 -0800
committerSteve Kondik <shade@chemlab.org>2012-07-10 12:53:19 -0700
commitfb9d57017268c5cbe74145e3a677b473b14e0e36 (patch)
treef8d046cee468510ebcc4e58fccc0a3535080cbba /contrib
parent4c5554b04e73f89d4a9bab8cbcec1943d8c274be (diff)
downloadexternal_libpng-fb9d57017268c5cbe74145e3a677b473b14e0e36.zip
external_libpng-fb9d57017268c5cbe74145e3a677b473b14e0e36.tar.gz
external_libpng-fb9d57017268c5cbe74145e3a677b473b14e0e36.tar.bz2
VeNum optimizations to libpng to improve PNG decode time
Set correct counter in neon routine for SUB filter type. Enable Neon optimizations for all filter types and pixel depths. Change-Id: Ica0d39e828a9e0cba59cbc3632830e4eb3e59607 (cherry picked from commit b912f64bc4bb174fc055cda58e303faaa640b8b1) Conflicts: pngrutil.c
Diffstat (limited to 'contrib')
-rw-r--r--contrib/pngneon/png_read_filter_row_neon.s1170
1 files changed, 1170 insertions, 0 deletions
diff --git a/contrib/pngneon/png_read_filter_row_neon.s b/contrib/pngneon/png_read_filter_row_neon.s
new file mode 100644
index 0000000..1a45745
--- /dev/null
+++ b/contrib/pngneon/png_read_filter_row_neon.s
@@ -0,0 +1,1170 @@
+#; Copyright (c) 2010-2011, Code Aurora Forum. All rights reserved.
+#;
+#; Redistribution and use in source and binary forms, with or without
+#; modification, are permitted provided that the following conditions are
+#; met:
+#; * Redistributions of source code must retain the above copyright
+#; notice, this list of conditions and the following disclaimer.
+#; * Redistributions in binary form must reproduce the above
+#; copyright notice, this list of conditions and the following
+#; disclaimer in the documentation and/or other materials provided
+#; with the distribution.
+#; * Neither the name of Code Aurora Forum, Inc. nor the names of its
+#; contributors may be used to endorse or promote products derived
+#; from this software without specific prior written permission.
+#;
+#; THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+#; WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+#; MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+#; ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+#; BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+#; CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+#; SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+#; BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+#; WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+#; OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+#; IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#;==============================================================================
+
+ .code 32 @; Code is ARM ISA
+#;==============================================================================
+
+ .global png_read_filter_row_neon
+
+#;==============================================================================
+#; INPUTS: r0 rowbytes: number of bytes in current row
+#; r1 pixel_depth: number of bits per pixel
+#; r2 row: pointer to start of current row
+#; r3 prev_row: pointer to start of previous row
+#; [sp,#0] filter: filter type
+#;
+#; NOTE: Don't touch r5-r11
+#;==============================================================================
+.balign 32
+.type png_read_filter_row_neon, %function
+png_read_filter_row_neon:
+
+ ldr r12,[sp,#0]
+
+ cmp r12,#0
+ beq DONE
+
+ cmp r12,#1
+ beq sub_filter
+
+ cmp r12,#2
+ beq up_filter
+
+ cmp r12,#3
+ beq avg_filter
+
+ cmp r12,#4
+ beq paeth_filter
+
+ b DONE
+
+ #;; ---------------
+ #;; SUB filter type
+ #;; ---------------
+
+
+sub_filter:
+
+ stmdb sp!, {r4}
+
+ add r1,r1,#7 @; bpp = bytes per pixel
+ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
+ mov r12,r1
+
+ #;; r0 = rowbytes
+ #;; r1 = loop counter = bpp (initially)
+ #;; r2 = row pointer
+ #;; r12 = bpp = loop/pointer increment value
+
+ cmp r1,r0
+ beq sub_filter_exit @; exit if bpp == rowbytes
+
+ cmp r12,#1
+ beq sub_filter_1bpp
+
+ cmp r12,#2
+ beq sub_filter_2bpp
+
+ cmp r12,#3
+ beq sub_filter_3bpp
+
+ cmp r12,#4
+ beq sub_filter_4bpp
+
+ cmp r12,#6
+ beq sub_filter_6bpp
+
+ cmp r12,#8
+ beq sub_filter_8bpp
+
+sub_filter_exit:
+ b sub_filter_DONE @; return
+
+
+sub_filter_1bpp:
+
+ #;; ----------------------------
+ #;; SUB filter, 1 byte per pixel
+ #;; ----------------------------
+
+ lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
+ @; = iteration count for loop16
+ beq sub_filter_1bpp_16bytes_done
+
+ vmov.i8 d21, #0
+ vld1.8 {d16,d17}, [r2] @; load 16 pixels
+ @; d16 = a b c d e f g h
+ @; d17 = i j k l m n o p
+
+ mov r1, #0
+sub_filter_1bpp_16bytes:
+
+
+
+
+ vshl.i64 d18, d16, #8 @; d18 = 0 a b c d e f g
+ vadd.i8 d18, d16, d18 @; d18 = a a+b b+c c+d d+e e+f f+g g+h
+
+ vshl.i64 d18, d18, #8 @; d18 = 0 a a+b b+c c+d d+e e+f f+g
+ vadd.i8 d18, d16, d18 @; d18 = a a+b a+b+c b+c+d c+d+e d+e+f e+f+g f+g+h
+
+ vshl.i64 d18, d18, #8 @; shift add continuously to propage the sum of previous
+ vadd.i8 d18, d16, d18 @; and current pixels
+
+ vshl.i64 d18, d18, #8
+ vadd.i8 d18, d16, d18
+
+ vshl.i64 d18, d18, #8
+ vadd.i8 d18, d16, d18
+
+ vshl.i64 d18, d18, #8
+ vadd.i8 d18, d16, d18
+
+ vshl.i64 d18, d18, #8
+ vadd.i8 d18, d16, d18 @; maximum data size for shift is 64 bits i.e. doubleword.
+ @; after computing thh value of all the pixels in the double word
+ @; extract the last computed value which will be used by
+ @; the next set of pixels (i.e next doubleword)
+ vext.8 d22, d18, d21, #7 @; extract the updated value of d18[7] i.e a+b+c+d+e+f+h
+ vadd.i8 d17, d17, d22 @; d17 = a+b+c+d+e+f+g+h+i j k l m n o p
+
+ vshl.i64 d19, d17, #8 @; continue shift-add as the first half
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #8
+ vadd.i8 d19, d17, d19
+
+ vst1.8 {d18,d19},[r2]! @; store the result back
+
+ add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
+ subs r4,r4,#1 @; decrement iteration count
+ beq sub_filter_1bpp_16bytes_adjust
+
+
+ vext.8 d22, d19, d21, #7 @; more iterations to go
+ @; extract the last computed value
+ vld1.8 {d16,d17}, [r2] @; load the next 16 bytes
+ vadd.i8 d16, d16, d22 @; set up the input by adding the previous pixel
+ @; value to the input
+ b sub_filter_1bpp_16bytes
+
+sub_filter_1bpp_16bytes_adjust:
+
+ cmp r1, r0 @; no more pixels left .. exit
+ sub r2, r2, #1 @; more pixels remaining
+ @; r2 points to the current pixel adjust it
+ @; so that it points to the prev pixel for the below loop
+ beq sub_filter_DONE
+
+sub_filter_1bpp_16bytes_done:
+
+
+ vld1.8 {d0[0]},[r2]! @; load 1 byte (1 pixel) into D0[0]
+ @; increment row pointer
+sub_filter_1bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+
+ vld1.8 {d2[0]},[r2] @; load 1 byte (current pixel) into D2[0]
+
+ vadd.i8 d0,d0,d2 @; vector add 1 byte of previous pixel with
+ @; 1 byte of current pixel
+ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+ bne sub_filter_1bpp_loop @; loop back until loop counter == rowbytes
+
+ b sub_filter_DONE @; return
+
+ #;; -----------------------------
+ #;; SUB filter, 2 bytes per pixel
+ #;; -----------------------------
+sub_filter_2bpp:
+
+ lsrs r4,r0,#4 @; r1 = floor(rowbytes/4)
+ @; = iteration count for loop16
+ beq sub_filter_2bpp_16bytes_done
+
+ vmov.i8 d21, #0
+ vld1.8 {d16,d17}, [r2] @; load 16 bytes to q8
+ @; d16 = a b c d e f g h
+ @; d17 = i j k l m n o p
+ mov r1, #0
+sub_filter_2bpp_16bytes:
+
+ vshl.i64 d18, d16, #16 @; each pixel is 2bytes .. shift by 16 bits to get previous pixel
+ vadd.i8 d18, d16, d18 @; add to the current pixel
+
+ vshl.i64 d18, d18, #16 @; shift-add to propagate the computed sum as the case for 1bpp
+ vadd.i8 d18, d16, d18
+
+ vshl.i64 d18, d18, #16
+ vadd.i8 d18, d16, d18
+
+
+ vext.8 d22, d18, d21, #6 @; extract the last computed value (i.e. last 2 bytes)
+ vadd.i8 d17, d17, d22 @; add the last computed pixel to the input
+
+ vshl.i64 d19, d17, #16
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #16
+ vadd.i8 d19, d17, d19
+
+ vshl.i64 d19, d19, #16
+ vadd.i8 d19, d17, d19
+
+
+ vst1.8 {d18,d19},[r2]! @; store the result back
+
+
+ add r1, r1, #16 @; add 16 to the loop counter(no of bytes completed)
+ subs r4,r4,#1 @; decrement iteration count
+ beq sub_filter_2bpp_16bytes_adjust
+
+
+ vext.8 d22, d19, d21, #6 @; extract the last computed value
+ @; add the last computed pixel to the input
+ vld1.8 {d16,d17}, [r2]
+ vadd.i8 d16, d16, d22
+
+ b sub_filter_2bpp_16bytes
+
+
+sub_filter_2bpp_16bytes_adjust:
+
+ cmp r1, r0 @; no more pixels left .. exit
+ sub r2, r2, #2 @; more pixels remaining
+ @; r2 points to the current pixel adjust it
+ @; so that it points to the prev pixel for the below loop
+ beq sub_filter_DONE
+
+sub_filter_2bpp_16bytes_done:
+
+ vld1.16 {d0[0]},[r2]! @; load 2 bytes (1 pixel) into D0[0]
+ @; increment row pointer
+sub_filter_2bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+
+ vld1.16 {d2[0]},[r2] @; load 2 bytes (current pixel) into D2[0]
+ vadd.i8 d0,d0,d2 @; vector add 2 bytes of previous pixel with
+ @; 2 bytes of current pixel
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+ bne sub_filter_2bpp_loop @; loop back until loop counter == rowbytes
+ @
+ b sub_filter_DONE @ ; return
+
+ #;; -----------------------------
+ #;; SUB filter, 3 bytes per pixel
+ #;; -----------------------------
+sub_filter_3bpp:
+ vld1.32 {d0[0]},[r2], r12 @; load 4 bytes (1 pixel + 1 extra byte) into D0[0]
+ @; increment row pointer by bpp
+sub_filter_3bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+
+ vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel + 1 extra byte) into D2[0]
+ vadd.i8 d0,d0,d2 @; vector add 3 bytes of previous pixel with
+ @; 3 bytes of current pixel
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+ bne sub_filter_3bpp_loop @; loop back until loop counter == rowbytes
+
+ b sub_filter_DONE @; return
+
+ #;; -----------------------------
+ #;; SUB filter, 4 bytes per pixel
+ #;; -----------------------------
+sub_filter_4bpp:
+ vld1.32 {d0[0]},[r2]! @; load 4 bytes (1 pixel) into D0[0]
+ @; increment row pointer
+sub_filter_4bpp_loop: @
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+
+
+ vld1.32 {d2[0]},[r2] @; load 4 bytes (current pixel) into D2[0]
+ vadd.i8 d0,d0,d2 @; vector add 4 bytes of previous pixel with
+ @; 4 bytes of current pixel
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+ bne sub_filter_4bpp_loop @; loop back until loop counter == rowbytes
+
+ b sub_filter_DONE @; return
+
+ #;; -----------------------------
+ #;; SUB filter, 6 bytes per pixel
+ #;; -----------------------------
+sub_filter_6bpp:
+ vld1.8 {d0},[r2],r12 @; load 8 bytes (1 pixel + 2 extra bytes) into D0
+ @; increment row pointer by bpp
+sub_filter_6bpp_loop: @
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+
+ vld1.8 {d2},[r2] @; load 8 bytes (1 pixel + 2 extra bytes) into D2
+ vadd.i8 d0,d0,d2 @; vector add 6 bytes of previous pixel with
+ @; 6 bytes of current pixel
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+ bne sub_filter_6bpp_loop @; loop back until loop counter == rowbytes
+
+ b sub_filter_DONE @; return
+
+ #;; -----------------------------
+ #;; SUB filter, 8 bytes per pixel
+ #;; -----------------------------
+sub_filter_8bpp:
+ vld1.8 {d0},[r2]! @; load 8 bytes (1 pixel) into D0
+ @; increment row pointer
+sub_filter_8bpp_loop: @
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0 @;
+ vld1.8 {d2},[r2] @; load 8 bytes (current pixel) into D2
+ vadd.i8 d0,d0,d2 @; vector add 8 bytes of previous pixel with
+ @; 8 bytes of current pixel
+ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel) back
+ @; into row pointer location and increment
+ @; row pointer
+
+
+ bne sub_filter_8bpp_loop @; loop back until loop counter == rowbytes
+ @
+ b sub_filter_DONE @ ; return
+
+sub_filter_DONE:
+
+ ldmia sp!, {r4}
+ bx r14
+
+ #;; --------------
+ #;; UP filter type
+ #;; --------------
+up_filter:
+
+ #;; r0 = rowbytes
+ #;; r1 = pixel_depth (not required for UP filter type)
+ #;; r2 = row pointer
+ #;; r3 = previous row pointer
+
+
+ lsrs r1,r0,#5 @; r1 = floor(rowbytes/32)
+ @; = iteration count for loop32
+ beq up_filter_32bytes_proc_done
+
+
+up_filter_32bytes_proc:
+
+
+ mov r12, r2
+
+ vld1.8 {q0},[r3]! @; load 32 bytes from previous
+ vld1.8 {q2},[r3]! @; row and increment pointer
+ @
+ @
+ vld1.8 {q1},[r12]! @; load 32 bytes from current row
+ vld1.8 {q3},[r12]! @
+ @
+ @
+ @
+ vadd.i8 q0,q0,q1 @; vector add of 16 bytes
+ vadd.i8 q2,q2,q3 @
+ @
+ @
+ @
+ vst1.8 {q0},[r2]! @; store 32 bytes to current row
+ vst1.8 {q2},[r2]! @
+ @; and increment pointer
+ sub r0,r0,#32 @; subtract 32 from rowbytes
+ subs r1,r1,#1 @; decrement iteration count
+ bne up_filter_32bytes_proc
+
+
+
+up_filter_32bytes_proc_done:
+
+ lsrs r1,r0,#4 @; r1 = floor(rowbytes/16)
+ @; = iteration count for loop16
+ beq up_filter_16bytes_proc_done
+
+up_filter_16bytes_proc:
+
+ vld1.8 {q0},[r3]! @; load 16 bytes from previous
+ @; row and increment pointer
+ vld1.8 {q1},[r2] @; load 16 bytes from current row
+ vadd.i8 q0,q0,q1 @; vector add of 16 bytes
+ vst1.8 {q0},[r2]! @; store 16 bytes to current row
+ @; and increment pointer
+ sub r0,r0,#16 @; subtract 16 from rowbytes
+ subs r1,r1,#1 @; decrement iteration count
+ bne up_filter_16bytes_proc
+
+up_filter_16bytes_proc_done:
+
+ lsrs r1,r0,#3 @; r1 = floor(rowbytes/8)
+ beq up_filter_8bytes_proc_done
+
+up_filter_8bytes_proc:
+
+ vld1.8 {d0},[r3]! @; load 8 bytes from previous
+ @; row and increment pointer
+ vld1.8 {d2},[r2] @; load 8 bytes from current row
+ vadd.i8 d0,d0,d2 @; vector add 8 bytes
+ vst1.8 {d0},[r2]! @; store 8 bytes to current row
+ @; and increment pointer
+ sub r0,r0,#8 @; subtract 8 from rowbytes
+
+up_filter_8bytes_proc_done:
+
+ lsrs r1,r0,#2 @; r1 = floor(rowbytes/4)
+ beq up_filter_4bytes_proc_done
+
+up_filter_4bytes_proc:
+
+ vld1.32 {d0[0]},[r3]! @; load 4 bytes from previous row
+ @; and increment pointer
+ vld1.32 {d2[0]},[r2] @; load 4 bytes from current row
+ vadd.i8 d0,d0,d2 @; vector add 4 bytes
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes to current row
+ @; and increment pointer
+ sub r0,r0,#4 @; subtract 4 from rowbytes
+
+up_filter_4bytes_proc_done:
+
+ lsrs r1,r0,#1 @; r1 = floor(rowbytes/2)
+ beq up_filter_2bytes_proc_done
+
+up_filter_2bytes_proc:
+
+ vld1.16 {d0[0]},[r3]! @; load 2 bytes from previous row
+ @; and increment pointer
+ vld1.16 {d2[0]},[r2] @; load 2 bytes from current row
+ vadd.i8 d0,d0,d2 @; vector add 2 bytes
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes to current row
+ @; and increment pointer
+ sub r0,r0,#2 @; subtract 2 from rowbytes
+
+up_filter_2bytes_proc_done:
+
+ cmp r0,#0
+ beq up_filter_1byte_proc_done
+
+up_filter_1byte_proc:
+
+ vld1.8 {d0[0]},[r3]! @; load 1 byte from previous row
+ @; and increment pointer
+ vld1.8 {d2[0]},[r2] @; load 1 byte from current row
+ vadd.i8 d0,d0,d2 @; vector add 1 byte
+ vst1.8 {d0[0]},[r2]! @; store 1 byte to current row
+ @; and increment pointer
+up_filter_1byte_proc_done:
+
+ b DONE
+
+ #;; ---------------
+ #;; AVG filter type
+ #;; ---------------
+avg_filter:
+
+ add r1,r1,#7 @; bpp = byptes per pixel
+ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
+ mov r12,r1
+
+ #;; r0 = rowbytes
+ #;; r1 = loop counter = bpp (initially)
+ #;; r2 = row pointer
+ #;; r3 = previous row pointer
+ #;; r12 = bpp = loop/pointer increment value
+
+ cmp r12,#1
+ beq avg_filter_1bpp
+
+ cmp r12,#2
+ beq avg_filter_2bpp
+
+ cmp r12,#3
+ beq avg_filter_3bpp
+
+ cmp r12,#4
+ beq avg_filter_4bpp
+
+ cmp r12,#6
+ beq avg_filter_6bpp
+
+ cmp r12,#8
+ beq avg_filter_8bpp
+
+avg_filter_exit:
+ b DONE @; return
+
+ #;; ----------------------------
+ #;; AVG filter, 1 byte per pixel
+ #;; ----------------------------
+avg_filter_1bpp:
+
+ cmp r1,r0
+
+ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
+ @; row into d0[0]
+ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+
+avg_filter_1bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ vld1.8 {d2[0]},[r2] @; load 1 byte (pixel x) from curr
+ @; row into d2[0]
+ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
+ @; row into d1[0]
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
+ vst1.8 {d0[0]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+ bne avg_filter_1bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -----------------------------
+ #;; AVG filter, 2 bytes per pixel
+ #;; -----------------------------
+avg_filter_2bpp:
+
+ cmp r1,r0
+
+ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
+ @; row into d0[0]
+ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+
+avg_filter_2bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ vld1.16 {d2[0]},[r2] @; load 2 bytes (pixel x) from curr
+ @; row into d2[0]
+ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
+ @; row into d1[0]
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+
+ bne avg_filter_2bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+
+ #;; -----------------------------
+ #;; AVG filter, 3 bytes per pixel
+ #;; -----------------------------
+avg_filter_3bpp:
+
+ cmp r1,r0
+
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
+ @; byte) from curr row into d0[0]
+ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
+ @; byte) from prev row into d1[0]
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+
+avg_filter_3bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+ vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x + 1 extra
+ @; byte) from curr row into d2[0]
+ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
+ @; byte) from prev row into d1[0]
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
+ vst1.16 {d0[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.8 {d0[2]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+
+ bne avg_filter_3bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -----------------------------
+ #;; AVG filter, 4 bytes per pixel
+ #;; -----------------------------
+avg_filter_4bpp:
+
+ cmp r1,r0
+
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
+ @; row into d0[0]
+ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+
+avg_filter_4bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ vld1.32 {d2[0]},[r2] @; load 4 bytes (pixel x) from curr
+ @; row into d2[0]
+ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
+ @; row into d1[0]
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1[0] = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0[0] = x + ((a + b)/2)
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne avg_filter_4bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -----------------------------
+ #;; AVG filter, 6 bytes per pixel
+ #;; -----------------------------
+avg_filter_6bpp:
+
+ cmp r1,r0
+
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
+ @; bytes) from curr row into d0
+ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
+ @; bytes) from prev row into d1
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+
+avg_filter_6bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ vld1.8 {d2},[r2] @; load 8 bytes (pixel x + 2 extra
+ @; bytes) from curr row into d2
+ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
+ @; bytes) from prev row into d1
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
+ vst1.32 {d0[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.16 {d0[2]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne avg_filter_6bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -----------------------------
+ #;; AVG filter, 8 bytes per pixel
+ #;; -----------------------------
+avg_filter_8bpp:
+
+ cmp r1,r0
+
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
+ @; row into d0
+ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
+ @; row into d1
+ @; increment prev row pointer
+ vsra.u8 d0,d1,#1 @; shift right pixel b by 1 and add
+ @; to pixel x
+ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
+ @; increment curr row pointer
+ @; updated pixel x is now pixel a
+ beq DONE
+avg_filter_8bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ vld1.8 {d2},[r2] @; load 8 bytes (pixel x) from curr
+ @; row into d2
+ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
+ @; row into d1
+ vaddl.u8 q2,d0,d1 @; q2 = (pixel a + pixel b)
+ vshrn.i16 d1,q2,#1 @; d1 = (a + b)/2
+ vadd.i8 d0,d2,d1 @; d0 = x + ((a + b)/2)
+ vst1.8 {d0},[r2]! @; store 8 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne avg_filter_8bpp_loop
+
+ b DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -----------------
+ #;; PAETH filter type
+ #;; -----------------
+paeth_filter:
+
+ VPUSH {q4-q7}
+ add r1,r1,#7 @; bpp = bytes per pixel
+ lsr r1,r1,#3 @; = (pixel_depth + 7) >> 3
+ mov r12,r1
+
+ #;; r0 = rowbytes
+ #;; r1 = loop counter = bpp (initially)
+ #;; r2 = row pointer
+ #;; r3 = previous row pointer
+ #;; r12 = bpp = loop/pointer increment value
+
+
+ cmp r12,#1
+ beq paeth_filter_1bpp
+
+ cmp r12,#2
+ beq paeth_filter_2bpp
+
+ cmp r12,#3
+ beq paeth_filter_3bpp
+
+ cmp r12,#4
+ beq paeth_filter_4bpp
+
+ cmp r12,#6
+ beq paeth_filter_6bpp
+
+ cmp r12,#8
+ beq paeth_filter_8bpp
+
+paeth_filter_exit:
+ b paeth_filter_DONE @; return
+
+ #;; ------------------------------
+ #;; PAETH filter, 1 byte per pixel
+ #;; ------------------------------
+paeth_filter_1bpp:
+
+ cmp r1, r0
+
+ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
+ @; row into d0[0]
+ vld1.8 {d1[0]},[r3]! @; load 1 byte (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+
+ beq paeth_filter_DONE
+
+paeth_filter_1bpp_loop:
+ add r1,r1,r12 @; increment curr row pointer
+ cmp r1,r0
+
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.8 {d3[0]},[r3]! @; load 1 byte (pixel b) from prev
+ @; row into d3[0]
+ vld1.8 {d0[0]},[r2] @; load 1 byte (pixel x) from curr
+ @; row into d0[0]
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+ @
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.8 {d2[0]},[r2]! @; store 1 byte (updated pixel x)
+
+
+ bne paeth_filter_1bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -------------------------------
+ #;; PAETH filter, 2 bytes per pixel
+ #;; -------------------------------
+paeth_filter_2bpp:
+
+ cmp r1, r0
+
+ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
+ @; row into d0[0]
+ vld1.16 {d1[0]},[r3]! @; load 2 bytes (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ beq paeth_filter_DONE
+
+paeth_filter_2bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.16 {d3[0]},[r3]! @; load 2 bytes (pixel b) from prev
+ @; row into d3[0]
+ vld1.16 {d0[0]},[r2] @; load 2 bytes (pixel x) from curr
+ @; row into d0[0]
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne paeth_filter_2bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -------------------------------
+ #;; PAETH filter, 3 bytes per pixel
+ #;; -------------------------------
+paeth_filter_3bpp:
+
+ cmp r1, r0
+
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
+ @; byte) from curr row into d0[0]
+ vld1.32 {d1[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
+ @; byte) from prev row into d1[0]
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+ beq paeth_filter_DONE
+
+paeth_filter_3bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.32 {d3[0]},[r3],r12 @; load 4 bytes (pixel b + 1 extra
+ @; byte) from prev row into d3[0]
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x + 1 extra
+ @; byte) from curr row into d0[0]
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+ @
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+ @
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.16 {d2[0]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.8 {d2[2]},[r2]! @; store 1 byte (updated pixel x)
+ @; increment curr row pointer
+ bne paeth_filter_3bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -------------------------------
+ #;; PAETH filter, 4 bytes per pixel
+ #;; -------------------------------
+paeth_filter_4bpp:
+
+ cmp r1, r0
+
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
+ @; row into d0[0]
+ vld1.32 {d1[0]},[r3]! @; load 4 bytes (pixel b) from prev
+ @; row into d1[0]
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ beq paeth_filter_DONE
+
+paeth_filter_4bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.32 {d3[0]},[r3]! @; load 4 bytes (pixel b) from prev
+ @; row into d3[0]
+ vld1.32 {d0[0]},[r2] @; load 4 bytes (pixel x) from curr
+ @; row into d0[0]
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+ @
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+ @
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne paeth_filter_4bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -------------------------------
+ #;; PAETH filter, 6 bytes per pixel
+ #;; -------------------------------
+paeth_filter_6bpp:
+ cmp r1, r0
+
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
+ @; bytes) from curr row into d0
+ vld1.8 {d1},[r3],r12 @; load 8 bytes (pixel b + 2 extra
+ @; bytes) from prev row into d1
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ beq paeth_filter_DONE
+
+paeth_filter_6bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.8 {d3},[r3],r12 @; load 8 bytes (pixel b + 2 extra
+ @; bytes) from prev row into d3
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x + 2 extra
+ @; bytes) from curr row into d0
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.32 {d2[0]},[r2]! @; store 4 bytes (updated pixel x)
+ @; increment curr row pointer
+ vst1.16 {d2[2]},[r2]! @; store 2 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne paeth_filter_6bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+ #;; -------------------------------
+ #;; PAETH filter, 8 bytes per pixel
+ #;; -------------------------------
+paeth_filter_8bpp:
+ cmp r1, r0
+
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
+ @; row into d0
+ vld1.8 {d1},[r3]! @; load 8 bytes (pixel b) from prev
+ @; row into d1
+ @; increment prev row pointer
+ vadd.i8 d2,d0,d1 @; d2 = x + b = updated pixel x
+ vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
+ @; increment curr row pointer
+ beq paeth_filter_DONE
+
+paeth_filter_8bpp_loop:
+ add r1,r1,r12 @; loop counter += bpp
+ cmp r1,r0
+
+
+ #;; d1[0] = c (b in the previous loop iteration)
+ #;; d2[0] = a (x in the previous loop iteration)
+ vld1.8 {d3},[r3]! @; load 8 bytes (pixel b) from prev
+ @; row into d3
+ vld1.8 {d0},[r2] @; load 8 bytes (pixel x) from curr
+ @; row into d0
+ vshll.u8 q4,d1,#1 @; q4 = c<<1 = 2c
+ vabdl.u8 q3,d2,d1 @; q3 = pb = abs(a - c)
+ vabdl.u8 q2,d3,d1 @; q2 = pa = abs(b - c)
+ vaddl.u8 q5,d2,d3 @; q5 = a + b
+ vabd.u16 q4,q5,q4 @; q4 = pc = abs(a + b - 2c)
+ @
+ vcle.s16 q5,q2,q3 @; q5 = (pa <= pb)
+ vcle.s16 q6,q2,q4 @; q6 = (pa <= pc)
+ vand q5,q5,q6 @; q5 = ((pa <= pb) && (pa <= pc))
+ vcle.s16 q7,q3,q4 @; q7 = (pb <= pc)
+ vshrn.u16 d10,q5,#8 @; d10 = ((pa <= pb) && (pa <= pc))
+ vshrn.u16 d14,q7,#8 @; d14 = (pb <= pc)
+ @
+ vand d2,d2,d10 @; d2 = a where 1, 0 where 0
+ vbsl d14,d3,d1 @; d14 = b where 1, c where 0
+ vmvn d10,d10 @; invert d10
+ vand d14,d14,d10 @; d14 = b/c where 1, 0 where 0
+ vadd.i8 d2,d2,d14 @; d2 = p = a/b/c where appropriate
+ vadd.i8 d2,d2,d0 @; d2 = x + p (updated pixel x)
+ vmov d1,d3 @; d1 = b (c for next iteration)
+ vst1.8 {d2},[r2]! @; store 8 bytes (updated pixel x)
+ @; increment curr row pointer
+ bne paeth_filter_8bpp_loop
+
+ b paeth_filter_DONE @; exit loop when
+ @; loop counter == rowbytes
+paeth_filter_DONE:
+
+ VPOP {q4-q7}
+ bx r14
+
+DONE:
+ bx r14
+
+
+.size png_read_filter_row_neon, .-png_read_filter_row_neon
+ .END