diff options
| author | Tim Redfern <tim@eclectronics.org> | 2013-09-05 17:57:22 +0100 |
|---|---|---|
| committer | Tim Redfern <tim@eclectronics.org> | 2013-09-05 17:57:22 +0100 |
| commit | 8992cb1d0d07edc33d274f6d7924ecdf6f83d994 (patch) | |
| tree | 3a2c86846b7eec8137c1507e623fc7018f13d453 /ffmpeg/libavcodec/x86/pngdsp.asm | |
| parent | 741fb4b9e135cfb161a749db88713229038577bb (diff) | |
making act segmenter
Diffstat (limited to 'ffmpeg/libavcodec/x86/pngdsp.asm')
| -rw-r--r-- | ffmpeg/libavcodec/x86/pngdsp.asm | 173 |
1 files changed, 173 insertions, 0 deletions
diff --git a/ffmpeg/libavcodec/x86/pngdsp.asm b/ffmpeg/libavcodec/x86/pngdsp.asm new file mode 100644 index 0000000..c05f3da --- /dev/null +++ b/ffmpeg/libavcodec/x86/pngdsp.asm @@ -0,0 +1,173 @@ +;****************************************************************************** +;* x86 optimizations for PNG decoding +;* +;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> +;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> +;* +;* This file is part of Libav. +;* +;* Libav is free software; you can redistribute it and/or +;* modify it under the terms of the GNU Lesser General Public +;* License as published by the Free Software Foundation; either +;* version 2.1 of the License, or (at your option) any later version. +;* +;* Libav is distributed in the hope that it will be useful, +;* but WITHOUT ANY WARRANTY; without even the implied warranty of +;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +;* Lesser General Public License for more details. +;* +;* You should have received a copy of the GNU Lesser General Public +;* License along with Libav; if not, write to the Free Software +;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA +;****************************************************************************** + +%include "libavutil/x86/x86util.asm" + +SECTION_RODATA + +cextern pw_255 + +SECTION_TEXT + +; %1 = nr. of xmm registers used +%macro ADD_BYTES_FN 1 +cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i +%if ARCH_X86_64 + movsxd waq, wad +%endif + xor iq, iq + + ; vector loop + mov wq, waq + and waq, ~(mmsize*2-1) + jmp .end_v +.loop_v: + mova m0, [src1q+iq] + mova m1, [src1q+iq+mmsize] + paddb m0, [src2q+iq] + paddb m1, [src2q+iq+mmsize] + mova [dstq+iq ], m0 + mova [dstq+iq+mmsize], m1 + add iq, mmsize*2 +.end_v: + cmp iq, waq + jl .loop_v + +%if mmsize == 16 + ; vector loop + mov waq, wq + and waq, ~7 + jmp .end_l +.loop_l: + movq mm0, [src1q+iq] + paddb mm0, [src2q+iq] + movq [dstq+iq ], mm0 + add iq, 8 +.end_l: + cmp iq, waq + jl .loop_l +%endif + + ; scalar loop for leftover + jmp .end_s +.loop_s: + mov wab, [src1q+iq] + add wab, [src2q+iq] + mov [dstq+iq], wab + inc iq +.end_s: + cmp iq, wq + jl .loop_s + REP_RET +%endmacro + +%if ARCH_X86_32 +INIT_MMX mmx +ADD_BYTES_FN 0 +%endif + +INIT_XMM sse2 +ADD_BYTES_FN 2 + +%macro ADD_PAETH_PRED_FN 1 +cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr +%if ARCH_X86_64 + movsxd bppq, bppd + movsxd wq, wd +%endif + lea endq, [dstq+wq-(mmsize/2-1)] + sub topq, dstq + sub srcq, dstq + sub dstq, bppq + pxor m7, m7 + + PUSH dstq + lea cntrq, [bppq-1] + shr cntrq, 2 + mmsize/16 +.bpp_loop: + lea dstq, [dstq+cntrq*(mmsize/2)] + movh m0, [dstq] + movh m1, [topq+dstq] + punpcklbw m0, m7 + punpcklbw m1, m7 + add dstq, bppq +.loop: + mova m2, m1 + movh m1, [topq+dstq] + mova m3, m2 + punpcklbw m1, m7 + mova m4, m2 + psubw m3, m1 + psubw m4, m0 + mova m5, m3 + paddw m5, m4 +%if cpuflag(ssse3) + pabsw m3, m3 + pabsw m4, m4 + pabsw m5, m5 +%else ; !cpuflag(ssse3) + psubw m7, m5 + pmaxsw m5, m7 + pxor m6, m6 + pxor m7, m7 + psubw m6, m3 + psubw m7, m4 + pmaxsw m3, m6 + pmaxsw m4, m7 + pxor m7, m7 +%endif ; cpuflag(ssse3) + mova m6, m4 + pminsw m6, m5 + pcmpgtw m3, m6 + pcmpgtw m4, m5 + mova m6, m4 + pand m4, m3 + pandn m6, m3 + pandn m3, m0 + movh m0, [srcq+dstq] + pand m6, m1 + pand m2, m4 + punpcklbw m0, m7 + paddw m0, m6 + paddw m3, m2 + paddw m0, m3 + pand m0, [pw_255] + mova m3, m0 + packuswb m3, m3 + movh [dstq], m3 + add dstq, bppq + cmp dstq, endq + jle .loop + + mov dstq, [rsp] + dec cntrq + jge .bpp_loop + POP dstq + RET +%endmacro + +INIT_MMX mmxext +ADD_PAETH_PRED_FN 0 + +INIT_MMX ssse3 +ADD_PAETH_PRED_FN 0 |
