diff options
Diffstat (limited to 'compat/zlib/contrib/masmx86')
-rw-r--r-- | compat/zlib/contrib/masmx86/bld_ml32.bat | 4 | ||||
-rw-r--r-- | compat/zlib/contrib/masmx86/inffas32.asm | 2166 | ||||
-rw-r--r-- | compat/zlib/contrib/masmx86/match686.asm | 478 | ||||
-rw-r--r-- | compat/zlib/contrib/masmx86/readme.txt | 14 |
4 files changed, 1573 insertions, 1089 deletions
diff --git a/compat/zlib/contrib/masmx86/bld_ml32.bat b/compat/zlib/contrib/masmx86/bld_ml32.bat index 99144d0..fcf5755 100644 --- a/compat/zlib/contrib/masmx86/bld_ml32.bat +++ b/compat/zlib/contrib/masmx86/bld_ml32.bat @@ -1,2 +1,2 @@ -ml /coff /Zi /c /Flgvmat32.lst gvmat32.asm
-ml /coff /Zi /c /Flinffas32.lst inffas32.asm
+ml /coff /Zi /c /Flmatch686.lst match686.asm +ml /coff /Zi /c /Flinffas32.lst inffas32.asm diff --git a/compat/zlib/contrib/masmx86/inffas32.asm b/compat/zlib/contrib/masmx86/inffas32.asm index 4a20512..14f9d35 100644 --- a/compat/zlib/contrib/masmx86/inffas32.asm +++ b/compat/zlib/contrib/masmx86/inffas32.asm @@ -1,1083 +1,1083 @@ -;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding
-; *
-; * inffas32.asm is derivated from inffas86.c, with translation of assembly code
-; *
-; * Copyright (C) 1995-2003 Mark Adler
-; * For conditions of distribution and use, see copyright notice in zlib.h
-; *
-; * Copyright (C) 2003 Chris Anderson <christop@charm.net>
-; * Please use the copyright conditions above.
-; *
-; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from
-; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at
-; * the moment. I have successfully compiled and tested this code with gcc2.96,
-; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S
-; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX
-; * enabled. I will attempt to merge the MMX code into this version. Newer
-; * versions of this and inffast.S can be found at
-; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/
-; *
-; * 2005 : modification by Gilles Vollant
-; */
-; For Visual C++ 4.x and higher and ML 6.x and higher
-; ml.exe is in directory \MASM611C of Win95 DDK
-; ml.exe is also distributed in http://www.masm32.com/masmdl.htm
-; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/
-;
-;
-; compile with command line option
-; ml /coff /Zi /c /Flinffas32.lst inffas32.asm
-
-; if you define NO_GZIP (see inflate.h), compile with
-; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm
-
-
-; zlib122sup is 0 fort zlib 1.2.2.1 and lower
-; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head
-; in inflate_state in inflate.h)
-zlib1222sup equ 8
-
-
-IFDEF GUNZIP
- INFLATE_MODE_TYPE equ 11
- INFLATE_MODE_BAD equ 26
-ELSE
- IFNDEF NO_GUNZIP
- INFLATE_MODE_TYPE equ 11
- INFLATE_MODE_BAD equ 26
- ELSE
- INFLATE_MODE_TYPE equ 3
- INFLATE_MODE_BAD equ 17
- ENDIF
-ENDIF
-
-
-; 75 "inffast.S"
-;FILE "inffast.S"
-
-;;;GLOBAL _inflate_fast
-
-;;;SECTION .text
-
-
-
- .586p
- .mmx
-
- name inflate_fast_x86
- .MODEL FLAT
-
-_DATA segment
-inflate_fast_use_mmx:
- dd 1
-
-
-_TEXT segment
-PUBLIC _inflate_fast
-
-ALIGN 4
-_inflate_fast:
- jmp inflate_fast_entry
-
-
-
-ALIGN 4
- db 'Fast decoding Code from Chris Anderson'
- db 0
-
-ALIGN 4
-invalid_literal_length_code_msg:
- db 'invalid literal/length code'
- db 0
-
-ALIGN 4
-invalid_distance_code_msg:
- db 'invalid distance code'
- db 0
-
-ALIGN 4
-invalid_distance_too_far_msg:
- db 'invalid distance too far back'
- db 0
-
-
-ALIGN 4
-inflate_fast_mask:
-dd 0
-dd 1
-dd 3
-dd 7
-dd 15
-dd 31
-dd 63
-dd 127
-dd 255
-dd 511
-dd 1023
-dd 2047
-dd 4095
-dd 8191
-dd 16383
-dd 32767
-dd 65535
-dd 131071
-dd 262143
-dd 524287
-dd 1048575
-dd 2097151
-dd 4194303
-dd 8388607
-dd 16777215
-dd 33554431
-dd 67108863
-dd 134217727
-dd 268435455
-dd 536870911
-dd 1073741823
-dd 2147483647
-dd 4294967295
-
-
-mode_state equ 0 ;/* state->mode */
-wsize_state equ (32+zlib1222sup) ;/* state->wsize */
-write_state equ (36+4+zlib1222sup) ;/* state->write */
-window_state equ (40+4+zlib1222sup) ;/* state->window */
-hold_state equ (44+4+zlib1222sup) ;/* state->hold */
-bits_state equ (48+4+zlib1222sup) ;/* state->bits */
-lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */
-distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */
-lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */
-distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */
-
-
-;;SECTION .text
-; 205 "inffast.S"
-;GLOBAL inflate_fast_use_mmx
-
-;SECTION .data
-
-
-; GLOBAL inflate_fast_use_mmx:object
-;.size inflate_fast_use_mmx, 4
-; 226 "inffast.S"
-;SECTION .text
-
-ALIGN 4
-inflate_fast_entry:
- push edi
- push esi
- push ebp
- push ebx
- pushfd
- sub esp,64
- cld
-
-
-
-
- mov esi, [esp+88]
- mov edi, [esi+28]
-
-
-
-
-
-
-
- mov edx, [esi+4]
- mov eax, [esi+0]
-
- add edx,eax
- sub edx,11
-
- mov [esp+44],eax
- mov [esp+20],edx
-
- mov ebp, [esp+92]
- mov ecx, [esi+16]
- mov ebx, [esi+12]
-
- sub ebp,ecx
- neg ebp
- add ebp,ebx
-
- sub ecx,257
- add ecx,ebx
-
- mov [esp+60],ebx
- mov [esp+40],ebp
- mov [esp+16],ecx
-; 285 "inffast.S"
- mov eax, [edi+lencode_state]
- mov ecx, [edi+distcode_state]
-
- mov [esp+8],eax
- mov [esp+12],ecx
-
- mov eax,1
- mov ecx, [edi+lenbits_state]
- shl eax,cl
- dec eax
- mov [esp+0],eax
-
- mov eax,1
- mov ecx, [edi+distbits_state]
- shl eax,cl
- dec eax
- mov [esp+4],eax
-
- mov eax, [edi+wsize_state]
- mov ecx, [edi+write_state]
- mov edx, [edi+window_state]
-
- mov [esp+52],eax
- mov [esp+48],ecx
- mov [esp+56],edx
-
- mov ebp, [edi+hold_state]
- mov ebx, [edi+bits_state]
-; 321 "inffast.S"
- mov esi, [esp+44]
- mov ecx, [esp+20]
- cmp ecx,esi
- ja L_align_long
-
- add ecx,11
- sub ecx,esi
- mov eax,12
- sub eax,ecx
- lea edi, [esp+28]
- rep movsb
- mov ecx,eax
- xor eax,eax
- rep stosb
- lea esi, [esp+28]
- mov [esp+20],esi
- jmp L_is_aligned
-
-
-L_align_long:
- test esi,3
- jz L_is_aligned
- xor eax,eax
- mov al, [esi]
- inc esi
- mov ecx,ebx
- add ebx,8
- shl eax,cl
- or ebp,eax
- jmp L_align_long
-
-L_is_aligned:
- mov edi, [esp+60]
-; 366 "inffast.S"
-L_check_mmx:
- cmp dword ptr [inflate_fast_use_mmx],2
- je L_init_mmx
- ja L_do_loop
-
- push eax
- push ebx
- push ecx
- push edx
- pushfd
- mov eax, [esp]
- xor dword ptr [esp],0200000h
-
-
-
-
- popfd
- pushfd
- pop edx
- xor edx,eax
- jz L_dont_use_mmx
- xor eax,eax
- cpuid
- cmp ebx,0756e6547h
- jne L_dont_use_mmx
- cmp ecx,06c65746eh
- jne L_dont_use_mmx
- cmp edx,049656e69h
- jne L_dont_use_mmx
- mov eax,1
- cpuid
- shr eax,8
- and eax,15
- cmp eax,6
- jne L_dont_use_mmx
- test edx,0800000h
- jnz L_use_mmx
- jmp L_dont_use_mmx
-L_use_mmx:
- mov dword ptr [inflate_fast_use_mmx],2
- jmp L_check_mmx_pop
-L_dont_use_mmx:
- mov dword ptr [inflate_fast_use_mmx],3
-L_check_mmx_pop:
- pop edx
- pop ecx
- pop ebx
- pop eax
- jmp L_check_mmx
-; 426 "inffast.S"
-ALIGN 4
-L_do_loop:
-; 437 "inffast.S"
- cmp bl,15
- ja L_get_length_code
-
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
-
-L_get_length_code:
- mov edx, [esp+0]
- mov ecx, [esp+8]
- and edx,ebp
- mov eax, [ecx+edx*4]
-
-L_dolen:
-
-
-
-
-
-
- mov cl,ah
- sub bl,ah
- shr ebp,cl
-
-
-
-
-
-
- test al,al
- jnz L_test_for_length_base
-
- shr eax,16
- stosb
-
-L_while_test:
-
-
- cmp [esp+16],edi
- jbe L_break_loop
-
- cmp [esp+20],esi
- ja L_do_loop
- jmp L_break_loop
-
-L_test_for_length_base:
-; 502 "inffast.S"
- mov edx,eax
- shr edx,16
- mov cl,al
-
- test al,16
- jz L_test_for_second_level_length
- and cl,15
- jz L_save_len
- cmp bl,cl
- jae L_add_bits_to_len
-
- mov ch,cl
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- mov cl,ch
-
-L_add_bits_to_len:
- mov eax,1
- shl eax,cl
- dec eax
- sub bl,cl
- and eax,ebp
- shr ebp,cl
- add edx,eax
-
-L_save_len:
- mov [esp+24],edx
-
-
-L_decode_distance:
-; 549 "inffast.S"
- cmp bl,15
- ja L_get_distance_code
-
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
-
-L_get_distance_code:
- mov edx, [esp+4]
- mov ecx, [esp+12]
- and edx,ebp
- mov eax, [ecx+edx*4]
-
-
-L_dodist:
- mov edx,eax
- shr edx,16
- mov cl,ah
- sub bl,ah
- shr ebp,cl
-; 584 "inffast.S"
- mov cl,al
-
- test al,16
- jz L_test_for_second_level_dist
- and cl,15
- jz L_check_dist_one
- cmp bl,cl
- jae L_add_bits_to_dist
-
- mov ch,cl
- xor eax,eax
- lodsw
- mov cl,bl
- add bl,16
- shl eax,cl
- or ebp,eax
- mov cl,ch
-
-L_add_bits_to_dist:
- mov eax,1
- shl eax,cl
- dec eax
- sub bl,cl
- and eax,ebp
- shr ebp,cl
- add edx,eax
- jmp L_check_window
-
-L_check_window:
-; 625 "inffast.S"
- mov [esp+44],esi
- mov eax,edi
- sub eax, [esp+40]
-
- cmp eax,edx
- jb L_clip_window
-
- mov ecx, [esp+24]
- mov esi,edi
- sub esi,edx
-
- sub ecx,3
- mov al, [esi]
- mov [edi],al
- mov al, [esi+1]
- mov dl, [esi+2]
- add esi,3
- mov [edi+1],al
- mov [edi+2],dl
- add edi,3
- rep movsb
-
- mov esi, [esp+44]
- jmp L_while_test
-
-ALIGN 4
-L_check_dist_one:
- cmp edx,1
- jne L_check_window
- cmp [esp+40],edi
- je L_check_window
-
- dec edi
- mov ecx, [esp+24]
- mov al, [edi]
- sub ecx,3
-
- mov [edi+1],al
- mov [edi+2],al
- mov [edi+3],al
- add edi,4
- rep stosb
-
- jmp L_while_test
-
-ALIGN 4
-L_test_for_second_level_length:
-
-
-
-
- test al,64
- jnz L_test_for_end_of_block
-
- mov eax,1
- shl eax,cl
- dec eax
- and eax,ebp
- add eax,edx
- mov edx, [esp+8]
- mov eax, [edx+eax*4]
- jmp L_dolen
-
-ALIGN 4
-L_test_for_second_level_dist:
-
-
-
-
- test al,64
- jnz L_invalid_distance_code
-
- mov eax,1
- shl eax,cl
- dec eax
- and eax,ebp
- add eax,edx
- mov edx, [esp+12]
- mov eax, [edx+eax*4]
- jmp L_dodist
-
-ALIGN 4
-L_clip_window:
-; 721 "inffast.S"
- mov ecx,eax
- mov eax, [esp+52]
- neg ecx
- mov esi, [esp+56]
-
- cmp eax,edx
- jb L_invalid_distance_too_far
-
- add ecx,edx
- cmp dword ptr [esp+48],0
- jne L_wrap_around_window
-
- sub eax,ecx
- add esi,eax
-; 749 "inffast.S"
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
-
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
-
- cmp eax,ecx
- jbe L_do_copy1
-
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
-
-L_wrap_around_window:
-; 793 "inffast.S"
- mov eax, [esp+48]
- cmp ecx,eax
- jbe L_contiguous_in_window
-
- add esi, [esp+52]
- add esi,eax
- sub esi,ecx
- sub ecx,eax
-
-
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
-
- sub eax,ecx
- rep movsb
- mov esi, [esp+56]
- mov ecx, [esp+48]
- cmp eax,ecx
- jbe L_do_copy1
-
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
- jmp L_do_copy1
-
-L_contiguous_in_window:
-; 836 "inffast.S"
- add esi,eax
- sub esi,ecx
-
-
- mov eax, [esp+24]
- cmp eax,ecx
- jbe L_do_copy1
-
- sub eax,ecx
- rep movsb
- mov esi,edi
- sub esi,edx
-
-L_do_copy1:
-; 862 "inffast.S"
- mov ecx,eax
- rep movsb
-
- mov esi, [esp+44]
- jmp L_while_test
-; 878 "inffast.S"
-ALIGN 4
-L_init_mmx:
- emms
-
-
-
-
-
- movd mm0,ebp
- mov ebp,ebx
-; 896 "inffast.S"
- movd mm4,[esp+0]
- movq mm3,mm4
- movd mm5,[esp+4]
- movq mm2,mm5
- pxor mm1,mm1
- mov ebx, [esp+8]
- jmp L_do_loop_mmx
-
-ALIGN 4
-L_do_loop_mmx:
- psrlq mm0,mm1
-
- cmp ebp,32
- ja L_get_length_code_mmx
-
- movd mm6,ebp
- movd mm7,[esi]
- add esi,4
- psllq mm7,mm6
- add ebp,32
- por mm0,mm7
-
-L_get_length_code_mmx:
- pand mm4,mm0
- movd eax,mm4
- movq mm4,mm3
- mov eax, [ebx+eax*4]
-
-L_dolen_mmx:
- movzx ecx,ah
- movd mm1,ecx
- sub ebp,ecx
-
- test al,al
- jnz L_test_for_length_base_mmx
-
- shr eax,16
- stosb
-
-L_while_test_mmx:
-
-
- cmp [esp+16],edi
- jbe L_break_loop
-
- cmp [esp+20],esi
- ja L_do_loop_mmx
- jmp L_break_loop
-
-L_test_for_length_base_mmx:
-
- mov edx,eax
- shr edx,16
-
- test al,16
- jz L_test_for_second_level_length_mmx
- and eax,15
- jz L_decode_distance_mmx
-
- psrlq mm0,mm1
- movd mm1,eax
- movd ecx,mm0
- sub ebp,eax
- and ecx, [inflate_fast_mask+eax*4]
- add edx,ecx
-
-L_decode_distance_mmx:
- psrlq mm0,mm1
-
- cmp ebp,32
- ja L_get_dist_code_mmx
-
- movd mm6,ebp
- movd mm7,[esi]
- add esi,4
- psllq mm7,mm6
- add ebp,32
- por mm0,mm7
-
-L_get_dist_code_mmx:
- mov ebx, [esp+12]
- pand mm5,mm0
- movd eax,mm5
- movq mm5,mm2
- mov eax, [ebx+eax*4]
-
-L_dodist_mmx:
-
- movzx ecx,ah
- mov ebx,eax
- shr ebx,16
- sub ebp,ecx
- movd mm1,ecx
-
- test al,16
- jz L_test_for_second_level_dist_mmx
- and eax,15
- jz L_check_dist_one_mmx
-
-L_add_bits_to_dist_mmx:
- psrlq mm0,mm1
- movd mm1,eax
- movd ecx,mm0
- sub ebp,eax
- and ecx, [inflate_fast_mask+eax*4]
- add ebx,ecx
-
-L_check_window_mmx:
- mov [esp+44],esi
- mov eax,edi
- sub eax, [esp+40]
-
- cmp eax,ebx
- jb L_clip_window_mmx
-
- mov ecx,edx
- mov esi,edi
- sub esi,ebx
-
- sub ecx,3
- mov al, [esi]
- mov [edi],al
- mov al, [esi+1]
- mov dl, [esi+2]
- add esi,3
- mov [edi+1],al
- mov [edi+2],dl
- add edi,3
- rep movsb
-
- mov esi, [esp+44]
- mov ebx, [esp+8]
- jmp L_while_test_mmx
-
-ALIGN 4
-L_check_dist_one_mmx:
- cmp ebx,1
- jne L_check_window_mmx
- cmp [esp+40],edi
- je L_check_window_mmx
-
- dec edi
- mov ecx,edx
- mov al, [edi]
- sub ecx,3
-
- mov [edi+1],al
- mov [edi+2],al
- mov [edi+3],al
- add edi,4
- rep stosb
-
- mov ebx, [esp+8]
- jmp L_while_test_mmx
-
-ALIGN 4
-L_test_for_second_level_length_mmx:
- test al,64
- jnz L_test_for_end_of_block
-
- and eax,15
- psrlq mm0,mm1
- movd ecx,mm0
- and ecx, [inflate_fast_mask+eax*4]
- add ecx,edx
- mov eax, [ebx+ecx*4]
- jmp L_dolen_mmx
-
-ALIGN 4
-L_test_for_second_level_dist_mmx:
- test al,64
- jnz L_invalid_distance_code
-
- and eax,15
- psrlq mm0,mm1
- movd ecx,mm0
- and ecx, [inflate_fast_mask+eax*4]
- mov eax, [esp+12]
- add ecx,ebx
- mov eax, [eax+ecx*4]
- jmp L_dodist_mmx
-
-ALIGN 4
-L_clip_window_mmx:
-
- mov ecx,eax
- mov eax, [esp+52]
- neg ecx
- mov esi, [esp+56]
-
- cmp eax,ebx
- jb L_invalid_distance_too_far
-
- add ecx,ebx
- cmp dword ptr [esp+48],0
- jne L_wrap_around_window_mmx
-
- sub eax,ecx
- add esi,eax
-
- cmp edx,ecx
- jbe L_do_copy1_mmx
-
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
-
- cmp edx,ecx
- jbe L_do_copy1_mmx
-
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
-
-L_wrap_around_window_mmx:
-
- mov eax, [esp+48]
- cmp ecx,eax
- jbe L_contiguous_in_window_mmx
-
- add esi, [esp+52]
- add esi,eax
- sub esi,ecx
- sub ecx,eax
-
-
- cmp edx,ecx
- jbe L_do_copy1_mmx
-
- sub edx,ecx
- rep movsb
- mov esi, [esp+56]
- mov ecx, [esp+48]
- cmp edx,ecx
- jbe L_do_copy1_mmx
-
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
- jmp L_do_copy1_mmx
-
-L_contiguous_in_window_mmx:
-
- add esi,eax
- sub esi,ecx
-
-
- cmp edx,ecx
- jbe L_do_copy1_mmx
-
- sub edx,ecx
- rep movsb
- mov esi,edi
- sub esi,ebx
-
-L_do_copy1_mmx:
-
-
- mov ecx,edx
- rep movsb
-
- mov esi, [esp+44]
- mov ebx, [esp+8]
- jmp L_while_test_mmx
-; 1174 "inffast.S"
-L_invalid_distance_code:
-
-
-
-
-
- mov ecx, invalid_distance_code_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
-
-L_test_for_end_of_block:
-
-
-
-
-
- test al,32
- jz L_invalid_literal_length_code
-
- mov ecx,0
- mov edx,INFLATE_MODE_TYPE
- jmp L_update_stream_state
-
-L_invalid_literal_length_code:
-
-
-
-
-
- mov ecx, invalid_literal_length_code_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
-
-L_invalid_distance_too_far:
-
-
-
- mov esi, [esp+44]
- mov ecx, invalid_distance_too_far_msg
- mov edx,INFLATE_MODE_BAD
- jmp L_update_stream_state
-
-L_update_stream_state:
-
- mov eax, [esp+88]
- test ecx,ecx
- jz L_skip_msg
- mov [eax+24],ecx
-L_skip_msg:
- mov eax, [eax+28]
- mov [eax+mode_state],edx
- jmp L_break_loop
-
-ALIGN 4
-L_break_loop:
-; 1243 "inffast.S"
- cmp dword ptr [inflate_fast_use_mmx],2
- jne L_update_next_in
-
-
-
- mov ebx,ebp
-
-L_update_next_in:
-; 1266 "inffast.S"
- mov eax, [esp+88]
- mov ecx,ebx
- mov edx, [eax+28]
- shr ecx,3
- sub esi,ecx
- shl ecx,3
- sub ebx,ecx
- mov [eax+12],edi
- mov [edx+bits_state],ebx
- mov ecx,ebx
-
- lea ebx, [esp+28]
- cmp [esp+20],ebx
- jne L_buf_not_used
-
- sub esi,ebx
- mov ebx, [eax+0]
- mov [esp+20],ebx
- add esi,ebx
- mov ebx, [eax+4]
- sub ebx,11
- add [esp+20],ebx
-
-L_buf_not_used:
- mov [eax+0],esi
-
- mov ebx,1
- shl ebx,cl
- dec ebx
-
-
-
-
-
- cmp dword ptr [inflate_fast_use_mmx],2
- jne L_update_hold
-
-
-
- psrlq mm0,mm1
- movd ebp,mm0
-
- emms
-
-L_update_hold:
-
-
-
- and ebp,ebx
- mov [edx+hold_state],ebp
-
-
-
-
- mov ebx, [esp+20]
- cmp ebx,esi
- jbe L_last_is_smaller
-
- sub ebx,esi
- add ebx,11
- mov [eax+4],ebx
- jmp L_fixup_out
-L_last_is_smaller:
- sub esi,ebx
- neg esi
- add esi,11
- mov [eax+4],esi
-
-
-
-
-L_fixup_out:
-
- mov ebx, [esp+16]
- cmp ebx,edi
- jbe L_end_is_smaller
-
- sub ebx,edi
- add ebx,257
- mov [eax+16],ebx
- jmp L_done
-L_end_is_smaller:
- sub edi,ebx
- neg edi
- add edi,257
- mov [eax+16],edi
-
-
-
-
-
-L_done:
- add esp,64
- popfd
- pop ebx
- pop ebp
- pop esi
- pop edi
- ret
-
-_TEXT ends
-end
+;/* inffas32.asm is a hand tuned assembler version of inffast.c -- fast decoding +; * +; * inffas32.asm is derivated from inffas86.c, with translation of assembly code +; * +; * Copyright (C) 1995-2003 Mark Adler +; * For conditions of distribution and use, see copyright notice in zlib.h +; * +; * Copyright (C) 2003 Chris Anderson <christop@charm.net> +; * Please use the copyright conditions above. +; * +; * Mar-13-2003 -- Most of this is derived from inffast.S which is derived from +; * the gcc -S output of zlib-1.2.0/inffast.c. Zlib-1.2.0 is in beta release at +; * the moment. I have successfully compiled and tested this code with gcc2.96, +; * gcc3.2, icc5.0, msvc6.0. It is very close to the speed of inffast.S +; * compiled with gcc -DNO_MMX, but inffast.S is still faster on the P3 with MMX +; * enabled. I will attempt to merge the MMX code into this version. Newer +; * versions of this and inffast.S can be found at +; * http://www.eetbeetee.com/zlib/ and http://www.charm.net/~christop/zlib/ +; * +; * 2005 : modification by Gilles Vollant +; */ +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is in directory \MASM611C of Win95 DDK +; ml.exe is also distributed in http://www.masm32.com/masmdl.htm +; and in VC++2003 toolkit at http://msdn.microsoft.com/visualc/vctoolkit2003/ +; +; +; compile with command line option +; ml /coff /Zi /c /Flinffas32.lst inffas32.asm + +; if you define NO_GZIP (see inflate.h), compile with +; ml /coff /Zi /c /Flinffas32.lst /DNO_GUNZIP inffas32.asm + + +; zlib122sup is 0 fort zlib 1.2.2.1 and lower +; zlib122sup is 8 fort zlib 1.2.2.2 and more (with addition of dmax and head +; in inflate_state in inflate.h) +zlib1222sup equ 8 + + +IFDEF GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 +ELSE + IFNDEF NO_GUNZIP + INFLATE_MODE_TYPE equ 11 + INFLATE_MODE_BAD equ 26 + ELSE + INFLATE_MODE_TYPE equ 3 + INFLATE_MODE_BAD equ 17 + ENDIF +ENDIF + + +; 75 "inffast.S" +;FILE "inffast.S" + +;;;GLOBAL _inflate_fast + +;;;SECTION .text + + + + .586p + .mmx + + name inflate_fast_x86 + .MODEL FLAT + +_DATA segment +inflate_fast_use_mmx: + dd 1 + + +_TEXT segment +PUBLIC _inflate_fast + +ALIGN 4 +_inflate_fast: + jmp inflate_fast_entry + + + +ALIGN 4 + db 'Fast decoding Code from Chris Anderson' + db 0 + +ALIGN 4 +invalid_literal_length_code_msg: + db 'invalid literal/length code' + db 0 + +ALIGN 4 +invalid_distance_code_msg: + db 'invalid distance code' + db 0 + +ALIGN 4 +invalid_distance_too_far_msg: + db 'invalid distance too far back' + db 0 + + +ALIGN 4 +inflate_fast_mask: +dd 0 +dd 1 +dd 3 +dd 7 +dd 15 +dd 31 +dd 63 +dd 127 +dd 255 +dd 511 +dd 1023 +dd 2047 +dd 4095 +dd 8191 +dd 16383 +dd 32767 +dd 65535 +dd 131071 +dd 262143 +dd 524287 +dd 1048575 +dd 2097151 +dd 4194303 +dd 8388607 +dd 16777215 +dd 33554431 +dd 67108863 +dd 134217727 +dd 268435455 +dd 536870911 +dd 1073741823 +dd 2147483647 +dd 4294967295 + + +mode_state equ 0 ;/* state->mode */ +wsize_state equ (32+zlib1222sup) ;/* state->wsize */ +write_state equ (36+4+zlib1222sup) ;/* state->write */ +window_state equ (40+4+zlib1222sup) ;/* state->window */ +hold_state equ (44+4+zlib1222sup) ;/* state->hold */ +bits_state equ (48+4+zlib1222sup) ;/* state->bits */ +lencode_state equ (64+4+zlib1222sup) ;/* state->lencode */ +distcode_state equ (68+4+zlib1222sup) ;/* state->distcode */ +lenbits_state equ (72+4+zlib1222sup) ;/* state->lenbits */ +distbits_state equ (76+4+zlib1222sup) ;/* state->distbits */ + + +;;SECTION .text +; 205 "inffast.S" +;GLOBAL inflate_fast_use_mmx + +;SECTION .data + + +; GLOBAL inflate_fast_use_mmx:object +;.size inflate_fast_use_mmx, 4 +; 226 "inffast.S" +;SECTION .text + +ALIGN 4 +inflate_fast_entry: + push edi + push esi + push ebp + push ebx + pushfd + sub esp,64 + cld + + + + + mov esi, [esp+88] + mov edi, [esi+28] + + + + + + + + mov edx, [esi+4] + mov eax, [esi+0] + + add edx,eax + sub edx,11 + + mov [esp+44],eax + mov [esp+20],edx + + mov ebp, [esp+92] + mov ecx, [esi+16] + mov ebx, [esi+12] + + sub ebp,ecx + neg ebp + add ebp,ebx + + sub ecx,257 + add ecx,ebx + + mov [esp+60],ebx + mov [esp+40],ebp + mov [esp+16],ecx +; 285 "inffast.S" + mov eax, [edi+lencode_state] + mov ecx, [edi+distcode_state] + + mov [esp+8],eax + mov [esp+12],ecx + + mov eax,1 + mov ecx, [edi+lenbits_state] + shl eax,cl + dec eax + mov [esp+0],eax + + mov eax,1 + mov ecx, [edi+distbits_state] + shl eax,cl + dec eax + mov [esp+4],eax + + mov eax, [edi+wsize_state] + mov ecx, [edi+write_state] + mov edx, [edi+window_state] + + mov [esp+52],eax + mov [esp+48],ecx + mov [esp+56],edx + + mov ebp, [edi+hold_state] + mov ebx, [edi+bits_state] +; 321 "inffast.S" + mov esi, [esp+44] + mov ecx, [esp+20] + cmp ecx,esi + ja L_align_long + + add ecx,11 + sub ecx,esi + mov eax,12 + sub eax,ecx + lea edi, [esp+28] + rep movsb + mov ecx,eax + xor eax,eax + rep stosb + lea esi, [esp+28] + mov [esp+20],esi + jmp L_is_aligned + + +L_align_long: + test esi,3 + jz L_is_aligned + xor eax,eax + mov al, [esi] + inc esi + mov ecx,ebx + add ebx,8 + shl eax,cl + or ebp,eax + jmp L_align_long + +L_is_aligned: + mov edi, [esp+60] +; 366 "inffast.S" +L_check_mmx: + cmp dword ptr [inflate_fast_use_mmx],2 + je L_init_mmx + ja L_do_loop + + push eax + push ebx + push ecx + push edx + pushfd + mov eax, [esp] + xor dword ptr [esp],0200000h + + + + + popfd + pushfd + pop edx + xor edx,eax + jz L_dont_use_mmx + xor eax,eax + cpuid + cmp ebx,0756e6547h + jne L_dont_use_mmx + cmp ecx,06c65746eh + jne L_dont_use_mmx + cmp edx,049656e69h + jne L_dont_use_mmx + mov eax,1 + cpuid + shr eax,8 + and eax,15 + cmp eax,6 + jne L_dont_use_mmx + test edx,0800000h + jnz L_use_mmx + jmp L_dont_use_mmx +L_use_mmx: + mov dword ptr [inflate_fast_use_mmx],2 + jmp L_check_mmx_pop +L_dont_use_mmx: + mov dword ptr [inflate_fast_use_mmx],3 +L_check_mmx_pop: + pop edx + pop ecx + pop ebx + pop eax + jmp L_check_mmx +; 426 "inffast.S" +ALIGN 4 +L_do_loop: +; 437 "inffast.S" + cmp bl,15 + ja L_get_length_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_length_code: + mov edx, [esp+0] + mov ecx, [esp+8] + and edx,ebp + mov eax, [ecx+edx*4] + +L_dolen: + + + + + + + mov cl,ah + sub bl,ah + shr ebp,cl + + + + + + + test al,al + jnz L_test_for_length_base + + shr eax,16 + stosb + +L_while_test: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop + jmp L_break_loop + +L_test_for_length_base: +; 502 "inffast.S" + mov edx,eax + shr edx,16 + mov cl,al + + test al,16 + jz L_test_for_second_level_length + and cl,15 + jz L_save_len + cmp bl,cl + jae L_add_bits_to_len + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_len: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + +L_save_len: + mov [esp+24],edx + + +L_decode_distance: +; 549 "inffast.S" + cmp bl,15 + ja L_get_distance_code + + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + +L_get_distance_code: + mov edx, [esp+4] + mov ecx, [esp+12] + and edx,ebp + mov eax, [ecx+edx*4] + + +L_dodist: + mov edx,eax + shr edx,16 + mov cl,ah + sub bl,ah + shr ebp,cl +; 584 "inffast.S" + mov cl,al + + test al,16 + jz L_test_for_second_level_dist + and cl,15 + jz L_check_dist_one + cmp bl,cl + jae L_add_bits_to_dist + + mov ch,cl + xor eax,eax + lodsw + mov cl,bl + add bl,16 + shl eax,cl + or ebp,eax + mov cl,ch + +L_add_bits_to_dist: + mov eax,1 + shl eax,cl + dec eax + sub bl,cl + and eax,ebp + shr ebp,cl + add edx,eax + jmp L_check_window + +L_check_window: +; 625 "inffast.S" + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,edx + jb L_clip_window + + mov ecx, [esp+24] + mov esi,edi + sub esi,edx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + jmp L_while_test + +ALIGN 4 +L_check_dist_one: + cmp edx,1 + jne L_check_window + cmp [esp+40],edi + je L_check_window + + dec edi + mov ecx, [esp+24] + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + jmp L_while_test + +ALIGN 4 +L_test_for_second_level_length: + + + + + test al,64 + jnz L_test_for_end_of_block + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+8] + mov eax, [edx+eax*4] + jmp L_dolen + +ALIGN 4 +L_test_for_second_level_dist: + + + + + test al,64 + jnz L_invalid_distance_code + + mov eax,1 + shl eax,cl + dec eax + and eax,ebp + add eax,edx + mov edx, [esp+12] + mov eax, [edx+eax*4] + jmp L_dodist + +ALIGN 4 +L_clip_window: +; 721 "inffast.S" + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,edx + jb L_invalid_distance_too_far + + add ecx,edx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window + + sub eax,ecx + add esi,eax +; 749 "inffast.S" + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_wrap_around_window: +; 793 "inffast.S" + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + jmp L_do_copy1 + +L_contiguous_in_window: +; 836 "inffast.S" + add esi,eax + sub esi,ecx + + + mov eax, [esp+24] + cmp eax,ecx + jbe L_do_copy1 + + sub eax,ecx + rep movsb + mov esi,edi + sub esi,edx + +L_do_copy1: +; 862 "inffast.S" + mov ecx,eax + rep movsb + + mov esi, [esp+44] + jmp L_while_test +; 878 "inffast.S" +ALIGN 4 +L_init_mmx: + emms + + + + + + movd mm0,ebp + mov ebp,ebx +; 896 "inffast.S" + movd mm4,dword ptr [esp+0] + movq mm3,mm4 + movd mm5,dword ptr [esp+4] + movq mm2,mm5 + pxor mm1,mm1 + mov ebx, [esp+8] + jmp L_do_loop_mmx + +ALIGN 4 +L_do_loop_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_length_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_length_code_mmx: + pand mm4,mm0 + movd eax,mm4 + movq mm4,mm3 + mov eax, [ebx+eax*4] + +L_dolen_mmx: + movzx ecx,ah + movd mm1,ecx + sub ebp,ecx + + test al,al + jnz L_test_for_length_base_mmx + + shr eax,16 + stosb + +L_while_test_mmx: + + + cmp [esp+16],edi + jbe L_break_loop + + cmp [esp+20],esi + ja L_do_loop_mmx + jmp L_break_loop + +L_test_for_length_base_mmx: + + mov edx,eax + shr edx,16 + + test al,16 + jz L_test_for_second_level_length_mmx + and eax,15 + jz L_decode_distance_mmx + + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add edx,ecx + +L_decode_distance_mmx: + psrlq mm0,mm1 + + cmp ebp,32 + ja L_get_dist_code_mmx + + movd mm6,ebp + movd mm7,dword ptr [esi] + add esi,4 + psllq mm7,mm6 + add ebp,32 + por mm0,mm7 + +L_get_dist_code_mmx: + mov ebx, [esp+12] + pand mm5,mm0 + movd eax,mm5 + movq mm5,mm2 + mov eax, [ebx+eax*4] + +L_dodist_mmx: + + movzx ecx,ah + mov ebx,eax + shr ebx,16 + sub ebp,ecx + movd mm1,ecx + + test al,16 + jz L_test_for_second_level_dist_mmx + and eax,15 + jz L_check_dist_one_mmx + +L_add_bits_to_dist_mmx: + psrlq mm0,mm1 + movd mm1,eax + movd ecx,mm0 + sub ebp,eax + and ecx, [inflate_fast_mask+eax*4] + add ebx,ecx + +L_check_window_mmx: + mov [esp+44],esi + mov eax,edi + sub eax, [esp+40] + + cmp eax,ebx + jb L_clip_window_mmx + + mov ecx,edx + mov esi,edi + sub esi,ebx + + sub ecx,3 + mov al, [esi] + mov [edi],al + mov al, [esi+1] + mov dl, [esi+2] + add esi,3 + mov [edi+1],al + mov [edi+2],dl + add edi,3 + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_check_dist_one_mmx: + cmp ebx,1 + jne L_check_window_mmx + cmp [esp+40],edi + je L_check_window_mmx + + dec edi + mov ecx,edx + mov al, [edi] + sub ecx,3 + + mov [edi+1],al + mov [edi+2],al + mov [edi+3],al + add edi,4 + rep stosb + + mov ebx, [esp+8] + jmp L_while_test_mmx + +ALIGN 4 +L_test_for_second_level_length_mmx: + test al,64 + jnz L_test_for_end_of_block + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + add ecx,edx + mov eax, [ebx+ecx*4] + jmp L_dolen_mmx + +ALIGN 4 +L_test_for_second_level_dist_mmx: + test al,64 + jnz L_invalid_distance_code + + and eax,15 + psrlq mm0,mm1 + movd ecx,mm0 + and ecx, [inflate_fast_mask+eax*4] + mov eax, [esp+12] + add ecx,ebx + mov eax, [eax+ecx*4] + jmp L_dodist_mmx + +ALIGN 4 +L_clip_window_mmx: + + mov ecx,eax + mov eax, [esp+52] + neg ecx + mov esi, [esp+56] + + cmp eax,ebx + jb L_invalid_distance_too_far + + add ecx,ebx + cmp dword ptr [esp+48],0 + jne L_wrap_around_window_mmx + + sub eax,ecx + add esi,eax + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_wrap_around_window_mmx: + + mov eax, [esp+48] + cmp ecx,eax + jbe L_contiguous_in_window_mmx + + add esi, [esp+52] + add esi,eax + sub esi,ecx + sub ecx,eax + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi, [esp+56] + mov ecx, [esp+48] + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + jmp L_do_copy1_mmx + +L_contiguous_in_window_mmx: + + add esi,eax + sub esi,ecx + + + cmp edx,ecx + jbe L_do_copy1_mmx + + sub edx,ecx + rep movsb + mov esi,edi + sub esi,ebx + +L_do_copy1_mmx: + + + mov ecx,edx + rep movsb + + mov esi, [esp+44] + mov ebx, [esp+8] + jmp L_while_test_mmx +; 1174 "inffast.S" +L_invalid_distance_code: + + + + + + mov ecx, invalid_distance_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_test_for_end_of_block: + + + + + + test al,32 + jz L_invalid_literal_length_code + + mov ecx,0 + mov edx,INFLATE_MODE_TYPE + jmp L_update_stream_state + +L_invalid_literal_length_code: + + + + + + mov ecx, invalid_literal_length_code_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_invalid_distance_too_far: + + + + mov esi, [esp+44] + mov ecx, invalid_distance_too_far_msg + mov edx,INFLATE_MODE_BAD + jmp L_update_stream_state + +L_update_stream_state: + + mov eax, [esp+88] + test ecx,ecx + jz L_skip_msg + mov [eax+24],ecx +L_skip_msg: + mov eax, [eax+28] + mov [eax+mode_state],edx + jmp L_break_loop + +ALIGN 4 +L_break_loop: +; 1243 "inffast.S" + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_next_in + + + + mov ebx,ebp + +L_update_next_in: +; 1266 "inffast.S" + mov eax, [esp+88] + mov ecx,ebx + mov edx, [eax+28] + shr ecx,3 + sub esi,ecx + shl ecx,3 + sub ebx,ecx + mov [eax+12],edi + mov [edx+bits_state],ebx + mov ecx,ebx + + lea ebx, [esp+28] + cmp [esp+20],ebx + jne L_buf_not_used + + sub esi,ebx + mov ebx, [eax+0] + mov [esp+20],ebx + add esi,ebx + mov ebx, [eax+4] + sub ebx,11 + add [esp+20],ebx + +L_buf_not_used: + mov [eax+0],esi + + mov ebx,1 + shl ebx,cl + dec ebx + + + + + + cmp dword ptr [inflate_fast_use_mmx],2 + jne L_update_hold + + + + psrlq mm0,mm1 + movd ebp,mm0 + + emms + +L_update_hold: + + + + and ebp,ebx + mov [edx+hold_state],ebp + + + + + mov ebx, [esp+20] + cmp ebx,esi + jbe L_last_is_smaller + + sub ebx,esi + add ebx,11 + mov [eax+4],ebx + jmp L_fixup_out +L_last_is_smaller: + sub esi,ebx + neg esi + add esi,11 + mov [eax+4],esi + + + + +L_fixup_out: + + mov ebx, [esp+16] + cmp ebx,edi + jbe L_end_is_smaller + + sub ebx,edi + add ebx,257 + mov [eax+16],ebx + jmp L_done +L_end_is_smaller: + sub edi,ebx + neg edi + add edi,257 + mov [eax+16],edi + + + + + +L_done: + add esp,64 + popfd + pop ebx + pop ebp + pop esi + pop edi + ret + +_TEXT ends +end diff --git a/compat/zlib/contrib/masmx86/match686.asm b/compat/zlib/contrib/masmx86/match686.asm new file mode 100644 index 0000000..21ae704 --- /dev/null +++ b/compat/zlib/contrib/masmx86/match686.asm @@ -0,0 +1,478 @@ +; match686.asm -- Asm portion of the optimized longest_match for 32 bits x86 +; Copyright (C) 1995-1996 Jean-loup Gailly, Brian Raiter and Gilles Vollant. +; File written by Gilles Vollant, by converting match686.S from Brian Raiter +; for MASM. This is as assembly version of longest_match +; from Jean-loup Gailly in deflate.c +; +; http://www.zlib.net +; http://www.winimage.com/zLibDll +; http://www.muppetlabs.com/~breadbox/software/assembly.html +; +; For Visual C++ 4.x and higher and ML 6.x and higher +; ml.exe is distributed in +; http://www.microsoft.com/downloads/details.aspx?FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 +; +; this file contain two implementation of longest_match +; +; this longest_match was written by Brian raiter (1998), optimized for Pentium Pro +; (and the faster known version of match_init on modern Core 2 Duo and AMD Phenom) +; +; for using an assembly version of longest_match, you need define ASMV in project +; +; compile the asm file running +; ml /coff /Zi /c /Flmatch686.lst match686.asm +; and do not include match686.obj in your project +; +; note: contrib of zLib 1.2.3 and earlier contained both a deprecated version for +; Pentium (prior Pentium Pro) and this version for Pentium Pro and modern processor +; with autoselect (with cpu detection code) +; if you want support the old pentium optimization, you can still use these version +; +; this file is not optimized for old pentium, but it compatible with all x86 32 bits +; processor (starting 80386) +; +; +; see below : zlib1222add must be adjuster if you use a zlib version < 1.2.2.2 + +;uInt longest_match(s, cur_match) +; deflate_state *s; +; IPos cur_match; /* current match */ + + NbStack equ 76 + cur_match equ dword ptr[esp+NbStack-0] + str_s equ dword ptr[esp+NbStack-4] +; 5 dword on top (ret,ebp,esi,edi,ebx) + adrret equ dword ptr[esp+NbStack-8] + pushebp equ dword ptr[esp+NbStack-12] + pushedi equ dword ptr[esp+NbStack-16] + pushesi equ dword ptr[esp+NbStack-20] + pushebx equ dword ptr[esp+NbStack-24] + + chain_length equ dword ptr [esp+NbStack-28] + limit equ dword ptr [esp+NbStack-32] + best_len equ dword ptr [esp+NbStack-36] + window equ dword ptr [esp+NbStack-40] + prev equ dword ptr [esp+NbStack-44] + scan_start equ word ptr [esp+NbStack-48] + wmask equ dword ptr [esp+NbStack-52] + match_start_ptr equ dword ptr [esp+NbStack-56] + nice_match equ dword ptr [esp+NbStack-60] + scan equ dword ptr [esp+NbStack-64] + + windowlen equ dword ptr [esp+NbStack-68] + match_start equ dword ptr [esp+NbStack-72] + strend equ dword ptr [esp+NbStack-76] + NbStackAdd equ (NbStack-24) + + .386p + + name gvmatch + .MODEL FLAT + + + +; all the +zlib1222add offsets are due to the addition of fields +; in zlib in the deflate_state structure since the asm code was first written +; (if you compile with zlib 1.0.4 or older, use "zlib1222add equ (-4)"). +; (if you compile with zlib between 1.0.5 and 1.2.2.1, use "zlib1222add equ 0"). +; if you compile with zlib 1.2.2.2 or later , use "zlib1222add equ 8"). + + zlib1222add equ 8 + +; Note : these value are good with a 8 bytes boundary pack structure + dep_chain_length equ 74h+zlib1222add + dep_window equ 30h+zlib1222add + dep_strstart equ 64h+zlib1222add + dep_prev_length equ 70h+zlib1222add + dep_nice_match equ 88h+zlib1222add + dep_w_size equ 24h+zlib1222add + dep_prev equ 38h+zlib1222add + dep_w_mask equ 2ch+zlib1222add + dep_good_match equ 84h+zlib1222add + dep_match_start equ 68h+zlib1222add + dep_lookahead equ 6ch+zlib1222add + + +_TEXT segment + +IFDEF NOUNDERLINE + public longest_match + public match_init +ELSE + public _longest_match + public _match_init +ENDIF + + MAX_MATCH equ 258 + MIN_MATCH equ 3 + MIN_LOOKAHEAD equ (MAX_MATCH+MIN_MATCH+1) + + + +MAX_MATCH equ 258 +MIN_MATCH equ 3 +MIN_LOOKAHEAD equ (MAX_MATCH + MIN_MATCH + 1) +MAX_MATCH_8_ equ ((MAX_MATCH + 7) AND 0FFF0h) + + +;;; stack frame offsets + +chainlenwmask equ esp + 0 ; high word: current chain len + ; low word: s->wmask +window equ esp + 4 ; local copy of s->window +windowbestlen equ esp + 8 ; s->window + bestlen +scanstart equ esp + 16 ; first two bytes of string +scanend equ esp + 12 ; last two bytes of string +scanalign equ esp + 20 ; dword-misalignment of string +nicematch equ esp + 24 ; a good enough match size +bestlen equ esp + 28 ; size of best match so far +scan equ esp + 32 ; ptr to string wanting match + +LocalVarsSize equ 36 +; saved ebx byte esp + 36 +; saved edi byte esp + 40 +; saved esi byte esp + 44 +; saved ebp byte esp + 48 +; return address byte esp + 52 +deflatestate equ esp + 56 ; the function arguments +curmatch equ esp + 60 + +;;; Offsets for fields in the deflate_state structure. These numbers +;;; are calculated from the definition of deflate_state, with the +;;; assumption that the compiler will dword-align the fields. (Thus, +;;; changing the definition of deflate_state could easily cause this +;;; program to crash horribly, without so much as a warning at +;;; compile time. Sigh.) + +dsWSize equ 36+zlib1222add +dsWMask equ 44+zlib1222add +dsWindow equ 48+zlib1222add +dsPrev equ 56+zlib1222add +dsMatchLen equ 88+zlib1222add +dsPrevMatch equ 92+zlib1222add +dsStrStart equ 100+zlib1222add +dsMatchStart equ 104+zlib1222add +dsLookahead equ 108+zlib1222add +dsPrevLen equ 112+zlib1222add +dsMaxChainLen equ 116+zlib1222add +dsGoodMatch equ 132+zlib1222add +dsNiceMatch equ 136+zlib1222add + + +;;; match686.asm -- Pentium-Pro-optimized version of longest_match() +;;; Written for zlib 1.1.2 +;;; Copyright (C) 1998 Brian Raiter <breadbox@muppetlabs.com> +;;; You can look at http://www.muppetlabs.com/~breadbox/software/assembly.html +;;; +;; +;; This software is provided 'as-is', without any express or implied +;; warranty. In no event will the authors be held liable for any damages +;; arising from the use of this software. +;; +;; Permission is granted to anyone to use this software for any purpose, +;; including commercial applications, and to alter it and redistribute it +;; freely, subject to the following restrictions: +;; +;; 1. The origin of this software must not be misrepresented; you must not +;; claim that you wrote the original software. If you use this software +;; in a product, an acknowledgment in the product documentation would be +;; appreciated but is not required. +;; 2. Altered source versions must be plainly marked as such, and must not be +;; misrepresented as being the original software +;; 3. This notice may not be removed or altered from any source distribution. +;; + +;GLOBAL _longest_match, _match_init + + +;SECTION .text + +;;; uInt longest_match(deflate_state *deflatestate, IPos curmatch) + +;_longest_match: + IFDEF NOUNDERLINE + longest_match proc near + ELSE + _longest_match proc near + ENDIF + +;;; Save registers that the compiler may be using, and adjust esp to +;;; make room for our stack frame. + + push ebp + push edi + push esi + push ebx + sub esp, LocalVarsSize + +;;; Retrieve the function arguments. ecx will hold cur_match +;;; throughout the entire function. edx will hold the pointer to the +;;; deflate_state structure during the function's setup (before +;;; entering the main loop. + + mov edx, [deflatestate] + mov ecx, [curmatch] + +;;; uInt wmask = s->w_mask; +;;; unsigned chain_length = s->max_chain_length; +;;; if (s->prev_length >= s->good_match) { +;;; chain_length >>= 2; +;;; } + + mov eax, [edx + dsPrevLen] + mov ebx, [edx + dsGoodMatch] + cmp eax, ebx + mov eax, [edx + dsWMask] + mov ebx, [edx + dsMaxChainLen] + jl LastMatchGood + shr ebx, 2 +LastMatchGood: + +;;; chainlen is decremented once beforehand so that the function can +;;; use the sign flag instead of the zero flag for the exit test. +;;; It is then shifted into the high word, to make room for the wmask +;;; value, which it will always accompany. + + dec ebx + shl ebx, 16 + or ebx, eax + mov [chainlenwmask], ebx + +;;; if ((uInt)nice_match > s->lookahead) nice_match = s->lookahead; + + mov eax, [edx + dsNiceMatch] + mov ebx, [edx + dsLookahead] + cmp ebx, eax + jl LookaheadLess + mov ebx, eax +LookaheadLess: mov [nicematch], ebx + +;;; register Bytef *scan = s->window + s->strstart; + + mov esi, [edx + dsWindow] + mov [window], esi + mov ebp, [edx + dsStrStart] + lea edi, [esi + ebp] + mov [scan], edi + +;;; Determine how many bytes the scan ptr is off from being +;;; dword-aligned. + + mov eax, edi + neg eax + and eax, 3 + mov [scanalign], eax + +;;; IPos limit = s->strstart > (IPos)MAX_DIST(s) ? +;;; s->strstart - (IPos)MAX_DIST(s) : NIL; + + mov eax, [edx + dsWSize] + sub eax, MIN_LOOKAHEAD + sub ebp, eax + jg LimitPositive + xor ebp, ebp +LimitPositive: + +;;; int best_len = s->prev_length; + + mov eax, [edx + dsPrevLen] + mov [bestlen], eax + +;;; Store the sum of s->window + best_len in esi locally, and in esi. + + add esi, eax + mov [windowbestlen], esi + +;;; register ush scan_start = *(ushf*)scan; +;;; register ush scan_end = *(ushf*)(scan+best_len-1); +;;; Posf *prev = s->prev; + + movzx ebx, word ptr [edi] + mov [scanstart], ebx + movzx ebx, word ptr [edi + eax - 1] + mov [scanend], ebx + mov edi, [edx + dsPrev] + +;;; Jump into the main loop. + + mov edx, [chainlenwmask] + jmp short LoopEntry + +align 4 + +;;; do { +;;; match = s->window + cur_match; +;;; if (*(ushf*)(match+best_len-1) != scan_end || +;;; *(ushf*)match != scan_start) continue; +;;; [...] +;;; } while ((cur_match = prev[cur_match & wmask]) > limit +;;; && --chain_length != 0); +;;; +;;; Here is the inner loop of the function. The function will spend the +;;; majority of its time in this loop, and majority of that time will +;;; be spent in the first ten instructions. +;;; +;;; Within this loop: +;;; ebx = scanend +;;; ecx = curmatch +;;; edx = chainlenwmask - i.e., ((chainlen << 16) | wmask) +;;; esi = windowbestlen - i.e., (window + bestlen) +;;; edi = prev +;;; ebp = limit + +LookupLoop: + and ecx, edx + movzx ecx, word ptr [edi + ecx*2] + cmp ecx, ebp + jbe LeaveNow + sub edx, 00010000h + js LeaveNow +LoopEntry: movzx eax, word ptr [esi + ecx - 1] + cmp eax, ebx + jnz LookupLoop + mov eax, [window] + movzx eax, word ptr [eax + ecx] + cmp eax, [scanstart] + jnz LookupLoop + +;;; Store the current value of chainlen. + + mov [chainlenwmask], edx + +;;; Point edi to the string under scrutiny, and esi to the string we +;;; are hoping to match it up with. In actuality, esi and edi are +;;; both pointed (MAX_MATCH_8 - scanalign) bytes ahead, and edx is +;;; initialized to -(MAX_MATCH_8 - scanalign). + + mov esi, [window] + mov edi, [scan] + add esi, ecx + mov eax, [scanalign] + mov edx, 0fffffef8h; -(MAX_MATCH_8) + lea edi, [edi + eax + 0108h] ;MAX_MATCH_8] + lea esi, [esi + eax + 0108h] ;MAX_MATCH_8] + +;;; Test the strings for equality, 8 bytes at a time. At the end, +;;; adjust edx so that it is offset to the exact byte that mismatched. +;;; +;;; We already know at this point that the first three bytes of the +;;; strings match each other, and they can be safely passed over before +;;; starting the compare loop. So what this code does is skip over 0-3 +;;; bytes, as much as necessary in order to dword-align the edi +;;; pointer. (esi will still be misaligned three times out of four.) +;;; +;;; It should be confessed that this loop usually does not represent +;;; much of the total running time. Replacing it with a more +;;; straightforward "rep cmpsb" would not drastically degrade +;;; performance. + +LoopCmps: + mov eax, [esi + edx] + xor eax, [edi + edx] + jnz LeaveLoopCmps + mov eax, [esi + edx + 4] + xor eax, [edi + edx + 4] + jnz LeaveLoopCmps4 + add edx, 8 + jnz LoopCmps + jmp short LenMaximum +LeaveLoopCmps4: add edx, 4 +LeaveLoopCmps: test eax, 0000FFFFh + jnz LenLower + add edx, 2 + shr eax, 16 +LenLower: sub al, 1 + adc edx, 0 + +;;; Calculate the length of the match. If it is longer than MAX_MATCH, +;;; then automatically accept it as the best possible match and leave. + + lea eax, [edi + edx] + mov edi, [scan] + sub eax, edi + cmp eax, MAX_MATCH + jge LenMaximum + +;;; If the length of the match is not longer than the best match we +;;; have so far, then forget it and return to the lookup loop. + + mov edx, [deflatestate] + mov ebx, [bestlen] + cmp eax, ebx + jg LongerMatch + mov esi, [windowbestlen] + mov edi, [edx + dsPrev] + mov ebx, [scanend] + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; s->match_start = cur_match; +;;; best_len = len; +;;; if (len >= nice_match) break; +;;; scan_end = *(ushf*)(scan+best_len-1); + +LongerMatch: mov ebx, [nicematch] + mov [bestlen], eax + mov [edx + dsMatchStart], ecx + cmp eax, ebx + jge LeaveNow + mov esi, [window] + add esi, eax + mov [windowbestlen], esi + movzx ebx, word ptr [edi + eax - 1] + mov edi, [edx + dsPrev] + mov [scanend], ebx + mov edx, [chainlenwmask] + jmp LookupLoop + +;;; Accept the current string, with the maximum possible length. + +LenMaximum: mov edx, [deflatestate] + mov dword ptr [bestlen], MAX_MATCH + mov [edx + dsMatchStart], ecx + +;;; if ((uInt)best_len <= s->lookahead) return (uInt)best_len; +;;; return s->lookahead; + +LeaveNow: + mov edx, [deflatestate] + mov ebx, [bestlen] + mov eax, [edx + dsLookahead] + cmp ebx, eax + jg LookaheadRet + mov eax, ebx +LookaheadRet: + +;;; Restore the stack and return from whence we came. + + add esp, LocalVarsSize + pop ebx + pop esi + pop edi + pop ebp + + ret +; please don't remove this string ! +; Your can freely use match686 in any free or commercial app if you don't remove the string in the binary! + db 0dh,0ah,"asm686 with masm, optimised assembly code from Brian Raiter, written 1998",0dh,0ah + + + IFDEF NOUNDERLINE + longest_match endp + ELSE + _longest_match endp + ENDIF + + IFDEF NOUNDERLINE + match_init proc near + ret + match_init endp + ELSE + _match_init proc near + ret + _match_init endp + ENDIF + + +_TEXT ends +end diff --git a/compat/zlib/contrib/masmx86/readme.txt b/compat/zlib/contrib/masmx86/readme.txt index 7b57167..3f88886 100644 --- a/compat/zlib/contrib/masmx86/readme.txt +++ b/compat/zlib/contrib/masmx86/readme.txt @@ -7,15 +7,21 @@ longest_match() and inflate_fast(). Use instructions ---------------- -Copy these files into the zlib source directory, then run the -appropriate makefile, as suggested below. +Assemble using MASM, and copy the object files into the zlib source +directory, then run the appropriate makefile, as suggested below. You can +donwload MASM from here: + http://www.microsoft.com/downloads/details.aspx?displaylang=en&FamilyID=7a1c9da0-0510-44a2-b042-7ef370530c64 + +You can also get objects files here: + + http://www.winimage.com/zLibDll/zlib124_masm_obj.zip Build instructions ------------------ * With Microsoft C and MASM: -nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="gvmat32c.obj gvmat32.obj inffas32.obj" +nmake -f win32/Makefile.msc LOC="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" * With Borland C and TASM: -make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="gvmat32c.obj gvmat32.obj inffas32.obj" OBJPA="+gvmat32c.obj+gvmat32.obj+inffas32.obj" +make -f win32/Makefile.bor LOCAL_ZLIB="-DASMV -DASMINF" OBJA="match686.obj inffas32.obj" OBJPA="+match686c.obj+match686.obj+inffas32.obj" |