stream_encoder_asm.nasm revision c74663799493f2b1e6123c18def94295d0afab7b
1;  vim:filetype=nasm ts=8
2
3;  libFLAC - Free Lossless Audio Codec library
4;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
5;
6;  Redistribution and use in source and binary forms, with or without
7;  modification, are permitted provided that the following conditions
8;  are met:
9;
10;  - Redistributions of source code must retain the above copyright
11;  notice, this list of conditions and the following disclaimer.
12;
13;  - Redistributions in binary form must reproduce the above copyright
14;  notice, this list of conditions and the following disclaimer in the
15;  documentation and/or other materials provided with the distribution.
16;
17;  - Neither the name of the Xiph.org Foundation nor the names of its
18;  contributors may be used to endorse or promote products derived from
19;  this software without specific prior written permission.
20;
21;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
24;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
25;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
26;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
27;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
28;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
29;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
30;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
31;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32
33%include "nasm.h"
34
35	data_section
36
37cglobal precompute_partition_info_sums_32bit_asm_ia32_
38
39	code_section
40
41
42; **********************************************************************
43;
44; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
45; void precompute_partition_info_sums_32bit_(
46; 	const FLAC__int32 residual[],
47; 	FLAC__uint64 abs_residual_partition_sums[],
48; 	unsigned blocksize,
49; 	unsigned predictor_order,
50; 	unsigned min_partition_order,
51; 	unsigned max_partition_order
52; )
53;
54	ALIGN 16
55cident precompute_partition_info_sums_32bit_asm_ia32_
56
57	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
58	;; [esp + 4]	const FLAC__int32 residual[]
59	;; [esp + 8]	FLAC__uint64 abs_residual_partition_sums[]
60	;; [esp + 12]	unsigned blocksize
61	;; [esp + 16]	unsigned predictor_order
62	;; [esp + 20]	unsigned min_partition_order
63	;; [esp + 24]	unsigned max_partition_order
64	push	ebp
65	push	ebx
66	push	esi
67	push	edi
68	sub	esp, 8
69	;; [esp + 28]	const FLAC__int32 residual[]
70	;; [esp + 32]	FLAC__uint64 abs_residual_partition_sums[]
71	;; [esp + 36]	unsigned blocksize
72	;; [esp + 40]	unsigned predictor_order
73	;; [esp + 44]	unsigned min_partition_order
74	;; [esp + 48]	unsigned max_partition_order
75	;; [esp]	partitions
76	;; [esp + 4]	default_partition_samples
77
78	mov	ecx, [esp + 48]
79	mov	eax, 1
80	shl	eax, cl
81	mov	[esp], eax		; [esp] <- partitions = 1u << max_partition_order;
82	mov	eax, [esp + 36]
83	shr	eax, cl
84	mov	[esp + 4], eax		; [esp + 4] <- default_partition_samples = blocksize >> max_partition_order;
85
86	;
87	; first do max_partition_order
88	;
89	mov	edi, [esp + 4]
90	sub	edi, [esp + 40]		; edi <- end = (unsigned)(-(int)predictor_order) + default_partition_samples
91	xor	esi, esi		; esi <- residual_sample = 0
92	xor	ecx, ecx		; ecx <- partition = 0
93	mov	ebp, [esp + 28]		; ebp <- residual[]
94	xor	ebx, ebx		; ebx <- abs_residual_partition_sum = 0;
95	; note we put the updates to 'end' and 'abs_residual_partition_sum' at the end of loop0 and in the initialization above so we could align loop0 and loop1
96	ALIGN	16
97.loop0:					; for(partition = residual_sample = 0; partition < partitions; partition++) {
98.loop1:					;   for( ; residual_sample < end; residual_sample++)
99	mov	eax, [ebp + esi * 4]
100	cdq
101	xor	eax, edx
102	sub	eax, edx
103	add	ebx, eax		;     abs_residual_partition_sum += abs(residual[residual_sample]);
104	;@@@@@@ check overflow flag and abort here?
105	add	esi, byte 1
106	cmp	esi, edi		;   /* since the loop will always run at least once, we can put the loop check down here */
107	jb	.loop1
108.next1:
109	add	edi, [esp + 4]		;   end += default_partition_samples;
110	mov	eax, [esp + 32]
111	mov	[eax + ecx * 8], ebx	;   abs_residual_partition_sums[partition] = abs_residual_partition_sum;
112	mov	[eax + ecx * 8 + 4], dword 0
113	xor	ebx, ebx		;   abs_residual_partition_sum = 0;
114	add	ecx, byte 1
115	cmp	ecx, [esp]		; /* since the loop will always run at least once, we can put the loop check down here */
116	jb	.loop0
117.next0:					; }
118	;
119	; now merge partitions for lower orders
120	;
121	mov	esi, [esp + 32]		; esi <- abs_residual_partition_sums[from_partition==0];
122	mov	eax, [esp]
123	lea	edi, [esi + eax * 8]	; edi <- abs_residual_partition_sums[to_partition==partitions];
124	mov	ecx, [esp + 48]
125	sub	ecx, byte 1		; ecx <- partition_order = (int)max_partition_order - 1;
126	ALIGN 16
127.loop2:					; for(; partition_order >= (int)min_partition_order; partition_order--) {
128	cmp	ecx, [esp + 44]
129	jl	.next2
130	mov	edx, 1
131	shl	edx, cl			;   const unsigned partitions = 1u << partition_order;
132	ALIGN 16
133.loop3:					;   for(i = 0; i < partitions; i++) {
134	mov	eax, [esi]
135	mov	ebx, [esi + 4]
136	add	eax, [esi + 8]
137	adc	ebx, [esi + 12]
138	mov	[edi], eax
139	mov	[edi + 4], ebx		;     a_r_p_s[to_partition] = a_r_p_s[from_partition] + a_r_p_s[from_partition+1];
140	add	esi, byte 16
141	add	edi, byte 8
142	sub	edx, byte 1
143	jnz	.loop3			;   }
144	sub	ecx, byte 1
145	jmp	.loop2			; }
146.next2:
147
148	add	esp, 8
149	pop	edi
150	pop	esi
151	pop	ebx
152	pop	ebp
153	ret
154
155end
156
157%ifdef OBJ_FORMAT_elf
158	section .note.GNU-stack noalloc
159%endif
160