summaryrefslogtreecommitdiffstats
path: root/libFLAC/ia32/bitreader_asm.nasm
blob: 5d1bbfa446435757d9bb5274e8222b05ca938eb4 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
;  vim:filetype=nasm ts=8

;  libFLAC - Free Lossless Audio Codec library
;  Copyright (C) 2001,2002,2003,2004,2005,2006,2007  Josh Coalson
;
;  Redistribution and use in source and binary forms, with or without
;  modification, are permitted provided that the following conditions
;  are met:
;
;  - Redistributions of source code must retain the above copyright
;  notice, this list of conditions and the following disclaimer.
;
;  - Redistributions in binary form must reproduce the above copyright
;  notice, this list of conditions and the following disclaimer in the
;  documentation and/or other materials provided with the distribution.
;
;  - Neither the name of the Xiph.org Foundation nor the names of its
;  contributors may be used to endorse or promote products derived from
;  this software without specific prior written permission.
;
;  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
;  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
;  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
;  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
;  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
;  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
;  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
;  PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
;  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
;  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
;  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

%include "nasm.h"

	data_section

cextern FLAC__crc16_table		; unsigned FLAC__crc16_table[256];
cextern bitreader_read_from_client_	; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br);

cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap

	code_section


; **********************************************************************
;
; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter)
;
; Some details like assertions and other checking is performed by the caller.
	ALIGN 16
cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap

	;ASSERT(0 != br);
	;ASSERT(0 != br->buffer);
	; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion
	;ASSERT(FLAC__BITS_PER_WORD == 32);
	;ASSERT(parameter < 32);
	; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it

	;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time
	;; [esp + 16]	unsigned parameter
	;; [esp + 12]	unsigned nvals
	;; [esp + 8]	int vals[]
	;; [esp + 4]	FLAC__BitReader *br
	mov	eax, [esp + 12]		; if(nvals == 0)
	test	eax, eax
	ja	.nvals_gt_0
	mov	eax, 1			;   return true;
	ret

.nvals_gt_0:
	push	ebp
	push	ebx
	push	esi
	push	edi
	sub	esp, 4
	;; [esp + 36]	unsigned parameter
	;; [esp + 32]	unsigned nvals
	;; [esp + 28]	int vals[]
	;; [esp + 24]	FLAC__BitReader *br
	;; [esp]	ucbits
	mov	ebp, [esp + 24]		; ebp <- br == br->buffer
	mov	esi, [ebp + 16]		; esi <- br->consumed_words (aka 'cwords' in the C version)
	mov	ecx, [ebp + 20]		; ecx <- br->consumed_bits  (aka 'cbits'  in the C version)
	xor	edi, edi		; edi <- 0  'uval'
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	;; [ebp]	br->buffer
	;; [ebp + 8]	br->words
	;; [ebp + 12]	br->bytes
	;; [ebp + 16]	br->consumed_words
	;; [ebp + 20]	br->consumed_bits
	;; [ebp + 24]	br->read_crc
	;; [ebp + 28]	br->crc16_align

					; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
	mov	eax, [ebp + 8]		;   eax <- br->words
	sub	eax, esi		;   eax <- br->words-cwords
	shl	eax, 2			;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD
	add	eax, [ebp + 12]		;   eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
	shl	eax, 3			;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
	sub	eax, ecx		;   eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
	mov	[esp], eax		;   ucbits <- eax

	ALIGN 16
.val_loop:				; while(1) {

	;
	; read unary part
	;
.unary_loop:				;   while(1) {
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	cmp	esi, [ebp + 8]		;     while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
	jae	near .c1_next1
.c1_loop:				;     {
	mov	ebx, [ebp]
	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
	mov	edx, eax		;       edx = br->buffer[cwords] (saved for later use)
	shl	eax, cl 		;       b = br->buffer[cwords] << cbits
	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
	jz	near .c1_next2		;       if(b) {
	bsr	ebx, eax
	not	ebx
	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
	add	ecx, ebx		;         cbits += i;
	add	edi, ebx		;         uval += i;
	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
	test	ecx, ~31
	jz	near .break1 		;         if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */
					;           crc16_update_word_(br, br->buffer[cwords]);
	push	edi			;		[need more registers]
	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
	mov	edi, _FLAC__crc16_table
%else
	mov	edi, FLAC__crc16_table
%endif
	;; eax (ax)	crc a.k.a. br->read_crc
	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
	;; ecx		br->crc16_align
	;; edx		byteswapped brword to CRC
	;; esi		cwords
	;; edi		unsigned FLAC__crc16_table[]
	;; ebp		br
	test	ecx, ecx		;		switch(br->crc16_align) ...
	jnz	.c0b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
.c0b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
.c0b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shr	edx, 16
.c0b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
.c0b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
	movzx	eax, ax
	mov	[ebp + 24], eax		;		br->read_crc <- crc
	pop	edi

	add	esi, byte 1		;           cwords++;
	xor	ecx, ecx		;           cbits = 0;
					;         }
	jmp	near .break1		;         goto break1;
	;; this section relocated out of the way for performance
.c0b4:
	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
	cmp	ecx, 8
	je	.c0b1
	shr	edx, 16
	cmp	ecx, 16
	je	.c0b2
	jmp	.c0b3

	;; this section relocated out of the way for performance
.c1b4:
	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
	cmp	ecx, 8
	je	.c1b1
	shr	edx, 16
	cmp	ecx, 16
	je	.c1b2
	jmp	.c1b3

.c1_next2:				;       } else {
	;; ecx		cbits
	;; edx		current brword 'b'
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	add	edi, 32
	sub	edi, ecx		;         uval += FLAC__BITS_PER_WORD - cbits;
					;         crc16_update_word_(br, br->buffer[cwords]);
	push	edi			;		[need more registers]
	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
	mov	edi, _FLAC__crc16_table
%else
	mov	edi, FLAC__crc16_table
%endif
	;; eax (ax)	crc a.k.a. br->read_crc
	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
	;; ecx		br->crc16_align
	;; edx		byteswapped brword to CRC
	;; esi		cwords
	;; edi		unsigned FLAC__crc16_table[]
	;; ebp		br
	test	ecx, ecx		;		switch(br->crc16_align) ...
	jnz	.c1b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
.c1b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
.c1b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shr	edx, 16
.c1b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
.c1b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
	movzx	eax, ax
	mov	[ebp + 24], eax		;		br->read_crc <- crc
	pop	edi

	add	esi, byte 1		;         cwords++;
	xor	ecx, ecx		;         cbits = 0;
					;         /* didn't find stop bit yet, have to keep going... */
					;       }

	cmp	esi, [ebp + 8]		;     } while(cwords < br->words)   /* if we've not consumed up to a partial tail word... */
	jb	near .c1_loop

.c1_next1:
	; at this point we've eaten up all the whole words; have to try
	; reading through any tail bytes before calling the read callback.
	; this is a repeat of the above logic adjusted for the fact we
	; don't have a whole word.  note though if the client is feeding
	; us data a byte at a time (unlikely), br->consumed_bits may not
	; be zero.
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	mov	edx, [ebp + 12]		;     edx <- br->bytes
	test	edx, edx
	jz	.read1			;     if(br->bytes) {  [NOTE: this case is rare so it doesn't have to be all that fast ]
	mov	ebx, [ebp]
	shl	edx, 3			;       edx <- const unsigned end = br->bytes * 8;
	mov	eax, [ebx + 4*esi]	;       b = br->buffer[cwords]
	xchg	edx, ecx		;       [edx <- cbits , ecx <- end]
	mov	ebx, 0xffffffff		;       ebx <- FLAC__WORD_ALL_ONES
	shr	ebx, cl			;       ebx <- FLAC__WORD_ALL_ONES >> end
	not	ebx			;       ebx <- ~(FLAC__WORD_ALL_ONES >> end)
	xchg	edx, ecx		;       [edx <- end , ecx <- cbits]
	and	eax, ebx		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end));
	shl	eax, cl 		;       b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits;
	test	eax, eax		;         (still have to test since cbits may be 0, thus ZF not updated for shl eax,0)
	jz	.c1_next3		;       if(b) {
	bsr	ebx, eax
	not	ebx
	and	ebx, 31			;         ebx = 'i' = # of leading 0 bits in 'b' (eax)
	add	ecx, ebx		;         cbits += i;
	add	edi, ebx		;         uval += i;
	add	ecx, byte 1		;         cbits++; /* skip over stop bit */
	jmp	short .break1 		;         goto break1;
.c1_next3:				;       } else {
	sub	edi, ecx
	add	edi, edx		;         uval += end - cbits;
	add	ecx, edx		;         cbits += end
					;         /* didn't find stop bit yet, have to keep going... */
					;       }
					;     }
.read1:
	; flush registers and read; bitreader_read_from_client_() does
	; not touch br->consumed_bits at all but we still need to set
	; it in case it fails and we have to return false.
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	mov	[ebp + 16], esi		;     br->consumed_words = cwords;
	mov	[ebp + 20], ecx		;     br->consumed_bits = cbits;
	push	ecx			;     /* save */
	push	ebp			;     /* push br argument */
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
	call	_bitreader_read_from_client_
%else
	call	bitreader_read_from_client_
%endif
	pop	edx			;     /* discard, unused */
	pop	ecx			;     /* restore */
	mov	esi, [ebp + 16]		;     cwords = br->consumed_words;
					;     ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
	mov	ebx, [ebp + 8]		;       ebx <- br->words
	sub	ebx, esi		;       ebx <- br->words-cwords
	shl	ebx, 2			;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
	add	ebx, [ebp + 12]		;       ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
	shl	ebx, 3			;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
	sub	ebx, ecx		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
	add	ebx, edi		;       ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval
					;           + uval to offset our count by the # of unary bits already
					;           consumed before the read, because we will add these back
					;           in all at once at break1
	mov	[esp], ebx		;       ucbits <- ebx
	test	eax, eax		;     if(!bitreader_read_from_client_(br))
	jnz	near .unary_loop
	jmp	.end			;       return false; /* eax (the return value) is already 0 */
					;   } /* end while(1) unary part */

	ALIGN 16
.break1:
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	;; [esp]	ucbits
	sub	[esp], edi		;   ucbits -= uval;
	sub	dword [esp], byte 1	;   ucbits--; /* account for stop bit */

	;
	; read binary part
	;
	mov	ebx, [esp + 36]		;   ebx <- parameter
	test	ebx, ebx		;   if(parameter) {
	jz	near .break2
.read2:
	cmp	[esp], ebx		;     while(ucbits < parameter) {
	jae	.c2_next1
	; flush registers and read; bitreader_read_from_client_() does
	; not touch br->consumed_bits at all but we still need to set
	; it in case it fails and we have to return false.
	mov	[ebp + 16], esi		;       br->consumed_words = cwords;
	mov	[ebp + 20], ecx		;       br->consumed_bits = cbits;
	push	ecx			;       /* save */
	push	ebp			;       /* push br argument */
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
	call	_bitreader_read_from_client_
%else
	call	bitreader_read_from_client_
%endif
	pop	edx			;       /* discard, unused */
	pop	ecx			;       /* restore */
	mov	esi, [ebp + 16]		;       cwords = br->consumed_words;
					;       ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits;
	mov	edx, [ebp + 8]		;         edx <- br->words
	sub	edx, esi		;         edx <- br->words-cwords
	shl	edx, 2			;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD
	add	edx, [ebp + 12]		;         edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes
	shl	edx, 3			;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8
	sub	edx, ecx		;         edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits
	mov	[esp], edx		;         ucbits <- edx
	test	eax, eax		;       if(!bitreader_read_from_client_(br))
	jnz	.read2
	jmp	.end			;         return false; /* eax (the return value) is already 0 */
					;     }
.c2_next1:
	;; ebx		parameter
	;; ecx		cbits
	;; esi		cwords
	;; edi		uval
	;; ebp		br
	;; [esp]	ucbits
	cmp	esi, [ebp + 8]		;     if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */
	jae	near .c2_next2
	test	ecx, ecx		;       if(cbits) {
	jz	near .c2_next3		;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
	mov	eax, 32
	mov	edx, [ebp]
	sub	eax, ecx		;         const unsigned n = FLAC__BITS_PER_WORD - cbits;
	mov	edx, [edx + 4*esi]	;         const brword word = br->buffer[cwords];
	cmp	ebx, eax		;         if(parameter < n) {
	jae	.c2_next4
					;           uval <<= parameter;
					;           uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter);
	shl	edx, cl
	xchg	ebx, ecx
	shld	edi, edx, cl
	add	ebx, ecx		;           cbits += parameter;
	xchg	ebx, ecx		;           ebx <- parameter, ecx <- cbits
	jmp	.break2			;           goto break2;
					;         }
.c2_next4:
					;         uval <<= n;
					;         uval |= word & (FLAC__WORD_ALL_ONES >> cbits);
%if 1
	rol	edx, cl			;            @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing
					;            @@@@@@OPT: or put parameter in ch instead and free up ebx completely again
%else
	shl	edx, cl
%endif
	xchg	eax, ecx
	shld	edi, edx, cl
	xchg	eax, ecx
%if 1
	ror	edx, cl			;            restored.
%else
	mov	edx, [ebp]
	mov	edx, [edx + 4*esi]
%endif
					;         crc16_update_word_(br, br->buffer[cwords]);
	push	edi			;		[need more registers]
	push	ebx			;		[need more registers]
	push	eax			;		[need more registers]
	bswap	edx			;		edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier
	mov	ecx, [ebp + 28]		;		ecx <- br->crc16_align
	mov	eax, [ebp + 24]		;		ax <- br->read_crc (a.k.a. crc)
%ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE
	mov	edi, _FLAC__crc16_table
%else
	mov	edi, FLAC__crc16_table
%endif
	;; eax (ax)	crc a.k.a. br->read_crc
	;; ebx (bl)	intermediate result index into FLAC__crc16_table[]
	;; ecx		br->crc16_align
	;; edx		byteswapped brword to CRC
	;; esi		cwords
	;; edi		unsigned FLAC__crc16_table[]
	;; ebp		br
	test	ecx, ecx		;		switch(br->crc16_align) ...
	jnz	.c2b4			;		[br->crc16_align is 0 the vast majority of the time so we optimize the common case]
.c2b0:	xor	dl, ah			;		dl <- (crc>>8)^(word>>24)
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word>>24)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)]
.c2b1:	xor	dh, ah			;		dh <- (crc>>8)^((word>>16)&0xff))
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))]
	shr	edx, 16
.c2b2:	xor	dl, ah			;		dl <- (crc>>8)^((word>>8)&0xff))
	movzx	ebx, dl
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))]
.c2b3:	xor	dh, ah			;		dh <- (crc>>8)^(word&0xff)
	movzx	ebx, dh
	mov	ecx, [ebx*4 + edi]	;		cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)]
	shl	eax, 8			;		ax <- (crc<<8)
	xor	eax, ecx		;		crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)]
	movzx	eax, ax
	mov	[ebp + 24], eax		;		br->read_crc <- crc
	pop	eax
	pop	ebx
	pop	edi
	add	esi, byte 1		;         cwords++;
	mov	ecx, ebx
	sub	ecx, eax		;         cbits = parameter - n;
	jz	.break2			;         if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */
					;           uval <<= cbits;
					;           uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
	mov	eax, [ebp]
	mov	eax, [eax + 4*esi]
	shld	edi, eax, cl
					;         }
	jmp	.break2			;         goto break2;

	;; this section relocated out of the way for performance
.c2b4:
	mov	[ebp + 28], dword 0	;		br->crc16_align <- 0
	cmp	ecx, 8
	je	.c2b1
	shr	edx, 16
	cmp	ecx, 16
	je	.c2b2
	jmp	.c2b3

.c2_next3:				;       } else {
	mov	ecx, ebx		;         cbits = parameter;
					;         uval <<= cbits;
					;         uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits));
	mov	eax, [ebp]
	mov	eax, [eax + 4*esi]
	shld	edi, eax, cl
	jmp	.break2			;         goto break2;
					;       }
.c2_next2:				;     } else {
	; in this case we're starting our read at a partial tail word;
	; the reader has guaranteed that we have at least 'parameter'
	; bits available to read, which makes this case simpler.
					;       uval <<= parameter;
					;       if(cbits) {
					;         /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */
					;         uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter);
					;         cbits += parameter;
					;         goto break2;
					;       } else {
					;         cbits = parameter;
					;         uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits);
					;         goto break2;
					;       }
					;       the above is much shorter in assembly:
	mov	eax, [ebp]
	mov	eax, [eax + 4*esi]	;       eax <- br->buffer[cwords]
	shl	eax, cl			;       eax <- br->buffer[cwords] << cbits
	add	ecx, ebx		;       cbits += parameter
	xchg	ebx, ecx		;       ebx <- cbits, ecx <- parameter
	shld	edi, eax, cl		;       uval <<= parameter <<< 'parameter' bits of tail word
	xchg	ebx, ecx		;       ebx <- parameter, ecx <- cbits
					;     }
					;   }
.break2:
	sub	[esp], ebx		;   ucbits -= parameter;

	;
	; compose the value
	;
	mov	ebx, [esp + 28]		;   ebx <- vals
	mov	edx, edi		;   edx <- uval
	and	edi, 1			;   edi <- uval & 1
	shr	edx, 1			;   edx <- uval >> 1
	neg	edi			;   edi <- -(int)(uval & 1)
	xor	edx, edi		;   edx <- (uval >> 1 ^ -(int)(uval & 1))
	mov	[ebx], edx		;   *vals <- edx
	sub	dword [esp + 32], byte 1	;   --nvals;
	jz	.finished		;   if(nvals == 0) /* jump to finish */
	xor	edi, edi		;   uval = 0;
	add	dword [esp + 28], 4	;   ++vals
	jmp	.val_loop		; }

.finished:
	mov	[ebp + 16], esi		; br->consumed_words = cwords;
	mov	[ebp + 20], ecx		; br->consumed_bits = cbits;
	mov	eax, 1
.end:
	add	esp, 4
	pop	edi
	pop	esi
	pop	ebx
	pop	ebp
	ret

end

%ifdef OBJ_FORMAT_elf
	section .note.GNU-stack noalloc
%endif