1; LzmaDecOpt.asm -- ASM version of LzmaDec_DecodeReal_3() function
2; 2018-02-06: Igor Pavlov : Public domain
3;
4; 3 - is the code compatibility version of LzmaDec_DecodeReal_*()
5; function for check at link time.
6; That code is tightly coupled with LzmaDec_TryDummy()
7; and with another functions in LzmaDec.c file.
8; CLzmaDec structure, (probs) array layout, input and output of
9; LzmaDec_DecodeReal_*() must be equal in both versions (C / ASM).
10
11ifndef x64
12; x64=1
13; .err <x64_IS_REQUIRED>
14endif
15
16include 7zAsm.asm
17
18MY_ASM_START
19
20_TEXT$LZMADECOPT SEGMENT ALIGN(64) 'CODE'
21
22MY_ALIGN macro num:req
23        align  num
24endm
25
26MY_ALIGN_16 macro
27        MY_ALIGN 16
28endm
29
30MY_ALIGN_32 macro
31        MY_ALIGN 32
32endm
33
34MY_ALIGN_64 macro
35        MY_ALIGN 64
36endm
37
38
39; _LZMA_SIZE_OPT  equ 1
40
41; _LZMA_PROB32 equ 1
42
43ifdef _LZMA_PROB32
44        PSHIFT  equ 2
45        PLOAD macro dest, mem
46                mov     dest, dword ptr [mem]
47        endm
48        PSTORE  macro src, mem
49                mov     dword ptr [mem], src
50        endm
51else
52        PSHIFT  equ 1
53        PLOAD macro dest, mem
54                movzx   dest, word ptr [mem]
55        endm
56        PSTORE macro src, mem
57                mov     word ptr [mem], @CatStr(src, _W)
58        endm
59endif
60
61PMULT           equ (1 SHL PSHIFT)
62PMULT_HALF      equ (1 SHL (PSHIFT - 1))
63PMULT_2         equ (1 SHL (PSHIFT + 1))
64
65
66;       x0      range
67;       x1      pbPos / (prob) TREE
68;       x2      probBranch / prm (MATCHED) / pbPos / cnt
69;       x3      sym
70;====== r4 ===  RSP
71;       x5      cod
72;       x6      t1 NORM_CALC / probs_state / dist
73;       x7      t0 NORM_CALC / prob2 IF_BIT_1
74;       x8      state
75;       x9      match (MATCHED) / sym2 / dist2 / lpMask_reg
76;       x10     kBitModelTotal_reg
77;       r11     probs
78;       x12     offs (MATCHED) / dic / len_temp
79;       x13     processedPos
80;       x14     bit (MATCHED) / dicPos
81;       r15     buf
82
83
84cod     equ x5
85cod_L   equ x5_L
86range   equ x0
87state   equ x8
88state_R equ r8
89buf     equ r15
90processedPos equ x13
91kBitModelTotal_reg equ x10
92
93probBranch   equ x2
94probBranch_R equ r2
95probBranch_W equ x2_W
96
97pbPos   equ x1
98pbPos_R equ r1
99
100cnt     equ x2
101cnt_R   equ r2
102
103lpMask_reg equ x9
104dicPos  equ r14
105
106sym     equ x3
107sym_R   equ r3
108sym_L   equ x3_L
109
110probs   equ r11
111dic     equ r12
112
113t0      equ x7
114t0_W    equ x7_W
115t0_R    equ r7
116
117prob2   equ t0
118prob2_W equ t0_W
119
120t1      equ x6
121t1_R    equ r6
122
123probs_state     equ t1
124probs_state_R   equ t1_R
125
126prm     equ r2
127match   equ x9
128match_R equ r9
129offs    equ x12
130offs_R  equ r12
131bit     equ x14
132bit_R   equ r14
133
134sym2    equ x9
135sym2_R  equ r9
136
137len_temp equ x12
138
139dist    equ sym
140dist2   equ x9
141
142
143
144kNumBitModelTotalBits   equ 11
145kBitModelTotal          equ (1 SHL kNumBitModelTotalBits)
146kNumMoveBits            equ 5
147kBitModelOffset         equ ((1 SHL kNumMoveBits) - 1)
148kTopValue               equ (1 SHL 24)
149
150NORM_2 macro
151        ; movzx   t0, BYTE PTR [buf]
152        shl     cod, 8
153        mov     cod_L, BYTE PTR [buf]
154        shl     range, 8
155        ; or      cod, t0
156        inc     buf
157endm
158
159
160NORM macro
161        cmp     range, kTopValue
162        jae     SHORT @F
163        NORM_2
164@@:
165endm
166
167
168; ---------- Branch MACROS ----------
169
170UPDATE_0 macro probsArray:req, probOffset:req, probDisp:req
171        mov     prob2, kBitModelTotal_reg
172        sub     prob2, probBranch
173        shr     prob2, kNumMoveBits
174        add     probBranch, prob2
175        PSTORE  probBranch, probOffset * 1 + probsArray + probDisp * PMULT
176endm
177
178
179UPDATE_1 macro probsArray:req, probOffset:req, probDisp:req
180        sub     prob2, range
181        sub     cod, range
182        mov     range, prob2
183        mov     prob2, probBranch
184        shr     probBranch, kNumMoveBits
185        sub     prob2, probBranch
186        PSTORE  prob2, probOffset * 1 + probsArray + probDisp * PMULT
187endm
188
189
190CMP_COD macro probsArray:req, probOffset:req, probDisp:req
191        PLOAD   probBranch, probOffset * 1 + probsArray + probDisp * PMULT
192        NORM
193        mov     prob2, range
194        shr     range, kNumBitModelTotalBits
195        imul    range, probBranch
196        cmp     cod, range
197endm
198
199
200IF_BIT_1_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
201        CMP_COD probsArray, probOffset, probDisp
202        jae     toLabel
203endm
204
205
206IF_BIT_1 macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
207        IF_BIT_1_NOUP probsArray, probOffset, probDisp, toLabel
208        UPDATE_0 probsArray, probOffset, probDisp
209endm
210
211
212IF_BIT_0_NOUP macro probsArray:req, probOffset:req, probDisp:req, toLabel:req
213        CMP_COD probsArray, probOffset, probDisp
214        jb      toLabel
215endm
216
217
218; ---------- CMOV MACROS ----------
219
220NORM_CALC macro prob:req
221        NORM
222        mov     t0, range
223        shr     range, kNumBitModelTotalBits
224        imul    range, prob
225        sub     t0, range
226        mov     t1, cod
227        sub     cod, range
228endm
229
230
231PUP macro prob:req, probPtr:req
232        sub     t0, prob
233       ; only sar works for both 16/32 bit prob modes
234        sar     t0, kNumMoveBits
235        add     t0, prob
236        PSTORE  t0, probPtr
237endm
238
239
240PUP_SUB macro prob:req, probPtr:req, symSub:req
241        sbb     sym, symSub
242        PUP prob, probPtr
243endm
244
245
246PUP_COD macro prob:req, probPtr:req, symSub:req
247        mov     t0, kBitModelOffset
248        cmovb   cod, t1
249        mov     t1, sym
250        cmovb   t0, kBitModelTotal_reg
251        PUP_SUB prob, probPtr, symSub
252endm
253
254
255BIT_0 macro prob:req, probNext:req
256        PLOAD   prob, probs + 1 * PMULT
257        PLOAD   probNext, probs + 1 * PMULT_2
258
259        NORM_CALC prob
260
261        cmovae  range, t0
262        PLOAD   t0, probs + 1 * PMULT_2 + PMULT
263        cmovae  probNext, t0
264        mov     t0, kBitModelOffset
265        cmovb   cod, t1
266        cmovb   t0, kBitModelTotal_reg
267        mov     sym, 2
268        PUP_SUB prob, probs + 1 * PMULT, 0 - 1
269endm
270
271
272BIT_1 macro prob:req, probNext:req
273        PLOAD   probNext, probs + sym_R * PMULT_2
274        add     sym, sym
275
276        NORM_CALC prob
277
278        cmovae  range, t0
279        PLOAD   t0, probs + sym_R * PMULT + PMULT
280        cmovae  probNext, t0
281        PUP_COD prob, probs + t1_R * PMULT_HALF, 0 - 1
282endm
283
284
285BIT_2 macro prob:req, symSub:req
286        add     sym, sym
287
288        NORM_CALC prob
289
290        cmovae  range, t0
291        PUP_COD prob, probs + t1_R * PMULT_HALF, symSub
292endm
293
294
295; ---------- MATCHED LITERAL ----------
296
297LITM_0 macro
298        mov     offs, 256 * PMULT
299        shl     match, (PSHIFT + 1)
300        mov     bit, offs
301        and     bit, match
302        PLOAD   x1, probs + 256 * PMULT + bit_R * 1 + 1 * PMULT
303        lea     prm, [probs + 256 * PMULT + bit_R * 1 + 1 * PMULT]
304        ; lea     prm, [probs + 256 * PMULT + 1 * PMULT]
305        ; add     prm, bit_R
306        xor     offs, bit
307        add     match, match
308
309        NORM_CALC x1
310
311        cmovae  offs, bit
312        mov     bit, match
313        cmovae  range, t0
314        mov     t0, kBitModelOffset
315        cmovb   cod, t1
316        cmovb   t0, kBitModelTotal_reg
317        mov     sym, 0
318        PUP_SUB x1, prm, -2-1
319endm
320
321
322LITM macro
323        and     bit, offs
324        lea     prm, [probs + offs_R * 1]
325        add     prm, bit_R
326        PLOAD   x1, prm + sym_R * PMULT
327        xor     offs, bit
328        add     sym, sym
329        add     match, match
330
331        NORM_CALC x1
332
333        cmovae  offs, bit
334        mov     bit, match
335        cmovae  range, t0
336        PUP_COD x1, prm + t1_R * PMULT_HALF, - 1
337endm
338
339
340LITM_2 macro
341        and     bit, offs
342        lea     prm, [probs + offs_R * 1]
343        add     prm, bit_R
344        PLOAD   x1, prm + sym_R * PMULT
345        add     sym, sym
346
347        NORM_CALC x1
348
349        cmovae  range, t0
350        PUP_COD x1, prm + t1_R * PMULT_HALF, 256 - 1
351endm
352
353
354; ---------- REVERSE BITS ----------
355
356REV_0 macro prob:req, probNext:req
357        ; PLOAD   prob, probs + 1 * PMULT
358        ; lea     sym2_R, [probs + 2 * PMULT]
359        ; PLOAD   probNext, probs + 2 * PMULT
360        PLOAD   probNext, sym2_R
361
362        NORM_CALC prob
363
364        cmovae  range, t0
365        PLOAD   t0, probs + 3 * PMULT
366        cmovae  probNext, t0
367        cmovb   cod, t1
368        mov     t0, kBitModelOffset
369        cmovb   t0, kBitModelTotal_reg
370        lea     t1_R, [probs + 3 * PMULT]
371        cmovae  sym2_R, t1_R
372        PUP prob, probs + 1 * PMULT
373endm
374
375
376REV_1 macro prob:req, probNext:req, step:req
377        add     sym2_R, step * PMULT
378        PLOAD   probNext, sym2_R
379
380        NORM_CALC prob
381
382        cmovae  range, t0
383        PLOAD   t0, sym2_R + step * PMULT
384        cmovae  probNext, t0
385        cmovb   cod, t1
386        mov     t0, kBitModelOffset
387        cmovb   t0, kBitModelTotal_reg
388        lea     t1_R, [sym2_R + step * PMULT]
389        cmovae  sym2_R, t1_R
390        PUP prob, t1_R - step * PMULT_2
391endm
392
393
394REV_2 macro prob:req, step:req
395        sub     sym2_R, probs
396        shr     sym2, PSHIFT
397        or      sym, sym2
398
399        NORM_CALC prob
400
401        cmovae  range, t0
402        lea     t0, [sym - step]
403        cmovb   sym, t0
404        cmovb   cod, t1
405        mov     t0, kBitModelOffset
406        cmovb   t0, kBitModelTotal_reg
407        PUP prob, probs + sym2_R * PMULT
408endm
409
410
411REV_1_VAR macro prob:req
412        PLOAD   prob, sym_R
413        mov     probs, sym_R
414        add     sym_R, sym2_R
415
416        NORM_CALC prob
417
418        cmovae  range, t0
419        lea     t0_R, [sym_R + sym2_R]
420        cmovae  sym_R, t0_R
421        mov     t0, kBitModelOffset
422        cmovb   cod, t1
423        ; mov     t1, kBitModelTotal
424        ; cmovb   t0, t1
425        cmovb   t0, kBitModelTotal_reg
426        add     sym2, sym2
427        PUP prob, probs
428endm
429
430
431
432
433LIT_PROBS macro lpMaskParam:req
434        ; prob += (UInt32)3 * ((((processedPos << 8) + dic[(dicPos == 0 ? dicBufSize : dicPos) - 1]) & lpMask) << lc);
435        mov     t0, processedPos
436        shl     t0, 8
437        add     sym, t0
438        and     sym, lpMaskParam
439        add     probs_state_R, pbPos_R
440        mov     x1, LOC lc2
441        lea     sym, dword ptr[sym_R + 2 * sym_R]
442        add     probs, Literal * PMULT
443        shl     sym, x1_L
444        add     probs, sym_R
445        UPDATE_0 probs_state_R, 0, IsMatch
446        inc     processedPos
447endm
448
449
450
451kNumPosBitsMax          equ 4
452kNumPosStatesMax        equ (1 SHL kNumPosBitsMax)
453
454kLenNumLowBits          equ 3
455kLenNumLowSymbols       equ (1 SHL kLenNumLowBits)
456kLenNumHighBits         equ 8
457kLenNumHighSymbols      equ (1 SHL kLenNumHighBits)
458kNumLenProbs            equ (2 * kLenNumLowSymbols * kNumPosStatesMax + kLenNumHighSymbols)
459
460LenLow                  equ 0
461LenChoice               equ LenLow
462LenChoice2              equ (LenLow + kLenNumLowSymbols)
463LenHigh                 equ (LenLow + 2 * kLenNumLowSymbols * kNumPosStatesMax)
464
465kNumStates              equ 12
466kNumStates2             equ 16
467kNumLitStates           equ 7
468
469kStartPosModelIndex     equ 4
470kEndPosModelIndex       equ 14
471kNumFullDistances       equ (1 SHL (kEndPosModelIndex SHR 1))
472
473kNumPosSlotBits         equ 6
474kNumLenToPosStates      equ 4
475
476kNumAlignBits           equ 4
477kAlignTableSize         equ (1 SHL kNumAlignBits)
478
479kMatchMinLen            equ 2
480kMatchSpecLenStart      equ (kMatchMinLen + kLenNumLowSymbols * 2 + kLenNumHighSymbols)
481
482kStartOffset    equ 1664
483SpecPos         equ (-kStartOffset)
484IsRep0Long      equ (SpecPos + kNumFullDistances)
485RepLenCoder     equ (IsRep0Long + (kNumStates2 SHL kNumPosBitsMax))
486LenCoder        equ (RepLenCoder + kNumLenProbs)
487IsMatch         equ (LenCoder + kNumLenProbs)
488kAlign          equ (IsMatch + (kNumStates2 SHL kNumPosBitsMax))
489IsRep           equ (kAlign + kAlignTableSize)
490IsRepG0         equ (IsRep + kNumStates)
491IsRepG1         equ (IsRepG0 + kNumStates)
492IsRepG2         equ (IsRepG1 + kNumStates)
493PosSlot         equ (IsRepG2 + kNumStates)
494Literal         equ (PosSlot + (kNumLenToPosStates SHL kNumPosSlotBits))
495NUM_BASE_PROBS  equ (Literal + kStartOffset)
496
497if kAlign ne 0
498  .err <Stop_Compiling_Bad_LZMA_kAlign>
499endif
500
501if NUM_BASE_PROBS ne 1984
502  .err <Stop_Compiling_Bad_LZMA_PROBS>
503endif
504
505
506PTR_FIELD equ dq ?
507
508CLzmaDec_Asm struct
509        lc      db ?
510        lp      db ?
511        pb      db ?
512        _pad_   db ?
513        dicSize dd ?
514
515        probs_Spec      PTR_FIELD
516        probs_1664      PTR_FIELD
517        dic_Spec        PTR_FIELD
518        dicBufSize      PTR_FIELD
519        dicPos_Spec     PTR_FIELD
520        buf_Spec        PTR_FIELD
521
522        range_Spec      dd ?
523        code_Spec       dd ?
524        processedPos_Spec  dd ?
525        checkDicSize    dd ?
526        rep0    dd ?
527        rep1    dd ?
528        rep2    dd ?
529        rep3    dd ?
530        state_Spec      dd ?
531        remainLen dd ?
532CLzmaDec_Asm ends
533
534
535CLzmaDec_Asm_Loc struct
536        OLD_RSP    PTR_FIELD
537        lzmaPtr    PTR_FIELD
538        _pad0_     PTR_FIELD
539        _pad1_     PTR_FIELD
540        _pad2_     PTR_FIELD
541        dicBufSize PTR_FIELD
542        probs_Spec PTR_FIELD
543        dic_Spec   PTR_FIELD
544
545        limit      PTR_FIELD
546        bufLimit   PTR_FIELD
547        lc2       dd ?
548        lpMask    dd ?
549        pbMask    dd ?
550        checkDicSize   dd ?
551
552        _pad_     dd ?
553        remainLen dd ?
554        dicPos_Spec     PTR_FIELD
555        rep0      dd ?
556        rep1      dd ?
557        rep2      dd ?
558        rep3      dd ?
559CLzmaDec_Asm_Loc ends
560
561
562GLOB_2  equ [sym_R].CLzmaDec_Asm.
563GLOB    equ [r1].CLzmaDec_Asm.
564LOC_0   equ [r0].CLzmaDec_Asm_Loc.
565LOC     equ [RSP].CLzmaDec_Asm_Loc.
566
567
568COPY_VAR macro name
569        mov     t0, GLOB_2 name
570        mov     LOC_0 name, t0
571endm
572
573
574RESTORE_VAR macro name
575        mov     t0, LOC name
576        mov     GLOB name, t0
577endm
578
579
580
581IsMatchBranch_Pre macro reg
582        ; prob = probs + IsMatch + (state << kNumPosBitsMax) + posState;
583        mov     pbPos, LOC pbMask
584        and     pbPos, processedPos
585        shl     pbPos, (kLenNumLowBits + 1 + PSHIFT)
586        lea     probs_state_R, [probs + state_R]
587endm
588
589
590IsMatchBranch macro reg
591        IsMatchBranch_Pre
592        IF_BIT_1 probs_state_R, pbPos_R, IsMatch, IsMatch_label
593endm
594
595
596CheckLimits macro reg
597        cmp     buf, LOC bufLimit
598        jae     fin_OK
599        cmp     dicPos, LOC limit
600        jae     fin_OK
601endm
602
603
604
605; RSP is (16x + 8) bytes aligned in WIN64-x64
606; LocalSize equ ((((SIZEOF CLzmaDec_Asm_Loc) + 7) / 16 * 16) + 8)
607
608PARAM_lzma      equ REG_PARAM_0
609PARAM_limit     equ REG_PARAM_1
610PARAM_bufLimit  equ REG_PARAM_2
611
612; MY_ALIGN_64
613MY_PROC LzmaDec_DecodeReal_3, 3
614MY_PUSH_PRESERVED_REGS
615
616        lea     r0, [RSP - (SIZEOF CLzmaDec_Asm_Loc)]
617        and     r0, -128
618        mov     r5, RSP
619        mov     RSP, r0
620        mov     LOC_0 Old_RSP, r5
621        mov     LOC_0 lzmaPtr, PARAM_lzma
622
623        mov     LOC_0 remainLen, 0  ; remainLen must be ZERO
624
625        mov     LOC_0 bufLimit, PARAM_bufLimit
626        mov     sym_R, PARAM_lzma  ;  CLzmaDec_Asm_Loc pointer for GLOB_2
627        mov     dic, GLOB_2 dic_Spec
628        add     PARAM_limit, dic
629        mov     LOC_0 limit, PARAM_limit
630
631        COPY_VAR(rep0)
632        COPY_VAR(rep1)
633        COPY_VAR(rep2)
634        COPY_VAR(rep3)
635
636        mov     dicPos, GLOB_2 dicPos_Spec
637        add     dicPos, dic
638        mov     LOC_0 dicPos_Spec, dicPos
639        mov     LOC_0 dic_Spec, dic
640
641        mov     x1_L, GLOB_2 pb
642        mov     t0, 1
643        shl     t0, x1_L
644        dec     t0
645        mov     LOC_0 pbMask, t0
646
647        ; unsigned pbMask = ((unsigned)1 << (p->prop.pb)) - 1;
648        ; unsigned lc = p->prop.lc;
649        ; unsigned lpMask = ((unsigned)0x100 << p->prop.lp) - ((unsigned)0x100 >> lc);
650
651        mov     x1_L, GLOB_2 lc
652        mov     x2, 100h
653        mov     t0, x2
654        shr     x2, x1_L
655        ; inc     x1
656        add     x1_L, PSHIFT
657        mov     LOC_0 lc2, x1
658        mov     x1_L, GLOB_2 lp
659        shl     t0, x1_L
660        sub     t0, x2
661        mov     LOC_0 lpMask, t0
662        mov     lpMask_reg, t0
663
664        ; mov     probs, GLOB_2 probs_Spec
665        ; add     probs, kStartOffset SHL PSHIFT
666        mov     probs, GLOB_2 probs_1664
667        mov     LOC_0 probs_Spec, probs
668
669        mov     t0_R, GLOB_2 dicBufSize
670        mov     LOC_0 dicBufSize, t0_R
671
672        mov     x1, GLOB_2 checkDicSize
673        mov     LOC_0 checkDicSize, x1
674
675        mov     processedPos, GLOB_2 processedPos_Spec
676
677        mov     state, GLOB_2 state_Spec
678        shl     state, PSHIFT
679
680        mov     buf,   GLOB_2 buf_Spec
681        mov     range, GLOB_2 range_Spec
682        mov     cod,   GLOB_2 code_Spec
683        mov     kBitModelTotal_reg, kBitModelTotal
684        xor     sym, sym
685
686        ; if (processedPos != 0 || checkDicSize != 0)
687        or      x1, processedPos
688        jz      @f
689
690        add     t0_R, dic
691        cmp     dicPos, dic
692        cmovnz  t0_R, dicPos
693        movzx   sym, byte ptr[t0_R - 1]
694
695@@:
696        IsMatchBranch_Pre
697        cmp     state, 4 * PMULT
698        jb      lit_end
699        cmp     state, kNumLitStates * PMULT
700        jb      lit_matched_end
701        jmp     lz_end
702
703
704
705
706; ---------- LITERAL ----------
707MY_ALIGN_64
708lit_start:
709        xor     state, state
710lit_start_2:
711        LIT_PROBS lpMask_reg
712
713    ifdef _LZMA_SIZE_OPT
714
715        PLOAD   x1, probs + 1 * PMULT
716        mov     sym, 1
717MY_ALIGN_16
718lit_loop:
719        BIT_1   x1, x2
720        mov     x1, x2
721        cmp     sym, 127
722        jbe     lit_loop
723
724    else
725
726        BIT_0   x1, x2
727        BIT_1   x2, x1
728        BIT_1   x1, x2
729        BIT_1   x2, x1
730        BIT_1   x1, x2
731        BIT_1   x2, x1
732        BIT_1   x1, x2
733
734    endif
735
736        BIT_2   x2, 256 - 1
737
738        ; mov     dic, LOC dic_Spec
739        mov     probs, LOC probs_Spec
740        IsMatchBranch_Pre
741        mov     byte ptr[dicPos], sym_L
742        inc     dicPos
743
744        CheckLimits
745lit_end:
746        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsMatch, lit_start
747
748        ; jmp     IsMatch_label
749
750; ---------- MATCHES ----------
751; MY_ALIGN_32
752IsMatch_label:
753        UPDATE_1 probs_state_R, pbPos_R, IsMatch
754        IF_BIT_1 probs_state_R, 0, IsRep, IsRep_label
755
756        add     probs, LenCoder * PMULT
757        add     state, kNumStates * PMULT
758
759; ---------- LEN DECODE ----------
760len_decode:
761        mov     len_temp, 8 - 1 - kMatchMinLen
762        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
763        UPDATE_1 probs, 0, 0
764        add     probs, (1 SHL (kLenNumLowBits + PSHIFT))
765        mov     len_temp, -1 - kMatchMinLen
766        IF_BIT_0_NOUP probs, 0, 0, len_mid_0
767        UPDATE_1 probs, 0, 0
768        add     probs, LenHigh * PMULT - (1 SHL (kLenNumLowBits + PSHIFT))
769        mov     sym, 1
770        PLOAD   x1, probs + 1 * PMULT
771
772MY_ALIGN_32
773len8_loop:
774        BIT_1   x1, x2
775        mov     x1, x2
776        cmp     sym, 64
777        jb      len8_loop
778
779        mov     len_temp, (kLenNumHighSymbols - kLenNumLowSymbols * 2) - 1 - kMatchMinLen
780        jmp     len_mid_2
781
782MY_ALIGN_32
783len_mid_0:
784        UPDATE_0 probs, 0, 0
785        add     probs, pbPos_R
786        BIT_0   x2, x1
787len_mid_2:
788        BIT_1   x1, x2
789        BIT_2   x2, len_temp
790        mov     probs, LOC probs_Spec
791        cmp     state, kNumStates * PMULT
792        jb      copy_match
793
794
795; ---------- DECODE DISTANCE ----------
796        ; probs + PosSlot + ((len < kNumLenToPosStates ? len : kNumLenToPosStates - 1) << kNumPosSlotBits);
797
798        mov     t0, 3 + kMatchMinLen
799        cmp     sym, 3 + kMatchMinLen
800        cmovb   t0, sym
801        add     probs, PosSlot * PMULT - (kMatchMinLen SHL (kNumPosSlotBits + PSHIFT))
802        shl     t0, (kNumPosSlotBits + PSHIFT)
803        add     probs, t0_R
804
805        ; sym = Len
806        ; mov     LOC remainLen, sym
807        mov     len_temp, sym
808
809    ifdef _LZMA_SIZE_OPT
810
811        PLOAD   x1, probs + 1 * PMULT
812        mov     sym, 1
813MY_ALIGN_16
814slot_loop:
815        BIT_1   x1, x2
816        mov     x1, x2
817        cmp     sym, 32
818        jb      slot_loop
819
820    else
821
822        BIT_0   x1, x2
823        BIT_1   x2, x1
824        BIT_1   x1, x2
825        BIT_1   x2, x1
826        BIT_1   x1, x2
827
828    endif
829
830        mov     x1, sym
831        BIT_2   x2, 64-1
832
833        and     sym, 3
834        mov     probs, LOC probs_Spec
835        cmp     x1, 32 + kEndPosModelIndex / 2
836        jb      short_dist
837
838        ;  unsigned numDirectBits = (unsigned)(((distance >> 1) - 1));
839        sub     x1, (32 + 1 + kNumAlignBits)
840        ;  distance = (2 | (distance & 1));
841        or      sym, 2
842        PLOAD   x2, probs + 1 * PMULT
843        shl     sym, kNumAlignBits + 1
844        lea     sym2_R, [probs + 2 * PMULT]
845
846        jmp     direct_norm
847        ; lea     t1, [sym_R + (1 SHL kNumAlignBits)]
848        ; cmp     range, kTopValue
849        ; jb      direct_norm
850
851; ---------- DIRECT DISTANCE ----------
852MY_ALIGN_32
853direct_loop:
854        shr     range, 1
855        mov     t0, cod
856        sub     cod, range
857        cmovs   cod, t0
858        cmovns  sym, t1
859
860        comment ~
861        sub     cod, range
862        mov     x2, cod
863        sar     x2, 31
864        lea     sym, dword ptr [r2 + sym_R * 2 + 1]
865        and     x2, range
866        add     cod, x2
867        ~
868        dec     x1
869        je      direct_end
870
871        add     sym, sym
872direct_norm:
873        lea     t1, [sym_R + (1 SHL kNumAlignBits)]
874        cmp     range, kTopValue
875        jae     near ptr direct_loop
876        ; we align for 32 here with "near ptr" command above
877        NORM_2
878        jmp     direct_loop
879
880MY_ALIGN_32
881direct_end:
882        ;  prob =  + kAlign;
883        ;  distance <<= kNumAlignBits;
884        REV_0   x2, x1
885        REV_1   x1, x2, 2
886        REV_1   x2, x1, 4
887        REV_2   x1, 8
888
889decode_dist_end:
890
891        ; if (distance >= (checkDicSize == 0 ? processedPos: checkDicSize))
892
893        mov     t0, LOC checkDicSize
894        test    t0, t0
895        cmove   t0, processedPos
896        cmp     sym, t0
897        jae     end_of_payload
898
899        ; rep3 = rep2;
900        ; rep2 = rep1;
901        ; rep1 = rep0;
902        ; rep0 = distance + 1;
903
904        inc     sym
905        mov     t0, LOC rep0
906        mov     t1, LOC rep1
907        mov     x1, LOC rep2
908        mov     LOC rep0, sym
909        ; mov     sym, LOC remainLen
910        mov     sym, len_temp
911        mov     LOC rep1, t0
912        mov     LOC rep2, t1
913        mov     LOC rep3, x1
914
915        ; state = (state < kNumStates + kNumLitStates) ? kNumLitStates : kNumLitStates + 3;
916        cmp     state, (kNumStates + kNumLitStates) * PMULT
917        mov     state, kNumLitStates * PMULT
918        mov     t0, (kNumLitStates + 3) * PMULT
919        cmovae  state, t0
920
921
922; ---------- COPY MATCH ----------
923copy_match:
924
925        ; len += kMatchMinLen;
926        ; add     sym, kMatchMinLen
927
928        ; if ((rem = limit - dicPos) == 0)
929        ; {
930        ;   p->dicPos = dicPos;
931        ;   return SZ_ERROR_DATA;
932        ; }
933        mov     cnt_R, LOC limit
934        sub     cnt_R, dicPos
935        jz      fin_ERROR
936
937        ; curLen = ((rem < len) ? (unsigned)rem : len);
938        cmp     cnt_R, sym_R
939        ; cmovae  cnt_R, sym_R ; 64-bit
940        cmovae  cnt, sym ; 32-bit
941
942        mov     dic, LOC dic_Spec
943        mov     x1, LOC rep0
944
945        mov     t0_R, dicPos
946        add     dicPos, cnt_R
947        ; processedPos += curLen;
948        add     processedPos, cnt
949        ; len -= curLen;
950        sub     sym, cnt
951        mov     LOC remainLen, sym
952
953        sub     t0_R, dic
954
955        ; pos = dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0);
956        sub     t0_R, r1
957        jae     @f
958
959        mov     r1, LOC dicBufSize
960        add     t0_R, r1
961        sub     r1, t0_R
962        cmp     cnt_R, r1
963        ja      copy_match_cross
964@@:
965        ; if (curLen <= dicBufSize - pos)
966
967; ---------- COPY MATCH FAST ----------
968        ; Byte *dest = dic + dicPos;
969        ; mov     r1, dic
970        ; ptrdiff_t src = (ptrdiff_t)pos - (ptrdiff_t)dicPos;
971        ; sub   t0_R, dicPos
972        ; dicPos += curLen;
973
974        ; const Byte *lim = dest + curLen;
975        add     t0_R, dic
976        movzx   sym, byte ptr[t0_R]
977        add     t0_R, cnt_R
978        neg     cnt_R
979        ; lea     r1, [dicPos - 1]
980copy_common:
981        dec     dicPos
982        ; cmp   LOC rep0, 1
983        ; je    rep0Label
984
985        ; t0_R - src_lim
986        ; r1 - dest_lim - 1
987        ; cnt_R - (-cnt)
988
989        IsMatchBranch_Pre
990        inc     cnt_R
991        jz      copy_end
992MY_ALIGN_16
993@@:
994        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
995        movzx   sym, byte ptr[cnt_R * 1 + t0_R]
996        inc     cnt_R
997        jnz     @b
998
999copy_end:
1000lz_end_match:
1001        mov     byte ptr[dicPos], sym_L
1002        inc     dicPos
1003
1004        ; IsMatchBranch_Pre
1005        CheckLimits
1006lz_end:
1007        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1008
1009
1010
1011; ---------- LITERAL MATCHED ----------
1012
1013        LIT_PROBS LOC lpMask
1014
1015        ; matchByte = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1016        mov     x1, LOC rep0
1017        ; mov     dic, LOC dic_Spec
1018        mov     LOC dicPos_Spec, dicPos
1019
1020        ; state -= (state < 10) ? 3 : 6;
1021        lea     t0, [state_R - 6 * PMULT]
1022        sub     state, 3 * PMULT
1023        cmp     state, 7 * PMULT
1024        cmovae  state, t0
1025
1026        sub     dicPos, dic
1027        sub     dicPos, r1
1028        jae     @f
1029        add     dicPos, LOC dicBufSize
1030@@:
1031        comment ~
1032        xor     t0, t0
1033        sub     dicPos, r1
1034        cmovb   t0_R, LOC dicBufSize
1035        ~
1036
1037        movzx   match, byte ptr[dic + dicPos * 1]
1038
1039    ifdef _LZMA_SIZE_OPT
1040
1041        mov     offs, 256 * PMULT
1042        shl     match, (PSHIFT + 1)
1043        mov     bit, match
1044        mov     sym, 1
1045MY_ALIGN_16
1046litm_loop:
1047        LITM
1048        cmp     sym, 256
1049        jb      litm_loop
1050        sub     sym, 256
1051
1052    else
1053
1054        LITM_0
1055        LITM
1056        LITM
1057        LITM
1058        LITM
1059        LITM
1060        LITM
1061        LITM_2
1062
1063    endif
1064
1065        mov     probs, LOC probs_Spec
1066        IsMatchBranch_Pre
1067        ; mov     dic, LOC dic_Spec
1068        mov     dicPos, LOC dicPos_Spec
1069        mov     byte ptr[dicPos], sym_L
1070        inc     dicPos
1071
1072        CheckLimits
1073lit_matched_end:
1074        IF_BIT_1_NOUP probs_state_R, pbPos_R, IsMatch, IsMatch_label
1075        ; IsMatchBranch
1076        mov     lpMask_reg, LOC lpMask
1077        sub     state, 3 * PMULT
1078        jmp     lit_start_2
1079
1080
1081
1082; ---------- REP 0 LITERAL ----------
1083MY_ALIGN_32
1084IsRep0Short_label:
1085        UPDATE_0 probs_state_R, pbPos_R, IsRep0Long
1086
1087        ; dic[dicPos] = dic[dicPos - rep0 + (dicPos < rep0 ? dicBufSize : 0)];
1088        mov     dic, LOC dic_Spec
1089        mov     t0_R, dicPos
1090        mov     probBranch, LOC rep0
1091        sub     t0_R, dic
1092
1093        sub     probs, RepLenCoder * PMULT
1094        inc     processedPos
1095        ; state = state < kNumLitStates ? 9 : 11;
1096        or      state, 1 * PMULT
1097        IsMatchBranch_Pre
1098
1099        sub     t0_R, probBranch_R
1100        jae     @f
1101        add     t0_R, LOC dicBufSize
1102@@:
1103        movzx   sym, byte ptr[dic + t0_R * 1]
1104        jmp     lz_end_match
1105
1106
1107MY_ALIGN_32
1108IsRep_label:
1109        UPDATE_1 probs_state_R, 0, IsRep
1110
1111        ; The (checkDicSize == 0 && processedPos == 0) case was checked before in LzmaDec.c with kBadRepCode.
1112        ; So we don't check it here.
1113
1114        ; mov     t0, processedPos
1115        ; or      t0, LOC checkDicSize
1116        ; jz      fin_ERROR_2
1117
1118        ; state = state < kNumLitStates ? 8 : 11;
1119        cmp     state, kNumLitStates * PMULT
1120        mov     state, 8 * PMULT
1121        mov     probBranch, 11 * PMULT
1122        cmovae  state, probBranch
1123
1124        ; prob = probs + RepLenCoder;
1125        add     probs, RepLenCoder * PMULT
1126
1127        IF_BIT_1 probs_state_R, 0, IsRepG0, IsRepG0_label
1128        IF_BIT_0_NOUP probs_state_R, pbPos_R, IsRep0Long, IsRep0Short_label
1129        UPDATE_1 probs_state_R, pbPos_R, IsRep0Long
1130        jmp     len_decode
1131
1132MY_ALIGN_32
1133IsRepG0_label:
1134        UPDATE_1 probs_state_R, 0, IsRepG0
1135        mov     dist2, LOC rep0
1136        mov     dist, LOC rep1
1137        mov     LOC rep1, dist2
1138
1139        IF_BIT_1 probs_state_R, 0, IsRepG1, IsRepG1_label
1140        mov     LOC rep0, dist
1141        jmp     len_decode
1142
1143; MY_ALIGN_32
1144IsRepG1_label:
1145        UPDATE_1 probs_state_R, 0, IsRepG1
1146        mov     dist2, LOC rep2
1147        mov     LOC rep2, dist
1148
1149        IF_BIT_1 probs_state_R, 0, IsRepG2, IsRepG2_label
1150        mov     LOC rep0, dist2
1151        jmp     len_decode
1152
1153; MY_ALIGN_32
1154IsRepG2_label:
1155        UPDATE_1 probs_state_R, 0, IsRepG2
1156        mov     dist, LOC rep3
1157        mov     LOC rep3, dist2
1158        mov     LOC rep0, dist
1159        jmp     len_decode
1160
1161
1162
1163; ---------- SPEC SHORT DISTANCE ----------
1164
1165MY_ALIGN_32
1166short_dist:
1167        sub     x1, 32 + 1
1168        jbe     decode_dist_end
1169        or      sym, 2
1170        shl     sym, x1_L
1171        lea     sym_R, [probs + sym_R * PMULT + SpecPos * PMULT + 1 * PMULT]
1172        mov     sym2, PMULT ; step
1173MY_ALIGN_32
1174spec_loop:
1175        REV_1_VAR x2
1176        dec     x1
1177        jnz     spec_loop
1178
1179        mov     probs, LOC probs_Spec
1180        sub     sym, sym2
1181        sub     sym, SpecPos * PMULT
1182        sub     sym_R, probs
1183        shr     sym, PSHIFT
1184
1185        jmp     decode_dist_end
1186
1187
1188; ---------- COPY MATCH CROSS ----------
1189copy_match_cross:
1190        ; t0_R - src pos
1191        ; r1 - len to dicBufSize
1192        ; cnt_R - total copy len
1193
1194        mov     t1_R, t0_R         ; srcPos
1195        mov     t0_R, dic
1196        mov     r1, LOC dicBufSize   ;
1197        neg     cnt_R
1198@@:
1199        movzx   sym, byte ptr[t1_R * 1 + t0_R]
1200        inc     t1_R
1201        mov     byte ptr[cnt_R * 1 + dicPos], sym_L
1202        inc     cnt_R
1203        cmp     t1_R, r1
1204        jne     @b
1205
1206        movzx   sym, byte ptr[t0_R]
1207        sub     t0_R, cnt_R
1208        jmp     copy_common
1209
1210
1211
1212
1213fin_ERROR:
1214        mov     LOC remainLen, len_temp
1215; fin_ERROR_2:
1216        mov     sym, 1
1217        jmp     fin
1218
1219end_of_payload:
1220        cmp     sym, 0FFFFFFFFh ; -1
1221        jne     fin_ERROR
1222
1223        mov     LOC remainLen, kMatchSpecLenStart
1224        sub     state, kNumStates * PMULT
1225
1226fin_OK:
1227        xor     sym, sym
1228
1229fin:
1230        NORM
1231
1232        mov     r1, LOC lzmaPtr
1233
1234        sub     dicPos, LOC dic_Spec
1235        mov     GLOB dicPos_Spec, dicPos
1236        mov     GLOB buf_Spec, buf
1237        mov     GLOB range_Spec, range
1238        mov     GLOB code_Spec, cod
1239        shr     state, PSHIFT
1240        mov     GLOB state_Spec, state
1241        mov     GLOB processedPos_Spec, processedPos
1242
1243        RESTORE_VAR(remainLen)
1244        RESTORE_VAR(rep0)
1245        RESTORE_VAR(rep1)
1246        RESTORE_VAR(rep2)
1247        RESTORE_VAR(rep3)
1248
1249        mov     x0, sym
1250
1251        mov     RSP, LOC Old_RSP
1252
1253MY_POP_PRESERVED_REGS
1254MY_ENDP
1255
1256_TEXT$LZMADECOPT ENDS
1257
1258end
1259