1/* Intel SIMD MMX implementation of Viterbi ACS butterflies
2   for 256-state (k=9) convolutional code
3   Copyright 2004 Phil Karn, KA9Q
4   This code may be used under the terms of the GNU Lesser General Public License (LGPL)
5
6   void update_viterbi29_blk_mmx(struct v29 *vp,unsigned char *syms,int nbits);
7*/
8
9	# These are offsets into struct v29, defined in viterbi29.h
10	.set DP,512
11	.set OLDMETRICS,516
12	.set NEWMETRICS,520
13	.text
14	.global update_viterbi29_blk_mmx,Mettab29_1,Mettab29_2
15	.type update_viterbi29_blk_mmx,@function
16	.align 16
17
18	# MMX (64-bit SIMD) version
19	# requires Pentium-MMX, Pentium-II or better
20
21update_viterbi29_blk_mmx:
22	pushl %ebp
23	movl %esp,%ebp
24	pushl %esi
25	pushl %edi
26	pushl %edx
27	pushl %ebx
28
29	movl 8(%ebp),%edx	# edx = vp
30	movl 8(%ebp),%edx	# edx = vp
31	testl %edx,%edx
32	jnz  0f
33	movl -1,%eax
34	jmp  err
350:	movl OLDMETRICS(%edx),%esi	# esi -> old metrics
36	movl NEWMETRICS(%edx),%edi	# edi -> new metrics
37	movl DP(%edx),%edx	# edx -> decisions
38
391:	movl 16(%ebp),%eax	# eax = nbits
40	decl %eax
41	jl   2f			# passed zero, we're done
42	movl %eax,16(%ebp)
43
44	movl 12(%ebp),%ebx	# ebx = syms
45	movw (%ebx),%ax		# ax = second symbol : first symbol
46	addl $2,%ebx
47	movl %ebx,12(%ebp)
48
49	movb %ah,%bl
50	andl $255,%eax
51	andl $255,%ebx
52
53	# shift into first array index dimension slot
54	shll $7,%eax
55	shll $7,%ebx
56
57	# each invocation of this macro will do 8 butterflies in parallel
58	.MACRO butterfly GROUP
59	# Compute branch metrics
60	movq (Mettab29_1+8*\GROUP)(%eax),%mm3
61	movq fifteens,%mm0
62	paddb (Mettab29_2+8*\GROUP)(%ebx),%mm3
63	paddb ones,%mm3  # emulate pavgb - this may not be necessary
64	psrlq $1,%mm3
65	pand %mm0,%mm3
66
67	movq (8*\GROUP)(%esi),%mm6	# Incoming path metric, high bit = 0
68	movq ((8*\GROUP)+128)(%esi),%mm2 # Incoming path metric, high bit = 1
69	movq %mm6,%mm1
70	movq %mm2,%mm7
71
72	paddb %mm3,%mm6
73	paddb %mm3,%mm2
74	pxor  %mm0,%mm3		 # invert branch metric
75	paddb %mm3,%mm7		 # path metric for inverted symbols
76	paddb %mm3,%mm1
77
78	# live registers 1 2 6 7
79	# Compare mm6 and mm7;  mm1 and mm2
80	pxor %mm3,%mm3
81	movq %mm6,%mm4
82	movq %mm1,%mm5
83	psubb %mm7,%mm4		# mm4 = mm6 - mm7
84	psubb %mm2,%mm5		# mm5 = mm1 - mm2
85	pcmpgtb %mm3,%mm4	# mm4 = first set of decisions (ff = 1 better)
86	pcmpgtb %mm3,%mm5	# mm5 = second set of decisions
87
88	# live registers 1 2 4 5 6 7
89	# select survivors
90	movq %mm4,%mm0
91	pand %mm4,%mm7
92	movq %mm5,%mm3
93	pand %mm5,%mm2
94	pandn %mm6,%mm0
95	pandn %mm1,%mm3
96	por %mm0,%mm7		# mm7 = first set of survivors
97	por %mm3,%mm2		# mm2 = second set of survivors
98
99	# live registers 2 4 5 7
100	# interleave & store decisions in mm4, mm5
101	# interleave & store new branch metrics in mm2, mm7
102	movq %mm4,%mm3
103	movq %mm7,%mm0
104	punpckhbw %mm5,%mm4
105	punpcklbw %mm5,%mm3
106	punpcklbw %mm2,%mm7	# interleave second 8 new metrics
107	punpckhbw %mm2,%mm0	# interleave first 8 new metrics
108	movq %mm4,(16*\GROUP+8)(%edx)
109	movq %mm3,(16*\GROUP)(%edx)
110	movq %mm7,(16*\GROUP)(%edi)
111	movq %mm0,(16*\GROUP+8)(%edi)
112
113	.endm
114
115# invoke macro 16 times for a total of 128 butterflies
116	butterfly GROUP=0
117	butterfly GROUP=1
118	butterfly GROUP=2
119	butterfly GROUP=3
120	butterfly GROUP=4
121	butterfly GROUP=5
122	butterfly GROUP=6
123	butterfly GROUP=7
124	butterfly GROUP=8
125	butterfly GROUP=9
126	butterfly GROUP=10
127	butterfly GROUP=11
128	butterfly GROUP=12
129	butterfly GROUP=13
130	butterfly GROUP=14
131	butterfly GROUP=15
132
133	addl $256,%edx		# bump decision pointer
134
135	# swap metrics
136	movl %esi,%eax
137	movl %edi,%esi
138	movl %eax,%edi
139	jmp 1b
140
1412:	emms
142	movl 8(%ebp),%ebx	# ebx = vp
143	# stash metric pointers
144	movl %esi,OLDMETRICS(%ebx)
145	movl %edi,NEWMETRICS(%ebx)
146	movl %edx,DP(%ebx)	# stash incremented value of vp->dp
147	xorl %eax,%eax
148err:	popl %ebx
149	popl %edx
150	popl %edi
151	popl %esi
152	popl %ebp
153	ret
154
155	.data
156	.align 8
157fifteens:
158	.byte 15,15,15,15,15,15,15,15
159
160	.align 8
161ones:	.byte 1,1,1,1,1,1,1,1
162