arch/alpha/lib/ev6-divide.S


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259

/*
 * arch/alpha/lib/ev6-divide.S
 *
 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
 *
 * Alpha division..
 */

/*
 * The alpha chip doesn't provide hardware division, so we have to do it
 * by hand.  The compiler expects the functions
 *
 *	__divqu: 64-bit unsigned long divide
 *	__remqu: 64-bit unsigned long remainder
 *	__divqs/__remqs: signed 64-bit
 *	__divlu/__remlu: unsigned 32-bit
 *	__divls/__remls: signed 32-bit
 *
 * These are not normal C functions: instead of the normal
 * calling sequence, these expect their arguments in registers
 * $24 and $25, and return the result in $27. Register $28 may
 * be clobbered (assembly temporary), anything else must be saved. 
 *
 * In short: painful.
 *
 * This is a rather simple bit-at-a-time algorithm: it's very good
 * at dividing random 64-bit numbers, but the more usual case where
 * the divisor is small is handled better by the DEC algorithm
 * using lookup tables. This uses much less memory, though, and is
 * nicer on the cache.. Besides, I don't know the copyright status
 * of the DEC code.
 */

/*
 * My temporaries:
 *	$0 - current bit
 *	$1 - shifted divisor
 *	$2 - modulus/quotient
 *
 *	$23 - return address
 *	$24 - dividend
 *	$25 - divisor
 *
 *	$27 - quotient/modulus
 *	$28 - compare status
 *
 * Much of the information about 21264 scheduling/coding comes from:
 *	Compiler Writer's Guide for the Alpha 21264
 *	abbreviated as 'CWG' in other comments here
 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
 * Scheduling notation:
 *	E	- either cluster
 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
 * Try not to change the actual algorithm if possible for consistency.
 */

#define halt .long 0

/*
 * Select function type and registers
 */
#define mask	$0
#define divisor	$1
#define compare $28
#define tmp1	$3
#define tmp2	$4

#ifdef DIV
#define DIV_ONLY(x,y...) x,##y
#define MOD_ONLY(x,y...)
#define func(x) __div##x
#define modulus $2
#define quotient $27
#define GETSIGN(x) xor $24,$25,x
#define STACK 48
#else
#define DIV_ONLY(x,y...)
#define MOD_ONLY(x,y...) x,##y
#define func(x) __rem##x
#define modulus $27
#define quotient $2
#define GETSIGN(x) bis $24,$24,x
#define STACK 32
#endif

/*
 * For 32-bit operations, we need to extend to 64-bit
 */
#ifdef INTSIZE
#define ufunction func(lu)
#define sfunction func(l)
#define LONGIFY(x) zapnot x,15,x
#define SLONGIFY(x) addl x,0,x
#else
#define ufunction func(qu)
#define sfunction func(q)
#define LONGIFY(x)
#define SLONGIFY(x)
#endif

.set noat
.align	4
.globl	ufunction
.ent	ufunction
ufunction:
	subq	$30,STACK,$30		# E :
	.frame	$30,STACK,$23
	.prologue 0

7:	stq	$1, 0($30)		# L :
	bis	$25,$25,divisor		# E :
	stq	$2, 8($30)		# L : L U L U

	bis	$24,$24,modulus		# E :
	stq	$0,16($30)		# L :
	bis	$31,$31,quotient	# E :
	LONGIFY(divisor)		# E : U L L U

	stq	tmp1,24($30)		# L :
	LONGIFY(modulus)		# E :
	bis	$31,1,mask		# E :
	DIV_ONLY(stq tmp2,32($30))	# L : L U U L

	beq	divisor, 9f			/* div by zero */
	/*
	 * In spite of the DIV_ONLY being either a non-instruction
	 * or an actual stq, the addition of the .align directive
	 * below ensures that label 1 is going to be nicely aligned
	 */

	.align	4
#ifdef INTSIZE
	/*
	 * shift divisor left, using 3-bit shifts for
	 * 32-bit divides as we can't overflow. Three-bit
	 * shifts will result in looping three times less
	 * here, but can result in two loops more later.
	 * Thus using a large shift isn't worth it (and
	 * s8add pairs better than a sll..)
	 */
1:	cmpult	divisor,modulus,compare	# E :
	s8addq	divisor,$31,divisor	# E :
	s8addq	mask,$31,mask		# E :
	bne	compare,1b		# U : U L U L
#else
1:	cmpult	divisor,modulus,compare	# E :
	nop				# E :
	nop				# E :
	blt     divisor, 2f		# U : U L U L

	addq	divisor,divisor,divisor	# E :
	addq	mask,mask,mask		# E :
	unop				# E :
	bne	compare,1b		# U : U L U L
#endif

	/* ok, start to go right again.. */
2:
	/*
	 * Keep things nicely bundled... use a nop instead of not
	 * having an instruction for DIV_ONLY
	 */
#ifdef DIV
	DIV_ONLY(addq quotient,mask,tmp2) # E :
#else
	nop				# E :
#endif
	srl	mask,1,mask		# U :
	cmpule	divisor,modulus,compare	# E :
	subq	modulus,divisor,tmp1	# E :

#ifdef DIV
	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
	nop				# E : as part of the cmovne
	srl	divisor,1,divisor	# U :
	nop				# E : L U L U

	nop				# E :
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
	nop				# E : as part of the cmovne
	bne	mask,2b			# U : U L U L
#else
	srl	divisor,1,divisor	# U :
	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
	nop				# E : as part of the cmovne
	bne	mask,2b			# U : U L L U
#endif

9:	ldq	$1, 0($30)		# L :
	ldq	$2, 8($30)		# L :
	nop				# E :
	nop				# E : U U L L

	ldq	$0,16($30)		# L :
	ldq	tmp1,24($30)		# L :
	nop				# E :
	nop				# E :

#ifdef DIV
	DIV_ONLY(ldq tmp2,32($30))	# L :
#else
	nop				# E :
#endif
	addq	$30,STACK,$30		# E :
	ret	$31,($23),1		# L0 : L U U L
	.end	ufunction

/*
 * Uhh.. Ugly signed division. I'd rather not have it at all, but
 * it's needed in some circumstances. There are different ways to
 * handle this, really. This does:
 * 	-a / b = a / -b = -(a / b)
 *	-a % b = -(a % b)
 *	a % -b = a % b
 * which is probably not the best solution, but at least should
 * have the property that (x/y)*y + (x%y) = x.
 */
.align 4
.globl	sfunction
.ent	sfunction
sfunction:
	subq	$30,STACK,$30		# E :
	.frame	$30,STACK,$23
	.prologue 0
	bis	$24,$25,$28		# E :
	SLONGIFY($28)			# E :
	bge	$28,7b			# U :

	stq	$24,0($30)		# L :
	subq	$31,$24,$28		# E :
	stq	$25,8($30)		# L :
	nop				# E : U L U L

	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
	nop				# E : as part of the cmov
	stq	$23,16($30)		# L :
	subq	$31,$25,$28		# E : U L U L

	stq	tmp1,24($30)		# L :
	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
	nop				# E :
	bsr	$23,ufunction		# L0: L U L U

	ldq	$24,0($30)		# L :
	ldq	$25,8($30)		# L :
	GETSIGN($28)			# E :
	subq	$31,$27,tmp1		# E : U U L L

	SLONGIFY($28)			# E :
	ldq	$23,16($30)		# L :
	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
	nop				# E : U L L U : as part of the cmov

	ldq	tmp1,24($30)		# L :
	nop				# E : as part of the cmov
	addq	$30,STACK,$30		# E :
	ret	$31,($23),1		# L0 : L U U L
	.end	sfunction