summaryrefslogtreecommitdiff
path: root/arch/x86/lib/memcpy_64.S
blob: f82e884928af6643854f64df5899c42ab6fee9b2 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/* Copyright 2002 Andi Kleen */

#include <linux/linkage.h>

#include <asm/cpufeature.h>
#include <asm/dwarf2.h>

/*
 * memcpy - Copy a memory block.
 *
 * Input:
 *  rdi destination
 *  rsi source
 *  rdx count
 *
 * Output:
 * rax original destination
 */

/*
 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
 *
 * This gets patched over the unrolled variant (below) via the
 * alternative instructions framework:
 */
	.section .altinstr_replacement, "ax", @progbits
.Lmemcpy_c:
	movq %rdi, %rax

	movl %edx, %ecx
	shrl $3, %ecx
	andl $7, %edx
	rep movsq
	movl %edx, %ecx
	rep movsb
	ret
.Lmemcpy_e:
	.previous

ENTRY(__memcpy)
ENTRY(memcpy)
	CFI_STARTPROC

	/*
	 * Put the number of full 64-byte blocks into %ecx.
	 * Tail portion is handled at the end:
	 */
	movq %rdi, %rax
	movl %edx, %ecx
	shrl   $6, %ecx
	jz .Lhandle_tail

	.p2align 4
.Lloop_64:
	/*
	 * We decrement the loop index here - and the zero-flag is
	 * checked at the end of the loop (instructions inbetween do
	 * not change the zero flag):
	 */
	decl %ecx

	/*
	 * Move in blocks of 4x16 bytes:
	 */
	movq 0*8(%rsi),		%r11
	movq 1*8(%rsi),		%r8
	movq %r11,		0*8(%rdi)
	movq %r8,		1*8(%rdi)

	movq 2*8(%rsi),		%r9
	movq 3*8(%rsi),		%r10
	movq %r9,		2*8(%rdi)
	movq %r10,		3*8(%rdi)

	movq 4*8(%rsi),		%r11
	movq 5*8(%rsi),		%r8
	movq %r11,		4*8(%rdi)
	movq %r8,		5*8(%rdi)

	movq 6*8(%rsi),		%r9
	movq 7*8(%rsi),		%r10
	movq %r9,		6*8(%rdi)
	movq %r10,		7*8(%rdi)

	leaq 64(%rsi), %rsi
	leaq 64(%rdi), %rdi

	jnz  .Lloop_64

.Lhandle_tail:
	movl %edx, %ecx
	andl  $63, %ecx
	shrl   $3, %ecx
	jz   .Lhandle_7

	.p2align 4
.Lloop_8:
	decl %ecx
	movq (%rsi),		%r8
	movq %r8,		(%rdi)
	leaq 8(%rdi),		%rdi
	leaq 8(%rsi),		%rsi
	jnz  .Lloop_8

.Lhandle_7:
	movl %edx, %ecx
	andl $7, %ecx
	jz .Lend

	.p2align 4
.Lloop_1:
	movb (%rsi), %r8b
	movb %r8b, (%rdi)
	incq %rdi
	incq %rsi
	decl %ecx
	jnz .Lloop_1

.Lend:
	ret
	CFI_ENDPROC
ENDPROC(memcpy)
ENDPROC(__memcpy)

	/*
	 * Some CPUs run faster using the string copy instructions.
	 * It is also a lot simpler. Use this when possible:
	 */

	.section .altinstructions, "a"
	.align 8
	.quad memcpy
	.quad .Lmemcpy_c
	.byte X86_FEATURE_REP_GOOD

	/*
	 * Replace only beginning, memcpy is used to apply alternatives,
	 * so it is silly to overwrite itself with nops - reboot is the
	 * only outcome...
	 */
	.byte .Lmemcpy_e - .Lmemcpy_c
	.byte .Lmemcpy_e - .Lmemcpy_c
	.previous