summaryrefslogtreecommitdiff
path: root/Documentation/block/ioprio.rst
blob: 4662e1ff3d81f28f57417ff70ac81f84220df88f (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
===================
Block io priorities
===================


Intro
-----

The io priority feature enables users to io nice processes or process groups,
similar to what has been possible with cpu scheduling for ages. Support for io
priorities is io scheduler dependent and currently supported by bfq and
mq-deadline.

Scheduling classes
------------------

Three generic scheduling classes are implemented for io priorities that
determine how io is served for a process.

IOPRIO_CLASS_RT: This is the realtime io class. This scheduling class is given
higher priority than any other in the system, processes from this class are
given first access to the disk every time. Thus it needs to be used with some
care, one io RT process can starve the entire system. Within the RT class,
there are 8 levels of class data that determine exactly how much time this
process needs the disk for on each service. In the future this might change
to be more directly mappable to performance, by passing in a wanted data
rate instead.

IOPRIO_CLASS_BE: This is the best-effort scheduling class, which is the default
for any process that hasn't set a specific io priority. The class data
determines how much io bandwidth the process will get, it's directly mappable
to the cpu nice levels just more coarsely implemented. 0 is the highest
BE prio level, 7 is the lowest. The mapping between cpu nice level and io
nice level is determined as: io_nice = (cpu_nice + 20) / 5.

IOPRIO_CLASS_IDLE: This is the idle scheduling class, processes running at this
level only get io time when no one else needs the disk. The idle class has no
class data, since it doesn't really apply here.

Tools
-----

See below for a sample ionice tool. Usage::

	# ionice -c<class> -n<level> -p<pid>

If pid isn't given, the current process is assumed. IO priority settings
are inherited on fork, so you can use ionice to start the process at a given
level::

	# ionice -c2 -n0 /bin/ls

will run ls at the best-effort scheduling class at the highest priority.
For a running process, you can give the pid instead::

	# ionice -c1 -n2 -p100

will change pid 100 to run at the realtime scheduling class, at priority 2.

ionice.c tool::

  #include <stdio.h>
  #include <stdlib.h>
  #include <errno.h>
  #include <getopt.h>
  #include <unistd.h>
  #include <sys/ptrace.h>
  #include <asm/unistd.h>

  extern int sys_ioprio_set(int, int, int);
  extern int sys_ioprio_get(int, int);

  #if defined(__i386__)
  #define __NR_ioprio_set		289
  #define __NR_ioprio_get		290
  #elif defined(__ppc__)
  #define __NR_ioprio_set		273
  #define __NR_ioprio_get		274
  #elif defined(__x86_64__)
  #define __NR_ioprio_set		251
  #define __NR_ioprio_get		252
  #else
  #error "Unsupported arch"
  #endif

  static inline int ioprio_set(int which, int who, int ioprio)
  {
	return syscall(__NR_ioprio_set, which, who, ioprio);
  }

  static inline int ioprio_get(int which, int who)
  {
	return syscall(__NR_ioprio_get, which, who);
  }

  enum {
	IOPRIO_CLASS_NONE,
	IOPRIO_CLASS_RT,
	IOPRIO_CLASS_BE,
	IOPRIO_CLASS_IDLE,
  };

  enum {
	IOPRIO_WHO_PROCESS = 1,
	IOPRIO_WHO_PGRP,
	IOPRIO_WHO_USER,
  };

  #define IOPRIO_CLASS_SHIFT	13

  const char *to_prio[] = { "none", "realtime", "best-effort", "idle", };

  int main(int argc, char *argv[])
  {
	int ioprio = 4, set = 0, ioprio_class = IOPRIO_CLASS_BE;
	int c, pid = 0;

	while ((c = getopt(argc, argv, "+n:c:p:")) != EOF) {
		switch (c) {
		case 'n':
			ioprio = strtol(optarg, NULL, 10);
			set = 1;
			break;
		case 'c':
			ioprio_class = strtol(optarg, NULL, 10);
			set = 1;
			break;
		case 'p':
			pid = strtol(optarg, NULL, 10);
			break;
		}
	}

	switch (ioprio_class) {
		case IOPRIO_CLASS_NONE:
			ioprio_class = IOPRIO_CLASS_BE;
			break;
		case IOPRIO_CLASS_RT:
		case IOPRIO_CLASS_BE:
			break;
		case IOPRIO_CLASS_IDLE:
			ioprio = 7;
			break;
		default:
			printf("bad prio class %d\n", ioprio_class);
			return 1;
	}

	if (!set) {
		if (!pid && argv[optind])
			pid = strtol(argv[optind], NULL, 10);

		ioprio = ioprio_get(IOPRIO_WHO_PROCESS, pid);

		printf("pid=%d, %d\n", pid, ioprio);

		if (ioprio == -1)
			perror("ioprio_get");
		else {
			ioprio_class = ioprio >> IOPRIO_CLASS_SHIFT;
			ioprio = ioprio & 0xff;
			printf("%s: prio %d\n", to_prio[ioprio_class], ioprio);
		}
	} else {
		if (ioprio_set(IOPRIO_WHO_PROCESS, pid, ioprio | ioprio_class << IOPRIO_CLASS_SHIFT) == -1) {
			perror("ioprio_set");
			return 1;
		}

		if (argv[optind])
			execvp(argv[optind], &argv[optind]);
	}

	return 0;
  }


March 11 2005, Jens Axboe <jens.axboe@oracle.com>