dm-service-time.c revision 5a0e3ad6af8660be21ca98a971cd00f331318c05
1f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda/*
2f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
3f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
4f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Module Author: Kiyoshi Ueda
5f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
6f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * This file is released under the GPL.
7f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
8f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Throughput oriented path selector.
9f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda */
10f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
11f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#include "dm.h"
12f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#include "dm-path-selector.h"
13f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
145a0e3ad6af8660be21ca98a971cd00f331318c05Tejun Heo#include <linux/slab.h>
155a0e3ad6af8660be21ca98a971cd00f331318c05Tejun Heo
16f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define DM_MSG_PREFIX	"multipath service-time"
17f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MIN_IO	1
18f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_RELATIVE_THROUGHPUT	100
19f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
20f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
21f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_VERSION	"0.2.0"
22f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
23f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastruct selector {
24f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head valid_paths;
25f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head failed_paths;
26f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
27f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
28f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastruct path_info {
29f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head list;
30f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct dm_path *path;
31f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned repeat_count;
32f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned relative_throughput;
33f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
34f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
35f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
36f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct selector *alloc_selector(void)
37f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
38f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
39f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
40f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (s) {
41f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		INIT_LIST_HEAD(&s->valid_paths);
42f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		INIT_LIST_HEAD(&s->failed_paths);
43f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
44f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
45f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return s;
46f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
47f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
48f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_create(struct path_selector *ps, unsigned argc, char **argv)
49f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
50f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = alloc_selector();
51f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
52f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!s)
53f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -ENOMEM;
54f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
55f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	ps->context = s;
56f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
57f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
58f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
59f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void free_paths(struct list_head *paths)
60f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
61f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi, *next;
62f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
63f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_for_each_entry_safe(pi, next, paths, list) {
64f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		list_del(&pi->list);
65f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		kfree(pi);
66f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
67f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
68f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
69f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void st_destroy(struct path_selector *ps)
70f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
71f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
72f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
73f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	free_paths(&s->valid_paths);
74f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	free_paths(&s->failed_paths);
75f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	kfree(s);
76f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	ps->context = NULL;
77f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
78f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
79f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_status(struct path_selector *ps, struct dm_path *path,
80f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     status_type_t type, char *result, unsigned maxlen)
81f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
82f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned sz = 0;
83f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi;
84f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
85f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!path)
86f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMEMIT("0 ");
87f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	else {
88f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		pi = path->pscontext;
89f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
90f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		switch (type) {
91f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		case STATUSTYPE_INFO:
92f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
93f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			       pi->relative_throughput);
94f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			break;
95f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		case STATUSTYPE_TABLE:
96f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			DMEMIT("%u %u ", pi->repeat_count,
97f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			       pi->relative_throughput);
98f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			break;
99f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		}
100f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
101f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
102f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return sz;
103f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
104f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
105f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_add_path(struct path_selector *ps, struct dm_path *path,
106f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		       int argc, char **argv, char **error)
107f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
108f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
109f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi;
110f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned repeat_count = ST_MIN_IO;
111f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned relative_throughput = 1;
112f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
113f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
114f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Arguments: [<repeat_count> [<relative_throughput>]]
115f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 	<repeat_count>: The number of I/Os before switching path.
116f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			If not given, default (ST_MIN_IO) is used.
117f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 	<relative_throughput>: The relative throughput value of
118f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			the path among all paths in the path-group.
119f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
120f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			If not given, minimum value '1' is used.
121f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			If '0' is given, the path isn't selected while
122f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			other paths having a positive value are
123f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			available.
124f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
125f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (argc > 2) {
126f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: incorrect number of arguments";
127f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
128f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
129f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
130f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) {
131f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: invalid repeat count";
132f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
133f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
134f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
135f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if ((argc == 2) &&
136f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	    (sscanf(argv[1], "%u", &relative_throughput) != 1 ||
137f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
138f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: invalid relative_throughput value";
139f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
140f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
141f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
142f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/* allocate the path */
143f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
144f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!pi) {
145f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: Error allocating path context";
146f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -ENOMEM;
147f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
148f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
149f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->path = path;
150f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->repeat_count = repeat_count;
151f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->relative_throughput = relative_throughput;
152f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_set(&pi->in_flight_size, 0);
153f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
154f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	path->pscontext = pi;
155f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
156f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_add_tail(&pi->list, &s->valid_paths);
157f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
158f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
159f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
160f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
161f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void st_fail_path(struct path_selector *ps, struct dm_path *path)
162f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
163f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
164f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
165f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
166f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move(&pi->list, &s->failed_paths);
167f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
168f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
169f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
170f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
171f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
172f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
173f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
174f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move_tail(&pi->list, &s->valid_paths);
175f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
176f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
177f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
178f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
179f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda/*
180f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Compare the estimated service time of 2 paths, pi1 and pi2,
181f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * for the incoming I/O.
182f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
183f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Returns:
184f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * < 0 : pi1 is better
185f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * 0   : no difference between pi1 and pi2
186f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * > 0 : pi2 is better
187f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
188f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Description:
189f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Basically, the service time is estimated by:
190f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
191f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * To reduce the calculation, some optimizations are made.
192f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * (See comments inline)
193f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda */
194f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_compare_load(struct path_info *pi1, struct path_info *pi2,
195f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			   size_t incoming)
196f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
197f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	size_t sz1, sz2, st1, st2;
198f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
199f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz1 = atomic_read(&pi1->in_flight_size);
200f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz2 = atomic_read(&pi2->in_flight_size);
201f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
202f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
203f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 1: Both have same throughput value. Choose less loaded path.
204f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
205f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (pi1->relative_throughput == pi2->relative_throughput)
206f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return sz1 - sz2;
207f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
208f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
209f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 2a: Both have same load. Choose higher throughput path.
210f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 2b: One path has no throughput value. Choose the other one.
211f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
212f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (sz1 == sz2 ||
213f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	    !pi1->relative_throughput || !pi2->relative_throughput)
214f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return pi2->relative_throughput - pi1->relative_throughput;
215f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
216f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
217f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 3: Calculate service time. Choose faster path.
218f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Service time using pi1:
219f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
220f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Service time using pi2:
221f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
222f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *
223f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         To avoid the division, transform the expression to use
224f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         multiplication.
225f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Because ->relative_throughput > 0 here, if st1 < st2,
226f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         the expressions below are the same meaning:
227f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             (sz1 + incoming) / pi1->relative_throughput <
228f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *                 (sz2 + incoming) / pi2->relative_throughput
229f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             (sz1 + incoming) * pi2->relative_throughput <
230f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *                 (sz2 + incoming) * pi1->relative_throughput
231f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         So use the later one.
232f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
233f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz1 += incoming;
234f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz2 += incoming;
235f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
236f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
237f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		/*
238f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * Size may be too big for multiplying pi->relative_throughput
239f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * and overflow.
240f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * To avoid the overflow and mis-selection, shift down both.
241f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 */
242f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
243f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
244f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
245f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	st1 = sz1 * pi2->relative_throughput;
246f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	st2 = sz2 * pi1->relative_throughput;
247f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (st1 != st2)
248f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return st1 - st2;
249f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
250f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
251f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 4: Service time is equal. Choose higher throughput path.
252f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
253f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return pi2->relative_throughput - pi1->relative_throughput;
254f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
255f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
256f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct dm_path *st_select_path(struct path_selector *ps,
257f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda				      unsigned *repeat_count, size_t nr_bytes)
258f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
259f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
260f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = NULL, *best = NULL;
261f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
262f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (list_empty(&s->valid_paths))
263f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return NULL;
264f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
265f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/* Change preferred (first in list) path to evenly balance. */
266f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move_tail(s->valid_paths.next, &s->valid_paths);
267f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
268f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_for_each_entry(pi, &s->valid_paths, list)
269f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
270f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			best = pi;
271f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
272f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!best)
273f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return NULL;
274f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
275f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	*repeat_count = best->repeat_count;
276f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
277f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return best->path;
278f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
279f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
280f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_start_io(struct path_selector *ps, struct dm_path *path,
281f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		       size_t nr_bytes)
282f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
283f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
284f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
285f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_add(nr_bytes, &pi->in_flight_size);
286f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
287f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
288f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
289f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
290f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_end_io(struct path_selector *ps, struct dm_path *path,
291f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     size_t nr_bytes)
292f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
293f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
294f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
295f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_sub(nr_bytes, &pi->in_flight_size);
296f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
297f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
298f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
299f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
300f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct path_selector_type st_ps = {
301f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.name		= "service-time",
302f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.module		= THIS_MODULE,
303f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.table_args	= 2,
304f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.info_args	= 2,
305f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.create		= st_create,
306f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.destroy	= st_destroy,
307f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.status		= st_status,
308f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.add_path	= st_add_path,
309f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.fail_path	= st_fail_path,
310f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.reinstate_path	= st_reinstate_path,
311f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.select_path	= st_select_path,
312f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.start_io	= st_start_io,
313f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.end_io		= st_end_io,
314f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
315f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
316f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int __init dm_st_init(void)
317f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
318f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	int r = dm_register_path_selector(&st_ps);
319f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
320f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (r < 0)
321f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMERR("register failed %d", r);
322f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
323f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	DMINFO("version " ST_VERSION " loaded");
324f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
325f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return r;
326f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
327f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
328f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void __exit dm_st_exit(void)
329f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
330f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	int r = dm_unregister_path_selector(&st_ps);
331f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
332f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (r < 0)
333f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMERR("unregister failed %d", r);
334f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
335f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
336f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedamodule_init(dm_st_init);
337f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedamodule_exit(dm_st_exit);
338f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
339f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
340f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
341f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_LICENSE("GPL");
342