1f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda/*
2f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Copyright (C) 2007-2009 NEC Corporation.  All Rights Reserved.
3f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
4f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Module Author: Kiyoshi Ueda
5f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
6f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * This file is released under the GPL.
7f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
8f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Throughput oriented path selector.
9f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda */
10f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
11f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#include "dm.h"
12f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#include "dm-path-selector.h"
13f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
145a0e3ad6af8660be21ca98a971cd00f331318c05Tejun Heo#include <linux/slab.h>
15056075c76417b112b4924e7b6386fdc6dfc9ac03Paul Gortmaker#include <linux/module.h>
165a0e3ad6af8660be21ca98a971cd00f331318c05Tejun Heo
17f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define DM_MSG_PREFIX	"multipath service-time"
18f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MIN_IO	1
19f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_RELATIVE_THROUGHPUT	100
20f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_RELATIVE_THROUGHPUT_SHIFT	7
21f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_MAX_INFLIGHT_SIZE	((size_t)-1 >> ST_MAX_RELATIVE_THROUGHPUT_SHIFT)
22f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda#define ST_VERSION	"0.2.0"
23f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
24f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastruct selector {
25f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head valid_paths;
26f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head failed_paths;
27f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
28f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
29f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastruct path_info {
30f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct list_head list;
31f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct dm_path *path;
32f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned repeat_count;
33f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned relative_throughput;
34f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_t in_flight_size;	/* Total size of in-flight I/Os */
35f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
36f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
37f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct selector *alloc_selector(void)
38f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
39f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = kmalloc(sizeof(*s), GFP_KERNEL);
40f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
41f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (s) {
42f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		INIT_LIST_HEAD(&s->valid_paths);
43f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		INIT_LIST_HEAD(&s->failed_paths);
44f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
45f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
46f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return s;
47f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
48f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
49f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_create(struct path_selector *ps, unsigned argc, char **argv)
50f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
51f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = alloc_selector();
52f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
53f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!s)
54f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -ENOMEM;
55f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
56f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	ps->context = s;
57f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
58f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
59f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
60f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void free_paths(struct list_head *paths)
61f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
62f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi, *next;
63f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
64f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_for_each_entry_safe(pi, next, paths, list) {
65f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		list_del(&pi->list);
66f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		kfree(pi);
67f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
68f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
69f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
70f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void st_destroy(struct path_selector *ps)
71f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
72f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
73f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
74f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	free_paths(&s->valid_paths);
75f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	free_paths(&s->failed_paths);
76f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	kfree(s);
77f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	ps->context = NULL;
78f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
79f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
80f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_status(struct path_selector *ps, struct dm_path *path,
81f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     status_type_t type, char *result, unsigned maxlen)
82f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
83f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned sz = 0;
84f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi;
85f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
86f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!path)
87f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMEMIT("0 ");
88f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	else {
89f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		pi = path->pscontext;
90f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
91f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		switch (type) {
92f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		case STATUSTYPE_INFO:
93f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			DMEMIT("%d %u ", atomic_read(&pi->in_flight_size),
94f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			       pi->relative_throughput);
95f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			break;
96f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		case STATUSTYPE_TABLE:
97f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			DMEMIT("%u %u ", pi->repeat_count,
98f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			       pi->relative_throughput);
99f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			break;
100f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		}
101f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
102f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
103f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return sz;
104f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
105f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
106f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_add_path(struct path_selector *ps, struct dm_path *path,
107f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		       int argc, char **argv, char **error)
108f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
109f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
110f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi;
111f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned repeat_count = ST_MIN_IO;
112f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	unsigned relative_throughput = 1;
11331998ef19385c944600d9a981b96252f98204beeMikulas Patocka	char dummy;
114f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
115f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
116f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Arguments: [<repeat_count> [<relative_throughput>]]
117f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 	<repeat_count>: The number of I/Os before switching path.
118f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			If not given, default (ST_MIN_IO) is used.
119f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 	<relative_throughput>: The relative throughput value of
120f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			the path among all paths in the path-group.
121f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			The valid range: 0-<ST_MAX_RELATIVE_THROUGHPUT>
122f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			If not given, minimum value '1' is used.
123f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *			If '0' is given, the path isn't selected while
124f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			other paths having a positive value are
125f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * 			available.
126f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
127f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (argc > 2) {
128f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: incorrect number of arguments";
129f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
130f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
131f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
13231998ef19385c944600d9a981b96252f98204beeMikulas Patocka	if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
133f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: invalid repeat count";
134f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
135f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
136f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
137f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if ((argc == 2) &&
13831998ef19385c944600d9a981b96252f98204beeMikulas Patocka	    (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
139f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	     relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
140f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: invalid relative_throughput value";
141f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -EINVAL;
142f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
143f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
144f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/* allocate the path */
145f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi = kmalloc(sizeof(*pi), GFP_KERNEL);
146f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!pi) {
147f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		*error = "service-time ps: Error allocating path context";
148f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return -ENOMEM;
149f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
150f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
151f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->path = path;
152f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->repeat_count = repeat_count;
153f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	pi->relative_throughput = relative_throughput;
154f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_set(&pi->in_flight_size, 0);
155f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
156f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	path->pscontext = pi;
157f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
158f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_add_tail(&pi->list, &s->valid_paths);
159f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
160f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
161f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
162f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
163f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void st_fail_path(struct path_selector *ps, struct dm_path *path)
164f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
165f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
166f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
167f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
168f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move(&pi->list, &s->failed_paths);
169f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
170f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
171f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_reinstate_path(struct path_selector *ps, struct dm_path *path)
172f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
173f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
174f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
175f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
176f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move_tail(&pi->list, &s->valid_paths);
177f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
178f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
179f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
180f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
181f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda/*
182f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Compare the estimated service time of 2 paths, pi1 and pi2,
183f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * for the incoming I/O.
184f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
185f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Returns:
186f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * < 0 : pi1 is better
187f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * 0   : no difference between pi1 and pi2
188f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * > 0 : pi2 is better
189f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *
190f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Description:
191f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * Basically, the service time is estimated by:
192f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda *     ('pi->in-flight-size' + 'incoming') / 'pi->relative_throughput'
193f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * To reduce the calculation, some optimizations are made.
194f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda * (See comments inline)
195f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda */
196f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_compare_load(struct path_info *pi1, struct path_info *pi2,
197f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			   size_t incoming)
198f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
199f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	size_t sz1, sz2, st1, st2;
200f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
201f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz1 = atomic_read(&pi1->in_flight_size);
202f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz2 = atomic_read(&pi2->in_flight_size);
203f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
204f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
205f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 1: Both have same throughput value. Choose less loaded path.
206f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
207f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (pi1->relative_throughput == pi2->relative_throughput)
208f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return sz1 - sz2;
209f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
210f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
211f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 2a: Both have same load. Choose higher throughput path.
212f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 2b: One path has no throughput value. Choose the other one.
213f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
214f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (sz1 == sz2 ||
215f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	    !pi1->relative_throughput || !pi2->relative_throughput)
216f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return pi2->relative_throughput - pi1->relative_throughput;
217f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
218f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
219f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 3: Calculate service time. Choose faster path.
220f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Service time using pi1:
221f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             st1 = (sz1 + incoming) / pi1->relative_throughput
222f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Service time using pi2:
223f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             st2 = (sz2 + incoming) / pi2->relative_throughput
224f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *
225f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         To avoid the division, transform the expression to use
226f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         multiplication.
227f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         Because ->relative_throughput > 0 here, if st1 < st2,
228f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         the expressions below are the same meaning:
229f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             (sz1 + incoming) / pi1->relative_throughput <
230f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *                 (sz2 + incoming) / pi2->relative_throughput
231f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *             (sz1 + incoming) * pi2->relative_throughput <
232f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *                 (sz2 + incoming) * pi1->relative_throughput
233f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 *         So use the later one.
234f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
235f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz1 += incoming;
236f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	sz2 += incoming;
237f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (unlikely(sz1 >= ST_MAX_INFLIGHT_SIZE ||
238f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     sz2 >= ST_MAX_INFLIGHT_SIZE)) {
239f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		/*
240f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * Size may be too big for multiplying pi->relative_throughput
241f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * and overflow.
242f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 * To avoid the overflow and mis-selection, shift down both.
243f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		 */
244f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		sz1 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
245f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		sz2 >>= ST_MAX_RELATIVE_THROUGHPUT_SHIFT;
246f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	}
247f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	st1 = sz1 * pi2->relative_throughput;
248f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	st2 = sz2 * pi1->relative_throughput;
249f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (st1 != st2)
250f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return st1 - st2;
251f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
252f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/*
253f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 * Case 4: Service time is equal. Choose higher throughput path.
254f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	 */
255f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return pi2->relative_throughput - pi1->relative_throughput;
256f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
257f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
258f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct dm_path *st_select_path(struct path_selector *ps,
259f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda				      unsigned *repeat_count, size_t nr_bytes)
260f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
261f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct selector *s = ps->context;
262f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = NULL, *best = NULL;
263f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
264f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (list_empty(&s->valid_paths))
265f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return NULL;
266f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
267f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	/* Change preferred (first in list) path to evenly balance. */
268f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_move_tail(s->valid_paths.next, &s->valid_paths);
269f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
270f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	list_for_each_entry(pi, &s->valid_paths, list)
271f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		if (!best || (st_compare_load(pi, best, nr_bytes) < 0))
272f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda			best = pi;
273f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
274f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (!best)
275f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		return NULL;
276f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
277f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	*repeat_count = best->repeat_count;
278f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
279f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return best->path;
280f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
281f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
282f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_start_io(struct path_selector *ps, struct dm_path *path,
283f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		       size_t nr_bytes)
284f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
285f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
286f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
287f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_add(nr_bytes, &pi->in_flight_size);
288f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
289f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
290f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
291f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
292f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int st_end_io(struct path_selector *ps, struct dm_path *path,
293f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		     size_t nr_bytes)
294f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
295f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	struct path_info *pi = path->pscontext;
296f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
297f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	atomic_sub(nr_bytes, &pi->in_flight_size);
298f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
299f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return 0;
300f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
301f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
302f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic struct path_selector_type st_ps = {
303f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.name		= "service-time",
304f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.module		= THIS_MODULE,
305f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.table_args	= 2,
306f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.info_args	= 2,
307f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.create		= st_create,
308f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.destroy	= st_destroy,
309f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.status		= st_status,
310f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.add_path	= st_add_path,
311f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.fail_path	= st_fail_path,
312f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.reinstate_path	= st_reinstate_path,
313f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.select_path	= st_select_path,
314f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.start_io	= st_start_io,
315f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	.end_io		= st_end_io,
316f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda};
317f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
318f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic int __init dm_st_init(void)
319f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
320f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	int r = dm_register_path_selector(&st_ps);
321f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
322f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (r < 0)
323f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMERR("register failed %d", r);
324f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
325f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	DMINFO("version " ST_VERSION " loaded");
326f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
327f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	return r;
328f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
329f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
330f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedastatic void __exit dm_st_exit(void)
331f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda{
332f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	int r = dm_unregister_path_selector(&st_ps);
333f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
334f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda	if (r < 0)
335f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda		DMERR("unregister failed %d", r);
336f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda}
337f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
338f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedamodule_init(dm_st_init);
339f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Uedamodule_exit(dm_st_exit);
340f392ba889b019602976082bfe7bf486c2594f85cKiyoshi Ueda
341f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_DESCRIPTION(DM_NAME " throughput oriented path selector");
342f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_AUTHOR("Kiyoshi Ueda <k-ueda@ct.jp.nec.com>");
343f392ba889b019602976082bfe7bf486c2594f85cKiyoshi UedaMODULE_LICENSE("GPL");
344