tcp_cong.c revision e905a9edab7f4f14f9213b52234e4a346c690911
1/* 2 * Plugable TCP congestion control support and newReno 3 * congestion control. 4 * Based on ideas from I/O scheduler suport and Web100. 5 * 6 * Copyright (C) 2005 Stephen Hemminger <shemminger@osdl.org> 7 */ 8 9#include <linux/module.h> 10#include <linux/mm.h> 11#include <linux/types.h> 12#include <linux/list.h> 13#include <net/tcp.h> 14 15static DEFINE_SPINLOCK(tcp_cong_list_lock); 16static LIST_HEAD(tcp_cong_list); 17 18/* Simple linear search, don't expect many entries! */ 19static struct tcp_congestion_ops *tcp_ca_find(const char *name) 20{ 21 struct tcp_congestion_ops *e; 22 23 list_for_each_entry_rcu(e, &tcp_cong_list, list) { 24 if (strcmp(e->name, name) == 0) 25 return e; 26 } 27 28 return NULL; 29} 30 31/* 32 * Attach new congestion control algorthim to the list 33 * of available options. 34 */ 35int tcp_register_congestion_control(struct tcp_congestion_ops *ca) 36{ 37 int ret = 0; 38 39 /* all algorithms must implement ssthresh and cong_avoid ops */ 40 if (!ca->ssthresh || !ca->cong_avoid) { 41 printk(KERN_ERR "TCP %s does not implement required ops\n", 42 ca->name); 43 return -EINVAL; 44 } 45 46 spin_lock(&tcp_cong_list_lock); 47 if (tcp_ca_find(ca->name)) { 48 printk(KERN_NOTICE "TCP %s already registered\n", ca->name); 49 ret = -EEXIST; 50 } else { 51 list_add_tail_rcu(&ca->list, &tcp_cong_list); 52 printk(KERN_INFO "TCP %s registered\n", ca->name); 53 } 54 spin_unlock(&tcp_cong_list_lock); 55 56 return ret; 57} 58EXPORT_SYMBOL_GPL(tcp_register_congestion_control); 59 60/* 61 * Remove congestion control algorithm, called from 62 * the module's remove function. Module ref counts are used 63 * to ensure that this can't be done till all sockets using 64 * that method are closed. 65 */ 66void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) 67{ 68 spin_lock(&tcp_cong_list_lock); 69 list_del_rcu(&ca->list); 70 spin_unlock(&tcp_cong_list_lock); 71} 72EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); 73 74/* Assign choice of congestion control. */ 75void tcp_init_congestion_control(struct sock *sk) 76{ 77 struct inet_connection_sock *icsk = inet_csk(sk); 78 struct tcp_congestion_ops *ca; 79 80 if (icsk->icsk_ca_ops != &tcp_init_congestion_ops) 81 return; 82 83 rcu_read_lock(); 84 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 85 if (try_module_get(ca->owner)) { 86 icsk->icsk_ca_ops = ca; 87 break; 88 } 89 90 } 91 rcu_read_unlock(); 92 93 if (icsk->icsk_ca_ops->init) 94 icsk->icsk_ca_ops->init(sk); 95} 96 97/* Manage refcounts on socket close. */ 98void tcp_cleanup_congestion_control(struct sock *sk) 99{ 100 struct inet_connection_sock *icsk = inet_csk(sk); 101 102 if (icsk->icsk_ca_ops->release) 103 icsk->icsk_ca_ops->release(sk); 104 module_put(icsk->icsk_ca_ops->owner); 105} 106 107/* Used by sysctl to change default congestion control */ 108int tcp_set_default_congestion_control(const char *name) 109{ 110 struct tcp_congestion_ops *ca; 111 int ret = -ENOENT; 112 113 spin_lock(&tcp_cong_list_lock); 114 ca = tcp_ca_find(name); 115#ifdef CONFIG_KMOD 116 if (!ca && capable(CAP_SYS_MODULE)) { 117 spin_unlock(&tcp_cong_list_lock); 118 119 request_module("tcp_%s", name); 120 spin_lock(&tcp_cong_list_lock); 121 ca = tcp_ca_find(name); 122 } 123#endif 124 125 if (ca) { 126 ca->non_restricted = 1; /* default is always allowed */ 127 list_move(&ca->list, &tcp_cong_list); 128 ret = 0; 129 } 130 spin_unlock(&tcp_cong_list_lock); 131 132 return ret; 133} 134 135/* Set default value from kernel configuration at bootup */ 136static int __init tcp_congestion_default(void) 137{ 138 return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); 139} 140late_initcall(tcp_congestion_default); 141 142 143/* Build string with list of available congestion control values */ 144void tcp_get_available_congestion_control(char *buf, size_t maxlen) 145{ 146 struct tcp_congestion_ops *ca; 147 size_t offs = 0; 148 149 rcu_read_lock(); 150 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 151 offs += snprintf(buf + offs, maxlen - offs, 152 "%s%s", 153 offs == 0 ? "" : " ", ca->name); 154 155 } 156 rcu_read_unlock(); 157} 158 159/* Get current default congestion control */ 160void tcp_get_default_congestion_control(char *name) 161{ 162 struct tcp_congestion_ops *ca; 163 /* We will always have reno... */ 164 BUG_ON(list_empty(&tcp_cong_list)); 165 166 rcu_read_lock(); 167 ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); 168 strncpy(name, ca->name, TCP_CA_NAME_MAX); 169 rcu_read_unlock(); 170} 171 172/* Built list of non-restricted congestion control values */ 173void tcp_get_allowed_congestion_control(char *buf, size_t maxlen) 174{ 175 struct tcp_congestion_ops *ca; 176 size_t offs = 0; 177 178 *buf = '\0'; 179 rcu_read_lock(); 180 list_for_each_entry_rcu(ca, &tcp_cong_list, list) { 181 if (!ca->non_restricted) 182 continue; 183 offs += snprintf(buf + offs, maxlen - offs, 184 "%s%s", 185 offs == 0 ? "" : " ", ca->name); 186 187 } 188 rcu_read_unlock(); 189} 190 191/* Change list of non-restricted congestion control */ 192int tcp_set_allowed_congestion_control(char *val) 193{ 194 struct tcp_congestion_ops *ca; 195 char *clone, *name; 196 int ret = 0; 197 198 clone = kstrdup(val, GFP_USER); 199 if (!clone) 200 return -ENOMEM; 201 202 spin_lock(&tcp_cong_list_lock); 203 /* pass 1 check for bad entries */ 204 while ((name = strsep(&clone, " ")) && *name) { 205 ca = tcp_ca_find(name); 206 if (!ca) { 207 ret = -ENOENT; 208 goto out; 209 } 210 } 211 212 /* pass 2 clear */ 213 list_for_each_entry_rcu(ca, &tcp_cong_list, list) 214 ca->non_restricted = 0; 215 216 /* pass 3 mark as allowed */ 217 while ((name = strsep(&val, " ")) && *name) { 218 ca = tcp_ca_find(name); 219 WARN_ON(!ca); 220 if (ca) 221 ca->non_restricted = 1; 222 } 223out: 224 spin_unlock(&tcp_cong_list_lock); 225 226 return ret; 227} 228 229 230/* Change congestion control for socket */ 231int tcp_set_congestion_control(struct sock *sk, const char *name) 232{ 233 struct inet_connection_sock *icsk = inet_csk(sk); 234 struct tcp_congestion_ops *ca; 235 int err = 0; 236 237 rcu_read_lock(); 238 ca = tcp_ca_find(name); 239 /* no change asking for existing value */ 240 if (ca == icsk->icsk_ca_ops) 241 goto out; 242 243#ifdef CONFIG_KMOD 244 /* not found attempt to autoload module */ 245 if (!ca && capable(CAP_SYS_MODULE)) { 246 rcu_read_unlock(); 247 request_module("tcp_%s", name); 248 rcu_read_lock(); 249 ca = tcp_ca_find(name); 250 } 251#endif 252 if (!ca) 253 err = -ENOENT; 254 255 else if (!(ca->non_restricted || capable(CAP_NET_ADMIN))) 256 err = -EPERM; 257 258 else if (!try_module_get(ca->owner)) 259 err = -EBUSY; 260 261 else { 262 tcp_cleanup_congestion_control(sk); 263 icsk->icsk_ca_ops = ca; 264 if (icsk->icsk_ca_ops->init) 265 icsk->icsk_ca_ops->init(sk); 266 } 267 out: 268 rcu_read_unlock(); 269 return err; 270} 271 272 273/* 274 * Linear increase during slow start 275 */ 276void tcp_slow_start(struct tcp_sock *tp) 277{ 278 if (sysctl_tcp_abc) { 279 /* RFC3465: Slow Start 280 * TCP sender SHOULD increase cwnd by the number of 281 * previously unacknowledged bytes ACKed by each incoming 282 * acknowledgment, provided the increase is not more than L 283 */ 284 if (tp->bytes_acked < tp->mss_cache) 285 return; 286 287 /* We MAY increase by 2 if discovered delayed ack */ 288 if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache) { 289 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 290 tp->snd_cwnd++; 291 } 292 } 293 tp->bytes_acked = 0; 294 295 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 296 tp->snd_cwnd++; 297} 298EXPORT_SYMBOL_GPL(tcp_slow_start); 299 300/* 301 * TCP Reno congestion control 302 * This is special case used for fallback as well. 303 */ 304/* This is Jacobson's slow start and congestion avoidance. 305 * SIGCOMM '88, p. 328. 306 */ 307void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 rtt, u32 in_flight, 308 int flag) 309{ 310 struct tcp_sock *tp = tcp_sk(sk); 311 312 if (!tcp_is_cwnd_limited(sk, in_flight)) 313 return; 314 315 /* In "safe" area, increase. */ 316 if (tp->snd_cwnd <= tp->snd_ssthresh) 317 tcp_slow_start(tp); 318 319 /* In dangerous area, increase slowly. */ 320 else if (sysctl_tcp_abc) { 321 /* RFC3465: Appropriate Byte Count 322 * increase once for each full cwnd acked 323 */ 324 if (tp->bytes_acked >= tp->snd_cwnd*tp->mss_cache) { 325 tp->bytes_acked -= tp->snd_cwnd*tp->mss_cache; 326 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 327 tp->snd_cwnd++; 328 } 329 } else { 330 /* In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd */ 331 if (tp->snd_cwnd_cnt >= tp->snd_cwnd) { 332 if (tp->snd_cwnd < tp->snd_cwnd_clamp) 333 tp->snd_cwnd++; 334 tp->snd_cwnd_cnt = 0; 335 } else 336 tp->snd_cwnd_cnt++; 337 } 338} 339EXPORT_SYMBOL_GPL(tcp_reno_cong_avoid); 340 341/* Slow start threshold is half the congestion window (min 2) */ 342u32 tcp_reno_ssthresh(struct sock *sk) 343{ 344 const struct tcp_sock *tp = tcp_sk(sk); 345 return max(tp->snd_cwnd >> 1U, 2U); 346} 347EXPORT_SYMBOL_GPL(tcp_reno_ssthresh); 348 349/* Lower bound on congestion window with halving. */ 350u32 tcp_reno_min_cwnd(const struct sock *sk) 351{ 352 const struct tcp_sock *tp = tcp_sk(sk); 353 return tp->snd_ssthresh/2; 354} 355EXPORT_SYMBOL_GPL(tcp_reno_min_cwnd); 356 357struct tcp_congestion_ops tcp_reno = { 358 .name = "reno", 359 .non_restricted = 1, 360 .owner = THIS_MODULE, 361 .ssthresh = tcp_reno_ssthresh, 362 .cong_avoid = tcp_reno_cong_avoid, 363 .min_cwnd = tcp_reno_min_cwnd, 364}; 365 366/* Initial congestion control used (until SYN) 367 * really reno under another name so we can tell difference 368 * during tcp_set_default_congestion_control 369 */ 370struct tcp_congestion_ops tcp_init_congestion_ops = { 371 .name = "", 372 .owner = THIS_MODULE, 373 .ssthresh = tcp_reno_ssthresh, 374 .cong_avoid = tcp_reno_cong_avoid, 375 .min_cwnd = tcp_reno_min_cwnd, 376}; 377EXPORT_SYMBOL_GPL(tcp_init_congestion_ops); 378