diff -r -U3 --show-c-function linux-2.4.21/fs/proc/web100.c linux-2.4.21vegas/fs/proc/web100.c --- linux-2.4.21/fs/proc/web100.c 2003-09-26 13:55:08.000000000 -0400 +++ linux-2.4.21vegas/fs/proc/web100.c 2003-10-01 11:44:59.000000000 -0400 @@ -1077,6 +1077,45 @@ static int write_mss(void *buf, struct w } #endif +#ifdef WEB100_PRIVATE +static int write_vegas(void *buf, struct web100stats *stats, struct web100_var *vp) +{ + struct sock *sk = stats->wc_sk; + struct tcp_opt *tp; + __u32 val = *(__u32 *)buf; + + /* turn vegas on or off from WAD */ + + if (sk == NULL) + return 1; + tp = &sk->tp_pinfo.af_tcp; + WEB100_VAR_SET(tp, WAD_Vegas, val); + WEB100_VAR_SET(tp,WAD_VegasAlpha,NET100_WAD(tp, WAD_VegasAlpha, sysctl_tcp_vegas_alpha)); + WEB100_VAR_SET(tp,WAD_VegasBeta,NET100_WAD(tp, WAD_VegasBeta, sysctl_tcp_vegas_beta)); + WEB100_VAR_SET(tp,WAD_VegasGamma,NET100_WAD(tp, WAD_VegasGamma, sysctl_tcp_vegas_gamma)); + + if (val == 0) { /* off */ + tp->v_do_vegas = 0; + tp->v_doing_vegas_now = 0; + } else { /* vegas on */ + tp->v_do_vegas = 1; + tp->v_baseRTT = 0x7fffffff; + if (tp->ca_state == TCP_CA_Open) { /* enable */ + tp->v_doing_vegas_now = 1; + tp->v_cntRTT = 0; + tp->v_minRTT = 0x7fffffff; + tp->v_beg_snd_nxt = tp->snd_nxt; + /* also init beg_snd_una & beg_snd_cwnd--thd */ + tp->v_beg_snd_una = tp->snd_una; + tp->v_beg_snd_cwnd = tp->snd_cwnd; + } else { + tp->v_doing_vegas_now = 0; + } + } + return 0; +} +#endif + static int write_sndbuf(void *buf, struct web100stats *stats, struct web100_var *vp) { (__u32)(stats->wc_sk->sndbuf) = *(__u32 *)buf; @@ -1331,6 +1370,11 @@ add_var(web100_file_lookup(ino), #name, #ifdef WEB100_PRIVATE ADD_RO_STATSVAR(PROC_CONN_READ, X_FullBursts, WEB100_TYPE_COUNTER32); ADD_RO_STATSVAR(PROC_CONN_READ, X_PushedBursts, WEB100_TYPE_COUNTER32); + ADD_RO_STATSVAR(PROC_CONN_READ, WAD_VegasGTGamma, WEB100_TYPE_COUNTER32); + ADD_RO_STATSVAR(PROC_CONN_READ, WAD_VegasLTAlpha, WEB100_TYPE_COUNTER32); + ADD_RO_STATSVAR(PROC_CONN_READ, WAD_VegasGTBeta, WEB100_TYPE_COUNTER32); + ADD_RO_STATSVAR(PROC_CONN_READ, WAD_VegasNoChange, WEB100_TYPE_COUNTER32); + ADD_RO_STATSVAR(PROC_CONN_READ, VegasDiff, WEB100_TYPE_GAUGE32); #endif /* WEB100_PRIVATE */ ADD_RO_STATSVAR(PROC_CONN_READ, X_RcvRTT, WEB100_TYPE_GAUGE32); @@ -1394,5 +1438,11 @@ add_var(web100_file_lookup(ino), #name, ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_NoAI, WEB100_TYPE_INTEGER); ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_CwndAdjust, WEB100_TYPE_INTEGER32); ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_Kaicnt, WEB100_TYPE_INTEGER32); + add_var(web100_file_lookup(PROC_CONN_TUNE), "WAD_Vegas", + WEB100_TYPE_GAUGE32, read_stats, OFFSET_ST(WAD_Vegas), + write_vegas, 0); + ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_VegasAlpha, WEB100_TYPE_GAUGE32); + ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_VegasBeta, WEB100_TYPE_GAUGE32); + ADD_RW_STATSVAR(PROC_CONN_TUNE, WAD_VegasGamma, WEB100_TYPE_GAUGE32); #endif /* WEB100_PRIVATE */ } diff -r -U3 --show-c-function linux-2.4.21/include/linux/sysctl.h linux-2.4.21vegas/include/linux/sysctl.h --- linux-2.4.21/include/linux/sysctl.h 2003-08-27 16:14:50.000000000 -0400 +++ linux-2.4.21vegas/include/linux/sysctl.h 2003-09-26 14:25:57.000000000 -0400 @@ -297,6 +297,11 @@ enum NET_TCP_FRTO=92, NET_TCP_LOW_LATENCY=93, NET_IPV4_IPFRAG_SECRET_INTERVAL=94, + NET_TCP_VEGAS=95, + NET_TCP_VEGAS_ALPHA=96, + NET_TCP_VEGAS_BETA=97, + NET_TCP_VEGAS_GAMMA=98, + NET_TCP_VEGAS_TIMRES=99, #ifdef CONFIG_WEB100 NET_IPV4_WEB100_DEFAULT_WSCALE, #endif diff -r -U3 --show-c-function linux-2.4.21/include/net/sock.h linux-2.4.21vegas/include/net/sock.h --- linux-2.4.21/include/net/sock.h 2003-08-27 16:16:13.000000000 -0400 +++ linux-2.4.21vegas/include/net/sock.h 2003-09-05 13:50:09.000000000 -0400 @@ -336,6 +336,16 @@ struct tcp_opt { __u32 snd_cwnd_used; __u32 snd_cwnd_stamp; + /* Vegas variables */ + __u32 v_beg_snd_nxt; /* saves right edge of bytes sent during last RTT */ + __u32 v_beg_snd_una; /* saves the left edge of bytes sent during last RTT */ + __u32 v_beg_snd_cwnd; /* saves the size of the cwnd */ + __u8 v_do_vegas; /* if true, do vegas for this connection */ + __u8 v_doing_vegas_now;/* if true, do vegas for this RTT */ + __u16 v_cntRTT; /* # of RTTs measured within last RTT */ + __u32 v_minRTT; /* min of RTTs measured within last RTT (in usec) */ + __u32 v_baseRTT; /* the min of all Vegas RTT measurements seen (in usec) */ + /* Two commonly used timers in both sender and receiver paths. */ unsigned long timeout; struct timer_list retransmit_timer; /* Resend (no ack) */ diff -r -U3 --show-c-function linux-2.4.21/include/net/tcp.h linux-2.4.21vegas/include/net/tcp.h --- linux-2.4.21/include/net/tcp.h 2003-08-27 16:16:13.000000000 -0400 +++ linux-2.4.21vegas/include/net/tcp.h 2003-10-01 11:43:30.000000000 -0400 @@ -467,6 +467,11 @@ extern int sysctl_tcp_adv_win_scale; extern int sysctl_tcp_tw_reuse; extern int sysctl_tcp_frto; extern int sysctl_tcp_low_latency; +extern int sysctl_tcp_vegas_cong_avoid; +extern int sysctl_tcp_vegas_alpha; +extern int sysctl_tcp_vegas_beta; +extern int sysctl_tcp_vegas_gamma; +extern int sysctl_tcp_vegas_timres; #ifdef CONFIG_WEB100 extern int sysctl_web100_default_wscale; #endif @@ -1126,6 +1131,51 @@ static inline __u32 tcp_recalc_ssthresh( return max(tp->snd_cwnd >> 1U, 2U); } +/* Stop taking Vegas samples for now. */ +#define tcp_vegas_disable(__tp) ((__tp)->v_doing_vegas_now = 0) + +/* Is this TCP connection using Vegas (regardless of whether it is taking + * Vegas measurements at the current time)? + */ +#define tcp_is_vegas(__tp) ((__tp)->v_do_vegas) + +static inline void tcp_vegas_enable(struct tcp_opt *tp) +{ + if (!tcp_is_vegas(tp)) + return; + + /* There are several situations when we must "re-start" Vegas: + * + * o when a connection is established + * o after an RTO + * o after fast recovery + * o when we send a packet and there is no outstanding + * unacknowledged data (restarting an idle connection) + * + * In these circumstances we cannot do a Vegas calculation at the + * end of the first RTT, because any calculation we do is using + * stale info -- both the saved cwnd and congestion feedback are + * stale. + * + * Instead we must wait until the completion of an RTT during + * which we actually receive ACKs. + */ + + /* Begin taking Vegas samples next time we send something. */ + tp->v_doing_vegas_now = 1; + + /* Set the beginning of the next send window. */ + tp->v_beg_snd_nxt = tp->snd_nxt; + /* Also set beg_snd_una and beg_snd_cwnd--thd */ + tp->v_beg_snd_una = tp->snd_una; + tp->v_beg_snd_cwnd = tp->snd_cwnd; + + tp->v_cntRTT = 0; + tp->v_minRTT = 0x7fffffff; +} + +extern void tcp_set_ca_state(struct tcp_opt *tp, u8 ca_state); + /* If cwnd > ssthresh, we may raise ssthresh to be half-way to cwnd. * The exception is rate halving phase, when cwnd is decreasing towards * ssthresh. @@ -1186,7 +1236,7 @@ static inline void tcp_enter_cwr(struct tp->prior_ssthresh = 0; if (tp->ca_state < TCP_CA_CWR) { __tcp_enter_cwr(tp); - tp->ca_state = TCP_CA_CWR; + tcp_set_ca_state(tp, TCP_CA_CWR); } } @@ -1975,4 +2025,23 @@ static inline void tcp_mib_init(void) TCP_ADD_STATS_USER(TcpMaxConn, -1); } +/* Should we be taking Vegas samples right now? */ +#define tcp_vegas_enabled(__tp) ((__tp)->v_doing_vegas_now) + +static inline void tcp_do_vegas(struct tcp_opt *tp, __u8 should_do_vegas) +{ + /* Set up a new TCP connection, depending on whether it should be + * using Vegas or not. + */ + if (should_do_vegas) { + tp->v_do_vegas = 1; + tp->v_baseRTT = 0x7fffffff; + + tcp_vegas_enable(tp); + } else { + tp->v_do_vegas = 0; + tcp_vegas_disable(tp); + } +} + #endif /* _TCP_H */ diff -r -U3 --show-c-function linux-2.4.21/include/net/web100_stats.h linux-2.4.21vegas/include/net/web100_stats.h --- linux-2.4.21/include/net/web100_stats.h 2003-09-26 13:51:00.000000000 -0400 +++ linux-2.4.21vegas/include/net/web100_stats.h 2003-09-29 07:46:10.000000000 -0400 @@ -326,6 +326,15 @@ struct web100directs { INTEGER WAD_NoAI; Integer32 WAD_CwndAdjust; Integer32 WAD_Kaicnt; + Gauge32 WAD_Vegas; + Gauge32 WAD_VegasAlpha; + Gauge32 WAD_VegasBeta; + Gauge32 WAD_VegasGamma; + Gauge32 VegasDiff; + Counter32 WAD_VegasGTGamma; + Counter32 WAD_VegasLTAlpha; + Counter32 WAD_VegasGTBeta; + Counter32 WAD_VegasNoChange; Counter32 X_FullBursts; Counter32 X_PushedBursts; diff -r -U3 --show-c-function linux-2.4.21/net/ipv4/sysctl_net_ipv4.c linux-2.4.21vegas/net/ipv4/sysctl_net_ipv4.c --- linux-2.4.21/net/ipv4/sysctl_net_ipv4.c 2003-08-27 16:13:36.000000000 -0400 +++ linux-2.4.21vegas/net/ipv4/sysctl_net_ipv4.c 2003-09-26 14:35:57.000000000 -0400 @@ -255,6 +255,16 @@ ctl_table ipv4_table[] = { {NET_IPV4_WAD_FLOYD_AIMD, "WAD_FloydAIMD", &sysctl_WAD_FloydAIMD, sizeof(int), 0644, NULL, &proc_dointvec}, #endif /* WEB100_PRIVATE */ + {NET_TCP_VEGAS, "tcp_vegas_cong_avoid", + &sysctl_tcp_vegas_cong_avoid, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_VEGAS_ALPHA, "tcp_vegas_alpha", + &sysctl_tcp_vegas_alpha, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_VEGAS_BETA, "tcp_vegas_beta", + &sysctl_tcp_vegas_beta, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_VEGAS_GAMMA, "tcp_vegas_gamma", + &sysctl_tcp_vegas_gamma, sizeof(int), 0644, NULL, &proc_dointvec}, + {NET_TCP_VEGAS_TIMRES, "tcp_vegas_timres", + &sysctl_tcp_vegas_timres, sizeof(int), 0644, NULL, &proc_dointvec}, {0} }; diff -r -U3 --show-c-function linux-2.4.21/net/ipv4/tcp.c linux-2.4.21vegas/net/ipv4/tcp.c --- linux-2.4.21/net/ipv4/tcp.c 2003-09-08 15:07:12.000000000 -0400 +++ linux-2.4.21vegas/net/ipv4/tcp.c 2003-09-05 13:55:28.000000000 -0400 @@ -2171,7 +2171,7 @@ int tcp_disconnect(struct sock *sk, int tp->packets_out = 0; tp->snd_ssthresh = 0x7fffffff; tp->snd_cwnd_cnt = 0; - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); tcp_clear_retrans(tp); tcp_delack_init(tp); tp->send_head = NULL; diff -r -U3 --show-c-function linux-2.4.21/net/ipv4/tcp_input.c linux-2.4.21vegas/net/ipv4/tcp_input.c --- linux-2.4.21/net/ipv4/tcp_input.c 2003-09-26 14:02:48.000000000 -0400 +++ linux-2.4.21vegas/net/ipv4/tcp_input.c 2003-12-11 10:32:44.000000000 -0500 @@ -89,6 +89,16 @@ int sysctl_tcp_rfc1337 = 0; int sysctl_tcp_max_orphans = NR_FILE; int sysctl_tcp_frto = 0; +/* Vegas is off by default. */ +int sysctl_tcp_vegas_cong_avoid = 0; +/* Vegas microsecond time resolution is on by default. */ +int sysctl_tcp_vegas_timres = 0; + +/* Default values of the Vegas variables */ +int sysctl_tcp_vegas_alpha = 1; +int sysctl_tcp_vegas_beta = 3; +int sysctl_tcp_vegas_gamma = 1; + #define FLAG_DATA 0x01 /* Incoming frame contained data. */ #define FLAG_WIN_UPDATE 0x02 /* Incoming ACK was a window update. */ #define FLAG_DATA_ACKED 0x04 /* This ACK acknowledged new data. */ @@ -513,6 +523,57 @@ static void tcp_event_data_recv(struct s WEB100_UPDATE_FUNC(tp, web100_update_rcv_nxt(tp)); } +void tcp_set_ca_state(struct tcp_opt *tp, u8 ca_state) +{ + if(tcp_is_vegas(tp)) { + if (ca_state == TCP_CA_Open) { + if(!(tp->v_doing_vegas_now)) + printk("vegas is OFF-state: %u=>tcp_vegas_enable=>", ca_state); + else + printk("vegas is ON-state: %u=>tcp_vegas_enable=>", ca_state); + tcp_vegas_enable(tp); + if(tp->v_doing_vegas_now) + printk("vegas is ON\n"); + else + printk("vegas is OFF??\n"); + } else { + if(tp->v_doing_vegas_now) + printk("vegas is ON-state: %u=>tcp_vegas_disable=>", ca_state); + else + printk("vegas is OFF-state: %u=>tcp_vegas_disable=>", ca_state); + tcp_vegas_disable(tp); + if(!(tp->v_doing_vegas_now)) + printk("vegas is OFF\n"); + else + printk("vegas is ON??\n"); + } + } + tp->ca_state = ca_state; +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static void tcp_vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->v_baseRTT) + tp->v_baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->v_minRTT = min(tp->v_minRTT, vrtt); + tp->v_cntRTT++; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -522,10 +583,18 @@ static void tcp_event_data_recv(struct s * To save cycles in the RFC 1323 implementation it was better to break * it up into three procedures. -- erics */ -static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) +static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt, + __u32 usrtt) { long m = mrtt; /* RTT */ + if (tcp_vegas_enabled(tp)) { + if (sysctl_tcp_vegas_timres) + tcp_vegas_rtt_calc(tp, usrtt); + else + tcp_vegas_rtt_calc(tp, mrtt); + } + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -757,6 +826,12 @@ static void tcp_init_metrics(struct sock WEB100_VAR_SET(tp, RetranThresh, tp->reordering); } + #ifdef WEB100_PRIVATE + tcp_do_vegas(tp, NET100_WAD(tp, WAD_Vegas, sysctl_tcp_vegas_cong_avoid)); + #else + tcp_do_vegas(tp, sysctl_tcp_vegas_cong_avoid); + #endif + if (dst->rtt == 0) goto reset; @@ -1114,7 +1189,7 @@ void tcp_enter_frto(struct sock *sk) } tcp_sync_left_out(tp); - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); tp->frto_highmark = tp->snd_nxt; } @@ -1160,7 +1235,7 @@ void tcp_enter_frto_loss(struct sock *sk tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->frto_highmark; TCP_ECN_queue_cwr(tp); } @@ -1226,7 +1301,7 @@ void tcp_enter_loss(struct sock *sk, int tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); WEB100_VAR_SET(tp, RetranThresh, tp->reordering); - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } @@ -1600,7 +1675,7 @@ static int tcp_try_undo_recovery(struct tcp_moderate_cwnd(tp); return 1; } - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 0; } @@ -1660,7 +1735,7 @@ static int tcp_try_undo_loss(struct sock tp->retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 1; } return 0; @@ -1679,8 +1754,9 @@ static void tcp_try_to_open(struct sock if (tp->retrans_out == 0) tp->retrans_stamp = 0; - if (flag&FLAG_ECE) + if (flag&FLAG_ECE) { tcp_enter_cwr(tp); + } if (tp->ca_state != TCP_CA_CWR) { int state = TCP_CA_Open; @@ -1691,7 +1767,7 @@ static void tcp_try_to_open(struct sock state = TCP_CA_Disorder; if (tp->ca_state != state) { - tp->ca_state = state; + tcp_set_ca_state(tp, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); @@ -1766,7 +1842,7 @@ tcp_fastretrans_alert(struct sock *sk, u * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(tp); - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1777,7 +1853,7 @@ tcp_fastretrans_alert(struct sock *sk, u * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1851,7 +1927,7 @@ tcp_fastretrans_alert(struct sock *sk, u } tp->snd_cwnd_cnt = 0; - tp->ca_state = TCP_CA_Recovery; + tcp_set_ca_state(tp, TCP_CA_Recovery); WEB100_UPDATE_FUNC(tp, web100_update_congestion(tp, 0)); WEB100_VAR_INC(tp, FastRetran); /* WEB100_XXX */ } @@ -1865,7 +1941,7 @@ tcp_fastretrans_alert(struct sock *sk, u /* Read draft-ietf-tcplw-high-performance before mucking * with this code. (Superceeds RFC1323) */ -static void tcp_ack_saw_tstamp(struct sock *sk, int flag) +static void tcp_ack_saw_tstamp(struct sock *sk, u32 seq_usrtt, int flag) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; __u32 seq_rtt; @@ -1887,14 +1963,14 @@ static void tcp_ack_saw_tstamp(struct so */ seq_rtt = tcp_time_stamp - tp->rcv_tsecr; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, seq_usrtt); tcp_set_rto(tp); WEB100_UPDATE_FUNC(tp, web100_update_rtt(tp, seq_rtt)); tp->backoff = 0; tcp_bound_rto(tp); } -static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, int flag) +static void tcp_ack_no_tstamp(struct sock *sk, u32 seq_rtt, u32 seq_usrtt, int flag) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; @@ -1910,7 +1986,7 @@ static void tcp_ack_no_tstamp(struct soc if (flag & FLAG_RETRANS_DATA_ACKED) return; - tcp_rtt_estimator(tp, seq_rtt); + tcp_rtt_estimator(tp, seq_rtt, seq_usrtt); tcp_set_rto(tp); WEB100_UPDATE_FUNC(tp, web100_update_rtt(tp, seq_rtt)); tp->backoff = 0; @@ -1918,20 +1994,22 @@ static void tcp_ack_no_tstamp(struct soc } static __inline__ void -tcp_ack_update_rtt(struct sock *sk, int flag, s32 seq_rtt) +tcp_ack_update_rtt(struct sock *sk, int flag, s32 seq_rtt, u32 seq_usrtt) { struct tcp_opt *tp = &sk->tp_pinfo.af_tcp; /* Note that peer MAY send zero echo. In this case it is ignored. (rfc1323) */ - if (tp->saw_tstamp && tp->rcv_tsecr) - tcp_ack_saw_tstamp(sk, flag); - else if (seq_rtt >= 0) - tcp_ack_no_tstamp(sk, seq_rtt, flag); + if (tp->saw_tstamp && tp->rcv_tsecr) { + tcp_ack_saw_tstamp(sk, seq_usrtt, flag); + } + else if (seq_rtt >= 0) { + tcp_ack_no_tstamp(sk, seq_rtt, seq_usrtt, flag); + } } /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +static void tcp_reno_cong_avoid(struct tcp_opt *tp) { #ifdef CONFIG_WEB100_STATS if (tp->snd_cwnd > tp->snd_cwnd_clamp) { @@ -1949,7 +2027,7 @@ static __inline__ void tcp_cong_avoid(st tp->snd_cwnd++; } else { /* Floyd modified slow start */ - if (NET100_WAD(tp, WAD_sscnt, 0) < + if (NET100_WAD(tp, WAD_sscnt, 0) >= 2 * tp->snd_cwnd / max_ssthresh) { tp->snd_cwnd++; WEB100_VAR_SET(tp, WAD_sscnt, 0); @@ -1996,6 +2074,283 @@ static __inline__ void tcp_cong_avoid(st tp->snd_cwnd_stamp = tcp_time_stamp; } +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void tcp_vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->v_beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + u32 old_snd_nxt, old_snd_una; + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->v_beg_snd_nxt - tp->v_beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = tp->v_beg_snd_cwnd; + old_snd_nxt = tp->v_beg_snd_nxt; + old_snd_una = tp->v_beg_snd_una; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->v_beg_snd_una = tp->v_beg_snd_nxt; + tp->v_beg_snd_nxt = tp->snd_nxt; + tp->v_beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + tcp_vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->v_cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh){ + tp->snd_cwnd++; +/* printk ("thdvegas %d\n",tp->snd_cwnd); */ + } + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->v_minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + */ + target_cwnd = ((old_wnd * tp->v_baseRTT) + ) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + */ + diff = old_wnd - target_cwnd; + + WEB100_VAR_SET(tp, VegasDiff, diff); + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + #ifdef WEB100_PRIVATE + if (diff > NET100_WAD(tp, WAD_VegasGamma, sysctl_tcp_vegas_gamma)) { + WEB100_VAR_INC(tp, WAD_VegasGTGamma); + #else + if (diff > sysctl_tcp_vegas_gamma) { + #endif + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + printk("diff: %u GT gamma: %u [old_wnd: %u rtt: %u baseRTT: %u v_beg_snd_nxt: %u v_beg_snd_una: %u]\n", + diff, NET100_WAD(tp, WAD_VegasGamma, sysctl_tcp_vegas_gamma), old_wnd, rtt, tp->v_baseRTT, old_snd_nxt, old_snd_una); + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + target_cwnd+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + #ifdef WEB100_PRIVATE + if(NET100_WAD(tp, WAD_FloydAIMD, sysctl_WAD_FloydAIMD)) + web100_update_floyd_aimd(tp); /* Update AI MD */ + if (diff > NET100_WAD(tp, WAD_VegasBeta, sysctl_tcp_vegas_beta)) { + WEB100_VAR_INC(tp, WAD_VegasGTBeta); + #else + if (diff > sysctl_tcp_vegas_beta) { + #endif + /* The old window was too fast, so + * we slow down. + */ +/*** next_snd_cwnd = old_snd_cwnd - 1; */ + #ifdef WEB100_PRIVATE + tp->snd_cwnd -= (NET100_WAD(tp, WAD_AI, 1<<3))>>3; + #else + tp->snd_cwnd--; + #endif + } else + #ifdef WEB100_PRIVATE + if (diff < NET100_WAD(tp, WAD_VegasAlpha, sysctl_tcp_vegas_alpha)) { + WEB100_VAR_INC(tp, WAD_VegasLTAlpha); + #else + if (diff < sysctl_tcp_vegas_alpha) { + #endif + /* We don't have enough extra packets + * in the network, so speed up. + */ +/*** next_snd_cwnd = old_snd_cwnd + 1; */ + #ifdef WEB100_PRIVATE + tp->snd_cwnd += (NET100_WAD(tp, WAD_AI, 1<<3))>>3; + #else + tp->snd_cwnd++; + #endif + } else { + /* Sending just as fast as we + * should be. + */ + #ifdef WEB100_PRIVATE + WEB100_VAR_INC(tp, WAD_VegasNoChange); + #endif +/*** next_snd_cwnd = old_snd_cwnd; */ + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ +/*** if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; +***/ + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->v_cntRTT = 0; + tp->v_minRTT = 0x7fffffff; + + } + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) { + tp->snd_cwnd++; + } + + /* If we haven't been able to keep up with cwnd, clamp our cwnd + * down to our actual effective sending window. This will usually + * happen if the receiver's window is limiting us, or the sender + * is not providing enough data to fully exploit our cwnd. The + * thing we are trying to avoid is cranking up cwnd absurdly high + * while we are limited by the receiver window or sender sending + * rate, and then sending a huge burst out into the network when + * the receiver window or sender's sending rate opens back up. + */ +/* if ((int)(tp->snd_cwnd * tp->mss_cache) > + ((int)(tp->snd_nxt - tp->snd_una) + + (int)(2 * tp->mss_cache))) { + } +*/ + /* to keep cwnd from growing without bound-thd */ + tp->snd_cwnd = min(tp->snd_cwnd, (__u32)tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + if (tp->snd_cwnd < 2) + tp->snd_cwnd = 2; +} + +static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + tcp_vegas_cong_avoid(tp, ack, seq_rtt); + else + tcp_reno_cong_avoid(tp); +} + /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -2010,14 +2365,20 @@ static __inline__ void tcp_ack_packets_o } /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p, u32 *seq_usrtt) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; __u32 now = tcp_time_stamp; + struct timeval usnow; int acked = 0; __s32 seq_rtt = -1; + if(tcp_vegas_enabled(tp)) + if (sysctl_tcp_vegas_timres) + /* get microsecond-resolution time */ + do_gettimeofday(&usnow); + while((skb=skb_peek(&sk->write_queue)) && (skb != tp->send_head)) { struct tcp_skb_cb *scb = TCP_SKB_CB(skb); __u8 sacked = scb->sacked; @@ -2062,6 +2423,10 @@ static int tcp_clean_rtx_queue(struct so } } else if (seq_rtt < 0) seq_rtt = now - scb->when; + if(tcp_vegas_enabled(tp)) + if (sysctl_tcp_vegas_timres) + *seq_usrtt = (usnow.tv_sec - skb->stamp.tv_sec) * 1000000 + (usnow.tv_usec - skb->stamp.tv_usec); + if(tp->fackets_out) tp->fackets_out--; tp->packets_out--; @@ -2070,7 +2435,7 @@ static int tcp_clean_rtx_queue(struct so } if (acked&FLAG_ACKED) { - tcp_ack_update_rtt(sk, acked, seq_rtt); + tcp_ack_update_rtt(sk, acked, seq_rtt, *(seq_usrtt)); tcp_ack_packets_out(sk, tp); } @@ -2093,6 +2458,7 @@ static int tcp_clean_rtx_queue(struct so } } #endif + *seq_rtt_p = seq_rtt; return acked; } @@ -2217,6 +2583,8 @@ static int tcp_ack(struct sock *sk, stru u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; + s32 seq_rtt; + u32 seq_usrtt = 0; /* microsecond resolution rtt */ int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -2267,21 +2635,50 @@ static int tcp_ack(struct sock *sk, stru prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt, &seq_usrtt); if (tp->frto_counter) tcp_process_frto(sk, prior_snd_una); + if(tcp_vegas_enabled(tp)) { if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && - tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp); + if ((flag&FLAG_DATA_ACKED) && + tcp_may_raise_cwnd(tp, flag)) { + if (sysctl_tcp_vegas_timres) + tcp_cong_avoid(tp, ack, seq_usrtt); + else + tcp_cong_avoid(tp, ack, seq_rtt); + } + tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); + } else { + if ((flag&FLAG_DATA_ACKED)) { + if (sysctl_tcp_vegas_timres) + tcp_cong_avoid(tp, ack, seq_usrtt); + else + tcp_cong_avoid(tp, ack, seq_rtt); + } + } + } else { + if (tcp_ack_is_dubious(tp, flag)) { + /* Advanve CWND, if state allows this. */ + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && + tcp_may_raise_cwnd(tp, flag)) { + if (sysctl_tcp_vegas_timres) + tcp_cong_avoid(tp, ack, seq_usrtt); + else + tcp_cong_avoid(tp, ack, seq_rtt); + } tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) - tcp_cong_avoid(tp); + if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) { + if (sysctl_tcp_vegas_timres) + tcp_cong_avoid(tp, ack, seq_usrtt); + else + tcp_cong_avoid(tp, ack, seq_rtt); + } } + } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) dst_confirm(sk->dst_cache); @@ -4205,9 +4602,9 @@ int tcp_rcv_state_process(struct sock *s * and does not calculate rtt. * Fix it at least with timestamps. */ - if (tp->saw_tstamp && tp->rcv_tsecr && !tp->srtt) - tcp_ack_saw_tstamp(sk, 0); - + if (tp->saw_tstamp && tp->rcv_tsecr && !tp->srtt) { + tcp_ack_saw_tstamp(sk, 0, 0); + } if (tp->tstamp_ok) tp->advmss -= TCPOLEN_TSTAMP_ALIGNED; diff -r -U3 --show-c-function linux-2.4.21/net/ipv4/tcp_minisocks.c linux-2.4.21vegas/net/ipv4/tcp_minisocks.c --- linux-2.4.21/net/ipv4/tcp_minisocks.c 2003-08-27 16:13:37.000000000 -0400 +++ linux-2.4.21vegas/net/ipv4/tcp_minisocks.c 2003-09-05 13:55:27.000000000 -0400 @@ -721,7 +721,7 @@ struct sock *tcp_create_openreq_child(st newtp->frto_counter = 0; newtp->frto_highmark = 0; - newtp->ca_state = TCP_CA_Open; + tcp_set_ca_state(newtp, TCP_CA_Open); tcp_init_xmit_timers(newsk); skb_queue_head_init(&newtp->out_of_order_queue); newtp->send_head = NULL; diff -r -U3 --show-c-function linux-2.4.21/net/ipv4/tcp_output.c linux-2.4.21vegas/net/ipv4/tcp_output.c --- linux-2.4.21/net/ipv4/tcp_output.c 2003-08-27 16:13:37.000000000 -0400 +++ linux-2.4.21vegas/net/ipv4/tcp_output.c 2003-09-26 14:39:30.000000000 -0400 @@ -106,6 +106,8 @@ static void tcp_cwnd_restart(struct tcp_ u32 restart_cwnd = tcp_init_cwnd(tp); u32 cwnd = tp->snd_cwnd; + tcp_vegas_enable(tp); + tp->snd_ssthresh = tcp_current_ssthresh(tp); restart_cwnd = min(restart_cwnd, cwnd); @@ -363,6 +365,10 @@ void tcp_send_skb(struct sock *sk, struc (why = tcp_snd_wait(tp, skb, cur_mss, tp->nonagle)) == WC_SNDLIM_NONE) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { #ifdef CONFIG_WEB100_STATS if (tp->tcp_stats->wc_vars.X_SBufMode == WC_BUFMODE_WEB100) @@ -397,6 +403,10 @@ void tcp_push_one(struct sock *sk, unsig if ((why = tcp_snd_wait(tp, skb, cur_mss, 1)) == WC_SNDLIM_NONE) { /* Send it out now. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + if (tcp_transmit_skb(sk, skb_clone(skb, sk->allocation)) == 0) { #ifdef CONFIG_WEB100_STATS if (tp->tcp_stats->wc_vars.X_SBufMode == WC_BUFMODE_WEB100) @@ -643,6 +653,10 @@ int tcp_write_xmit(struct sock *sk, int } TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + if (tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC))) { why = WC_SNDLIM_SENDER; break; @@ -897,7 +911,7 @@ void tcp_simple_retransmit(struct sock * tp->snd_ssthresh = tcp_current_ssthresh(tp); tp->prior_ssthresh = 0; tp->undo_marker = 0; - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); } tcp_xmit_retransmit_queue(sk); } @@ -965,6 +979,9 @@ int tcp_retransmit_skb(struct sock *sk, * is still in somebody's hands, else make a clone. */ TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); err = tcp_transmit_skb(sk, (skb_cloned(skb) ? pskb_copy(skb, GFP_ATOMIC): @@ -1151,6 +1168,10 @@ void tcp_send_active_reset(struct sock * TCP_SKB_CB(skb)->seq = tcp_acceptable_seq(sk, tp); TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + if (tcp_transmit_skb(sk, skb)) NET_INC_STATS(TCPAbortFailed); } @@ -1333,6 +1354,10 @@ int tcp_connect(struct sock *sk) /* Send it off. */ TCP_SKB_CB(buff)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&buff->stamp); + tp->retrans_stamp = TCP_SKB_CB(buff)->when; __skb_queue_tail(&sk->write_queue, buff); tcp_charge_skb(sk, buff); @@ -1466,6 +1491,10 @@ static int tcp_xmit_probe_skb(struct soc TCP_SKB_CB(skb)->seq = urgent ? tp->snd_una : tp->snd_una - 1; TCP_SKB_CB(skb)->end_seq = TCP_SKB_CB(skb)->seq; TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + return tcp_transmit_skb(sk, skb); } @@ -1497,6 +1526,10 @@ int tcp_write_wakeup(struct sock *sk) } TCP_SKB_CB(skb)->flags |= TCPCB_FLAG_PSH; TCP_SKB_CB(skb)->when = tcp_time_stamp; + if (tcp_is_vegas(tp)) + if (sysctl_tcp_vegas_timres) + do_gettimeofday(&skb->stamp); + err = tcp_transmit_skb(sk, skb_clone(skb, GFP_ATOMIC)); if (!err) { update_send_head(sk, tp, skb);