From 4aca87515a5083ae0e31ce3177189fd43b6d05ac Mon Sep 17 00:00:00 2001 From: Andreas Baumann Date: Sat, 3 Jan 2015 13:58:15 +0100 Subject: patch to Vanilla Tomato 1.28 --- release/src/linux/linux/net/ipv4/tcp_input.c | 311 +++++++++++++++++++++++++-- 1 file changed, 297 insertions(+), 14 deletions(-) (limited to 'release/src/linux/linux/net/ipv4/tcp_input.c') diff --git a/release/src/linux/linux/net/ipv4/tcp_input.c b/release/src/linux/linux/net/ipv4/tcp_input.c index 8c99dd52..243e2991 100644 --- a/release/src/linux/linux/net/ipv4/tcp_input.c +++ b/release/src/linux/linux/net/ipv4/tcp_input.c @@ -87,6 +87,16 @@ int sysctl_tcp_stdurg = 0; int sysctl_tcp_rfc1337 = 0; int sysctl_tcp_max_orphans = NR_FILE; +int sysctl_tcp_vegas_cong_avoid = 0; + +/* Default values of the Vegas variables, in fixed-point representation + * with V_PARAM_SHIFT bits to the right of the binary point. + */ +#define V_PARAM_SHIFT 1 +int sysctl_tcp_vegas_alpha = 1<vegas.do_vegas = 1; + tp->vegas.baseRTT = 0x7fffffff; + tcp_vegas_enable(tp); + } else + tcp_vegas_disable(tp); +} + +/* Do RTT sampling needed for Vegas. + * Basically we: + * o min-filter RTT samples from within an RTT to get the current + * propagation delay + queuing delay (we are min-filtering to try to + * avoid the effects of delayed ACKs) + * o min-filter RTT samples from a much longer window (forever for now) + * to find the propagation delay (baseRTT) + */ +static inline void vegas_rtt_calc(struct tcp_opt *tp, __u32 rtt) +{ + __u32 vrtt = rtt + 1; /* Never allow zero rtt or baseRTT */ + + /* Filter to find propagation delay: */ + if (vrtt < tp->vegas.baseRTT) + tp->vegas.baseRTT = vrtt; + + /* Find the min RTT during the last RTT to find + * the current prop. delay + queuing delay: + */ + tp->vegas.minRTT = min(tp->vegas.minRTT, vrtt); + tp->vegas.cntRTT++; +} + /* Called to compute a smoothed rtt estimate. The data fed to this * routine either comes from timestamps, or from segments that were * known _not_ to have been retransmitted [see Karn/Partridge @@ -412,6 +458,9 @@ static __inline__ void tcp_rtt_estimator(struct tcp_opt *tp, __u32 mrtt) { long m = mrtt; /* RTT */ + if (tcp_vegas_enabled(tp)) + vegas_rtt_calc(tp, mrtt); + /* The following amusing code comes from Jacobson's * article in SIGCOMM '88. Note that rtt and mdev * are scaled versions of rtt and mean deviation. @@ -1013,7 +1062,7 @@ void tcp_enter_loss(struct sock *sk, int how) tcp_sync_left_out(tp); tp->reordering = min_t(unsigned int, tp->reordering, sysctl_tcp_reordering); - tp->ca_state = TCP_CA_Loss; + tcp_set_ca_state(tp, TCP_CA_Loss); tp->high_seq = tp->snd_nxt; TCP_ECN_queue_cwr(tp); } @@ -1375,7 +1424,7 @@ static int tcp_try_undo_recovery(struct sock *sk, struct tcp_opt *tp) tcp_moderate_cwnd(tp); return 1; } - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 0; } @@ -1435,7 +1484,7 @@ static int tcp_try_undo_loss(struct sock *sk, struct tcp_opt *tp) tp->retransmits = 0; tp->undo_marker = 0; if (!IsReno(tp)) - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); return 1; } return 0; @@ -1466,7 +1515,7 @@ static void tcp_try_to_open(struct sock *sk, struct tcp_opt *tp, int flag) state = TCP_CA_Disorder; if (tp->ca_state != state) { - tp->ca_state = state; + tcp_set_ca_state(tp, state); tp->high_seq = tp->snd_nxt; } tcp_moderate_cwnd(tp); @@ -1540,7 +1589,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * is ACKed for CWR bit to reach receiver. */ if (tp->snd_una != tp->high_seq) { tcp_complete_cwr(tp); - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1551,7 +1600,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, * catching for all duplicate ACKs. */ IsReno(tp) || tp->snd_una != tp->high_seq) { tp->undo_marker = 0; - tp->ca_state = TCP_CA_Open; + tcp_set_ca_state(tp, TCP_CA_Open); } break; @@ -1625,7 +1674,7 @@ tcp_fastretrans_alert(struct sock *sk, u32 prior_snd_una, } tp->snd_cwnd_cnt = 0; - tp->ca_state = TCP_CA_Recovery; + tcp_set_ca_state(tp, TCP_CA_Recovery); } if (is_dupack || tcp_head_timedout(sk, tp)) @@ -1696,7 +1745,7 @@ tcp_ack_update_rtt(struct tcp_opt *tp, int flag, s32 seq_rtt) /* This is Jacobson's slow start and congestion avoidance. * SIGCOMM '88, p. 328. */ -static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) +static __inline__ void reno_cong_avoid(struct tcp_opt *tp) { if (tp->snd_cwnd <= tp->snd_ssthresh) { /* In "safe" area, increase. */ @@ -1716,6 +1765,236 @@ static __inline__ void tcp_cong_avoid(struct tcp_opt *tp) tp->snd_cwnd_stamp = tcp_time_stamp; } +/* This is based on the congestion detection/avoidance scheme described in + * Lawrence S. Brakmo and Larry L. Peterson. + * "TCP Vegas: End to end congestion avoidance on a global internet." + * IEEE Journal on Selected Areas in Communication, 13(8):1465--1480, + * October 1995. Available from: + * ftp://ftp.cs.arizona.edu/xkernel/Papers/jsac.ps + * + * See http://www.cs.arizona.edu/xkernel/ for their implementation. + * The main aspects that distinguish this implementation from the + * Arizona Vegas implementation are: + * o We do not change the loss detection or recovery mechanisms of + * Linux in any way. Linux already recovers from losses quite well, + * using fine-grained timers, NewReno, and FACK. + * o To avoid the performance penalty imposed by increasing cwnd + * only every-other RTT during slow start, we increase during + * every RTT during slow start, just like Reno. + * o Largely to allow continuous cwnd growth during slow start, + * we use the rate at which ACKs come back as the "actual" + * rate, rather than the rate at which data is sent. + * o To speed convergence to the right rate, we set the cwnd + * to achieve the right ("actual") rate when we exit slow start. + * o To filter out the noise caused by delayed ACKs, we use the + * minimum RTT sample observed during the last RTT to calculate + * the actual rate. + * o When the sender re-starts from idle, it waits until it has + * received ACKs for an entire flight of new data before making + * a cwnd adjustment decision. The original Vegas implementation + * assumed senders never went idle. + */ +static void vegas_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + /* The key players are v_beg_snd_una and v_beg_snd_nxt. + * + * These are so named because they represent the approximate values + * of snd_una and snd_nxt at the beginning of the current RTT. More + * precisely, they represent the amount of data sent during the RTT. + * At the end of the RTT, when we receive an ACK for v_beg_snd_nxt, + * we will calculate that (v_beg_snd_nxt - v_beg_snd_una) outstanding + * bytes of data have been ACKed during the course of the RTT, giving + * an "actual" rate of: + * + * (v_beg_snd_nxt - v_beg_snd_una) / (rtt duration) + * + * Unfortunately, v_beg_snd_una is not exactly equal to snd_una, + * because delayed ACKs can cover more than one segment, so they + * don't line up nicely with the boundaries of RTTs. + * + * Another unfortunate fact of life is that delayed ACKs delay the + * advance of the left edge of our send window, so that the number + * of bytes we send in an RTT is often less than our cwnd will allow. + * So we keep track of our cwnd separately, in v_beg_snd_cwnd. + */ + + if (after(ack, tp->vegas.beg_snd_nxt)) { + /* Do the Vegas once-per-RTT cwnd adjustment. */ + u32 old_wnd, old_snd_cwnd; + + + /* Here old_wnd is essentially the window of data that was + * sent during the previous RTT, and has all + * been acknowledged in the course of the RTT that ended + * with the ACK we just received. Likewise, old_snd_cwnd + * is the cwnd during the previous RTT. + */ + old_wnd = (tp->vegas.beg_snd_nxt - tp->vegas.beg_snd_una) / + tp->mss_cache; + old_snd_cwnd = tp->vegas.beg_snd_cwnd; + + /* Save the extent of the current window so we can use this + * at the end of the next RTT. + */ + tp->vegas.beg_snd_una = tp->vegas.beg_snd_nxt; + tp->vegas.beg_snd_nxt = tp->snd_nxt; + tp->vegas.beg_snd_cwnd = tp->snd_cwnd; + + /* Take into account the current RTT sample too, to + * decrease the impact of delayed acks. This double counts + * this sample since we count it for the next window as well, + * but that's not too awful, since we're taking the min, + * rather than averaging. + */ + vegas_rtt_calc(tp, seq_rtt); + + /* We do the Vegas calculations only if we got enough RTT + * samples that we can be reasonably sure that we got + * at least one RTT sample that wasn't from a delayed ACK. + * If we only had 2 samples total, + * then that means we're getting only 1 ACK per RTT, which + * means they're almost certainly delayed ACKs. + * If we have 3 samples, we should be OK. + */ + + if (tp->vegas.cntRTT <= 2) { + /* We don't have enough RTT samples to do the Vegas + * calculation, so we'll behave like Reno. + */ + if (tp->snd_cwnd > tp->snd_ssthresh) + tp->snd_cwnd++; + } else { + u32 rtt, target_cwnd, diff; + + /* We have enough RTT samples, so, using the Vegas + * algorithm, we determine if we should increase or + * decrease cwnd, and by how much. + */ + + /* Pluck out the RTT we are using for the Vegas + * calculations. This is the min RTT seen during the + * last RTT. Taking the min filters out the effects + * of delayed ACKs, at the cost of noticing congestion + * a bit later. + */ + rtt = tp->vegas.minRTT; + + /* Calculate the cwnd we should have, if we weren't + * going too fast. + * + * This is: + * (actual rate in segments) * baseRTT + * We keep it as a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary point. + */ + target_cwnd = ((old_wnd * tp->vegas.baseRTT) + << V_PARAM_SHIFT) / rtt; + + /* Calculate the difference between the window we had, + * and the window we would like to have. This quantity + * is the "Diff" from the Arizona Vegas papers. + * + * Again, this is a fixed point number with + * V_PARAM_SHIFT bits to the right of the binary + * point. + */ + diff = (old_wnd << V_PARAM_SHIFT) - target_cwnd; + + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* Slow start. */ + if (diff > sysctl_tcp_vegas_gamma) { + /* Going too fast. Time to slow down + * and switch to congestion avoidance. + */ + tp->snd_ssthresh = 2; + + /* Set cwnd to match the actual rate + * exactly: + * cwnd = (actual rate) * baseRTT + * Then we add 1 because the integer + * truncation robs us of full link + * utilization. + */ + tp->snd_cwnd = min(tp->snd_cwnd, + (target_cwnd >> + V_PARAM_SHIFT)+1); + + } + } else { + /* Congestion avoidance. */ + u32 next_snd_cwnd; + + /* Figure out where we would like cwnd + * to be. + */ + if (diff > sysctl_tcp_vegas_beta) { + /* The old window was too fast, so + * we slow down. + */ + next_snd_cwnd = old_snd_cwnd - 1; + } else if (diff < sysctl_tcp_vegas_alpha) { + /* We don't have enough extra packets + * in the network, so speed up. + */ + next_snd_cwnd = old_snd_cwnd + 1; + } else { + /* Sending just as fast as we + * should be. + */ + next_snd_cwnd = old_snd_cwnd; + } + + /* Adjust cwnd upward or downward, toward the + * desired value. + */ + if (next_snd_cwnd > tp->snd_cwnd) + tp->snd_cwnd++; + else if (next_snd_cwnd < tp->snd_cwnd) + tp->snd_cwnd--; + } + } + + /* Wipe the slate clean for the next RTT. */ + tp->vegas.cntRTT = 0; + tp->vegas.minRTT = 0x7fffffff; + } + + /* The following code is executed for every ack we receive, + * except for conditions checked in should_advance_cwnd() + * before the call to tcp_cong_avoid(). Mainly this means that + * we only execute this code if the ack actually acked some + * data. + */ + + /* If we are in slow start, increase our cwnd in response to this ACK. + * (If we are not in slow start then we are in congestion avoidance, + * and adjust our congestion window only once per RTT. See the code + * above.) + */ + if (tp->snd_cwnd <= tp->snd_ssthresh) + tp->snd_cwnd++; + + /* to keep cwnd from growing without bound */ + tp->snd_cwnd = min_t(u32, tp->snd_cwnd, tp->snd_cwnd_clamp); + + /* Make sure that we are never so timid as to reduce our cwnd below + * 2 MSS. + * + * Going below 2 MSS would risk huge delayed ACKs from our receiver. + */ + tp->snd_cwnd = max(tp->snd_cwnd, 2U); + + tp->snd_cwnd_stamp = tcp_time_stamp; +} + +static inline void tcp_cong_avoid(struct tcp_opt *tp, u32 ack, u32 seq_rtt) +{ + if (tcp_vegas_enabled(tp)) + vegas_cong_avoid(tp, ack, seq_rtt); + else + reno_cong_avoid(tp); +} + /* Restart timer after forward progress on connection. * RFC2988 recommends to restart timer to now+rto. */ @@ -1730,7 +2009,7 @@ static __inline__ void tcp_ack_packets_out(struct sock *sk, struct tcp_opt *tp) } /* Remove acknowledged frames from the retransmission queue. */ -static int tcp_clean_rtx_queue(struct sock *sk) +static int tcp_clean_rtx_queue(struct sock *sk, __s32 *seq_rtt_p) { struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp); struct sk_buff *skb; @@ -1813,6 +2092,7 @@ static int tcp_clean_rtx_queue(struct sock *sk) } } #endif + *seq_rtt_p = seq_rtt; return acked; } @@ -1900,6 +2180,7 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) u32 ack_seq = TCP_SKB_CB(skb)->seq; u32 ack = TCP_SKB_CB(skb)->ack_seq; u32 prior_in_flight; + s32 seq_rtt; int prior_packets; /* If the ack is newer than sent or older than previous acks @@ -1947,17 +2228,19 @@ static int tcp_ack(struct sock *sk, struct sk_buff *skb, int flag) prior_in_flight = tcp_packets_in_flight(tp); /* See if we can take anything off of the retransmit queue. */ - flag |= tcp_clean_rtx_queue(sk); + flag |= tcp_clean_rtx_queue(sk, &seq_rtt); if (tcp_ack_is_dubious(tp, flag)) { /* Advanve CWND, if state allows this. */ - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd && + if ((flag&FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd) && tcp_may_raise_cwnd(tp, flag)) - tcp_cong_avoid(tp); + tcp_cong_avoid(tp, ack, seq_rtt); tcp_fastretrans_alert(sk, prior_snd_una, prior_packets, flag); } else { - if ((flag&FLAG_DATA_ACKED) && prior_in_flight >= tp->snd_cwnd) - tcp_cong_avoid(tp); + if ((flag & FLAG_DATA_ACKED) && + (tcp_vegas_enabled(tp) || prior_in_flight >= tp->snd_cwnd)) + tcp_cong_avoid(tp, ack, seq_rtt); } if ((flag & FLAG_FORWARD_PROGRESS) || !(flag&FLAG_NOT_DUP)) -- cgit v1.2.3-54-g00ecf