Friday, July 13, 2012

Improve TCP throughput via localising send buffers....


When bulk data transfer is done using TCP as underlying transport layer using high speed link, rate of network buffer allocation and free is very high for the connection. Only when TCP data is ACKed, the network buffer containing ACKed data is freed. On a high speed link, bulk data transfer will cause very high rate of allocation free cycle.
When we free network buffers(sk_buff), we give it back to the pool creating send buffer space on the socket. This causes application to push more data immediately causing allocation of network buffers. If we cache in few network buffers locally with the socket on reception of ACK instead of giving it back to the global pool, we may avoid allocation of network buffers from the global pool and hence some performance gain!!

I used some minimal calculation for the size of the localized send buffer cache so that we should not end up eating up entire system memory. The network buffer must be cached network buffer with the socket locally only if -

1. total allocated send buffer size + socket's local cache size less than send buffer limit for the socket.

2. socket's local cache size less than maximum "packet's in flight" at any point in time(code changes for
this require some correction and I'll post it once done).

ttcp was modified to set socket option on the tcp socket to localize send buffers and the results showed that with localizing send buffer, we got 5 - 6 % improvement in the TCP throughput. 80MB of data of transmitted with MTU link of 1500. With send buffer localization at any point in time there we no more than 25 network buffers(sk_buff) cached locally with the socket. Of total 6k allocations, only 1 K sk_buff's were allocated from the global pool and rest were allocated from the socket's local send buffer cache. The result is obtained with per-cpu slab cache. System memory usage in both the cases during the experiment was same.
the improvement in the TCP throughput can be attributed to the lesser number of allocations from the global pool. Each allocation from the global pool(per-cpu slab caches) will have to disable local interrupts on the CPU. Since allocations from the local send buffer cache need not disable local interrupts, we got the performance improvement. Allocation from the localized socket send buffer caches as explained above will bring over all improvement in the system performance in applications like file server, ftp server, http server etc. on SMP system.
following are the diffs:


diff -r -p -c linux-2.6.33.3/include/linux/skbuff.h send-buf-new/include/linux/skbuff.h
*** linux-2.6.33.3/include/linux/skbuff.h    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/include/linux/skbuff.h    2012-06-23 23:38:21.000000000 +0530
*************** struct sk_buff {
*** 384,389 ****
--- 384,390 ----
  #ifdef CONFIG_NETWORK_SECMARK
      __u32            secmark;
  #endif
+     __u32            localize_snd_buf;
      union {
          __u32        mark;
          __u32        dropcount;
*************** extern void consume_skb(struct sk_buff *
*** 431,442 ****
--- 432,446 ----
  extern void           __kfree_skb(struct sk_buff *skb);
  extern struct sk_buff *__alloc_skb(unsigned int size,
                     gfp_t priority, int fclone, int node);
+ extern void __init_skb_local(struct sk_buff *, int);
+
  static inline struct sk_buff *alloc_skb(unsigned int size,
                      gfp_t priority)
  {
      return __alloc_skb(size, priority, 0, -1);
  }
 
+
  static inline struct sk_buff *alloc_skb_fclone(unsigned int size,
                             gfp_t priority)
  {
diff -r -p -c linux-2.6.33.3/include/linux/tcp.h send-buf-new/include/linux/tcp.h
*** linux-2.6.33.3/include/linux/tcp.h    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/include/linux/tcp.h    2012-06-25 04:07:51.000000000 +0530
*************** enum {
*** 103,108 ****
--- 103,111 ----
  #define TCP_CONGESTION        13    /* Congestion control algorithm */
  #define TCP_MD5SIG        14    /* TCP MD5 Signature (RFC2385) */
  #define TCP_COOKIE_TRANSACTIONS    15    /* TCP Cookie Transactions */
+ #define TCP_LOCALIZE_SND_BUF    16    /* TCP localise send buffer */
+ #define TCP_NO_OPTIMIZATION    17    /* TCP optimization off */
+ #define TCP_GET_LOCALIZATION    18    /* TCP optimization off */
 
  /* for TCP_INFO socket option */
  #define TCPI_OPT_TIMESTAMPS    1
*************** struct tcp_sock {
*** 341,346 ****
--- 344,351 ----
      u16    advmss;        /* Advertised MSS            */
      u8    frto_counter;    /* Number of new acks after RTO */
      u8    nonagle;    /* Disable Nagle algorithm?             */
+     u32    localize_send_buf;    /* localise send buffer ?       */
+     u32    no_optimization;    /* no optimization ?       */
 
  /* RTT measurement */
      u32    srtt;        /* smoothed round trip time << 3    */

diff -r -p -c linux-2.6.33.3/include/net/sock.h send-buf-new/include/net/sock.h
*** linux-2.6.33.3/include/net/sock.h    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/include/net/sock.h    2012-06-25 09:07:26.000000000 +0530
*************** struct sock_common {
*** 150,155 ****
--- 150,170 ----
      struct net         *skc_net;
  #endif
  };
+ struct sk_localize_buf{
+     struct sk_buff *head;
+     struct sk_buff *tail;
+     int len;
+     int limit;
+     int local_sendbuf_allocations;
+     int sendbuf_allocations;
+ };
+ typedef struct sk_localize_buf sk_localized_send_buffs;
+
+ struct tcp_localization_info {
+     int local_sendbuf_allocations;
+     int sendbuf_allocations;
+     int len;
+ };
 
  /**
    *    struct sock - network layer representation of sockets
*************** struct sock {
*** 256,261 ****
--- 271,277 ----
          int len;
          int limit;
      } sk_backlog;
+     struct sk_localize_buf *sk_local_snd_buf;
      wait_queue_head_t    *sk_sleep;
      struct dst_entry    *sk_dst_cache;
  #ifdef CONFIG_XFRM
*************** static inline __must_check int sk_add_ba
*** 599,604 ****
--- 615,723 ----
      return 0;
  }
 
+ static inline void sk_free_local_snd_buf_q(struct sock *sk) {
+     struct sk_buff *skb, *next;
+
+     if (!sk->sk_local_snd_buf) {
+         return;
+     }
+   
+     skb = sk->sk_local_snd_buf->head;
+     sk->sk_local_snd_buf->head = sk->sk_local_snd_buf->tail = NULL;
+     sk->sk_local_snd_buf->len = 0;
+
+     while (skb) {
+         next = skb->next;
+         __kfree_skb(skb);
+         skb=next;
+     }
+     return;
+ }
+
+
+ static struct sk_buff *sk_get_local_snd_buf_q(struct sock *sk) {
+     struct sk_buff *skb = NULL;
+
+     if (!sk->sk_local_snd_buf)
+         return NULL;
+     if ((skb = sk->sk_local_snd_buf->head)) {
+         if (sk->sk_local_snd_buf->tail == sk->sk_local_snd_buf->head) {
+             sk->sk_local_snd_buf->tail = sk->sk_local_snd_buf->head = NULL;
+         } else {
+             sk->sk_local_snd_buf->head = sk->sk_local_snd_buf->head->next;
+         }
+         sk->sk_local_snd_buf->len -= skb->truesize;
+         ++sk->sk_local_snd_buf->local_sendbuf_allocations;
+     }
+     return skb;
+ }
+
+ static inline struct sk_buff *alloc_skb_local_sk(struct sock *sk)
+ {
+     struct sk_buff *skb = NULL;
+
+     skb = sk_get_local_snd_buf_q(sk);
+     if (skb) {
+         printk(KERN_INFO "allocated from local buffer\n");
+         __init_skb_local(skb, 1);      
+     } else {
+         printk(KERN_INFO "could not allocate from local buffer\n");
+     }
+     return skb;
+ }
+
+ static inline void __sk_add_local_snd_buf_q(struct sock *sk, struct sk_buff *skb)
+ {
+     if (!sk->sk_local_snd_buf)
+         return;
+     if (!sk->sk_local_snd_buf->tail) {
+         sk->sk_local_snd_buf->head = sk->sk_local_snd_buf->tail = skb;
+     } else {
+         sk->sk_local_snd_buf->tail->next = skb;
+         sk->sk_local_snd_buf->tail = skb;
+     }
+     skb->next = NULL;
+ }
+
+ /* The per-socket spinlock must be held here. */
+ static inline __must_check int sk_add_local_snd_buf_q(struct sock *sk, struct sk_buff *skb)
+ {
+     if (!sk->sk_local_snd_buf)
+         return -ENOBUFS;
+     if ((sk->sk_local_snd_buf->len + sk->sk_wmem_queued) < sk->sk_sndbuf) {
+         if (sk->sk_local_snd_buf->len < (sk->sk_local_snd_buf->limit * skb->truesize)) {
+             __sk_add_local_snd_buf_q(sk, skb);
+             sk->sk_local_snd_buf->len += skb->truesize;
+             return 0;
+         }
+     }
+     return -ENOBUFS;
+ }
+
+
+ static inline void sk_add_local_snd_buf_q_init(struct sock *sk) {
+   
+     if (sk->sk_local_snd_buf) {
+         printk(KERN_INFO " socket  localize send buf already init\n");
+         return;
+     }
+      if (sk->sk_type == SOCK_STREAM) {
+         sk->sk_local_snd_buf = kmalloc(sizeof (struct sk_localize_buf), GFP_ATOMIC);
+         if (sk->sk_local_snd_buf) {
+             sk->sk_local_snd_buf->len = sk->sk_local_snd_buf->limit = 0;
+             sk->sk_local_snd_buf->head=sk->sk_local_snd_buf->tail = NULL;
+         } else {
+             printk(KERN_INFO " socket  localize send buf could not be mallocked\n");
+             return;
+         }
+     }
+     if (!sk->sk_local_snd_buf) {
+         printk(KERN_INFO " socket  localize send buf could not be initialised\n");
+     }
+     return;
+ }
+
  static inline int sk_backlog_rcv(struct sock *sk, struct sk_buff *skb)
  {
      return sk->sk_backlog_rcv(sk, skb);
*************** static inline void sk_wmem_free_skb(stru
*** 905,910 ****
--- 1024,1034 ----
      sock_set_flag(sk, SOCK_QUEUE_SHRUNK);
      sk->sk_wmem_queued -= skb->truesize;
      sk_mem_uncharge(sk, skb->truesize);
+     if (skb->localize_snd_buf) {
+         if (!sk_add_local_snd_buf_q(sk, skb)) {
+             return;
+         }
+     }
      __kfree_skb(skb);
  }
 
diff -r -p -c linux-2.6.33.3/include/net/tcp.h send-buf-new/include/net/tcp.h
*** linux-2.6.33.3/include/net/tcp.h    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/include/net/tcp.h    2012-06-23 19:27:54.000000000 +0530
*************** extern __u32 cookie_v6_init_sequence(str
*** 468,473 ****
--- 468,474 ----
  extern void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
                        int nonagle);
  extern int tcp_may_send_now(struct sock *sk);
+ extern void tcp_adjust_max_pkt_in_flight(struct sock *sk);
  extern int tcp_retransmit_skb(struct sock *, struct sk_buff *);
  extern void tcp_retransmit_timer(struct sock *sk);
  extern void tcp_xmit_retransmit_queue(struct sock *);

diff -r -p -c linux-2.6.33.3/Module.symvers send-buf-new/Module.symvers
*** linux-2.6.33.3/Module.symvers    2012-06-24 02:53:55.000000000 +0530
--- send-buf-new/Module.symvers    2012-06-25 07:36:58.000000000 +0530
***************
*** 4432,4437 ****
--- 4432,4438 ----
  0x00000000    llc_remove_pack    vmlinux    EXPORT_SYMBOL
  0x00000000    neigh_rand_reach_time    vmlinux    EXPORT_SYMBOL
  0x00000000    dev_forward_skb    vmlinux    EXPORT_SYMBOL_GPL
+ 0x00000000    __init_skb_local    vmlinux    EXPORT_SYMBOL
  0x00000000    sock_common_getsockopt    vmlinux    EXPORT_SYMBOL
  0x00000000    sock_common_setsockopt    vmlinux    EXPORT_SYMBOL
  0x00000000    restore_processor_state    vmlinux    EXPORT_SYMBOL
diff -r -p -c linux-2.6.33.3/net/core/skbuff.c send-buf-new/net/core/skbuff.c
*** linux-2.6.33.3/net/core/skbuff.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/core/skbuff.c    2012-06-25 17:06:22.000000000 +0530
*************** nodata:
*** 240,245 ****
--- 240,301 ----
  }
  EXPORT_SYMBOL(__alloc_skb);
 
+ void __init_skb_local(struct sk_buff *skb, int fclone)
+ {
+     struct skb_shared_info *shinfo;
+     int size, truesize;
+
+ #ifdef NET_SKBUFF_DATA_USES_OFFSET
+     size = skb->end;
+ #else
+     size = skb->end - skb->head;
+ #endif
+     truesize = skb->truesize;
+     /*
+      * Only clear those fields we need to clear, not those that we will
+      * actually initialise below. Hence, don't put any more fields after
+      * the tail pointer in struct sk_buff!
+      */
+     memset(skb, 0, offsetof(struct sk_buff, tail));
+     skb->truesize = truesize;
+     atomic_set(&skb->users, 1);
+     /*skb->head = data;*/
+     skb->data = skb->head;
+     skb_reset_tail_pointer(skb);
+     /*skb->end = skb->tail + size;*/
+     kmemcheck_annotate_bitfield(skb, flags1);
+     kmemcheck_annotate_bitfield(skb, flags2);
+ #ifdef NET_SKBUFF_DATA_USES_OFFSET
+     skb->mac_header = ~0U;
+ #endif
+
+     /* make sure we initialize shinfo sequentially */
+     shinfo = skb_shinfo(skb);
+     atomic_set(&shinfo->dataref, 1);
+     shinfo->nr_frags  = 0;
+     shinfo->gso_size = 0;
+     shinfo->gso_segs = 0;
+     shinfo->gso_type = 0;
+     shinfo->ip6_frag_id = 0;
+     shinfo->tx_flags.flags = 0;
+     skb_frag_list_init(skb);
+     memset(&shinfo->hwtstamps, 0, sizeof(shinfo->hwtstamps));
+
+     if (fclone) {
+         struct sk_buff *child = skb + 1;
+         atomic_t *fclone_ref = (atomic_t *) (child + 1);
+
+         kmemcheck_annotate_bitfield(child, flags1);
+         kmemcheck_annotate_bitfield(child, flags2);
+         skb->fclone = SKB_FCLONE_ORIG;
+         atomic_set(fclone_ref, 1);
+
+         child->fclone = SKB_FCLONE_UNAVAILABLE;
+     }
+     return;
+ }
+ EXPORT_SYMBOL(__init_skb_local);
+
 
--- 406,411 ----
diff -r -p -c linux-2.6.33.3/net/core/sock.c send-buf-new/net/core/sock.c
*** linux-2.6.33.3/net/core/sock.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/core/sock.c    2012-06-25 08:30:02.000000000 +0530
*************** static void __sk_free(struct sock *sk)
*** 1074,1079 ****
--- 1074,1084 ----
  {
      struct sk_filter *filter;
 
+     if (sk->sk_local_snd_buf) {
+         sk_free_local_snd_buf_q(sk);
+         kfree(sk->sk_local_snd_buf);
+         sk->sk_local_snd_buf = NULL;
+     }
      if (sk->sk_destruct)
          sk->sk_destruct(sk);
 
diff -r -p -c linux-2.6.33.3/net/ipv4/af_inet.c send-buf-new/net/ipv4/af_inet.c
*** linux-2.6.33.3/net/ipv4/af_inet.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/ipv4/af_inet.c    2012-06-23 20:40:32.000000000 +0530
*************** lookup_protocol:
*** 368,373 ****
--- 368,374 ----
      inet->inet_id = 0;
 
      sock_init_data(sock, sk);
+     sk->sk_local_snd_buf = NULL;
 
      sk->sk_destruct       = inet_sock_destruct;
      sk->sk_protocol       = protocol;
diff -r -p -c linux-2.6.33.3/net/ipv4/tcp.c send-buf-new/net/ipv4/tcp.c
*** linux-2.6.33.3/net/ipv4/tcp.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/ipv4/tcp.c    2012-06-25 17:05:57.000000000 +0530
*************** ssize_t tcp_splice_read(struct socket *s
*** 678,689 ****
 
  struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
  {
!     struct sk_buff *skb;
 
-     /* The TCP header must be at least 32-bit aligned.  */
-     size = ALIGN(size, 4);
 
!     skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
      if (skb) {
          if (sk_wmem_schedule(sk, skb->truesize)) {
              /*
--- 678,712 ----
 
  struct sk_buff *sk_stream_alloc_skb(struct sock *sk, int size, gfp_t gfp)
  {
!     struct sk_buff *skb = NULL;
!     struct tcp_sock *tp = tcp_sk(sk);
!     int oldsize = 0;
 
 
!     if (tp->localize_send_buf) {
!         skb = alloc_skb_local_sk(sk);
!         if(skb) {
! #ifdef NET_SKBUFF_DATA_USES_OFFSET
!             oldsize = skb->end;
! #else
!             oldsize = skb->end - skb->head;
! #endif
!
!         }
!     }
!     if (!skb) {
!         /* The TCP header must be at least 32-bit aligned.  */
!         size = ALIGN(size, 4);
!         skb = alloc_skb_fclone(size + sk->sk_prot->max_header, gfp);
!         if (skb) {
!             if (sk->sk_local_snd_buf)
!                 ++sk->sk_local_snd_buf->sendbuf_allocations;
!         }
!     }
!     if(oldsize) {
!         if (oldsize > sk->sk_prot->max_header)
!             size = oldsize - sk->sk_prot->max_header;
!     }
      if (skb) {
          if (sk_wmem_schedule(sk, skb->truesize)) {
              /*
*************** static inline int select_size(struct soc
*** 882,888 ****
      struct tcp_sock *tp = tcp_sk(sk);
      int tmp = tp->mss_cache;
 
!     if (sk->sk_route_caps & NETIF_F_SG) {
          if (sk_can_gso(sk))
              tmp = 0;
          else {
--- 905,911 ----
      struct tcp_sock *tp = tcp_sk(sk);
      int tmp = tp->mss_cache;
 
!     if ((sk->sk_route_caps & NETIF_F_SG) && !tp->no_optimization) {
          if (sk_can_gso(sk))
              tmp = 0;
          else {
*************** int tcp_sendmsg(struct kiocb *iocb, stru
*** 924,929 ****
--- 947,955 ----
      clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
 
      mss_now = tcp_send_mss(sk, &size_goal, flags);
+     if ((size_goal > mss_now) && tp->no_optimization) {
+         size_goal = mss_now;
+     }
 
      /* Ok commence sending. */
      iovlen = msg->msg_iovlen;
*************** new_segment:
*** 999,1005 ****
                      merge = 1;
                  } else if (i == MAX_SKB_FRAGS ||
                         (!i &&
!                        !(sk->sk_route_caps & NETIF_F_SG))) {
                      /* Need to add new fragment and cannot
                       * do this because interface is non-SG,
                       * or because all the page slots are
--- 1025,1032 ----
                      merge = 1;
                  } else if (i == MAX_SKB_FRAGS ||
                         (!i &&
!                        (!(sk->sk_route_caps & NETIF_F_SG) ||
!                        tp->no_optimization))) {
                      /* Need to add new fragment and cannot
                       * do this because interface is non-SG,
                       * or because all the page slots are
*************** static int do_tcp_setsockopt(struct sock
*** 2274,2279 ****
--- 2301,2331 ----
          }
          break;
 
+     case TCP_LOCALIZE_SND_BUF:
+         if (val) {
+             printk(KERN_INFO " TCP_LOCALIZE_SND_BUF: localizing send buf could not be initialised\n");
+             sk_add_local_snd_buf_q_init(sk);
+             if (!sk->sk_local_snd_buf) {
+                 printk(KERN_INFO " socket  localize send buf could not be initialised\n");
+                 return -ENOENT;
+             }
+             tp->localize_send_buf = 1;
+             printk(KERN_INFO " TCP_LOCALIZE_SND_BUF: localized send buf could not be initialised\n");
+         } else {
+             printk(KERN_INFO " TCP_LOCALIZE_SND_BUF: UN localized send buf\n");
+             tp->localize_send_buf = 0;
+             sk_free_local_snd_buf_q(sk);
+             kfree(sk->sk_local_snd_buf);
+             sk->sk_local_snd_buf = NULL;
+         }
+         break;  
+     case TCP_NO_OPTIMIZATION:
+         if (val) {
+             tp->no_optimization = 1;
+         } else {
+             tp->no_optimization = 0;
+         }
+         break;  
      case TCP_KEEPIDLE:
          if (val < 1 || val > MAX_TCP_KEEPIDLE)
              err = -EINVAL;
*************** static int do_tcp_getsockopt(struct sock
*** 2480,2485 ****
--- 2532,2543 ----
          if (!val && ((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN)))
              val = tp->rx_opt.user_mss;
          break;
+     case TCP_LOCALIZE_SND_BUF:
+         val = tp->localize_send_buf;
+         break;  
+     case TCP_NO_OPTIMIZATION:
+         val = tp->no_optimization;
+         break;  
      case TCP_NODELAY:
          val = !!(tp->nonagle&TCP_NAGLE_OFF);
          break;
*************** static int do_tcp_getsockopt(struct sock
*** 2510,2515 ****
--- 2568,2609 ----
      case TCP_WINDOW_CLAMP:
          val = tp->window_clamp;
          break;
+     case TCP_GET_LOCALIZATION: {
+         struct tcp_localization_info info;
+       
+         if(!tp->localize_send_buf) {
+             printk(KERN_INFO "TCP_GET_LOCALIZATION: not flagged to localize sendbuf\n");
+             return -EFAULT;
+         } else {
+             if (!sk->sk_local_snd_buf) {
+                 printk(KERN_INFO "TCP_GET_LOCALIZATION: flagged but localize sendbuf not initialised \n");
+                 return -EFAULT;
+             }
+             printk(KERN_INFO "TCP_GET_LOCALIZATION: flagged to localize sendbuf\n");
+         }
+         if (get_user(len, optlen))
+             return -EFAULT;
+         if (len != sizeof (struct tcp_localization_info)) {
+             printk(KERN_INFO "TCP_GET_LOCALIZATION: length passed is %d and struct length is %d\n", len, sizeof (struct tcp_localization_info));
+             return -EFAULT;
+         }
+         if (sk->sk_local_snd_buf) {
+             info.local_sendbuf_allocations =
+                 sk->sk_local_snd_buf->local_sendbuf_allocations;
+             info.sendbuf_allocations =
+                 sk->sk_local_snd_buf->sendbuf_allocations;
+             info.len = sk->sk_local_snd_buf->len;
+         } else {
+             printk(KERN_INFO "TCP_GET_LOCALIZATION: sendbuf not initialsed size passed is %d and of struct is %d\n", len, sizeof (struct tcp_localization_info));
+             return -EFAULT;
+         }
+         len = min_t(unsigned int, len, sizeof(info));
+         if (put_user(len, optlen))
+             return -EFAULT;
+         if (copy_to_user(optval, &info, len))
+             return -EFAULT;
+         return 0;
+     }
      case TCP_INFO: {
          struct tcp_info info;
 
diff -r -p -c linux-2.6.33.3/net/ipv4/tcp_input.c send-buf-new/net/ipv4/tcp_input.c
*** linux-2.6.33.3/net/ipv4/tcp_input.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/ipv4/tcp_input.c    2012-06-24 00:24:55.000000000 +0530
*************** static int tcp_clean_rtx_queue(struct so
*** 3284,3289 ****
--- 3284,3292 ----
              break;
 
          tcp_unlink_write_queue(skb, sk);
+         if (tp->localize_send_buf) {
+             skb->localize_snd_buf = 1;
+         }
          sk_wmem_free_skb(sk, skb);
          tp->scoreboard_skb_hint = NULL;
          if (skb == tp->retransmit_skb_hint)
*************** static int tcp_ack(struct sock *sk, stru
*** 3634,3639 ****
--- 3637,3643 ----
      }
 
      prior_fackets = tp->fackets_out;
+     tcp_adjust_max_pkt_in_flight(sk);
      prior_in_flight = tcp_packets_in_flight(tp);
 
      if (!(flag & FLAG_SLOWPATH) && after(ack, prior_snd_una)) {
diff -r -p -c linux-2.6.33.3/net/ipv4/tcp_ipv4.c send-buf-new/net/ipv4/tcp_ipv4.c
*** linux-2.6.33.3/net/ipv4/tcp_ipv4.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/ipv4/tcp_ipv4.c    2012-06-23 20:26:32.000000000 +0530
*************** static int tcp_v4_init_sock(struct sock
*** 1842,1847 ****
--- 1842,1849 ----
 
      icsk->icsk_rto = TCP_TIMEOUT_INIT;
      tp->mdev = TCP_TIMEOUT_INIT;
+     tp->localize_send_buf = 0;
+     tp->no_optimization = 0;
 
      /* So many TCP implementations out there (incorrectly) count the
       * initial SYN frame in their delayed-ACK and congestion control
diff -r -p -c linux-2.6.33.3/net/ipv4/tcp_output.c send-buf-new/net/ipv4/tcp_output.c
*** linux-2.6.33.3/net/ipv4/tcp_output.c    2010-04-26 20:18:30.000000000 +0530
--- send-buf-new/net/ipv4/tcp_output.c    2012-06-23 19:27:54.000000000 +0530
*************** static inline int tcp_snd_wnd_test(struc
*** 1410,1415 ****
--- 1410,1427 ----
      return !after(end_seq, tcp_wnd_end(tp));
  }
 
+ void tcp_adjust_max_pkt_in_flight(struct sock * sk) {
+     struct tcp_sock *tp = tcp_sk(sk);
+     int packets_in_flight;
+
+     packets_in_flight = tcp_packets_in_flight(tp);
+     if (sk->sk_local_snd_buf &&
+         (sk->sk_local_snd_buf->limit < packets_in_flight)) {
+         sk->sk_local_snd_buf->limit = packets_in_flight;
+     }
+     return;
+ }
+
  /* This checks if the data bearing packet SKB (usually tcp_send_head(sk))
   * should be put on the wire right now.  If so, it returns the number of
   * packets allowed by the congestion window.
*************** static unsigned int tcp_snd_test(struct
*** 1422,1427 ****
--- 1434,1440 ----
 
      tcp_init_tso_segs(sk, skb, cur_mss);
 
+     tcp_adjust_max_pkt_in_flight(sk);
      if (!tcp_nagle_test(tp, skb, cur_mss, nonagle))
          return 0;
 

2 comments:

  1. Hi Sameer,
    Are there any plans for new edition of your book? In one of your earlier blog posts, it was mentioned that it would be in 2010!!

    ReplyDelete
    Replies
    1. Bharti, thanks for the comment. Book writing along with daily chores of life looks like a dream :)

      I'm ready with the new proposal after millions of swings in last four years. Third chapter is in progress. After three chapters are ready, I'm going to contact publishers. This time it will be latest 3.* series linux kernel and more of networking than transport centric contents.

      Good to see that Linux has penetrated so deep into minds and souls of it's fan followers.

      Let's be patient and look ahead for a new sunrise and hope for the best!!

      Delete