1// Copyright (c) 2011 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/socket/tcp_client_socket_win.h"
6
7#include <mstcpip.h>
8
9#include "base/basictypes.h"
10#include "base/compiler_specific.h"
11#include "base/memory/memory_debug.h"
12#include "base/metrics/stats_counters.h"
13#include "base/string_util.h"
14#include "base/sys_info.h"
15#include "base/win/object_watcher.h"
16#include "net/base/address_list_net_log_param.h"
17#include "net/base/connection_type_histograms.h"
18#include "net/base/io_buffer.h"
19#include "net/base/ip_endpoint.h"
20#include "net/base/net_errors.h"
21#include "net/base/net_log.h"
22#include "net/base/net_util.h"
23#include "net/base/network_change_notifier.h"
24#include "net/base/sys_addrinfo.h"
25#include "net/base/winsock_init.h"
26#include "net/base/winsock_util.h"
27
28namespace net {
29
30namespace {
31
32int MapConnectError(int os_error) {
33  switch (os_error) {
34    // connect fails with WSAEACCES when Windows Firewall blocks the
35    // connection.
36    case WSAEACCES:
37      return ERR_NETWORK_ACCESS_DENIED;
38    case WSAETIMEDOUT:
39      return ERR_CONNECTION_TIMED_OUT;
40    default: {
41      int net_error = MapSystemError(os_error);
42      if (net_error == ERR_FAILED)
43        return ERR_CONNECTION_FAILED;  // More specific than ERR_FAILED.
44
45      // Give a more specific error when the user is offline.
46      if (net_error == ERR_ADDRESS_UNREACHABLE &&
47          NetworkChangeNotifier::IsOffline()) {
48        return ERR_INTERNET_DISCONNECTED;
49      }
50
51      return net_error;
52    }
53  }
54}
55
56}  // namespace
57
58//-----------------------------------------------------------------------------
59
60// This class encapsulates all the state that has to be preserved as long as
61// there is a network IO operation in progress. If the owner TCPClientSocketWin
62// is destroyed while an operation is in progress, the Core is detached and it
63// lives until the operation completes and the OS doesn't reference any resource
64// declared on this class anymore.
65class TCPClientSocketWin::Core : public base::RefCounted<Core> {
66 public:
67  explicit Core(TCPClientSocketWin* socket);
68
69  // Start watching for the end of a read or write operation.
70  void WatchForRead();
71  void WatchForWrite();
72
73  // The TCPClientSocketWin is going away.
74  void Detach() { socket_ = NULL; }
75
76  // The separate OVERLAPPED variables for asynchronous operation.
77  // |read_overlapped_| is used for both Connect() and Read().
78  // |write_overlapped_| is only used for Write();
79  OVERLAPPED read_overlapped_;
80  OVERLAPPED write_overlapped_;
81
82  // The buffers used in Read() and Write().
83  WSABUF read_buffer_;
84  WSABUF write_buffer_;
85  scoped_refptr<IOBuffer> read_iobuffer_;
86  scoped_refptr<IOBuffer> write_iobuffer_;
87  int write_buffer_length_;
88
89  // Throttle the read size based on our current slow start state.
90  // Returns the throttled read size.
91  int ThrottleReadSize(int size) {
92    if (slow_start_throttle_ < kMaxSlowStartThrottle) {
93      size = std::min(size, slow_start_throttle_);
94      slow_start_throttle_ *= 2;
95    }
96    return size;
97  }
98
99 private:
100  friend class base::RefCounted<Core>;
101
102  class ReadDelegate : public base::win::ObjectWatcher::Delegate {
103   public:
104    explicit ReadDelegate(Core* core) : core_(core) {}
105    virtual ~ReadDelegate() {}
106
107    // base::ObjectWatcher::Delegate methods:
108    virtual void OnObjectSignaled(HANDLE object);
109
110   private:
111    Core* const core_;
112  };
113
114  class WriteDelegate : public base::win::ObjectWatcher::Delegate {
115   public:
116    explicit WriteDelegate(Core* core) : core_(core) {}
117    virtual ~WriteDelegate() {}
118
119    // base::ObjectWatcher::Delegate methods:
120    virtual void OnObjectSignaled(HANDLE object);
121
122   private:
123    Core* const core_;
124  };
125
126  ~Core();
127
128  // The socket that created this object.
129  TCPClientSocketWin* socket_;
130
131  // |reader_| handles the signals from |read_watcher_|.
132  ReadDelegate reader_;
133  // |writer_| handles the signals from |write_watcher_|.
134  WriteDelegate writer_;
135
136  // |read_watcher_| watches for events from Connect() and Read().
137  base::win::ObjectWatcher read_watcher_;
138  // |write_watcher_| watches for events from Write();
139  base::win::ObjectWatcher write_watcher_;
140
141  // When doing reads from the socket, we try to mirror TCP's slow start.
142  // We do this because otherwise the async IO subsystem artifically delays
143  // returning data to the application.
144  static const int kInitialSlowStartThrottle = 1 * 1024;
145  static const int kMaxSlowStartThrottle = 32 * kInitialSlowStartThrottle;
146  int slow_start_throttle_;
147
148  DISALLOW_COPY_AND_ASSIGN(Core);
149};
150
151TCPClientSocketWin::Core::Core(
152    TCPClientSocketWin* socket)
153    : write_buffer_length_(0),
154      socket_(socket),
155      ALLOW_THIS_IN_INITIALIZER_LIST(reader_(this)),
156      ALLOW_THIS_IN_INITIALIZER_LIST(writer_(this)),
157      slow_start_throttle_(kInitialSlowStartThrottle) {
158  memset(&read_overlapped_, 0, sizeof(read_overlapped_));
159  memset(&write_overlapped_, 0, sizeof(write_overlapped_));
160}
161
162TCPClientSocketWin::Core::~Core() {
163  // Make sure the message loop is not watching this object anymore.
164  read_watcher_.StopWatching();
165  write_watcher_.StopWatching();
166
167  WSACloseEvent(read_overlapped_.hEvent);
168  memset(&read_overlapped_, 0xaf, sizeof(read_overlapped_));
169  WSACloseEvent(write_overlapped_.hEvent);
170  memset(&write_overlapped_, 0xaf, sizeof(write_overlapped_));
171}
172
173void TCPClientSocketWin::Core::WatchForRead() {
174  // We grab an extra reference because there is an IO operation in progress.
175  // Balanced in ReadDelegate::OnObjectSignaled().
176  AddRef();
177  read_watcher_.StartWatching(read_overlapped_.hEvent, &reader_);
178}
179
180void TCPClientSocketWin::Core::WatchForWrite() {
181  // We grab an extra reference because there is an IO operation in progress.
182  // Balanced in WriteDelegate::OnObjectSignaled().
183  AddRef();
184  write_watcher_.StartWatching(write_overlapped_.hEvent, &writer_);
185}
186
187void TCPClientSocketWin::Core::ReadDelegate::OnObjectSignaled(
188    HANDLE object) {
189  DCHECK_EQ(object, core_->read_overlapped_.hEvent);
190  if (core_->socket_) {
191    if (core_->socket_->waiting_connect()) {
192      core_->socket_->DidCompleteConnect();
193    } else {
194      core_->socket_->DidCompleteRead();
195    }
196  }
197
198  core_->Release();
199}
200
201void TCPClientSocketWin::Core::WriteDelegate::OnObjectSignaled(
202    HANDLE object) {
203  DCHECK_EQ(object, core_->write_overlapped_.hEvent);
204  if (core_->socket_)
205    core_->socket_->DidCompleteWrite();
206
207  core_->Release();
208}
209
210//-----------------------------------------------------------------------------
211
212TCPClientSocketWin::TCPClientSocketWin(const AddressList& addresses,
213                                       net::NetLog* net_log,
214                                       const net::NetLog::Source& source)
215    : socket_(INVALID_SOCKET),
216      addresses_(addresses),
217      current_ai_(NULL),
218      waiting_read_(false),
219      waiting_write_(false),
220      read_callback_(NULL),
221      write_callback_(NULL),
222      next_connect_state_(CONNECT_STATE_NONE),
223      connect_os_error_(0),
224      net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_SOCKET)),
225      previously_disconnected_(false) {
226  scoped_refptr<NetLog::EventParameters> params;
227  if (source.is_valid())
228    params = new NetLogSourceParameter("source_dependency", source);
229  net_log_.BeginEvent(NetLog::TYPE_SOCKET_ALIVE, params);
230  EnsureWinsockInit();
231}
232
233TCPClientSocketWin::~TCPClientSocketWin() {
234  Disconnect();
235  net_log_.EndEvent(NetLog::TYPE_SOCKET_ALIVE, NULL);
236}
237
238void TCPClientSocketWin::AdoptSocket(SOCKET socket) {
239  DCHECK_EQ(socket_, INVALID_SOCKET);
240  socket_ = socket;
241  int error = SetupSocket();
242  DCHECK_EQ(0, error);
243  core_ = new Core(this);
244  current_ai_ = addresses_.head();
245  use_history_.set_was_ever_connected();
246}
247
248#ifdef ANDROID
249// TODO(kristianm): handle the case when wait_for_connect is true
250// (sync requests)
251#endif
252int TCPClientSocketWin::Connect(CompletionCallback* callback
253#ifdef ANDROID
254                                , bool wait_for_connect
255#endif
256                               ) {
257  DCHECK(CalledOnValidThread());
258
259  // If already connected, then just return OK.
260  if (socket_ != INVALID_SOCKET)
261    return OK;
262
263  base::StatsCounter connects("tcp.connect");
264  connects.Increment();
265
266  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT,
267                      new AddressListNetLogParam(addresses_));
268
269  // We will try to connect to each address in addresses_. Start with the
270  // first one in the list.
271  next_connect_state_ = CONNECT_STATE_CONNECT;
272  current_ai_ = addresses_.head();
273
274  int rv = DoConnectLoop(OK);
275  if (rv == ERR_IO_PENDING) {
276    // Synchronous operation not supported.
277    DCHECK(callback);
278    read_callback_ = callback;
279  } else {
280    LogConnectCompletion(rv);
281  }
282
283  return rv;
284}
285
286int TCPClientSocketWin::DoConnectLoop(int result) {
287  DCHECK_NE(next_connect_state_, CONNECT_STATE_NONE);
288
289  int rv = result;
290  do {
291    ConnectState state = next_connect_state_;
292    next_connect_state_ = CONNECT_STATE_NONE;
293    switch (state) {
294      case CONNECT_STATE_CONNECT:
295        DCHECK_EQ(OK, rv);
296        rv = DoConnect();
297        break;
298      case CONNECT_STATE_CONNECT_COMPLETE:
299        rv = DoConnectComplete(rv);
300        break;
301      default:
302        LOG(DFATAL) << "bad state " << state;
303        rv = ERR_UNEXPECTED;
304        break;
305    }
306  } while (rv != ERR_IO_PENDING && next_connect_state_ != CONNECT_STATE_NONE);
307
308  return rv;
309}
310
311int TCPClientSocketWin::DoConnect() {
312  const struct addrinfo* ai = current_ai_;
313  DCHECK(ai);
314  DCHECK_EQ(0, connect_os_error_);
315
316  if (previously_disconnected_) {
317    use_history_.Reset();
318    previously_disconnected_ = false;
319  }
320
321  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
322                      new NetLogStringParameter(
323                          "address", NetAddressToStringWithPort(current_ai_)));
324
325  next_connect_state_ = CONNECT_STATE_CONNECT_COMPLETE;
326
327  connect_os_error_ = CreateSocket(ai);
328  if (connect_os_error_ != 0)
329    return MapSystemError(connect_os_error_);
330
331  DCHECK(!core_);
332  core_ = new Core(this);
333
334  // WSACreateEvent creates a manual-reset event object.
335  core_->read_overlapped_.hEvent = WSACreateEvent();
336  // WSAEventSelect sets the socket to non-blocking mode as a side effect.
337  // Our connect() and recv() calls require that the socket be non-blocking.
338  WSAEventSelect(socket_, core_->read_overlapped_.hEvent, FD_CONNECT);
339
340  core_->write_overlapped_.hEvent = WSACreateEvent();
341
342  if (!connect(socket_, ai->ai_addr, static_cast<int>(ai->ai_addrlen))) {
343    // Connected without waiting!
344    //
345    // The MSDN page for connect says:
346    //   With a nonblocking socket, the connection attempt cannot be completed
347    //   immediately. In this case, connect will return SOCKET_ERROR, and
348    //   WSAGetLastError will return WSAEWOULDBLOCK.
349    // which implies that for a nonblocking socket, connect never returns 0.
350    // It's not documented whether the event object will be signaled or not
351    // if connect does return 0.  So the code below is essentially dead code
352    // and we don't know if it's correct.
353    NOTREACHED();
354
355    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent))
356      return OK;
357  } else {
358    int os_error = WSAGetLastError();
359    if (os_error != WSAEWOULDBLOCK) {
360      LOG(ERROR) << "connect failed: " << os_error;
361      connect_os_error_ = os_error;
362      return MapConnectError(os_error);
363    }
364  }
365
366  core_->WatchForRead();
367  return ERR_IO_PENDING;
368}
369
370int TCPClientSocketWin::DoConnectComplete(int result) {
371  // Log the end of this attempt (and any OS error it threw).
372  int os_error = connect_os_error_;
373  connect_os_error_ = 0;
374  scoped_refptr<NetLog::EventParameters> params;
375  if (result != OK)
376    params = new NetLogIntegerParameter("os_error", os_error);
377  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT, params);
378
379  if (result == OK) {
380    use_history_.set_was_ever_connected();
381    return OK;  // Done!
382  }
383
384  // Close whatever partially connected socket we currently have.
385  DoDisconnect();
386
387  // Try to fall back to the next address in the list.
388  if (current_ai_->ai_next) {
389    next_connect_state_ = CONNECT_STATE_CONNECT;
390    current_ai_ = current_ai_->ai_next;
391    return OK;
392  }
393
394  // Otherwise there is nothing to fall back to, so give up.
395  return result;
396}
397
398void TCPClientSocketWin::Disconnect() {
399  DoDisconnect();
400  current_ai_ = NULL;
401}
402
403void TCPClientSocketWin::DoDisconnect() {
404  DCHECK(CalledOnValidThread());
405
406  if (socket_ == INVALID_SOCKET)
407    return;
408
409  // Note: don't use CancelIo to cancel pending IO because it doesn't work
410  // when there is a Winsock layered service provider.
411
412  // In most socket implementations, closing a socket results in a graceful
413  // connection shutdown, but in Winsock we have to call shutdown explicitly.
414  // See the MSDN page "Graceful Shutdown, Linger Options, and Socket Closure"
415  // at http://msdn.microsoft.com/en-us/library/ms738547.aspx
416  shutdown(socket_, SD_SEND);
417
418  // This cancels any pending IO.
419  closesocket(socket_);
420  socket_ = INVALID_SOCKET;
421
422  if (waiting_connect()) {
423    // We closed the socket, so this notification will never come.
424    // From MSDN' WSAEventSelect documentation:
425    // "Closing a socket with closesocket also cancels the association and
426    // selection of network events specified in WSAEventSelect for the socket".
427    core_->Release();
428  }
429
430  waiting_read_ = false;
431  waiting_write_ = false;
432
433  core_->Detach();
434  core_ = NULL;
435
436  previously_disconnected_ = true;
437}
438
439bool TCPClientSocketWin::IsConnected() const {
440  DCHECK(CalledOnValidThread());
441
442  if (socket_ == INVALID_SOCKET || waiting_connect())
443    return false;
444
445  // Check if connection is alive.
446  char c;
447  int rv = recv(socket_, &c, 1, MSG_PEEK);
448  if (rv == 0)
449    return false;
450  if (rv == SOCKET_ERROR && WSAGetLastError() != WSAEWOULDBLOCK)
451    return false;
452
453  return true;
454}
455
456bool TCPClientSocketWin::IsConnectedAndIdle() const {
457  DCHECK(CalledOnValidThread());
458
459  if (socket_ == INVALID_SOCKET || waiting_connect())
460    return false;
461
462  // Check if connection is alive and we haven't received any data
463  // unexpectedly.
464  char c;
465  int rv = recv(socket_, &c, 1, MSG_PEEK);
466  if (rv >= 0)
467    return false;
468  if (WSAGetLastError() != WSAEWOULDBLOCK)
469    return false;
470
471  return true;
472}
473
474int TCPClientSocketWin::GetPeerAddress(AddressList* address) const {
475  DCHECK(CalledOnValidThread());
476  DCHECK(address);
477  if (!IsConnected())
478    return ERR_SOCKET_NOT_CONNECTED;
479  address->Copy(current_ai_, false);
480  return OK;
481}
482
483int TCPClientSocketWin::GetLocalAddress(IPEndPoint* address) const {
484  DCHECK(CalledOnValidThread());
485  DCHECK(address);
486  if (!IsConnected())
487    return ERR_SOCKET_NOT_CONNECTED;
488
489  struct sockaddr_storage addr_storage;
490  socklen_t addr_len = sizeof(addr_storage);
491  struct sockaddr* addr = reinterpret_cast<struct sockaddr*>(&addr_storage);
492  if (getsockname(socket_, addr, &addr_len))
493    return MapSystemError(WSAGetLastError());
494  if (!address->FromSockAddr(addr, addr_len))
495    return ERR_FAILED;
496  return OK;
497}
498
499void TCPClientSocketWin::SetSubresourceSpeculation() {
500  use_history_.set_subresource_speculation();
501}
502
503void TCPClientSocketWin::SetOmniboxSpeculation() {
504  use_history_.set_omnibox_speculation();
505}
506
507bool TCPClientSocketWin::WasEverUsed() const {
508  return use_history_.was_used_to_convey_data();
509}
510
511bool TCPClientSocketWin::UsingTCPFastOpen() const {
512  // Not supported on windows.
513  return false;
514}
515
516int TCPClientSocketWin::Read(IOBuffer* buf,
517                             int buf_len,
518                             CompletionCallback* callback) {
519  DCHECK(CalledOnValidThread());
520  DCHECK_NE(socket_, INVALID_SOCKET);
521  DCHECK(!waiting_read_);
522  DCHECK(!read_callback_);
523  DCHECK(!core_->read_iobuffer_);
524
525  buf_len = core_->ThrottleReadSize(buf_len);
526
527  core_->read_buffer_.len = buf_len;
528  core_->read_buffer_.buf = buf->data();
529
530  // TODO(wtc): Remove the assertion after enough testing.
531  AssertEventNotSignaled(core_->read_overlapped_.hEvent);
532  DWORD num, flags = 0;
533  int rv = WSARecv(socket_, &core_->read_buffer_, 1, &num, &flags,
534                   &core_->read_overlapped_, NULL);
535  if (rv == 0) {
536    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent)) {
537      // Because of how WSARecv fills memory when used asynchronously, Purify
538      // isn't able to detect that it's been initialized, so it scans for 0xcd
539      // in the buffer and reports UMRs (uninitialized memory reads) for those
540      // individual bytes. We override that in PURIFY builds to avoid the
541      // false error reports.
542      // See bug 5297.
543      base::MemoryDebug::MarkAsInitialized(core_->read_buffer_.buf, num);
544      base::StatsCounter read_bytes("tcp.read_bytes");
545      read_bytes.Add(num);
546      if (num > 0)
547        use_history_.set_was_used_to_convey_data();
548      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num,
549                      core_->read_buffer_.buf);
550      return static_cast<int>(num);
551    }
552  } else {
553    int os_error = WSAGetLastError();
554    if (os_error != WSA_IO_PENDING)
555      return MapSystemError(os_error);
556  }
557  core_->WatchForRead();
558  waiting_read_ = true;
559  read_callback_ = callback;
560  core_->read_iobuffer_ = buf;
561  return ERR_IO_PENDING;
562}
563
564int TCPClientSocketWin::Write(IOBuffer* buf,
565                              int buf_len,
566                              CompletionCallback* callback) {
567  DCHECK(CalledOnValidThread());
568  DCHECK_NE(socket_, INVALID_SOCKET);
569  DCHECK(!waiting_write_);
570  DCHECK(!write_callback_);
571  DCHECK_GT(buf_len, 0);
572  DCHECK(!core_->write_iobuffer_);
573
574  base::StatsCounter writes("tcp.writes");
575  writes.Increment();
576
577  core_->write_buffer_.len = buf_len;
578  core_->write_buffer_.buf = buf->data();
579  core_->write_buffer_length_ = buf_len;
580
581  // TODO(wtc): Remove the assertion after enough testing.
582  AssertEventNotSignaled(core_->write_overlapped_.hEvent);
583  DWORD num;
584  int rv = WSASend(socket_, &core_->write_buffer_, 1, &num, 0,
585                   &core_->write_overlapped_, NULL);
586  if (rv == 0) {
587    if (ResetEventIfSignaled(core_->write_overlapped_.hEvent)) {
588      rv = static_cast<int>(num);
589      if (rv > buf_len || rv < 0) {
590        // It seems that some winsock interceptors report that more was written
591        // than was available. Treat this as an error.  http://crbug.com/27870
592        LOG(ERROR) << "Detected broken LSP: Asked to write " << buf_len
593                   << " bytes, but " << rv << " bytes reported.";
594        return ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
595      }
596      base::StatsCounter write_bytes("tcp.write_bytes");
597      write_bytes.Add(rv);
598      if (rv > 0)
599        use_history_.set_was_used_to_convey_data();
600      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, rv,
601                      core_->write_buffer_.buf);
602      return rv;
603    }
604  } else {
605    int os_error = WSAGetLastError();
606    if (os_error != WSA_IO_PENDING)
607      return MapSystemError(os_error);
608  }
609  core_->WatchForWrite();
610  waiting_write_ = true;
611  write_callback_ = callback;
612  core_->write_iobuffer_ = buf;
613  return ERR_IO_PENDING;
614}
615
616bool TCPClientSocketWin::SetReceiveBufferSize(int32 size) {
617  DCHECK(CalledOnValidThread());
618  int rv = setsockopt(socket_, SOL_SOCKET, SO_RCVBUF,
619                      reinterpret_cast<const char*>(&size), sizeof(size));
620  DCHECK(!rv) << "Could not set socket receive buffer size: " << GetLastError();
621  return rv == 0;
622}
623
624bool TCPClientSocketWin::SetSendBufferSize(int32 size) {
625  DCHECK(CalledOnValidThread());
626  int rv = setsockopt(socket_, SOL_SOCKET, SO_SNDBUF,
627                      reinterpret_cast<const char*>(&size), sizeof(size));
628  DCHECK(!rv) << "Could not set socket send buffer size: " << GetLastError();
629  return rv == 0;
630}
631
632int TCPClientSocketWin::CreateSocket(const struct addrinfo* ai) {
633  socket_ = WSASocket(ai->ai_family, ai->ai_socktype, ai->ai_protocol, NULL, 0,
634                      WSA_FLAG_OVERLAPPED);
635  if (socket_ == INVALID_SOCKET) {
636    int os_error = WSAGetLastError();
637    LOG(ERROR) << "WSASocket failed: " << os_error;
638    return os_error;
639  }
640  return SetupSocket();
641}
642
643int TCPClientSocketWin::SetupSocket() {
644  // Increase the socket buffer sizes from the default sizes for WinXP.  In
645  // performance testing, there is substantial benefit by increasing from 8KB
646  // to 64KB.
647  // See also:
648  //    http://support.microsoft.com/kb/823764/EN-US
649  // On Vista, if we manually set these sizes, Vista turns off its receive
650  // window auto-tuning feature.
651  //    http://blogs.msdn.com/wndp/archive/2006/05/05/Winhec-blog-tcpip-2.aspx
652  // Since Vista's auto-tune is better than any static value we can could set,
653  // only change these on pre-vista machines.
654  int32 major_version, minor_version, fix_version;
655  base::SysInfo::OperatingSystemVersionNumbers(&major_version, &minor_version,
656    &fix_version);
657  if (major_version < 6) {
658    const int32 kSocketBufferSize = 64 * 1024;
659    SetReceiveBufferSize(kSocketBufferSize);
660    SetSendBufferSize(kSocketBufferSize);
661  }
662
663  // Disable Nagle.
664  // The Nagle implementation on windows is governed by RFC 896.  The idea
665  // behind Nagle is to reduce small packets on the network.  When Nagle is
666  // enabled, if a partial packet has been sent, the TCP stack will disallow
667  // further *partial* packets until an ACK has been received from the other
668  // side.  Good applications should always strive to send as much data as
669  // possible and avoid partial-packet sends.  However, in most real world
670  // applications, there are edge cases where this does not happen, and two
671  // partil packets may be sent back to back.  For a browser, it is NEVER
672  // a benefit to delay for an RTT before the second packet is sent.
673  //
674  // As a practical example in Chromium today, consider the case of a small
675  // POST.  I have verified this:
676  //     Client writes 649 bytes of header  (partial packet #1)
677  //     Client writes 50 bytes of POST data (partial packet #2)
678  // In the above example, with Nagle, a RTT delay is inserted between these
679  // two sends due to nagle.  RTTs can easily be 100ms or more.  The best
680  // fix is to make sure that for POSTing data, we write as much data as
681  // possible and minimize partial packets.  We will fix that.  But disabling
682  // Nagle also ensure we don't run into this delay in other edge cases.
683  // See also:
684  //    http://technet.microsoft.com/en-us/library/bb726981.aspx
685  const BOOL kDisableNagle = TRUE;
686  int rv = setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY,
687                      reinterpret_cast<const char*>(&kDisableNagle),
688                      sizeof(kDisableNagle));
689  DCHECK(!rv) << "Could not disable nagle";
690
691  // Enable TCP Keep-Alive to prevent NAT routers from timing out TCP
692  // connections. See http://crbug.com/27400 for details.
693
694  struct tcp_keepalive keepalive_vals = {
695    1, // TCP keep-alive on.
696    45000,  // Wait 45s until sending first TCP keep-alive packet.
697    45000,  // Wait 45s between sending TCP keep-alive packets.
698  };
699  DWORD bytes_returned = 0xABAB;
700  rv = WSAIoctl(socket_, SIO_KEEPALIVE_VALS, &keepalive_vals,
701                sizeof(keepalive_vals), NULL, 0,
702                &bytes_returned, NULL, NULL);
703  DCHECK(!rv) << "Could not enable TCP Keep-Alive for socket: " << socket_
704              << " [error: " << WSAGetLastError() << "].";
705
706  // Disregard any failure in disabling nagle or enabling TCP Keep-Alive.
707  return 0;
708}
709
710void TCPClientSocketWin::LogConnectCompletion(int net_error) {
711  if (net_error == OK)
712    UpdateConnectionTypeHistograms(CONNECTION_ANY);
713
714  if (net_error != OK) {
715    net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, net_error);
716    return;
717  }
718
719  struct sockaddr_storage source_address;
720  socklen_t addrlen = sizeof(source_address);
721  int rv = getsockname(
722      socket_, reinterpret_cast<struct sockaddr*>(&source_address), &addrlen);
723  if (rv != 0) {
724    LOG(ERROR) << "getsockname() [rv: " << rv
725               << "] error: " << WSAGetLastError();
726    NOTREACHED();
727    net_log_.EndEventWithNetErrorCode(NetLog::TYPE_TCP_CONNECT, rv);
728    return;
729  }
730
731  const std::string source_address_str =
732      NetAddressToStringWithPort(
733          reinterpret_cast<const struct sockaddr*>(&source_address),
734          sizeof(source_address));
735  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT,
736                    make_scoped_refptr(new NetLogStringParameter(
737                        "source address",
738                        source_address_str)));
739}
740
741void TCPClientSocketWin::DoReadCallback(int rv) {
742  DCHECK_NE(rv, ERR_IO_PENDING);
743  DCHECK(read_callback_);
744
745  // since Run may result in Read being called, clear read_callback_ up front.
746  CompletionCallback* c = read_callback_;
747  read_callback_ = NULL;
748  c->Run(rv);
749}
750
751void TCPClientSocketWin::DoWriteCallback(int rv) {
752  DCHECK_NE(rv, ERR_IO_PENDING);
753  DCHECK(write_callback_);
754
755  // since Run may result in Write being called, clear write_callback_ up front.
756  CompletionCallback* c = write_callback_;
757  write_callback_ = NULL;
758  c->Run(rv);
759}
760
761void TCPClientSocketWin::DidCompleteConnect() {
762  DCHECK_EQ(next_connect_state_, CONNECT_STATE_CONNECT_COMPLETE);
763  int result;
764
765  WSANETWORKEVENTS events;
766  int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
767                                &events);
768  int os_error = 0;
769  if (rv == SOCKET_ERROR) {
770    NOTREACHED();
771    os_error = WSAGetLastError();
772    result = MapSystemError(os_error);
773  } else if (events.lNetworkEvents & FD_CONNECT) {
774    os_error = events.iErrorCode[FD_CONNECT_BIT];
775    result = MapConnectError(os_error);
776  } else {
777    NOTREACHED();
778    result = ERR_UNEXPECTED;
779  }
780
781  connect_os_error_ = os_error;
782  rv = DoConnectLoop(result);
783  if (rv != ERR_IO_PENDING) {
784    LogConnectCompletion(rv);
785    DoReadCallback(rv);
786  }
787}
788
789void TCPClientSocketWin::DidCompleteRead() {
790  DCHECK(waiting_read_);
791  DWORD num_bytes, flags;
792  BOOL ok = WSAGetOverlappedResult(socket_, &core_->read_overlapped_,
793                                   &num_bytes, FALSE, &flags);
794  WSAResetEvent(core_->read_overlapped_.hEvent);
795  waiting_read_ = false;
796  core_->read_iobuffer_ = NULL;
797  if (ok) {
798    base::StatsCounter read_bytes("tcp.read_bytes");
799    read_bytes.Add(num_bytes);
800    if (num_bytes > 0)
801      use_history_.set_was_used_to_convey_data();
802    LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num_bytes,
803                    core_->read_buffer_.buf);
804  }
805  DoReadCallback(ok ? num_bytes : MapSystemError(WSAGetLastError()));
806}
807
808void TCPClientSocketWin::DidCompleteWrite() {
809  DCHECK(waiting_write_);
810
811  DWORD num_bytes, flags;
812  BOOL ok = WSAGetOverlappedResult(socket_, &core_->write_overlapped_,
813                                   &num_bytes, FALSE, &flags);
814  WSAResetEvent(core_->write_overlapped_.hEvent);
815  waiting_write_ = false;
816  int rv;
817  if (!ok) {
818    rv = MapSystemError(WSAGetLastError());
819  } else {
820    rv = static_cast<int>(num_bytes);
821    if (rv > core_->write_buffer_length_ || rv < 0) {
822      // It seems that some winsock interceptors report that more was written
823      // than was available. Treat this as an error.  http://crbug.com/27870
824      LOG(ERROR) << "Detected broken LSP: Asked to write "
825                 << core_->write_buffer_length_ << " bytes, but " << rv
826                 << " bytes reported.";
827      rv = ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
828    } else {
829      base::StatsCounter write_bytes("tcp.write_bytes");
830      write_bytes.Add(num_bytes);
831      if (num_bytes > 0)
832        use_history_.set_was_used_to_convey_data();
833      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, num_bytes,
834                      core_->write_buffer_.buf);
835    }
836  }
837  core_->write_iobuffer_ = NULL;
838  DoWriteCallback(rv);
839}
840
841}  // namespace net
842