tcp_client_socket_win.cc revision 7b9ca917061470268bf3395c8925d4b9cc52d8e1
1// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "net/socket/tcp_client_socket_win.h"
6
7#include "base/basictypes.h"
8#include "base/compiler_specific.h"
9#include "base/memory_debug.h"
10#include "base/metrics/stats_counters.h"
11#include "base/string_util.h"
12#include "base/sys_info.h"
13#include "net/base/address_list_net_log_param.h"
14#include "net/base/connection_type_histograms.h"
15#include "net/base/io_buffer.h"
16#include "net/base/net_errors.h"
17#include "net/base/net_log.h"
18#include "net/base/net_util.h"
19#include "net/base/network_change_notifier.h"
20#include "net/base/sys_addrinfo.h"
21#include "net/base/winsock_init.h"
22
23namespace net {
24
25namespace {
26
27// Assert that the (manual-reset) event object is not signaled.
28void AssertEventNotSignaled(WSAEVENT hEvent) {
29  DWORD wait_rv = WaitForSingleObject(hEvent, 0);
30  if (wait_rv != WAIT_TIMEOUT) {
31    DWORD err = ERROR_SUCCESS;
32    if (wait_rv == WAIT_FAILED)
33      err = GetLastError();
34    CHECK(false);  // Crash.
35    // This LOG statement is unreachable since we have already crashed, but it
36    // should prevent the compiler from optimizing away the |wait_rv| and
37    // |err| variables so they appear nicely on the stack in crash dumps.
38    VLOG(1) << "wait_rv=" << wait_rv << ", err=" << err;
39  }
40}
41
42// If the (manual-reset) event object is signaled, resets it and returns true.
43// Otherwise, does nothing and returns false.  Called after a Winsock function
44// succeeds synchronously
45//
46// Our testing shows that except in rare cases (when running inside QEMU),
47// the event object is already signaled at this point, so we call this method
48// to avoid a context switch in common cases.  This is just a performance
49// optimization.  The code still works if this function simply returns false.
50bool ResetEventIfSignaled(WSAEVENT hEvent) {
51  // TODO(wtc): Remove the CHECKs after enough testing.
52  DWORD wait_rv = WaitForSingleObject(hEvent, 0);
53  if (wait_rv == WAIT_TIMEOUT)
54    return false;  // The event object is not signaled.
55  CHECK_EQ(WAIT_OBJECT_0, wait_rv);
56  BOOL ok = WSAResetEvent(hEvent);
57  CHECK(ok);
58  return true;
59}
60
61//-----------------------------------------------------------------------------
62
63int MapWinsockError(int os_error) {
64  // There are numerous Winsock error codes, but these are the ones we thus far
65  // find interesting.
66  switch (os_error) {
67    case WSAEACCES:
68      return ERR_ACCESS_DENIED;
69    case WSAENETDOWN:
70      return ERR_INTERNET_DISCONNECTED;
71    case WSAETIMEDOUT:
72      return ERR_TIMED_OUT;
73    case WSAECONNRESET:
74    case WSAENETRESET:  // Related to keep-alive
75      return ERR_CONNECTION_RESET;
76    case WSAECONNABORTED:
77      return ERR_CONNECTION_ABORTED;
78    case WSAECONNREFUSED:
79      return ERR_CONNECTION_REFUSED;
80    case WSA_IO_INCOMPLETE:
81    case WSAEDISCON:
82      // WSAEDISCON is returned by WSARecv or WSARecvFrom for message-oriented
83      // sockets (where a return value of zero means a zero-byte message) to
84      // indicate graceful connection shutdown.  We should not ever see this
85      // error code for TCP sockets, which are byte stream oriented.
86      LOG(DFATAL) << "Unexpected error " << os_error
87                  << " mapped to net::ERR_UNEXPECTED";
88      return ERR_UNEXPECTED;
89    case WSAEHOSTUNREACH:
90    case WSAENETUNREACH:
91      return ERR_ADDRESS_UNREACHABLE;
92    case WSAEADDRNOTAVAIL:
93      return ERR_ADDRESS_INVALID;
94    case ERROR_SUCCESS:
95      return OK;
96    default:
97      LOG(WARNING) << "Unknown error " << os_error
98                   << " mapped to net::ERR_FAILED";
99      return ERR_FAILED;
100  }
101}
102
103int MapConnectError(int os_error) {
104  switch (os_error) {
105    // connect fails with WSAEACCES when Windows Firewall blocks the
106    // connection.
107    case WSAEACCES:
108      return ERR_NETWORK_ACCESS_DENIED;
109    case WSAETIMEDOUT:
110      return ERR_CONNECTION_TIMED_OUT;
111    default: {
112      int net_error = MapWinsockError(os_error);
113      if (net_error == ERR_FAILED)
114        return ERR_CONNECTION_FAILED;  // More specific than ERR_FAILED.
115
116      // Give a more specific error when the user is offline.
117      if (net_error == ERR_ADDRESS_UNREACHABLE &&
118          NetworkChangeNotifier::IsOffline()) {
119        return ERR_INTERNET_DISCONNECTED;
120      }
121
122      return net_error;
123    }
124  }
125}
126
127}  // namespace
128
129//-----------------------------------------------------------------------------
130
131// This class encapsulates all the state that has to be preserved as long as
132// there is a network IO operation in progress. If the owner TCPClientSocketWin
133// is destroyed while an operation is in progress, the Core is detached and it
134// lives until the operation completes and the OS doesn't reference any resource
135// declared on this class anymore.
136class TCPClientSocketWin::Core : public base::RefCounted<Core> {
137 public:
138  explicit Core(TCPClientSocketWin* socket);
139
140  // Start watching for the end of a read or write operation.
141  void WatchForRead();
142  void WatchForWrite();
143
144  // The TCPClientSocketWin is going away.
145  void Detach() { socket_ = NULL; }
146
147  // The separate OVERLAPPED variables for asynchronous operation.
148  // |read_overlapped_| is used for both Connect() and Read().
149  // |write_overlapped_| is only used for Write();
150  OVERLAPPED read_overlapped_;
151  OVERLAPPED write_overlapped_;
152
153  // The buffers used in Read() and Write().
154  WSABUF read_buffer_;
155  WSABUF write_buffer_;
156  scoped_refptr<IOBuffer> read_iobuffer_;
157  scoped_refptr<IOBuffer> write_iobuffer_;
158  int write_buffer_length_;
159
160  // Throttle the read size based on our current slow start state.
161  // Returns the throttled read size.
162  int ThrottleReadSize(int size) {
163    if (slow_start_throttle_ < kMaxSlowStartThrottle) {
164      size = std::min(size, slow_start_throttle_);
165      slow_start_throttle_ *= 2;
166    }
167    return size;
168  }
169
170 private:
171  friend class base::RefCounted<Core>;
172
173  class ReadDelegate : public base::ObjectWatcher::Delegate {
174   public:
175    explicit ReadDelegate(Core* core) : core_(core) {}
176    virtual ~ReadDelegate() {}
177
178    // base::ObjectWatcher::Delegate methods:
179    virtual void OnObjectSignaled(HANDLE object);
180
181   private:
182    Core* const core_;
183  };
184
185  class WriteDelegate : public base::ObjectWatcher::Delegate {
186   public:
187    explicit WriteDelegate(Core* core) : core_(core) {}
188    virtual ~WriteDelegate() {}
189
190    // base::ObjectWatcher::Delegate methods:
191    virtual void OnObjectSignaled(HANDLE object);
192
193   private:
194    Core* const core_;
195  };
196
197  ~Core();
198
199  // The socket that created this object.
200  TCPClientSocketWin* socket_;
201
202  // |reader_| handles the signals from |read_watcher_|.
203  ReadDelegate reader_;
204  // |writer_| handles the signals from |write_watcher_|.
205  WriteDelegate writer_;
206
207  // |read_watcher_| watches for events from Connect() and Read().
208  base::ObjectWatcher read_watcher_;
209  // |write_watcher_| watches for events from Write();
210  base::ObjectWatcher write_watcher_;
211
212  // When doing reads from the socket, we try to mirror TCP's slow start.
213  // We do this because otherwise the async IO subsystem artifically delays
214  // returning data to the application.
215  static const int kInitialSlowStartThrottle = 1 * 1024;
216  static const int kMaxSlowStartThrottle = 32 * kInitialSlowStartThrottle;
217  int slow_start_throttle_;
218
219  DISALLOW_COPY_AND_ASSIGN(Core);
220};
221
222TCPClientSocketWin::Core::Core(
223    TCPClientSocketWin* socket)
224    : write_buffer_length_(0),
225      socket_(socket),
226      ALLOW_THIS_IN_INITIALIZER_LIST(reader_(this)),
227      ALLOW_THIS_IN_INITIALIZER_LIST(writer_(this)),
228      slow_start_throttle_(kInitialSlowStartThrottle) {
229  memset(&read_overlapped_, 0, sizeof(read_overlapped_));
230  memset(&write_overlapped_, 0, sizeof(write_overlapped_));
231}
232
233TCPClientSocketWin::Core::~Core() {
234  // Make sure the message loop is not watching this object anymore.
235  read_watcher_.StopWatching();
236  write_watcher_.StopWatching();
237
238  WSACloseEvent(read_overlapped_.hEvent);
239  memset(&read_overlapped_, 0, sizeof(read_overlapped_));
240  WSACloseEvent(write_overlapped_.hEvent);
241  memset(&write_overlapped_, 0, sizeof(write_overlapped_));
242}
243
244void TCPClientSocketWin::Core::WatchForRead() {
245  // We grab an extra reference because there is an IO operation in progress.
246  // Balanced in ReadDelegate::OnObjectSignaled().
247  AddRef();
248  read_watcher_.StartWatching(read_overlapped_.hEvent, &reader_);
249}
250
251void TCPClientSocketWin::Core::WatchForWrite() {
252  // We grab an extra reference because there is an IO operation in progress.
253  // Balanced in WriteDelegate::OnObjectSignaled().
254  AddRef();
255  write_watcher_.StartWatching(write_overlapped_.hEvent, &writer_);
256}
257
258void TCPClientSocketWin::Core::ReadDelegate::OnObjectSignaled(
259    HANDLE object) {
260  DCHECK_EQ(object, core_->read_overlapped_.hEvent);
261  if (core_->socket_) {
262    if (core_->socket_->waiting_connect()) {
263      core_->socket_->DidCompleteConnect();
264    } else {
265      core_->socket_->DidCompleteRead();
266    }
267  }
268
269  core_->Release();
270}
271
272void TCPClientSocketWin::Core::WriteDelegate::OnObjectSignaled(
273    HANDLE object) {
274  DCHECK_EQ(object, core_->write_overlapped_.hEvent);
275  if (core_->socket_)
276    core_->socket_->DidCompleteWrite();
277
278  core_->Release();
279}
280
281//-----------------------------------------------------------------------------
282
283TCPClientSocketWin::TCPClientSocketWin(const AddressList& addresses,
284                                       net::NetLog* net_log,
285                                       const net::NetLog::Source& source)
286    : socket_(INVALID_SOCKET),
287      addresses_(addresses),
288      current_ai_(NULL),
289      waiting_read_(false),
290      waiting_write_(false),
291      read_callback_(NULL),
292      write_callback_(NULL),
293      next_connect_state_(CONNECT_STATE_NONE),
294      connect_os_error_(0),
295      net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_SOCKET)),
296      previously_disconnected_(false) {
297  scoped_refptr<NetLog::EventParameters> params;
298  if (source.is_valid())
299    params = new NetLogSourceParameter("source_dependency", source);
300  net_log_.BeginEvent(NetLog::TYPE_SOCKET_ALIVE, params);
301  EnsureWinsockInit();
302}
303
304TCPClientSocketWin::~TCPClientSocketWin() {
305  Disconnect();
306  net_log_.EndEvent(NetLog::TYPE_SOCKET_ALIVE, NULL);
307}
308
309void TCPClientSocketWin::AdoptSocket(SOCKET socket) {
310  DCHECK_EQ(socket_, INVALID_SOCKET);
311  socket_ = socket;
312  int error = SetupSocket();
313  DCHECK_EQ(0, error);
314  current_ai_ = addresses_.head();
315  use_history_.set_was_ever_connected();
316}
317
318#ifdef ANDROID
319// TODO(kristianm): handle the case when wait_for_connect is true
320// (sync requests)
321#endif
322int TCPClientSocketWin::Connect(CompletionCallback* callback
323#ifdef ANDROID
324                                , bool wait_for_connect
325#endif
326                               ) {
327  DCHECK(CalledOnValidThread());
328
329  // If already connected, then just return OK.
330  if (socket_ != INVALID_SOCKET)
331    return OK;
332
333  static base::StatsCounter connects("tcp.connect");
334  connects.Increment();
335
336  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT,
337                      new AddressListNetLogParam(addresses_));
338
339  // We will try to connect to each address in addresses_. Start with the
340  // first one in the list.
341  next_connect_state_ = CONNECT_STATE_CONNECT;
342  current_ai_ = addresses_.head();
343
344  int rv = DoConnectLoop(OK);
345  if (rv == ERR_IO_PENDING) {
346    // Synchronous operation not supported.
347    DCHECK(callback);
348    read_callback_ = callback;
349  } else {
350    LogConnectCompletion(rv);
351  }
352
353  return rv;
354}
355
356int TCPClientSocketWin::DoConnectLoop(int result) {
357  DCHECK_NE(next_connect_state_, CONNECT_STATE_NONE);
358
359  int rv = result;
360  do {
361    ConnectState state = next_connect_state_;
362    next_connect_state_ = CONNECT_STATE_NONE;
363    switch (state) {
364      case CONNECT_STATE_CONNECT:
365        DCHECK_EQ(OK, rv);
366        rv = DoConnect();
367        break;
368      case CONNECT_STATE_CONNECT_COMPLETE:
369        rv = DoConnectComplete(rv);
370        break;
371      default:
372        LOG(DFATAL) << "bad state " << state;
373        rv = ERR_UNEXPECTED;
374        break;
375    }
376  } while (rv != ERR_IO_PENDING && next_connect_state_ != CONNECT_STATE_NONE);
377
378  return rv;
379}
380
381int TCPClientSocketWin::DoConnect() {
382  const struct addrinfo* ai = current_ai_;
383  DCHECK(ai);
384  DCHECK_EQ(0, connect_os_error_);
385
386  if (previously_disconnected_) {
387    use_history_.Reset();
388    previously_disconnected_ = false;
389  }
390
391  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
392                      new NetLogStringParameter(
393                          "address", NetAddressToStringWithPort(current_ai_)));
394
395  next_connect_state_ = CONNECT_STATE_CONNECT_COMPLETE;
396
397  connect_os_error_ = CreateSocket(ai);
398  if (connect_os_error_ != 0)
399    return MapWinsockError(connect_os_error_);
400
401  DCHECK(!core_);
402  core_ = new Core(this);
403
404  // WSACreateEvent creates a manual-reset event object.
405  core_->read_overlapped_.hEvent = WSACreateEvent();
406  // WSAEventSelect sets the socket to non-blocking mode as a side effect.
407  // Our connect() and recv() calls require that the socket be non-blocking.
408  WSAEventSelect(socket_, core_->read_overlapped_.hEvent, FD_CONNECT);
409
410  core_->write_overlapped_.hEvent = WSACreateEvent();
411
412  if (!connect(socket_, ai->ai_addr, static_cast<int>(ai->ai_addrlen))) {
413    // Connected without waiting!
414    //
415    // The MSDN page for connect says:
416    //   With a nonblocking socket, the connection attempt cannot be completed
417    //   immediately. In this case, connect will return SOCKET_ERROR, and
418    //   WSAGetLastError will return WSAEWOULDBLOCK.
419    // which implies that for a nonblocking socket, connect never returns 0.
420    // It's not documented whether the event object will be signaled or not
421    // if connect does return 0.  So the code below is essentially dead code
422    // and we don't know if it's correct.
423    NOTREACHED();
424
425    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent))
426      return OK;
427  } else {
428    int os_error = WSAGetLastError();
429    if (os_error != WSAEWOULDBLOCK) {
430      LOG(ERROR) << "connect failed: " << os_error;
431      connect_os_error_ = os_error;
432      return MapConnectError(os_error);
433    }
434  }
435
436  core_->WatchForRead();
437  return ERR_IO_PENDING;
438}
439
440int TCPClientSocketWin::DoConnectComplete(int result) {
441  // Log the end of this attempt (and any OS error it threw).
442  int os_error = connect_os_error_;
443  connect_os_error_ = 0;
444  scoped_refptr<NetLog::EventParameters> params;
445  if (result != OK)
446    params = new NetLogIntegerParameter("os_error", os_error);
447  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT, params);
448
449  if (result == OK) {
450    use_history_.set_was_ever_connected();
451    return OK;  // Done!
452  }
453
454  // Close whatever partially connected socket we currently have.
455  DoDisconnect();
456
457  // Try to fall back to the next address in the list.
458  if (current_ai_->ai_next) {
459    next_connect_state_ = CONNECT_STATE_CONNECT;
460    current_ai_ = current_ai_->ai_next;
461    return OK;
462  }
463
464  // Otherwise there is nothing to fall back to, so give up.
465  return result;
466}
467
468void TCPClientSocketWin::Disconnect() {
469  DoDisconnect();
470  current_ai_ = NULL;
471}
472
473void TCPClientSocketWin::DoDisconnect() {
474  DCHECK(CalledOnValidThread());
475
476  if (socket_ == INVALID_SOCKET)
477    return;
478
479  // Note: don't use CancelIo to cancel pending IO because it doesn't work
480  // when there is a Winsock layered service provider.
481
482  // In most socket implementations, closing a socket results in a graceful
483  // connection shutdown, but in Winsock we have to call shutdown explicitly.
484  // See the MSDN page "Graceful Shutdown, Linger Options, and Socket Closure"
485  // at http://msdn.microsoft.com/en-us/library/ms738547.aspx
486  shutdown(socket_, SD_SEND);
487
488  // This cancels any pending IO.
489  closesocket(socket_);
490  socket_ = INVALID_SOCKET;
491
492  if (waiting_connect()) {
493    // We closed the socket, so this notification will never come.
494    // From MSDN' WSAEventSelect documentation:
495    // "Closing a socket with closesocket also cancels the association and
496    // selection of network events specified in WSAEventSelect for the socket".
497    core_->Release();
498  }
499
500  waiting_read_ = false;
501  waiting_write_ = false;
502
503  core_->Detach();
504  core_ = NULL;
505
506  previously_disconnected_ = true;
507}
508
509bool TCPClientSocketWin::IsConnected() const {
510  DCHECK(CalledOnValidThread());
511
512  if (socket_ == INVALID_SOCKET || waiting_connect())
513    return false;
514
515  // Check if connection is alive.
516  char c;
517  int rv = recv(socket_, &c, 1, MSG_PEEK);
518  if (rv == 0)
519    return false;
520  if (rv == SOCKET_ERROR && WSAGetLastError() != WSAEWOULDBLOCK)
521    return false;
522
523  return true;
524}
525
526bool TCPClientSocketWin::IsConnectedAndIdle() const {
527  DCHECK(CalledOnValidThread());
528
529  if (socket_ == INVALID_SOCKET || waiting_connect())
530    return false;
531
532  // Check if connection is alive and we haven't received any data
533  // unexpectedly.
534  char c;
535  int rv = recv(socket_, &c, 1, MSG_PEEK);
536  if (rv >= 0)
537    return false;
538  if (WSAGetLastError() != WSAEWOULDBLOCK)
539    return false;
540
541  return true;
542}
543
544int TCPClientSocketWin::GetPeerAddress(AddressList* address) const {
545  DCHECK(CalledOnValidThread());
546  DCHECK(address);
547  if (!IsConnected())
548    return ERR_SOCKET_NOT_CONNECTED;
549  address->Copy(current_ai_, false);
550  return OK;
551}
552
553void TCPClientSocketWin::SetSubresourceSpeculation() {
554  use_history_.set_subresource_speculation();
555}
556
557void TCPClientSocketWin::SetOmniboxSpeculation() {
558  use_history_.set_omnibox_speculation();
559}
560
561bool TCPClientSocketWin::WasEverUsed() const {
562  return use_history_.was_used_to_convey_data();
563}
564
565bool TCPClientSocketWin::UsingTCPFastOpen() const {
566  // Not supported on windows.
567  return false;
568}
569
570int TCPClientSocketWin::Read(IOBuffer* buf,
571                             int buf_len,
572                             CompletionCallback* callback) {
573  DCHECK(CalledOnValidThread());
574  DCHECK_NE(socket_, INVALID_SOCKET);
575  DCHECK(!waiting_read_);
576  DCHECK(!read_callback_);
577  DCHECK(!core_->read_iobuffer_);
578
579  buf_len = core_->ThrottleReadSize(buf_len);
580
581  core_->read_buffer_.len = buf_len;
582  core_->read_buffer_.buf = buf->data();
583
584  // TODO(wtc): Remove the assertion after enough testing.
585  AssertEventNotSignaled(core_->read_overlapped_.hEvent);
586  DWORD num, flags = 0;
587  int rv = WSARecv(socket_, &core_->read_buffer_, 1, &num, &flags,
588                   &core_->read_overlapped_, NULL);
589  if (rv == 0) {
590    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent)) {
591      // Because of how WSARecv fills memory when used asynchronously, Purify
592      // isn't able to detect that it's been initialized, so it scans for 0xcd
593      // in the buffer and reports UMRs (uninitialized memory reads) for those
594      // individual bytes. We override that in PURIFY builds to avoid the
595      // false error reports.
596      // See bug 5297.
597      base::MemoryDebug::MarkAsInitialized(core_->read_buffer_.buf, num);
598      static base::StatsCounter read_bytes("tcp.read_bytes");
599      read_bytes.Add(num);
600      if (num > 0)
601        use_history_.set_was_used_to_convey_data();
602      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num,
603                      core_->read_buffer_.buf);
604      return static_cast<int>(num);
605    }
606  } else {
607    int os_error = WSAGetLastError();
608    if (os_error != WSA_IO_PENDING)
609      return MapWinsockError(os_error);
610  }
611  core_->WatchForRead();
612  waiting_read_ = true;
613  read_callback_ = callback;
614  core_->read_iobuffer_ = buf;
615  return ERR_IO_PENDING;
616}
617
618int TCPClientSocketWin::Write(IOBuffer* buf,
619                              int buf_len,
620                              CompletionCallback* callback) {
621  DCHECK(CalledOnValidThread());
622  DCHECK_NE(socket_, INVALID_SOCKET);
623  DCHECK(!waiting_write_);
624  DCHECK(!write_callback_);
625  DCHECK_GT(buf_len, 0);
626  DCHECK(!core_->write_iobuffer_);
627
628  static base::StatsCounter writes("tcp.writes");
629  writes.Increment();
630
631  core_->write_buffer_.len = buf_len;
632  core_->write_buffer_.buf = buf->data();
633  core_->write_buffer_length_ = buf_len;
634
635  // TODO(wtc): Remove the assertion after enough testing.
636  AssertEventNotSignaled(core_->write_overlapped_.hEvent);
637  DWORD num;
638  int rv = WSASend(socket_, &core_->write_buffer_, 1, &num, 0,
639                   &core_->write_overlapped_, NULL);
640  if (rv == 0) {
641    if (ResetEventIfSignaled(core_->write_overlapped_.hEvent)) {
642      rv = static_cast<int>(num);
643      if (rv > buf_len || rv < 0) {
644        // It seems that some winsock interceptors report that more was written
645        // than was available. Treat this as an error.  http://crbug.com/27870
646        LOG(ERROR) << "Detected broken LSP: Asked to write " << buf_len
647                   << " bytes, but " << rv << " bytes reported.";
648        return ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
649      }
650      static base::StatsCounter write_bytes("tcp.write_bytes");
651      write_bytes.Add(rv);
652      if (rv > 0)
653        use_history_.set_was_used_to_convey_data();
654      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, rv,
655                      core_->write_buffer_.buf);
656      return rv;
657    }
658  } else {
659    int os_error = WSAGetLastError();
660    if (os_error != WSA_IO_PENDING)
661      return MapWinsockError(os_error);
662  }
663  core_->WatchForWrite();
664  waiting_write_ = true;
665  write_callback_ = callback;
666  core_->write_iobuffer_ = buf;
667  return ERR_IO_PENDING;
668}
669
670bool TCPClientSocketWin::SetReceiveBufferSize(int32 size) {
671  DCHECK(CalledOnValidThread());
672  int rv = setsockopt(socket_, SOL_SOCKET, SO_RCVBUF,
673                      reinterpret_cast<const char*>(&size), sizeof(size));
674  DCHECK(!rv) << "Could not set socket receive buffer size: " << GetLastError();
675  return rv == 0;
676}
677
678bool TCPClientSocketWin::SetSendBufferSize(int32 size) {
679  DCHECK(CalledOnValidThread());
680  int rv = setsockopt(socket_, SOL_SOCKET, SO_SNDBUF,
681                      reinterpret_cast<const char*>(&size), sizeof(size));
682  DCHECK(!rv) << "Could not set socket send buffer size: " << GetLastError();
683  return rv == 0;
684}
685
686int TCPClientSocketWin::CreateSocket(const struct addrinfo* ai) {
687  socket_ = WSASocket(ai->ai_family, ai->ai_socktype, ai->ai_protocol, NULL, 0,
688                      WSA_FLAG_OVERLAPPED);
689  if (socket_ == INVALID_SOCKET) {
690    int os_error = WSAGetLastError();
691    LOG(ERROR) << "WSASocket failed: " << os_error;
692    return os_error;
693  }
694  return SetupSocket();
695}
696
697int TCPClientSocketWin::SetupSocket() {
698  // Increase the socket buffer sizes from the default sizes for WinXP.  In
699  // performance testing, there is substantial benefit by increasing from 8KB
700  // to 64KB.
701  // See also:
702  //    http://support.microsoft.com/kb/823764/EN-US
703  // On Vista, if we manually set these sizes, Vista turns off its receive
704  // window auto-tuning feature.
705  //    http://blogs.msdn.com/wndp/archive/2006/05/05/Winhec-blog-tcpip-2.aspx
706  // Since Vista's auto-tune is better than any static value we can could set,
707  // only change these on pre-vista machines.
708  int32 major_version, minor_version, fix_version;
709  base::SysInfo::OperatingSystemVersionNumbers(&major_version, &minor_version,
710    &fix_version);
711  if (major_version < 6) {
712    const int32 kSocketBufferSize = 64 * 1024;
713    SetReceiveBufferSize(kSocketBufferSize);
714    SetSendBufferSize(kSocketBufferSize);
715  }
716
717  // Disable Nagle.
718  // The Nagle implementation on windows is governed by RFC 896.  The idea
719  // behind Nagle is to reduce small packets on the network.  When Nagle is
720  // enabled, if a partial packet has been sent, the TCP stack will disallow
721  // further *partial* packets until an ACK has been received from the other
722  // side.  Good applications should always strive to send as much data as
723  // possible and avoid partial-packet sends.  However, in most real world
724  // applications, there are edge cases where this does not happen, and two
725  // partil packets may be sent back to back.  For a browser, it is NEVER
726  // a benefit to delay for an RTT before the second packet is sent.
727  //
728  // As a practical example in Chromium today, consider the case of a small
729  // POST.  I have verified this:
730  //     Client writes 649 bytes of header  (partial packet #1)
731  //     Client writes 50 bytes of POST data (partial packet #2)
732  // In the above example, with Nagle, a RTT delay is inserted between these
733  // two sends due to nagle.  RTTs can easily be 100ms or more.  The best
734  // fix is to make sure that for POSTing data, we write as much data as
735  // possible and minimize partial packets.  We will fix that.  But disabling
736  // Nagle also ensure we don't run into this delay in other edge cases.
737  // See also:
738  //    http://technet.microsoft.com/en-us/library/bb726981.aspx
739  const BOOL kDisableNagle = TRUE;
740  int rv = setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY,
741      reinterpret_cast<const char*>(&kDisableNagle), sizeof(kDisableNagle));
742  DCHECK(!rv) << "Could not disable nagle";
743
744  // Disregard any failure in disabling nagle.
745  return 0;
746}
747
748void TCPClientSocketWin::LogConnectCompletion(int net_error) {
749  scoped_refptr<NetLog::EventParameters> params;
750  if (net_error != OK)
751    params = new NetLogIntegerParameter("net_error", net_error);
752  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT, params);
753  if (net_error == OK)
754    UpdateConnectionTypeHistograms(CONNECTION_ANY);
755}
756
757void TCPClientSocketWin::DoReadCallback(int rv) {
758  DCHECK_NE(rv, ERR_IO_PENDING);
759  DCHECK(read_callback_);
760
761  // since Run may result in Read being called, clear read_callback_ up front.
762  CompletionCallback* c = read_callback_;
763  read_callback_ = NULL;
764  c->Run(rv);
765}
766
767void TCPClientSocketWin::DoWriteCallback(int rv) {
768  DCHECK_NE(rv, ERR_IO_PENDING);
769  DCHECK(write_callback_);
770
771  // since Run may result in Write being called, clear write_callback_ up front.
772  CompletionCallback* c = write_callback_;
773  write_callback_ = NULL;
774  c->Run(rv);
775}
776
777void TCPClientSocketWin::DidCompleteConnect() {
778  DCHECK_EQ(next_connect_state_, CONNECT_STATE_CONNECT_COMPLETE);
779  int result;
780
781  WSANETWORKEVENTS events;
782  int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
783                                &events);
784  int os_error = 0;
785  if (rv == SOCKET_ERROR) {
786    NOTREACHED();
787    os_error = WSAGetLastError();
788    result = MapWinsockError(os_error);
789  } else if (events.lNetworkEvents & FD_CONNECT) {
790    os_error = events.iErrorCode[FD_CONNECT_BIT];
791    result = MapConnectError(os_error);
792  } else {
793    NOTREACHED();
794    result = ERR_UNEXPECTED;
795  }
796
797  connect_os_error_ = os_error;
798  rv = DoConnectLoop(result);
799  if (rv != ERR_IO_PENDING) {
800    LogConnectCompletion(rv);
801    DoReadCallback(rv);
802  }
803}
804
805void TCPClientSocketWin::DidCompleteRead() {
806  DCHECK(waiting_read_);
807  DWORD num_bytes, flags;
808  BOOL ok = WSAGetOverlappedResult(socket_, &core_->read_overlapped_,
809                                   &num_bytes, FALSE, &flags);
810  WSAResetEvent(core_->read_overlapped_.hEvent);
811  waiting_read_ = false;
812  core_->read_iobuffer_ = NULL;
813  if (ok) {
814    static base::StatsCounter read_bytes("tcp.read_bytes");
815    read_bytes.Add(num_bytes);
816    if (num_bytes > 0)
817      use_history_.set_was_used_to_convey_data();
818    LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_RECEIVED, num_bytes,
819                    core_->read_buffer_.buf);
820  }
821  DoReadCallback(ok ? num_bytes : MapWinsockError(WSAGetLastError()));
822}
823
824void TCPClientSocketWin::DidCompleteWrite() {
825  DCHECK(waiting_write_);
826
827  DWORD num_bytes, flags;
828  BOOL ok = WSAGetOverlappedResult(socket_, &core_->write_overlapped_,
829                                   &num_bytes, FALSE, &flags);
830  WSAResetEvent(core_->write_overlapped_.hEvent);
831  waiting_write_ = false;
832  int rv;
833  if (!ok) {
834    rv = MapWinsockError(WSAGetLastError());
835  } else {
836    rv = static_cast<int>(num_bytes);
837    if (rv > core_->write_buffer_length_ || rv < 0) {
838      // It seems that some winsock interceptors report that more was written
839      // than was available. Treat this as an error.  http://crbug.com/27870
840      LOG(ERROR) << "Detected broken LSP: Asked to write "
841                 << core_->write_buffer_length_ << " bytes, but " << rv
842                 << " bytes reported.";
843      rv = ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
844    } else {
845      static base::StatsCounter write_bytes("tcp.write_bytes");
846      write_bytes.Add(num_bytes);
847      if (num_bytes > 0)
848        use_history_.set_was_used_to_convey_data();
849      LogByteTransfer(net_log_, NetLog::TYPE_SOCKET_BYTES_SENT, num_bytes,
850                      core_->write_buffer_.buf);
851    }
852  }
853  core_->write_iobuffer_ = NULL;
854  DoWriteCallback(rv);
855}
856
857}  // namespace net
858