tcp_client_socket_win.cc revision c407dc5cd9bdc5668497f21b26b09d988ab439de
1804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// Copyright (c) 2010 The Chromium Authors. All rights reserved.
2804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// Use of this source code is governed by a BSD-style license that can be
3804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// found in the LICENSE file.
4804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
5804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/socket/tcp_client_socket_win.h"
6804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
7804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/basictypes.h"
8804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/compiler_specific.h"
9804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/memory_debug.h"
10804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/stats_counters.h"
11804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/string_util.h"
12804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "base/sys_info.h"
13804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/address_list_net_log_param.h"
14804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/connection_type_histograms.h"
15804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/io_buffer.h"
16804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/net_errors.h"
17804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/net_log.h"
18804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/net_util.h"
19804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/sys_addrinfo.h"
20804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar#include "net/base/winsock_init.h"
21804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
22804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbarnamespace net {
23804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
24804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbarnamespace {
25804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
26804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// If the (manual-reset) event object is signaled, resets it and returns true.
27804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// Otherwise, does nothing and returns false.  Called after a Winsock function
28804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// succeeds synchronously
29804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar//
30804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// Our testing shows that except in rare cases (when running inside QEMU),
31804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// the event object is already signaled at this point, so we call this method
32804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// to avoid a context switch in common cases.  This is just a performance
33804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar// optimization.  The code still works if this function simply returns false.
34804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbarbool ResetEventIfSignaled(WSAEVENT hEvent) {
35804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  // TODO(wtc): Remove the CHECKs after enough testing.
36804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  DWORD wait_rv = WaitForSingleObject(hEvent, 0);
37804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  if (wait_rv == WAIT_TIMEOUT)
38804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar    return false;  // The event object is not signaled.
39804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  CHECK_EQ(WAIT_OBJECT_0, wait_rv);
40804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  BOOL ok = WSAResetEvent(hEvent);
41804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  CHECK(ok);
42804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  return true;
43804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar}
44804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
45804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar//-----------------------------------------------------------------------------
46804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar
47804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbarint MapWinsockError(int os_error) {
48804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  // There are numerous Winsock error codes, but these are the ones we thus far
49804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  // find interesting.
50804ead0dd1713c3c8a02853fc0a5d898a46889a9Daniel Dunbar  switch (os_error) {
51    // connect fails with WSAEACCES when Windows Firewall blocks the
52    // connection.
53    case WSAEACCES:
54      return ERR_ACCESS_DENIED;
55    case WSAENETDOWN:
56      return ERR_INTERNET_DISCONNECTED;
57    case WSAETIMEDOUT:
58      return ERR_TIMED_OUT;
59    case WSAECONNRESET:
60    case WSAENETRESET:  // Related to keep-alive
61      return ERR_CONNECTION_RESET;
62    case WSAECONNABORTED:
63      return ERR_CONNECTION_ABORTED;
64    case WSAECONNREFUSED:
65      return ERR_CONNECTION_REFUSED;
66    case WSAEDISCON:
67      // Returned by WSARecv or WSARecvFrom for message-oriented sockets (where
68      // a return value of zero means a zero-byte message) to indicate graceful
69      // connection shutdown.  We should not ever see this error code for TCP
70      // sockets, which are byte stream oriented.
71      NOTREACHED();
72      return ERR_CONNECTION_CLOSED;
73    case WSAEHOSTUNREACH:
74    case WSAENETUNREACH:
75      return ERR_ADDRESS_UNREACHABLE;
76    case WSAEADDRNOTAVAIL:
77      return ERR_ADDRESS_INVALID;
78    case WSA_IO_INCOMPLETE:
79      return ERR_UNEXPECTED;
80    case ERROR_SUCCESS:
81      return OK;
82    default:
83      LOG(WARNING) << "Unknown error " << os_error
84                   << " mapped to net::ERR_FAILED";
85      return ERR_FAILED;
86  }
87}
88
89int MapConnectError(int os_error) {
90  switch (os_error) {
91    case WSAETIMEDOUT:
92      return ERR_CONNECTION_TIMED_OUT;
93    default: {
94      int net_error = MapWinsockError(os_error);
95      if (net_error == ERR_FAILED)
96        return ERR_CONNECTION_FAILED;  // More specific than ERR_FAILED.
97      return net_error;
98    }
99  }
100}
101
102}  // namespace
103
104//-----------------------------------------------------------------------------
105
106// This class encapsulates all the state that has to be preserved as long as
107// there is a network IO operation in progress. If the owner TCPClientSocketWin
108// is destroyed while an operation is in progress, the Core is detached and it
109// lives until the operation completes and the OS doesn't reference any resource
110// declared on this class anymore.
111class TCPClientSocketWin::Core : public base::RefCounted<Core> {
112 public:
113  explicit Core(TCPClientSocketWin* socket);
114
115  // Start watching for the end of a read or write operation.
116  void WatchForRead();
117  void WatchForWrite();
118
119  // The TCPClientSocketWin is going away.
120  void Detach() { socket_ = NULL; }
121
122  // The separate OVERLAPPED variables for asynchronous operation.
123  // |read_overlapped_| is used for both Connect() and Read().
124  // |write_overlapped_| is only used for Write();
125  OVERLAPPED read_overlapped_;
126  OVERLAPPED write_overlapped_;
127
128  // The buffers used in Read() and Write().
129  WSABUF read_buffer_;
130  WSABUF write_buffer_;
131  scoped_refptr<IOBuffer> read_iobuffer_;
132  scoped_refptr<IOBuffer> write_iobuffer_;
133  int write_buffer_length_;
134
135  // Throttle the read size based on our current slow start state.
136  // Returns the throttled read size.
137  int ThrottleReadSize(int size) {
138    if (slow_start_throttle_ < kMaxSlowStartThrottle) {
139      size = std::min(size, slow_start_throttle_);
140      slow_start_throttle_ *= 2;
141    }
142    return size;
143  }
144
145 private:
146  friend class base::RefCounted<Core>;
147
148  class ReadDelegate : public base::ObjectWatcher::Delegate {
149   public:
150    explicit ReadDelegate(Core* core) : core_(core) {}
151    virtual ~ReadDelegate() {}
152
153    // base::ObjectWatcher::Delegate methods:
154    virtual void OnObjectSignaled(HANDLE object);
155
156   private:
157    Core* const core_;
158  };
159
160  class WriteDelegate : public base::ObjectWatcher::Delegate {
161   public:
162    explicit WriteDelegate(Core* core) : core_(core) {}
163    virtual ~WriteDelegate() {}
164
165    // base::ObjectWatcher::Delegate methods:
166    virtual void OnObjectSignaled(HANDLE object);
167
168   private:
169    Core* const core_;
170  };
171
172  ~Core();
173
174  // The socket that created this object.
175  TCPClientSocketWin* socket_;
176
177  // |reader_| handles the signals from |read_watcher_|.
178  ReadDelegate reader_;
179  // |writer_| handles the signals from |write_watcher_|.
180  WriteDelegate writer_;
181
182  // |read_watcher_| watches for events from Connect() and Read().
183  base::ObjectWatcher read_watcher_;
184  // |write_watcher_| watches for events from Write();
185  base::ObjectWatcher write_watcher_;
186
187  // When doing reads from the socket, we try to mirror TCP's slow start.
188  // We do this because otherwise the async IO subsystem artifically delays
189  // returning data to the application.
190  static const int kInitialSlowStartThrottle = 1 * 1024;
191  static const int kMaxSlowStartThrottle = 32 * kInitialSlowStartThrottle;
192  int slow_start_throttle_;
193
194  DISALLOW_COPY_AND_ASSIGN(Core);
195};
196
197TCPClientSocketWin::Core::Core(
198    TCPClientSocketWin* socket)
199    : write_buffer_length_(0),
200      socket_(socket),
201      ALLOW_THIS_IN_INITIALIZER_LIST(reader_(this)),
202      ALLOW_THIS_IN_INITIALIZER_LIST(writer_(this)),
203      slow_start_throttle_(kInitialSlowStartThrottle) {
204  memset(&read_overlapped_, 0, sizeof(read_overlapped_));
205  memset(&write_overlapped_, 0, sizeof(write_overlapped_));
206}
207
208TCPClientSocketWin::Core::~Core() {
209  // Make sure the message loop is not watching this object anymore.
210  read_watcher_.StopWatching();
211  write_watcher_.StopWatching();
212
213  WSACloseEvent(read_overlapped_.hEvent);
214  memset(&read_overlapped_, 0, sizeof(read_overlapped_));
215  WSACloseEvent(write_overlapped_.hEvent);
216  memset(&write_overlapped_, 0, sizeof(write_overlapped_));
217}
218
219void TCPClientSocketWin::Core::WatchForRead() {
220  // We grab an extra reference because there is an IO operation in progress.
221  // Balanced in ReadDelegate::OnObjectSignaled().
222  AddRef();
223  read_watcher_.StartWatching(read_overlapped_.hEvent, &reader_);
224}
225
226void TCPClientSocketWin::Core::WatchForWrite() {
227  // We grab an extra reference because there is an IO operation in progress.
228  // Balanced in WriteDelegate::OnObjectSignaled().
229  AddRef();
230  write_watcher_.StartWatching(write_overlapped_.hEvent, &writer_);
231}
232
233void TCPClientSocketWin::Core::ReadDelegate::OnObjectSignaled(
234    HANDLE object) {
235  DCHECK_EQ(object, core_->read_overlapped_.hEvent);
236  if (core_->socket_) {
237    if (core_->socket_->waiting_connect()) {
238      core_->socket_->DidCompleteConnect();
239    } else {
240      core_->socket_->DidCompleteRead();
241    }
242  }
243
244  core_->Release();
245}
246
247void TCPClientSocketWin::Core::WriteDelegate::OnObjectSignaled(
248    HANDLE object) {
249  DCHECK_EQ(object, core_->write_overlapped_.hEvent);
250  if (core_->socket_)
251    core_->socket_->DidCompleteWrite();
252
253  core_->Release();
254}
255
256//-----------------------------------------------------------------------------
257
258TCPClientSocketWin::TCPClientSocketWin(const AddressList& addresses,
259                                       net::NetLog* net_log)
260    : socket_(INVALID_SOCKET),
261      addresses_(addresses),
262      current_ai_(NULL),
263      waiting_read_(false),
264      waiting_write_(false),
265      read_callback_(NULL),
266      write_callback_(NULL),
267      next_connect_state_(CONNECT_STATE_NONE),
268      connect_os_error_(0),
269      net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_SOCKET)) {
270  net_log_.BeginEvent(NetLog::TYPE_SOCKET_ALIVE, NULL);
271  EnsureWinsockInit();
272}
273
274TCPClientSocketWin::~TCPClientSocketWin() {
275  Disconnect();
276  net_log_.EndEvent(NetLog::TYPE_SOCKET_ALIVE, NULL);
277}
278
279int TCPClientSocketWin::Connect(CompletionCallback* callback) {
280  DCHECK(CalledOnValidThread());
281
282  // If already connected, then just return OK.
283  if (socket_ != INVALID_SOCKET)
284    return OK;
285
286  static StatsCounter connects("tcp.connect");
287  connects.Increment();
288
289  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT,
290                      new AddressListNetLogParam(addresses_));
291
292  // We will try to connect to each address in addresses_. Start with the
293  // first one in the list.
294  next_connect_state_ = CONNECT_STATE_CONNECT;
295  current_ai_ = addresses_.head();
296
297  int rv = DoConnectLoop(OK);
298  if (rv == ERR_IO_PENDING) {
299    // Synchronous operation not supported.
300    DCHECK(callback);
301    read_callback_ = callback;
302  } else {
303    LogConnectCompletion(rv);
304  }
305
306  return rv;
307}
308
309int TCPClientSocketWin::DoConnectLoop(int result) {
310  DCHECK_NE(next_connect_state_, CONNECT_STATE_NONE);
311
312  int rv = result;
313  do {
314    ConnectState state = next_connect_state_;
315    next_connect_state_ = CONNECT_STATE_NONE;
316    switch (state) {
317      case CONNECT_STATE_CONNECT:
318        DCHECK_EQ(OK, rv);
319        rv = DoConnect();
320        break;
321      case CONNECT_STATE_CONNECT_COMPLETE:
322        rv = DoConnectComplete(rv);
323        break;
324      default:
325        LOG(DFATAL) << "bad state";
326        rv = ERR_UNEXPECTED;
327        break;
328    }
329  } while (rv != ERR_IO_PENDING && next_connect_state_ != CONNECT_STATE_NONE);
330
331  return rv;
332}
333
334int TCPClientSocketWin::DoConnect() {
335  const struct addrinfo* ai = current_ai_;
336  DCHECK(ai);
337  DCHECK_EQ(0, connect_os_error_);
338
339  net_log_.BeginEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT,
340                      new NetLogStringParameter(
341                          "address", NetAddressToStringWithPort(current_ai_)));
342
343  next_connect_state_ = CONNECT_STATE_CONNECT_COMPLETE;
344
345  connect_os_error_ = CreateSocket(ai);
346  if (connect_os_error_ != 0)
347    return MapWinsockError(connect_os_error_);
348
349  DCHECK(!core_);
350  core_ = new Core(this);
351
352  // WSACreateEvent creates a manual-reset event object.
353  core_->read_overlapped_.hEvent = WSACreateEvent();
354  // WSAEventSelect sets the socket to non-blocking mode as a side effect.
355  // Our connect() and recv() calls require that the socket be non-blocking.
356  WSAEventSelect(socket_, core_->read_overlapped_.hEvent, FD_CONNECT);
357
358  core_->write_overlapped_.hEvent = WSACreateEvent();
359
360  if (!connect(socket_, ai->ai_addr, static_cast<int>(ai->ai_addrlen))) {
361    // Connected without waiting!
362    //
363    // The MSDN page for connect says:
364    //   With a nonblocking socket, the connection attempt cannot be completed
365    //   immediately. In this case, connect will return SOCKET_ERROR, and
366    //   WSAGetLastError will return WSAEWOULDBLOCK.
367    // which implies that for a nonblocking socket, connect never returns 0.
368    // It's not documented whether the event object will be signaled or not
369    // if connect does return 0.  So the code below is essentially dead code
370    // and we don't know if it's correct.
371    NOTREACHED();
372
373    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent))
374      return OK;
375  } else {
376    int os_error = WSAGetLastError();
377    if (os_error != WSAEWOULDBLOCK) {
378      LOG(ERROR) << "connect failed: " << os_error;
379      connect_os_error_ = os_error;
380      return MapConnectError(os_error);
381    }
382  }
383
384  core_->WatchForRead();
385  return ERR_IO_PENDING;
386}
387
388int TCPClientSocketWin::DoConnectComplete(int result) {
389  // Log the end of this attempt (and any OS error it threw).
390  int os_error = connect_os_error_;
391  connect_os_error_ = 0;
392  scoped_refptr<NetLog::EventParameters> params;
393  if (result != OK)
394    params = new NetLogIntegerParameter("os_error", os_error);
395  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT_ATTEMPT, params);
396
397  if (result == OK)
398    return OK;  // Done!
399
400  // Close whatever partially connected socket we currently have.
401  DoDisconnect();
402
403  // Try to fall back to the next address in the list.
404  if (current_ai_->ai_next) {
405    next_connect_state_ = CONNECT_STATE_CONNECT;
406    current_ai_ = current_ai_->ai_next;
407    return OK;
408  }
409
410  // Otherwise there is nothing to fall back to, so give up.
411  return result;
412}
413
414void TCPClientSocketWin::Disconnect() {
415  DoDisconnect();
416  current_ai_ = NULL;
417}
418
419void TCPClientSocketWin::DoDisconnect() {
420  DCHECK(CalledOnValidThread());
421
422  if (socket_ == INVALID_SOCKET)
423    return;
424
425  // Note: don't use CancelIo to cancel pending IO because it doesn't work
426  // when there is a Winsock layered service provider.
427
428  // In most socket implementations, closing a socket results in a graceful
429  // connection shutdown, but in Winsock we have to call shutdown explicitly.
430  // See the MSDN page "Graceful Shutdown, Linger Options, and Socket Closure"
431  // at http://msdn.microsoft.com/en-us/library/ms738547.aspx
432  shutdown(socket_, SD_SEND);
433
434  // This cancels any pending IO.
435  closesocket(socket_);
436  socket_ = INVALID_SOCKET;
437
438  if (waiting_connect()) {
439    // We closed the socket, so this notification will never come.
440    // From MSDN' WSAEventSelect documentation:
441    // "Closing a socket with closesocket also cancels the association and
442    // selection of network events specified in WSAEventSelect for the socket".
443    core_->Release();
444  }
445
446  waiting_read_ = false;
447  waiting_write_ = false;
448
449  core_->Detach();
450  core_ = NULL;
451}
452
453bool TCPClientSocketWin::IsConnected() const {
454  DCHECK(CalledOnValidThread());
455
456  if (socket_ == INVALID_SOCKET || waiting_connect())
457    return false;
458
459  // Check if connection is alive.
460  char c;
461  int rv = recv(socket_, &c, 1, MSG_PEEK);
462  if (rv == 0)
463    return false;
464  if (rv == SOCKET_ERROR && WSAGetLastError() != WSAEWOULDBLOCK)
465    return false;
466
467  return true;
468}
469
470bool TCPClientSocketWin::IsConnectedAndIdle() const {
471  DCHECK(CalledOnValidThread());
472
473  if (socket_ == INVALID_SOCKET || waiting_connect())
474    return false;
475
476  // Check if connection is alive and we haven't received any data
477  // unexpectedly.
478  char c;
479  int rv = recv(socket_, &c, 1, MSG_PEEK);
480  if (rv >= 0)
481    return false;
482  if (WSAGetLastError() != WSAEWOULDBLOCK)
483    return false;
484
485  return true;
486}
487
488int TCPClientSocketWin::GetPeerAddress(AddressList* address) const {
489  DCHECK(CalledOnValidThread());
490  DCHECK(address);
491  if (!current_ai_)
492    return ERR_FAILED;
493  address->Copy(current_ai_, false);
494  return OK;
495}
496
497int TCPClientSocketWin::Read(IOBuffer* buf,
498                             int buf_len,
499                             CompletionCallback* callback) {
500  DCHECK(CalledOnValidThread());
501  DCHECK_NE(socket_, INVALID_SOCKET);
502  DCHECK(!waiting_read_);
503  DCHECK(!read_callback_);
504  DCHECK(!core_->read_iobuffer_);
505
506  buf_len = core_->ThrottleReadSize(buf_len);
507
508  core_->read_buffer_.len = buf_len;
509  core_->read_buffer_.buf = buf->data();
510
511  // TODO(wtc): Remove the CHECK after enough testing.
512  CHECK_EQ(static_cast<DWORD>(WAIT_TIMEOUT),
513           WaitForSingleObject(core_->read_overlapped_.hEvent, 0));
514  DWORD num, flags = 0;
515  int rv = WSARecv(socket_, &core_->read_buffer_, 1, &num, &flags,
516                   &core_->read_overlapped_, NULL);
517  if (rv == 0) {
518    if (ResetEventIfSignaled(core_->read_overlapped_.hEvent)) {
519      // Because of how WSARecv fills memory when used asynchronously, Purify
520      // isn't able to detect that it's been initialized, so it scans for 0xcd
521      // in the buffer and reports UMRs (uninitialized memory reads) for those
522      // individual bytes. We override that in PURIFY builds to avoid the
523      // false error reports.
524      // See bug 5297.
525      base::MemoryDebug::MarkAsInitialized(core_->read_buffer_.buf, num);
526      static StatsCounter read_bytes("tcp.read_bytes");
527      read_bytes.Add(num);
528      net_log_.AddEvent(NetLog::TYPE_SOCKET_BYTES_RECEIVED,
529                        new NetLogIntegerParameter("num_bytes", num));
530      return static_cast<int>(num);
531    }
532  } else {
533    int os_error = WSAGetLastError();
534    if (os_error != WSA_IO_PENDING)
535      return MapWinsockError(os_error);
536  }
537  core_->WatchForRead();
538  waiting_read_ = true;
539  read_callback_ = callback;
540  core_->read_iobuffer_ = buf;
541  return ERR_IO_PENDING;
542}
543
544int TCPClientSocketWin::Write(IOBuffer* buf,
545                              int buf_len,
546                              CompletionCallback* callback) {
547  DCHECK(CalledOnValidThread());
548  DCHECK_NE(socket_, INVALID_SOCKET);
549  DCHECK(!waiting_write_);
550  DCHECK(!write_callback_);
551  DCHECK_GT(buf_len, 0);
552  DCHECK(!core_->write_iobuffer_);
553
554  static StatsCounter reads("tcp.writes");
555  reads.Increment();
556
557  core_->write_buffer_.len = buf_len;
558  core_->write_buffer_.buf = buf->data();
559  core_->write_buffer_length_ = buf_len;
560
561  // TODO(wtc): Remove the CHECK after enough testing.
562  CHECK_EQ(static_cast<DWORD>(WAIT_TIMEOUT),
563           WaitForSingleObject(core_->write_overlapped_.hEvent, 0));
564  DWORD num;
565  int rv = WSASend(socket_, &core_->write_buffer_, 1, &num, 0,
566                   &core_->write_overlapped_, NULL);
567  if (rv == 0) {
568    if (ResetEventIfSignaled(core_->write_overlapped_.hEvent)) {
569      rv = static_cast<int>(num);
570      if (rv > buf_len || rv < 0) {
571        // It seems that some winsock interceptors report that more was written
572        // than was available. Treat this as an error.  http://crbug.com/27870
573        LOG(ERROR) << "Detected broken LSP: Asked to write " << buf_len
574                   << " bytes, but " << rv << " bytes reported.";
575        return ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
576      }
577      static StatsCounter write_bytes("tcp.write_bytes");
578      write_bytes.Add(rv);
579      net_log_.AddEvent(NetLog::TYPE_SOCKET_BYTES_SENT,
580                        new NetLogIntegerParameter("num_bytes", rv));
581      return rv;
582    }
583  } else {
584    int os_error = WSAGetLastError();
585    if (os_error != WSA_IO_PENDING)
586      return MapWinsockError(os_error);
587  }
588  core_->WatchForWrite();
589  waiting_write_ = true;
590  write_callback_ = callback;
591  core_->write_iobuffer_ = buf;
592  return ERR_IO_PENDING;
593}
594
595bool TCPClientSocketWin::SetReceiveBufferSize(int32 size) {
596  DCHECK(CalledOnValidThread());
597  int rv = setsockopt(socket_, SOL_SOCKET, SO_RCVBUF,
598                      reinterpret_cast<const char*>(&size), sizeof(size));
599  DCHECK(!rv) << "Could not set socket receive buffer size: " << GetLastError();
600  return rv == 0;
601}
602
603bool TCPClientSocketWin::SetSendBufferSize(int32 size) {
604  DCHECK(CalledOnValidThread());
605  int rv = setsockopt(socket_, SOL_SOCKET, SO_SNDBUF,
606                      reinterpret_cast<const char*>(&size), sizeof(size));
607  DCHECK(!rv) << "Could not set socket send buffer size: " << GetLastError();
608  return rv == 0;
609}
610
611int TCPClientSocketWin::CreateSocket(const struct addrinfo* ai) {
612  socket_ = WSASocket(ai->ai_family, ai->ai_socktype, ai->ai_protocol, NULL, 0,
613                      WSA_FLAG_OVERLAPPED);
614  if (socket_ == INVALID_SOCKET) {
615    int os_error = WSAGetLastError();
616    LOG(ERROR) << "WSASocket failed: " << os_error;
617    return os_error;
618  }
619
620  // Increase the socket buffer sizes from the default sizes for WinXP.  In
621  // performance testing, there is substantial benefit by increasing from 8KB
622  // to 64KB.
623  // See also:
624  //    http://support.microsoft.com/kb/823764/EN-US
625  // On Vista, if we manually set these sizes, Vista turns off its receive
626  // window auto-tuning feature.
627  //    http://blogs.msdn.com/wndp/archive/2006/05/05/Winhec-blog-tcpip-2.aspx
628  // Since Vista's auto-tune is better than any static value we can could set,
629  // only change these on pre-vista machines.
630  int32 major_version, minor_version, fix_version;
631  base::SysInfo::OperatingSystemVersionNumbers(&major_version, &minor_version,
632    &fix_version);
633  if (major_version < 6) {
634    const int32 kSocketBufferSize = 64 * 1024;
635    SetReceiveBufferSize(kSocketBufferSize);
636    SetSendBufferSize(kSocketBufferSize);
637  }
638
639  // Disable Nagle.
640  // The Nagle implementation on windows is governed by RFC 896.  The idea
641  // behind Nagle is to reduce small packets on the network.  When Nagle is
642  // enabled, if a partial packet has been sent, the TCP stack will disallow
643  // further *partial* packets until an ACK has been received from the other
644  // side.  Good applications should always strive to send as much data as
645  // possible and avoid partial-packet sends.  However, in most real world
646  // applications, there are edge cases where this does not happen, and two
647  // partil packets may be sent back to back.  For a browser, it is NEVER
648  // a benefit to delay for an RTT before the second packet is sent.
649  //
650  // As a practical example in Chromium today, consider the case of a small
651  // POST.  I have verified this:
652  //     Client writes 649 bytes of header  (partial packet #1)
653  //     Client writes 50 bytes of POST data (partial packet #2)
654  // In the above example, with Nagle, a RTT delay is inserted between these
655  // two sends due to nagle.  RTTs can easily be 100ms or more.  The best
656  // fix is to make sure that for POSTing data, we write as much data as
657  // possible and minimize partial packets.  We will fix that.  But disabling
658  // Nagle also ensure we don't run into this delay in other edge cases.
659  // See also:
660  //    http://technet.microsoft.com/en-us/library/bb726981.aspx
661  const BOOL kDisableNagle = TRUE;
662  int rv = setsockopt(socket_, IPPROTO_TCP, TCP_NODELAY,
663      reinterpret_cast<const char*>(&kDisableNagle), sizeof(kDisableNagle));
664  DCHECK(!rv) << "Could not disable nagle";
665
666  // Disregard any failure in disabling nagle.
667  return 0;
668}
669
670void TCPClientSocketWin::LogConnectCompletion(int net_error) {
671  scoped_refptr<NetLog::EventParameters> params;
672  if (net_error != OK)
673    params = new NetLogIntegerParameter("net_error", net_error);
674  net_log_.EndEvent(NetLog::TYPE_TCP_CONNECT, params);
675  if (net_error == OK)
676    UpdateConnectionTypeHistograms(CONNECTION_ANY);
677}
678
679void TCPClientSocketWin::DoReadCallback(int rv) {
680  DCHECK_NE(rv, ERR_IO_PENDING);
681  DCHECK(read_callback_);
682
683  static StatsCounter read_bytes("tcp.read_bytes");
684  read_bytes.Add(rv);
685
686  // since Run may result in Read being called, clear read_callback_ up front.
687  CompletionCallback* c = read_callback_;
688  read_callback_ = NULL;
689  c->Run(rv);
690}
691
692void TCPClientSocketWin::DoWriteCallback(int rv) {
693  DCHECK_NE(rv, ERR_IO_PENDING);
694  DCHECK(write_callback_);
695
696  static StatsCounter write_bytes("tcp.write_bytes");
697  write_bytes.Add(rv);
698
699  // since Run may result in Write being called, clear write_callback_ up front.
700  CompletionCallback* c = write_callback_;
701  write_callback_ = NULL;
702  c->Run(rv);
703}
704
705void TCPClientSocketWin::DidCompleteConnect() {
706  DCHECK_EQ(next_connect_state_, CONNECT_STATE_CONNECT_COMPLETE);
707  int result;
708
709  WSANETWORKEVENTS events;
710  int rv = WSAEnumNetworkEvents(socket_, core_->read_overlapped_.hEvent,
711                                &events);
712  int os_error = 0;
713  if (rv == SOCKET_ERROR) {
714    NOTREACHED();
715    os_error = WSAGetLastError();
716    result = MapWinsockError(os_error);
717  } else if (events.lNetworkEvents & FD_CONNECT) {
718    os_error = events.iErrorCode[FD_CONNECT_BIT];
719    result = MapConnectError(os_error);
720  } else {
721    NOTREACHED();
722    result = ERR_UNEXPECTED;
723  }
724
725  connect_os_error_ = os_error;
726  rv = DoConnectLoop(result);
727  if (rv != ERR_IO_PENDING) {
728    LogConnectCompletion(rv);
729    DoReadCallback(rv);
730  }
731}
732
733void TCPClientSocketWin::DidCompleteRead() {
734  DCHECK(waiting_read_);
735  DWORD num_bytes, flags;
736  BOOL ok = WSAGetOverlappedResult(socket_, &core_->read_overlapped_,
737                                   &num_bytes, FALSE, &flags);
738  WSAResetEvent(core_->read_overlapped_.hEvent);
739  waiting_read_ = false;
740  core_->read_iobuffer_ = NULL;
741  if (ok) {
742    net_log_.AddEvent(NetLog::TYPE_SOCKET_BYTES_RECEIVED,
743                      new NetLogIntegerParameter("num_bytes", num_bytes));
744  }
745  DoReadCallback(ok ? num_bytes : MapWinsockError(WSAGetLastError()));
746}
747
748void TCPClientSocketWin::DidCompleteWrite() {
749  DCHECK(waiting_write_);
750
751  DWORD num_bytes, flags;
752  BOOL ok = WSAGetOverlappedResult(socket_, &core_->write_overlapped_,
753                                   &num_bytes, FALSE, &flags);
754  WSAResetEvent(core_->write_overlapped_.hEvent);
755  waiting_write_ = false;
756  int rv;
757  if (!ok) {
758    rv = MapWinsockError(WSAGetLastError());
759  } else {
760    rv = static_cast<int>(num_bytes);
761    if (rv > core_->write_buffer_length_ || rv < 0) {
762      // It seems that some winsock interceptors report that more was written
763      // than was available. Treat this as an error.  http://crbug.com/27870
764      LOG(ERROR) << "Detected broken LSP: Asked to write "
765                 << core_->write_buffer_length_ << " bytes, but " << rv
766                 << " bytes reported.";
767      rv = ERR_WINSOCK_UNEXPECTED_WRITTEN_BYTES;
768    } else {
769      net_log_.AddEvent(NetLog::TYPE_SOCKET_BYTES_SENT,
770                        new NetLogIntegerParameter("num_bytes", rv));
771    }
772  }
773  core_->write_iobuffer_ = NULL;
774  DoWriteCallback(rv);
775}
776
777}  // namespace net
778