1//
2// Copyright (C) 2013 The Android Open Source Project
3//
4// Licensed under the Apache License, Version 2.0 (the "License");
5// you may not use this file except in compliance with the License.
6// You may obtain a copy of the License at
7//
8//      http://www.apache.org/licenses/LICENSE-2.0
9//
10// Unless required by applicable law or agreed to in writing, software
11// distributed under the License is distributed on an "AS IS" BASIS,
12// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13// See the License for the specific language governing permissions and
14// limitations under the License.
15//
16
17#include "shill/traffic_monitor.h"
18
19#include <base/bind.h>
20#include <base/strings/stringprintf.h>
21#include <netinet/in.h>
22
23#include "shill/device.h"
24#include "shill/device_info.h"
25#include "shill/event_dispatcher.h"
26#include "shill/logging.h"
27#include "shill/socket_info_reader.h"
28
29using base::StringPrintf;
30using std::string;
31using std::vector;
32
33namespace shill {
34
35namespace Logging {
36static auto kModuleLogScope = ScopeLogger::kLink;
37static string ObjectID(Device* d) { return d->link_name(); }
38}
39
40// static
41const uint16_t TrafficMonitor::kDnsPort = 53;
42const int64_t TrafficMonitor::kDnsTimedOutThresholdSeconds = 15;
43const int TrafficMonitor::kMinimumFailedSamplesToTrigger = 2;
44const int64_t TrafficMonitor::kSamplingIntervalMilliseconds = 5000;
45
46TrafficMonitor::TrafficMonitor(const DeviceRefPtr& device,
47                               EventDispatcher* dispatcher)
48    : device_(device),
49      dispatcher_(dispatcher),
50      socket_info_reader_(new SocketInfoReader),
51      accummulated_congested_tx_queues_samples_(0),
52      connection_info_reader_(new ConnectionInfoReader),
53      accummulated_dns_failures_samples_(0) {
54}
55
56TrafficMonitor::~TrafficMonitor() {
57  Stop();
58}
59
60void TrafficMonitor::Start() {
61  SLOG(device_.get(), 2) << __func__;
62  Stop();
63
64  sample_traffic_callback_.Reset(base::Bind(&TrafficMonitor::SampleTraffic,
65                                            base::Unretained(this)));
66  dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
67                               kSamplingIntervalMilliseconds);
68}
69
70void TrafficMonitor::Stop() {
71  SLOG(device_.get(), 2) << __func__;
72  sample_traffic_callback_.Cancel();
73  ResetCongestedTxQueuesStats();
74  ResetDnsFailingStats();
75}
76
77void TrafficMonitor::ResetCongestedTxQueuesStats() {
78  accummulated_congested_tx_queues_samples_ = 0;
79}
80
81void TrafficMonitor::ResetCongestedTxQueuesStatsWithLogging() {
82  SLOG(device_.get(), 2) << __func__ << ": Tx-queues decongested";
83  ResetCongestedTxQueuesStats();
84}
85
86void TrafficMonitor::BuildIPPortToTxQueueLength(
87    const vector<SocketInfo>& socket_infos,
88    IPPortToTxQueueLengthMap* tx_queue_lengths) {
89  SLOG(device_.get(), 3) << __func__;
90  string device_ip_address = device_->ipconfig()->properties().address;
91  for (const auto& info : socket_infos) {
92    SLOG(device_.get(), 4) << "SocketInfo(IP="
93                           << info.local_ip_address().ToString()
94                           << ", TX=" << info.transmit_queue_value()
95                           << ", State=" << info.connection_state()
96                           << ", TimerState=" << info.timer_state();
97    if (info.local_ip_address().ToString() != device_ip_address ||
98        info.transmit_queue_value() == 0 ||
99        info.connection_state() != SocketInfo::kConnectionStateEstablished ||
100        (info.timer_state() != SocketInfo::kTimerStateRetransmitTimerPending &&
101         info.timer_state() !=
102            SocketInfo::kTimerStateZeroWindowProbeTimerPending)) {
103      SLOG(device_.get(), 4) << "Connection Filtered.";
104      continue;
105    }
106    SLOG(device_.get(), 3) << "Monitoring connection: TX="
107                           << info.transmit_queue_value()
108                           << " TimerState=" << info.timer_state();
109
110    string local_ip_port =
111        StringPrintf("%s:%d",
112                     info.local_ip_address().ToString().c_str(),
113                     info.local_port());
114    (*tx_queue_lengths)[local_ip_port] = info.transmit_queue_value();
115  }
116}
117
118bool TrafficMonitor::IsCongestedTxQueues() {
119  SLOG(device_.get(), 4) << __func__;
120  vector<SocketInfo> socket_infos;
121  if (!socket_info_reader_->LoadTcpSocketInfo(&socket_infos) ||
122      socket_infos.empty()) {
123    SLOG(device_.get(), 3) << __func__ << ": Empty socket info";
124    ResetCongestedTxQueuesStatsWithLogging();
125    return false;
126  }
127  bool congested_tx_queues = true;
128  IPPortToTxQueueLengthMap curr_tx_queue_lengths;
129  BuildIPPortToTxQueueLength(socket_infos, &curr_tx_queue_lengths);
130  if (curr_tx_queue_lengths.empty()) {
131    SLOG(device_.get(), 3) << __func__ << ": No interesting socket info";
132    ResetCongestedTxQueuesStatsWithLogging();
133  } else {
134    for (const auto& length_entry : old_tx_queue_lengths_) {
135      IPPortToTxQueueLengthMap::iterator curr_tx_queue_it =
136          curr_tx_queue_lengths.find(length_entry.first);
137      if (curr_tx_queue_it == curr_tx_queue_lengths.end() ||
138          curr_tx_queue_it->second < length_entry.second) {
139        congested_tx_queues = false;
140        // TODO(armansito): If we had a false positive earlier, we may
141        // want to correct it here by invoking a "connection back to normal
142        // callback", so that the OutOfCredits property can be set to
143        // false.
144        break;
145      }
146    }
147    if (congested_tx_queues) {
148      ++accummulated_congested_tx_queues_samples_;
149      SLOG(device_.get(), 2) << __func__
150                             << ": Congested tx-queues detected ("
151                             << accummulated_congested_tx_queues_samples_
152                             << ")";
153    }
154  }
155  old_tx_queue_lengths_ = curr_tx_queue_lengths;
156
157  return congested_tx_queues;
158}
159
160void TrafficMonitor::ResetDnsFailingStats() {
161  accummulated_dns_failures_samples_ = 0;
162}
163
164void TrafficMonitor::ResetDnsFailingStatsWithLogging() {
165  SLOG(device_.get(), 2) << __func__ << ": DNS queries restored";
166  ResetDnsFailingStats();
167}
168
169bool TrafficMonitor::IsDnsFailing() {
170  SLOG(device_.get(), 4) << __func__;
171  vector<ConnectionInfo> connection_infos;
172  if (!connection_info_reader_->LoadConnectionInfo(&connection_infos) ||
173      connection_infos.empty()) {
174    SLOG(device_.get(), 3) << __func__ << ": Empty connection info";
175  } else {
176    // The time-to-expire counter is used to determine when a DNS request
177    // has timed out.  This counter is the number of seconds remaining until
178    // the entry is removed from the system IP connection tracker.  The
179    // default time is 30 seconds.  This is too long of a wait.  Instead, we
180    // want to time out at |kDnsTimedOutThresholdSeconds|.  Unfortunately,
181    // we cannot simply look for entries less than
182    // |kDnsTimedOutThresholdSeconds| because we will count the entry
183    // multiple times once its time-to-expire is less than
184    // |kDnsTimedOutThresholdSeconds|.  To ensure that we only count an
185    // entry once, we look for entries in this time window between
186    // |kDnsTimedOutThresholdSeconds| and |kDnsTimedOutLowerThresholdSeconds|.
187    const int64_t kDnsTimedOutLowerThresholdSeconds =
188        kDnsTimedOutThresholdSeconds - kSamplingIntervalMilliseconds / 1000;
189    string device_ip_address = device_->ipconfig()->properties().address;
190    for (const auto& info : connection_infos) {
191      if (info.protocol() != IPPROTO_UDP ||
192          info.time_to_expire_seconds() > kDnsTimedOutThresholdSeconds ||
193          info.time_to_expire_seconds() <= kDnsTimedOutLowerThresholdSeconds ||
194          !info.is_unreplied() ||
195          info.original_source_ip_address().ToString() != device_ip_address ||
196          info.original_destination_port() != kDnsPort)
197        continue;
198
199      ++accummulated_dns_failures_samples_;
200      SLOG(device_.get(), 2) << __func__
201                             << ": DNS failures detected ("
202                             << accummulated_dns_failures_samples_ << ")";
203      return true;
204    }
205  }
206  ResetDnsFailingStatsWithLogging();
207  return false;
208}
209
210void TrafficMonitor::SampleTraffic() {
211  SLOG(device_.get(), 3) << __func__;
212
213  // Schedule the sample callback first, so it is possible for the network
214  // problem callback to stop the traffic monitor.
215  dispatcher_->PostDelayedTask(sample_traffic_callback_.callback(),
216                               kSamplingIntervalMilliseconds);
217
218  if (IsCongestedTxQueues() &&
219      accummulated_congested_tx_queues_samples_ ==
220          kMinimumFailedSamplesToTrigger) {
221    LOG(WARNING) << "Congested tx queues detected, out-of-credits?";
222    network_problem_detected_callback_.Run(kNetworkProblemCongestedTxQueue);
223  } else if (IsDnsFailing() &&
224             accummulated_dns_failures_samples_ ==
225                 kMinimumFailedSamplesToTrigger) {
226    LOG(WARNING) << "DNS queries failing, out-of-credits?";
227    network_problem_detected_callback_.Run(kNetworkProblemDNSFailure);
228  }
229}
230
231}  // namespace shill
232