summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authormef@chromium.org <mef@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-06-07 20:39:38 +0000
committermef@chromium.org <mef@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98>2013-06-07 20:39:38 +0000
commita6c84f4dd8a40702b7f7e9edd8204bbdb4e52f8f (patch)
tree083f1c314e43c2e0ec5d1e743c0cf0bb92325efb
parente9f54210328016e46b618002eb12770b895d1875 (diff)
downloadchromium_src-a6c84f4dd8a40702b7f7e9edd8204bbdb4e52f8f.zip
chromium_src-a6c84f4dd8a40702b7f7e9edd8204bbdb4e52f8f.tar.gz
chromium_src-a6c84f4dd8a40702b7f7e9edd8204bbdb4e52f8f.tar.bz2
Track DNS Server Success and Failure time and use it in next server selection.
BUG=110197 Review URL: https://chromiumcodereview.appspot.com/16357018 git-svn-id: svn://svn.chromium.org/chrome/trunk/src@204915 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r--net/dns/dns_session.cc150
-rw-r--r--net/dns/dns_session.h36
-rw-r--r--net/dns/dns_transaction.cc63
3 files changed, 185 insertions, 64 deletions
diff --git a/net/dns/dns_session.cc b/net/dns/dns_session.cc
index 6d8d0db..2e6d0c9 100644
--- a/net/dns/dns_session.cc
+++ b/net/dns/dns_session.cc
@@ -6,6 +6,7 @@
#include "base/basictypes.h"
#include "base/bind.h"
+#include "base/lazy_instance.h"
#include "base/metrics/histogram.h"
#include "base/metrics/sample_vector.h"
#include "base/rand_util.h"
@@ -32,6 +33,40 @@ const size_t kRTTBucketCount = 100;
const unsigned kRTOPercentile = 99;
} // namespace
+// Runtime statistics of DNS server.
+struct DnsSession::ServerStats {
+ ServerStats(base::TimeDelta rtt_estimate_param, RttBuckets* buckets)
+ : last_failure_count(0), rtt_estimate(rtt_estimate_param) {
+ rtt_histogram.reset(new base::SampleVector(buckets));
+ }
+
+ // Count of consecutive failures after last success.
+ int last_failure_count;
+
+ // Last time when server returned failure or timeout.
+ base::Time last_failure;
+ // Last time when server returned success.
+ base::Time last_success;
+
+ // Estimated RTT using moving average.
+ base::TimeDelta rtt_estimate;
+ // Estimated error in the above.
+ base::TimeDelta rtt_deviation;
+
+ // A histogram of observed RTT .
+ scoped_ptr<base::SampleVector> rtt_histogram;
+
+ DISALLOW_COPY_AND_ASSIGN(ServerStats);
+};
+
+// static
+base::LazyInstance<DnsSession::RttBuckets>::Leaky DnsSession::rtt_buckets_ =
+ LAZY_INSTANCE_INITIALIZER;
+
+DnsSession::RttBuckets::RttBuckets() : base::BucketRanges(kRTTBucketCount + 1) {
+ base::Histogram::InitializeBucketRanges(1, 5000, kRTTBucketCount, this);
+}
+
DnsSession::SocketLease::SocketLease(scoped_refptr<DnsSession> session,
unsigned server_index,
scoped_ptr<DatagramClientSocket> socket)
@@ -49,33 +84,79 @@ DnsSession::DnsSession(const DnsConfig& config,
socket_pool_(socket_pool.Pass()),
rand_callback_(base::Bind(rand_int_callback, 0, kuint16max)),
net_log_(net_log),
- server_index_(0),
- rtt_estimates_(config_.nameservers.size(), config_.timeout),
- rtt_deviations_(config_.nameservers.size()),
- rtt_buckets_(new base::BucketRanges(kRTTBucketCount + 1)) {
+ server_index_(0) {
socket_pool_->Initialize(&config_.nameservers, net_log);
-
- // TODO(mef): This could be done once per process lifetime.
- base::Histogram::InitializeBucketRanges(1, 5000, kRTTBucketCount,
- rtt_buckets_.get());
+ UMA_HISTOGRAM_CUSTOM_COUNTS(
+ "AsyncDNS.ServerCount", config_.nameservers.size(), 0, 10, 10);
for (size_t i = 0; i < config_.nameservers.size(); ++i) {
- rtt_histograms_.push_back(new base::SampleVector(rtt_buckets_.get()));
+ server_stats_.push_back(new ServerStats(config_.timeout,
+ rtt_buckets_.Pointer()));
}
}
-DnsSession::~DnsSession() {}
+DnsSession::~DnsSession() {
+ RecordServerStats();
+}
int DnsSession::NextQueryId() const { return rand_callback_.Run(); }
-int DnsSession::NextFirstServerIndex() {
- int index = server_index_;
+unsigned DnsSession::NextFirstServerIndex() {
+ unsigned index = NextGoodServerIndex(server_index_);
if (config_.rotate)
server_index_ = (server_index_ + 1) % config_.nameservers.size();
return index;
}
+unsigned DnsSession::NextGoodServerIndex(unsigned server_index) {
+ unsigned index = server_index;
+ base::Time oldest_server_failure(base::Time::Now());
+ unsigned oldest_server_failure_index = 0;
+
+ UMA_HISTOGRAM_BOOLEAN("AsyncDNS.ServerIsGood",
+ server_stats_[server_index]->last_failure.is_null());
+
+ do {
+ base::Time cur_server_failure = server_stats_[index]->last_failure;
+ // If number of failures on this server doesn't exceed number of allowed
+ // attempts, return its index.
+ if (server_stats_[server_index]->last_failure_count < config_.attempts) {
+ return index;
+ }
+ // Track oldest failed server.
+ if (cur_server_failure < oldest_server_failure) {
+ oldest_server_failure = cur_server_failure;
+ oldest_server_failure_index = index;
+ }
+ index = (index + 1) % config_.nameservers.size();
+ } while (index != server_index);
+
+ // If we are here it means that there are no successful servers, so we have
+ // to use one that has failed oldest.
+ return oldest_server_failure_index;
+}
+
+void DnsSession::RecordServerFailure(unsigned server_index) {
+ UMA_HISTOGRAM_CUSTOM_COUNTS(
+ "AsyncDNS.ServerFailureIndex", server_index, 0, 10, 10);
+ ++(server_stats_[server_index]->last_failure_count);
+ server_stats_[server_index]->last_failure = base::Time::Now();
+}
+
+void DnsSession::RecordServerSuccess(unsigned server_index) {
+ if (server_stats_[server_index]->last_success.is_null()) {
+ UMA_HISTOGRAM_COUNTS_100("AsyncDNS.ServerFailuresAfterNetworkChange",
+ server_stats_[server_index]->last_failure_count);
+ } else {
+ UMA_HISTOGRAM_COUNTS_100("AsyncDNS.ServerFailuresBeforeSuccess",
+ server_stats_[server_index]->last_failure_count);
+ }
+ server_stats_[server_index]->last_failure_count = 0;
+ server_stats_[server_index]->last_failure = base::Time();
+ server_stats_[server_index]->last_success = base::Time::Now();
+}
+
void DnsSession::RecordRTT(unsigned server_index, base::TimeDelta rtt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
// For measurement, assume it is the first attempt (no backoff).
base::TimeDelta timeout_jacobson = NextTimeoutFromJacobson(server_index, 0);
@@ -90,8 +171,8 @@ void DnsSession::RecordRTT(unsigned server_index, base::TimeDelta rtt) {
// Jacobson/Karels algorithm for TCP.
// Using parameters: alpha = 1/8, delta = 1/4, beta = 4
- base::TimeDelta& estimate = rtt_estimates_[server_index];
- base::TimeDelta& deviation = rtt_deviations_[server_index];
+ base::TimeDelta& estimate = server_stats_[server_index]->rtt_estimate;
+ base::TimeDelta& deviation = server_stats_[server_index]->rtt_deviation;
base::TimeDelta current_error = rtt - estimate;
estimate += current_error / 8; // * alpha
base::TimeDelta abs_error = base::TimeDelta::FromInternalValue(
@@ -99,7 +180,8 @@ void DnsSession::RecordRTT(unsigned server_index, base::TimeDelta rtt) {
deviation += (abs_error - deviation) / 4; // * delta
// Histogram-based method.
- rtt_histograms_[server_index]->Accumulate(rtt.InMilliseconds(), 1);
+ server_stats_[server_index]->rtt_histogram
+ ->Accumulate(rtt.InMilliseconds(), 1);
}
void DnsSession::RecordLostPacket(unsigned server_index, int attempt) {
@@ -111,10 +193,28 @@ void DnsSession::RecordLostPacket(unsigned server_index, int attempt) {
UMA_HISTOGRAM_TIMES("AsyncDNS.TimeoutSpentHistogram", timeout_histogram);
}
+void DnsSession::RecordServerStats() {
+ for (size_t index = 0; index < server_stats_.size(); ++index) {
+ if (server_stats_[index]->last_failure_count) {
+ if (server_stats_[index]->last_success.is_null()) {
+ UMA_HISTOGRAM_COUNTS("AsyncDNS.ServerFailuresWithoutSuccess",
+ server_stats_[index]->last_failure_count);
+ } else {
+ UMA_HISTOGRAM_COUNTS("AsyncDNS.ServerFailuresAfterSuccess",
+ server_stats_[index]->last_failure_count);
+ }
+ }
+ }
+}
+
+
base::TimeDelta DnsSession::NextTimeout(unsigned server_index, int attempt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
base::TimeDelta timeout = config_.timeout;
+ // If this server has not responded successfully, then don't wait too long.
+ if (server_stats_[server_index]->last_success.is_null())
+ return timeout;
// The timeout doubles every full round (each nameserver once).
unsigned num_backoffs = attempt / config_.nameservers.size();
@@ -155,10 +255,10 @@ void DnsSession::FreeSocket(unsigned server_index,
base::TimeDelta DnsSession::NextTimeoutFromJacobson(unsigned server_index,
int attempt) {
- DCHECK_LT(server_index, rtt_estimates_.size());
+ DCHECK_LT(server_index, server_stats_.size());
- base::TimeDelta timeout =
- rtt_estimates_[server_index] + 4 * rtt_deviations_[server_index];
+ base::TimeDelta timeout = server_stats_[server_index]->rtt_estimate +
+ 4 * server_stats_[server_index]->rtt_deviation;
timeout = std::max(timeout, base::TimeDelta::FromMilliseconds(kMinTimeoutMs));
@@ -171,23 +271,25 @@ base::TimeDelta DnsSession::NextTimeoutFromJacobson(unsigned server_index,
base::TimeDelta DnsSession::NextTimeoutFromHistogram(unsigned server_index,
int attempt) {
- DCHECK_LT(server_index, rtt_histograms_.size());
+ DCHECK_LT(server_index, server_stats_.size());
COMPILE_ASSERT(std::numeric_limits<base::HistogramBase::Count>::is_signed,
histogram_base_count_assumed_to_be_signed);
// Use fixed percentile of observed samples.
- const base::SampleVector& samples = *rtt_histograms_[server_index];
+ const base::SampleVector& samples =
+ *server_stats_[server_index]->rtt_histogram;
+
base::HistogramBase::Count total = samples.TotalCount();
base::HistogramBase::Count remaining_count = kRTOPercentile * total / 100;
size_t index = 0;
- while (remaining_count > 0 && index < rtt_buckets_->size()) {
+ while (remaining_count > 0 && index < rtt_buckets_.Get().size()) {
remaining_count -= samples.GetCountAtIndex(index);
++index;
}
base::TimeDelta timeout =
- base::TimeDelta::FromMilliseconds(rtt_buckets_->range(index));
+ base::TimeDelta::FromMilliseconds(rtt_buckets_.Get().range(index));
timeout = std::max(timeout, base::TimeDelta::FromMilliseconds(kMinTimeoutMs));
diff --git a/net/dns/dns_session.h b/net/dns/dns_session.h
index e35ddb0..bced1e6 100644
--- a/net/dns/dns_session.h
+++ b/net/dns/dns_session.h
@@ -7,9 +7,11 @@
#include <vector>
+#include "base/lazy_instance.h"
#include "base/memory/ref_counted.h"
#include "base/memory/scoped_ptr.h"
#include "base/memory/scoped_vector.h"
+#include "base/metrics/bucket_ranges.h"
#include "base/time.h"
#include "net/base/net_export.h"
#include "net/base/rand_callback.h"
@@ -67,7 +69,19 @@ class NET_EXPORT_PRIVATE DnsSession
int NextQueryId() const;
// Return the index of the first configured server to use on first attempt.
- int NextFirstServerIndex();
+ unsigned NextFirstServerIndex();
+
+ // Start with |server_index| and find the index of the next known good server
+ // to use on this attempt. Returns |server_index| if this server has no
+ // recorded failures, or if there are no other servers that have not failed
+ // or have failed longer time ago.
+ unsigned NextGoodServerIndex(unsigned server_index);
+
+ // Record that server failed to respond (due to SRV_FAIL or timeout).
+ void RecordServerFailure(unsigned server_index);
+
+ // Record that server responded successfully.
+ void RecordServerSuccess(unsigned server_index);
// Record how long it took to receive a response from the server.
void RecordRTT(unsigned server_index, base::TimeDelta rtt);
@@ -75,6 +89,9 @@ class NET_EXPORT_PRIVATE DnsSession
// Record suspected loss of a packet for a specific server.
void RecordLostPacket(unsigned server_index, int attempt);
+ // Record server stats before it is destroyed.
+ void RecordServerStats();
+
// Return the timeout for the next query. |attempt| counts from 0 and is used
// for exponential backoff.
base::TimeDelta NextTimeout(unsigned server_index, int attempt);
@@ -111,15 +128,16 @@ class NET_EXPORT_PRIVATE DnsSession
// Current index into |config_.nameservers| to begin resolution with.
int server_index_;
- // Estimated RTT for each name server using moving average.
- std::vector<base::TimeDelta> rtt_estimates_;
- // Estimated error in the above for each name server.
- std::vector<base::TimeDelta> rtt_deviations_;
+ struct ServerStats;
+
+ // Track runtime statistics of each DNS server.
+ ScopedVector<ServerStats> server_stats_;
- // Buckets shared for all |rtt_histograms_|.
- scoped_ptr<base::BucketRanges> rtt_buckets_;
- // A histogram of observed RTT for each name server.
- ScopedVector<base::SampleVector> rtt_histograms_;
+ // Buckets shared for all |ServerStats::rtt_histogram|.
+ struct RttBuckets : public base::BucketRanges {
+ RttBuckets();
+ };
+ static base::LazyInstance<RttBuckets>::Leaky rtt_buckets_;
DISALLOW_COPY_AND_ASSIGN(DnsSession);
};
diff --git a/net/dns/dns_transaction.cc b/net/dns/dns_transaction.cc
index 8001b0a..fef5f26 100644
--- a/net/dns/dns_transaction.cc
+++ b/net/dns/dns_transaction.cc
@@ -73,7 +73,8 @@ Value* NetLogStartCallback(const std::string* hostname,
// matches. Logging is done in the socket and in the outer DnsTransaction.
class DnsAttempt {
public:
- DnsAttempt() : result_(ERR_FAILED) {}
+ explicit DnsAttempt(unsigned server_index)
+ : result_(ERR_FAILED), server_index_(server_index) {}
virtual ~DnsAttempt() {}
// Starts the attempt. Returns ERR_IO_PENDING if cannot complete synchronously
@@ -91,7 +92,7 @@ class DnsAttempt {
virtual const BoundNetLog& GetSocketNetLog() const = 0;
// Returns the index of the destination server within DnsConfig::nameservers.
- virtual unsigned GetServerIndex() const = 0;
+ unsigned server_index() const { return server_index_; }
// Returns a Value representing the received response, along with a reference
// to the NetLog source source of the UDP socket used. The request must have
@@ -124,17 +125,20 @@ class DnsAttempt {
private:
// Result of last operation.
int result_;
+
+ const unsigned server_index_;
};
class DnsUDPAttempt : public DnsAttempt {
public:
- DnsUDPAttempt(scoped_ptr<DnsSession::SocketLease> socket_lease,
+ DnsUDPAttempt(unsigned server_index,
+ scoped_ptr<DnsSession::SocketLease> socket_lease,
scoped_ptr<DnsQuery> query)
- : next_state_(STATE_NONE),
+ : DnsAttempt(server_index),
+ next_state_(STATE_NONE),
received_malformed_response_(false),
socket_lease_(socket_lease.Pass()),
- query_(query.Pass()) {
- }
+ query_(query.Pass()) {}
// DnsAttempt:
virtual int Start(const CompletionCallback& callback) OVERRIDE {
@@ -158,10 +162,6 @@ class DnsUDPAttempt : public DnsAttempt {
return socket_lease_->socket()->NetLog();
}
- virtual unsigned GetServerIndex() const OVERRIDE {
- return socket_lease_->server_index();
- }
-
private:
enum State {
STATE_SEND_QUERY,
@@ -298,13 +298,12 @@ class DnsTCPAttempt : public DnsAttempt {
DnsTCPAttempt(unsigned server_index,
scoped_ptr<StreamSocket> socket,
scoped_ptr<DnsQuery> query)
- : next_state_(STATE_NONE),
- server_index_(server_index),
+ : DnsAttempt(server_index),
+ next_state_(STATE_NONE),
socket_(socket.Pass()),
query_(query.Pass()),
length_buffer_(new IOBufferWithSize(sizeof(uint16))),
- response_length_(0) {
- }
+ response_length_(0) {}
// DnsAttempt:
virtual int Start(const CompletionCallback& callback) OVERRIDE {
@@ -334,10 +333,6 @@ class DnsTCPAttempt : public DnsAttempt {
return socket_->NetLog();
}
- virtual unsigned GetServerIndex() const OVERRIDE {
- return server_index_;
- }
-
private:
enum State {
STATE_CONNECT_COMPLETE,
@@ -497,7 +492,6 @@ class DnsTCPAttempt : public DnsAttempt {
State next_state_;
base::TimeTicks start_time_;
- unsigned server_index_;
scoped_ptr<StreamSocket> socket_;
scoped_ptr<DnsQuery> query_;
scoped_refptr<IOBufferWithSize> length_buffer_;
@@ -685,13 +679,16 @@ class DnsTransactionImpl : public DnsTransaction,
unsigned server_index =
(first_server_index_ + attempt_number) % config.nameservers.size();
+ // Skip over known failed servers.
+ server_index = session_->NextGoodServerIndex(server_index);
scoped_ptr<DnsSession::SocketLease> lease =
session_->AllocateSocket(server_index, net_log_.source());
bool got_socket = !!lease.get();
- DnsUDPAttempt* attempt = new DnsUDPAttempt(lease.Pass(), query.Pass());
+ DnsUDPAttempt* attempt =
+ new DnsUDPAttempt(server_index, lease.Pass(), query.Pass());
attempts_.push_back(attempt);
@@ -718,7 +715,7 @@ class DnsTransactionImpl : public DnsTransaction,
DCHECK(previous_attempt);
DCHECK(!had_tcp_attempt_);
- unsigned server_index = previous_attempt->GetServerIndex();
+ unsigned server_index = previous_attempt->server_index();
scoped_ptr<StreamSocket> socket(
session_->CreateTCPSocket(server_index, net_log_.source()));
@@ -774,7 +771,7 @@ class DnsTransactionImpl : public DnsTransaction,
DCHECK_LT(attempt_number, attempts_.size());
const DnsAttempt* attempt = attempts_[attempt_number];
if (attempt->GetResponse()) {
- session_->RecordRTT(attempt->GetServerIndex(),
+ session_->RecordRTT(attempt->server_index(),
base::TimeTicks::Now() - start);
}
OnAttemptComplete(attempt_number, rv);
@@ -805,15 +802,14 @@ class DnsTransactionImpl : public DnsTransaction,
return;
size_t num_servers = session_->config().nameservers.size();
+ std::vector<int> server_attempts(num_servers);
for (size_t i = 0; i < first_completed; ++i) {
+ unsigned server_index = attempts_[i]->server_index();
+ int server_attempt = server_attempts[server_index]++;
// Don't record lost packet unless attempt is in pending state.
if (!attempts_[i]->is_pending())
continue;
- unsigned server_index = attempts_[i]->GetServerIndex();
- // Servers are rotated in round robin, so server round can be calculated
- // from attempt index.
- size_t server_round = i / num_servers;
- session_->RecordLostPacket(server_index, static_cast<int>(server_round));
+ session_->RecordLostPacket(server_index, server_attempt);
}
}
@@ -841,14 +837,16 @@ class DnsTransactionImpl : public DnsTransaction,
switch (result.rv) {
case OK:
- net_log_.EndEventWithNetErrorCode(
- NetLog::TYPE_DNS_TRANSACTION_QUERY, result.rv);
+ session_->RecordServerSuccess(result.attempt->server_index());
+ net_log_.EndEventWithNetErrorCode(NetLog::TYPE_DNS_TRANSACTION_QUERY,
+ result.rv);
DCHECK(result.attempt);
DCHECK(result.attempt->GetResponse());
return result;
case ERR_NAME_NOT_RESOLVED:
- net_log_.EndEventWithNetErrorCode(
- NetLog::TYPE_DNS_TRANSACTION_QUERY, result.rv);
+ session_->RecordServerSuccess(result.attempt->server_index());
+ net_log_.EndEventWithNetErrorCode(NetLog::TYPE_DNS_TRANSACTION_QUERY,
+ result.rv);
// Try next suffix.
qnames_.pop_front();
if (qnames_.empty()) {
@@ -859,6 +857,8 @@ class DnsTransactionImpl : public DnsTransaction,
break;
case ERR_CONNECTION_REFUSED:
case ERR_DNS_TIMED_OUT:
+ if (result.attempt)
+ session_->RecordServerFailure(result.attempt->server_index());
if (MoreAttemptsAllowed()) {
result = MakeAttempt();
} else {
@@ -873,6 +873,7 @@ class DnsTransactionImpl : public DnsTransaction,
DCHECK(result.attempt);
if (result.attempt != attempts_.back()) {
// This attempt already timed out. Ignore it.
+ session_->RecordServerFailure(result.attempt->server_index());
return AttemptResult(ERR_IO_PENDING, NULL);
}
if (MoreAttemptsAllowed()) {