diff options
author | rtenneti@chromium.org <rtenneti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-05-11 01:48:54 +0000 |
---|---|---|
committer | rtenneti@chromium.org <rtenneti@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2011-05-11 01:48:54 +0000 |
commit | 189163ee560220770ed5a119f6cfb9244c50c39d (patch) | |
tree | c24d6fce3397470234a2e22a9deae0243f83b5a8 /net/base/host_resolver_impl.cc | |
parent | aae2282336aed7a76bc30fa41fc746f13e941a63 (diff) | |
download | chromium_src-189163ee560220770ed5a119f6cfb9244c50c39d.zip chromium_src-189163ee560220770ed5a119f6cfb9244c50c39d.tar.gz chromium_src-189163ee560220770ed5a119f6cfb9244c50c39d.tar.bz2 |
Revert of 83710 which reverted 83641 - DNS Host resolver changes
with retry logic. Fix for bug Chromium cannot recover from a state
when its DNS requests have been dropped.
Whenever we try to resolve the host, we post a delayed task
to check if host resolution (OnLookupComplete) is completed
or not. If it hasn't completed, then we start a new job
to resolve for the same request.
BUG=73327
TEST=dns host resolver tests
R=eroman,jar
Review URL: http://codereview.chromium.org/6976006
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@84908 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'net/base/host_resolver_impl.cc')
-rw-r--r-- | net/base/host_resolver_impl.cc | 217 |
1 files changed, 174 insertions, 43 deletions
diff --git a/net/base/host_resolver_impl.cc b/net/base/host_resolver_impl.cc index 8c9d17f..008fd14 100644 --- a/net/base/host_resolver_impl.cc +++ b/net/base/host_resolver_impl.cc @@ -23,6 +23,7 @@ #include "base/metrics/histogram.h" #include "base/stl_util-inl.h" #include "base/string_util.h" +#include "base/task.h" #include "base/threading/worker_pool.h" #include "base/time.h" #include "base/utf_string_conversions.h" @@ -70,6 +71,7 @@ HostCache* CreateDefaultCache() { } // anonymous namespace +// static HostResolver* CreateSystemHostResolver(size_t max_concurrent_resolves, NetLog* net_log) { // Maximum of 8 concurrent resolver threads. @@ -351,8 +353,10 @@ class HostResolverImpl::Job resolver_(resolver), origin_loop_(base::MessageLoopProxy::CreateForCurrentThread()), resolver_proc_(resolver->effective_resolver_proc()), - error_(OK), - os_error_(0), + unresponsive_delay_(resolver->unresponsive_delay()), + attempt_number_(0), + completed_attempt_number_(0), + completed_attempt_error_(ERR_UNEXPECTED), had_non_speculative_request_(false), net_log_(BoundNetLog::Make(net_log, NetLog::SOURCE_HOST_RESOLVER_IMPL_JOB)) { @@ -381,23 +385,43 @@ class HostResolverImpl::Job void Start() { DCHECK(origin_loop_->BelongsToCurrentThread()); - start_time_ = base::TimeTicks::Now(); + StartLookupAttempt(); + } - // Dispatch the job to a worker thread. - if (!base::WorkerPool::PostTask(FROM_HERE, - NewRunnableMethod(this, &Job::DoLookup), true)) { + void StartLookupAttempt() { + DCHECK(origin_loop_->BelongsToCurrentThread()); + base::TimeTicks start_time = base::TimeTicks::Now(); + ++attempt_number_; + // Dispatch the lookup attempt to a worker thread. + if (!base::WorkerPool::PostTask( + FROM_HERE, + NewRunnableMethod(this, &Job::DoLookup, start_time, + attempt_number_), + true)) { NOTREACHED(); // Since we could be running within Resolve() right now, we can't just // call OnLookupComplete(). Instead we must wait until Resolve() has // returned (IO_PENDING). - error_ = ERR_UNEXPECTED; origin_loop_->PostTask( - FROM_HERE, NewRunnableMethod(this, &Job::OnLookupComplete)); + FROM_HERE, + NewRunnableMethod(this, &Job::OnLookupComplete, AddressList(), + start_time, attempt_number_, ERR_UNEXPECTED, 0)); + return; } + // Post a task to check if we get the results within a given time. + // OnCheckForComplete has the potential for starting a new attempt on a + // different worker thread if none of our outstanding attempts have + // completed yet. + origin_loop_->PostDelayedTask( + FROM_HERE, + NewRunnableMethod(this, &Job::OnCheckForComplete), + unresponsive_delay_.InMilliseconds()); } - // Cancels the current job. + // Cancels the current job. The Job will be orphaned. Any outstanding resolve + // attempts running on worker threads will continue running. Only once all the + // attempts complete will the final reference to this Job be released. void Cancel() { DCHECK(origin_loop_->BelongsToCurrentThread()); net_log_.AddEvent(NetLog::TYPE_CANCELLED, NULL); @@ -424,6 +448,11 @@ class HostResolverImpl::Job return resolver_ == NULL; } + bool was_completed() const { + DCHECK(origin_loop_->BelongsToCurrentThread()); + return completed_attempt_number_ > 0; + } + const Key& key() const { DCHECK(origin_loop_->BelongsToCurrentThread()); return key_; @@ -434,11 +463,6 @@ class HostResolverImpl::Job return id_; } - base::TimeTicks start_time() const { - DCHECK(origin_loop_->BelongsToCurrentThread()); - return start_time_; - } - const RequestsList& requests() const { DCHECK(origin_loop_->BelongsToCurrentThread()); return requests_; @@ -468,39 +492,92 @@ class HostResolverImpl::Job // WARNING: This code runs inside a worker pool. The shutdown code cannot // wait for it to finish, so we must be very careful here about using other // objects (like MessageLoops, Singletons, etc). During shutdown these objects - // may no longer exist. - void DoLookup() { + // may no longer exist. Multiple DoLookups() could be running in parallel, so + // any state inside of |this| must not mutate . + void DoLookup(const base::TimeTicks& start_time, + const uint32 attempt_number) { + AddressList results; + int os_error = 0; // Running on the worker thread - error_ = ResolveAddrInfo(resolver_proc_, - key_.hostname, - key_.address_family, - key_.host_resolver_flags, - &results_, - &os_error_); + int error = ResolveAddrInfo(resolver_proc_, + key_.hostname, + key_.address_family, + key_.host_resolver_flags, + &results, + &os_error); + + origin_loop_->PostTask( + FROM_HERE, + NewRunnableMethod(this, &Job::OnLookupComplete, results, start_time, + attempt_number, error, os_error)); + } + + // Callback to see if DoLookup() has finished or not (runs on origin thread). + void OnCheckForComplete() { + DCHECK(origin_loop_->BelongsToCurrentThread()); + + if (was_cancelled() || was_completed()) + return; + + DCHECK(resolver_); + base::TimeDelta unresponsive_delay = + unresponsive_delay_ * resolver_->retry_factor(); + if (unresponsive_delay >= resolver_->maximum_unresponsive_delay()) + return; - origin_loop_->PostTask(FROM_HERE, - NewRunnableMethod(this, &Job::OnLookupComplete)); + unresponsive_delay_ = unresponsive_delay; + StartLookupAttempt(); } // Callback for when DoLookup() completes (runs on origin thread). - void OnLookupComplete() { + void OnLookupComplete(const AddressList& results, + const base::TimeTicks& start_time, + const uint32 attempt_number, + int error, + const int os_error) { DCHECK(origin_loop_->BelongsToCurrentThread()); - DCHECK(error_ || results_.head()); + DCHECK(error || results.head()); + + bool was_retry_attempt = attempt_number > 1; + + if (!was_cancelled()) { + // If host is already resolved, then record data and return. + if (was_completed()) { + // If this is the first attempt that is finishing later, then record + // data for the first attempt. Won't contaminate with retry attempt's + // data. + if (!was_retry_attempt) + RecordPerformanceHistograms(start_time, error, os_error); + + RecordAttemptHistograms(start_time, attempt_number, error, os_error); + return; + } + + // Copy the results from the first worker thread that resolves the host. + results_ = results; + completed_attempt_number_ = attempt_number; + completed_attempt_error_ = error; + } // Ideally the following code would be part of host_resolver_proc.cc, // however it isn't safe to call NetworkChangeNotifier from worker // threads. So we do it here on the IO thread instead. - if (error_ != OK && NetworkChangeNotifier::IsOffline()) - error_ = ERR_INTERNET_DISCONNECTED; + if (error != OK && NetworkChangeNotifier::IsOffline()) + error = ERR_INTERNET_DISCONNECTED; + + // We will record data for the first attempt. Don't contaminate with retry + // attempt's data. + if (!was_retry_attempt) + RecordPerformanceHistograms(start_time, error, os_error); - RecordPerformanceHistograms(); + RecordAttemptHistograms(start_time, attempt_number, error, os_error); if (was_cancelled()) return; scoped_refptr<NetLog::EventParameters> params; - if (error_ != OK) { - params = new HostResolveFailedParams(error_, os_error_); + if (error != OK) { + params = new HostResolveFailedParams(error, os_error); } else { params = new AddressListNetLogParam(results_); } @@ -512,13 +589,15 @@ class HostResolverImpl::Job DCHECK(!requests_.empty()); // Use the port number of the first request. - if (error_ == OK) + if (error == OK) results_.SetPort(requests_[0]->port()); - resolver_->OnJobComplete(this, error_, os_error_, results_); + resolver_->OnJobComplete(this, error, os_error, results_); } - void RecordPerformanceHistograms() const { + void RecordPerformanceHistograms(const base::TimeTicks& start_time, + const int error, + const int os_error) const { DCHECK(origin_loop_->BelongsToCurrentThread()); enum Category { // Used in HISTOGRAM_ENUMERATION. RESOLVE_SUCCESS, @@ -529,8 +608,8 @@ class HostResolverImpl::Job }; int category = RESOLVE_MAX; // Illegal value for later DCHECK only. - base::TimeDelta duration = base::TimeTicks::Now() - start_time_; - if (error_ == OK) { + base::TimeDelta duration = base::TimeTicks::Now() - start_time; + if (error == OK) { if (had_non_speculative_request_) { category = RESOLVE_SUCCESS; DNS_HISTOGRAM("DNS.ResolveSuccess", duration); @@ -547,7 +626,7 @@ class HostResolverImpl::Job DNS_HISTOGRAM("DNS.ResolveSpeculativeFail", duration); } UMA_HISTOGRAM_CUSTOM_ENUMERATION(kOSErrorsForGetAddrinfoHistogramName, - std::abs(os_error_), + std::abs(os_error), GetAllGetAddrinfoOSErrors()); } DCHECK_LT(category, static_cast<int>(RESOLVE_MAX)); // Be sure it was set. @@ -578,7 +657,47 @@ class HostResolverImpl::Job } } + void RecordAttemptHistograms(const base::TimeTicks& start_time, + const uint32 attempt_number, + const int error, + const int os_error) const { + bool first_attempt_to_complete = + completed_attempt_number_ == attempt_number; + if (first_attempt_to_complete) { + // If this was first attempt to complete, then record the resolution + // status of the attempt. + if (completed_attempt_error_ == OK) { + UMA_HISTOGRAM_ENUMERATION( + "DNS.AttemptFirstSuccess", attempt_number, 100); + } else { + UMA_HISTOGRAM_ENUMERATION( + "DNS.AttemptFirstFailure", attempt_number, 100); + } + } + + if (error == OK) + UMA_HISTOGRAM_ENUMERATION("DNS.AttemptSuccess", attempt_number, 100); + else + UMA_HISTOGRAM_ENUMERATION("DNS.AttemptFailure", attempt_number, 100); + + if (was_cancelled() || !first_attempt_to_complete) { + // Count those attempts which completed after the job was already canceled + // OR after the job was already completed by an earlier attempt (so in + // effect). + UMA_HISTOGRAM_ENUMERATION("DNS.AttemptDiscarded", attempt_number, 100); + + // Record if job is cancelled. + if (was_cancelled()) + UMA_HISTOGRAM_ENUMERATION("DNS.AttemptCancelled", attempt_number, 100); + } + + base::TimeDelta duration = base::TimeTicks::Now() - start_time; + if (error == OK) + DNS_HISTOGRAM("DNS.AttemptSuccessDuration", duration); + else + DNS_HISTOGRAM("DNS.AttemptFailDuration", duration); + } // Immutable. Can be read from either thread, const int id_; @@ -599,9 +718,21 @@ class HostResolverImpl::Job // reference ensures that it remains valid until we are done. scoped_refptr<HostResolverProc> resolver_proc_; - // Assigned on the worker thread, read on the origin thread. - int error_; - int os_error_; + // The amount of time after starting a resolution attempt until deciding to + // retry. + base::TimeDelta unresponsive_delay_; + + // Keeps track of the number of attempts we have made so far to resolve the + // host. Whenever we start an attempt to resolve the host, we increase this + // number. + uint32 attempt_number_; + + // The index of the attempt which finished first (or 0 if the job is still in + // progress). + uint32 completed_attempt_number_; + + // The result (a net error code) from the first attempt to complete. + int completed_attempt_error_; // True if a non-speculative request was ever attached to this job // (regardless of whether or not it was later cancelled. @@ -611,9 +742,6 @@ class HostResolverImpl::Job AddressList results_; - // The time when the job was started. - base::TimeTicks start_time_; - BoundNetLog net_log_; DISALLOW_COPY_AND_ASSIGN(Job); @@ -862,6 +990,9 @@ HostResolverImpl::HostResolverImpl( NetLog* net_log) : cache_(cache), max_jobs_(max_jobs), + unresponsive_delay_(base::TimeDelta::FromMilliseconds(6000)), + retry_factor_(2), + maximum_unresponsive_delay_(base::TimeDelta::FromMilliseconds(60000)), next_request_id_(0), next_job_id_(0), resolver_proc_(resolver_proc), |