diff options
author | sra@chromium.org <sra@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-06-03 23:28:14 +0000 |
---|---|---|
committer | sra@chromium.org <sra@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-06-03 23:28:14 +0000 |
commit | 8cd9696d8eff70fa52198d81672efb2b780a2f40 (patch) | |
tree | d96ab9873dd5f89e96863d19e52b55a532d57a1d | |
parent | 3e6ba015438230f62be8e702f9c95e950c5e727f (diff) | |
download | chromium_src-8cd9696d8eff70fa52198d81672efb2b780a2f40.zip chromium_src-8cd9696d8eff70fa52198d81672efb2b780a2f40.tar.gz chromium_src-8cd9696d8eff70fa52198d81672efb2b780a2f40.tar.bz2 |
New adjustment method for Courgette.
Slower, but better.
1.0.154.59 to 1.0.154.65
Old: 1m19s, 354,997 bytes
New: 4m01s, 279,798 bytes
Timings on Lenovo T61 (T7700 cpu)
BUG=none
TEST=none (existing tests)
Review URL: http://codereview.chromium.org/118031
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@17566 0039d316-1c4b-4281-b951-d872f2087c98
-rw-r--r-- | courgette/adjustment_method.cc | 2 | ||||
-rw-r--r-- | courgette/adjustment_method.h | 11 | ||||
-rw-r--r-- | courgette/adjustment_method_2.cc | 1318 | ||||
-rw-r--r-- | courgette/courgette.gyp | 1 |
4 files changed, 1330 insertions, 2 deletions
diff --git a/courgette/adjustment_method.cc b/courgette/adjustment_method.cc index 48e7cc0..5a2f109 100644 --- a/courgette/adjustment_method.cc +++ b/courgette/adjustment_method.cc @@ -686,7 +686,7 @@ AdjustmentMethod* AdjustmentMethod::MakeNullAdjustmentMethod() { return new NullAdjustmentMethod(); } -AdjustmentMethod* AdjustmentMethod::MakeProductionAdjustmentMethod() { +AdjustmentMethod* AdjustmentMethod::MakeTrieAdjustmentMethod() { return new GraphAdjuster(); } diff --git a/courgette/adjustment_method.h b/courgette/adjustment_method.h index 8ca71e8..0ecf04d 100644 --- a/courgette/adjustment_method.h +++ b/courgette/adjustment_method.h @@ -16,11 +16,20 @@ class AdjustmentMethod { // Factory methods for making adjusters. // Returns the adjustment method used in production. - static AdjustmentMethod* MakeProductionAdjustmentMethod(); + static AdjustmentMethod* MakeProductionAdjustmentMethod() { + return MakeShingleAdjustmentMethod(); + } // Returns and adjustement method that makes no adjustments. static AdjustmentMethod* MakeNullAdjustmentMethod(); + // Returns the original adjustment method. + static AdjustmentMethod* MakeTrieAdjustmentMethod(); + + // Returns the new shingle tiling adjustment method. + static AdjustmentMethod* MakeShingleAdjustmentMethod(); + + // AdjustmentMethod interface: // Adjusts |program| to increase similarity to |model|. |program| can be // changed in any way provided that it still produces the same output when diff --git a/courgette/adjustment_method_2.cc b/courgette/adjustment_method_2.cc new file mode 100644 index 0000000..17868ba --- /dev/null +++ b/courgette/adjustment_method_2.cc @@ -0,0 +1,1318 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/adjustment_method.h" + +#include <algorithm> +#include <limits> +#include <list> +#include <map> +#include <set> +#include <string> +#include <vector> + +#include <iostream> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/string_util.h" + +#include "courgette/assembly_program.h" +#include "courgette/courgette.h" +#include "courgette/encoded_program.h" +#include "courgette/image_info.h" + +/* + +Shingle weighting matching. + +We have a sequence S1 of symbols from alphabet A1={A,B,C,...} called the 'model' +and a second sequence of S2 of symbols from alphabet A2={U,V,W,....} called the +'program'. Each symbol in A1 has a unique numerical name or index. We can +transcribe the sequence S1 to a sequence T1 of indexes of the symbols. We wish +to assign indexes to the symbols in A2 so that when we transcribe S2 into T2, T2 +has long subsequences that occur in T1. This will ensure that the sequence +T1;T2 compresses to be only slightly larger than the compressed T1. + +The algorithm for matching members of S2 with members of S1 is eager - it makes +matches without backtracking, until no more matches can be made. Each variable +(symbol) U,V,... in A2 has a set of candidates from A1, each candidate with a +weight summarizing the evidence for the match. We keep a VariableQueue of +U,V,... sorted by how much the evidence for the best choice outweighs the +evidence for the second choice, i.e. prioritized by how 'clear cut' the best +assignment is. We pick the variable with the most clear-cut candidate, make the +assignment, adjust the evidence and repeat. + +What has not been described so far is how the evidence is gathered and +maintained. We are working under the assumption that S1 and S2 are largely +similar. (A different assumption might be that S1 and S2 are dissimilar except +for many long subsequences.) + +A naive algorithm would consider all pairs (A,U) and for each pair assess the +benefit, or score, the assignment U:=A. The score might count the number of +occurrences of U in S2 which appear in similar contexts to A in S1. + +To distinguish contexts we view S1 and S2 as a sequence of overlapping k-length +substrings or 'shingles'. Two shingles are compatible if the symbols in one +shingle could be matched with the symbols in the other symbol. For example, ABC +is *not* compatible with UVU because it would require conflicting matches A=U +and C=U. ABC is compatible with UVW, UWV, WUV, VUW etc. We can't tell which +until we make an assignment - the compatible shingles form an equivalence class. +After assigning U:=A then only UVW and UWV (equivalently AVW, AWV) are +compatible. As we make assignments the number of equivalence classes of +shingles increases and the number of members of each equivalence class +decreases. The compatibility test becomes more restrictive. + +We gather evidence for the potential assignment U:=A by counting how many +shingles containing U are compatible with shingles containing A. Thus symbols +occurring a large number of times in compatible contexts will be assigned first. + +Finding the 'most clear-cut' assignment by considering all pairs symbols and for +each pair comparing the contexts of each pair of occurrences of the symbols is +computationally infeasible. We get the job done in a reasonable time by +approaching it 'backwards' and making incremental changes as we make +assignments. + +First the shingles are partitioned according to compatibility. In S1=ABCDD and +S2=UVWXX we have a total of 6 shingles, each occuring once. (ABC:1 BCD:1 CDD:1; +UVW:1 VWX: WXX:1) all fit the pattern <V0 V1 V2> or the pattern <V0 V1 V1>. The +first pattern indicates that each position matches a different symbol, the +second pattern indicates that the second symbol is repeated. + + pattern S1 members S2 members + <V0 V1 V2>: {ABC:1, BCD:1}; {UVW:1, VWX:1} + <V0 V1 V1>: {CDD:1} {WXX:1} + +The second pattern appears to have a unique assignment but we don't make the +assignment on such scant evidence. If S1 and S2 do not match exactly, there +will be numerous spurious low-score matches like this. Instead we must see what +assignments are indicated by considering all of the evidence. + +First pattern has 2 x 2 = 4 shingle pairs. For each pair we count the number +of symbol assignments. For ABC:a * UVW:b accumulate min(a,b) to each of + {U:=A, V:=B, W:=C}. +After accumulating over all 2 x 2 pairs: + U: {A:1 B:1} + V: {A:1 B:2 C:1} + W: {B:1 C:2 D:1 } + X: {C:1 D:1} +The second pattern contributes: + W: {C:1} + X: {D:2} +Sum: + U: {A:1 B:1} + V: {A:1 B:2 C:1} + W: {B:1 C:3 D:1} + X: {C:1 D:3} + +From this we decide to assign X:=D (because this assignment has both the largest +difference above the next candidate (X:=C) and this is also the largest +proportionately over the sum of alternatives). + +Lets assume D has numerical 'name' 77. The assignment X:=D sets X to 77 too. +Next we repartition all the shingles containing X or D: + + pattern S1 members S2 members + <V0 V1 V2>: {ABC:1}; {UVW:1} + <V0 V1 77>: {BCD:1}; {VWX:1} + <V0 77 77>: {CDD:1} {WXX:1} +As we repartition, we recalculate the contributions to the scores: + U: {A:1} + V: {B:2} + W: {C:3} +All the remaining assignments are now fixed. + +There is one step in the incremental algorithm that is still infeasibly +expensive: the contributions due to the cross product of large equivalence +classes. We settle for making an approximation by computing the contribution of +the cross product of only the most common shingles. The hope is that the noise +from the long tail of uncounted shingles is well below the scores being used to +pick assignments. The second hope is that as assignment are made, the large +equivalence class will be partitioned into smaller equivalence classes, reducing +the noise over time. + +In the code below the shingles are bigger (Shingle::kWidth = 5). +Class ShinglePattern holds the data for one pattern. + +There is an optimization for this case: + <V0 V1 V1>: {CDD:1} {WXX:1} + +Above we said that we don't make an assignment on this "scant evidence". There +is an exception: if there is only one variable unassigned (more like the <V0 77 +77> pattern) AND there are no occurrences of C and W other than those counted in +this pattern, then there is no competing evidence and we go ahead with the +assignment immediately. This produces slightly better results because these +cases tend to be low-scoring and susceptible to small mistakes made in +low-scoring assignments in the approximation for large equivalence classes. + +*/ + +namespace courgette { +namespace adjustment_method_2 { + +// We have three discretionary information logging levels for algorithm +// development. For now just configure with #defines. +// TODO(sra): make dependent of some configurable setting. +struct LogToCout { + LogToCout() {} + ~LogToCout() { std::cout << std::endl; } + std::ostream& stream() { return std::cout; } +}; +#define LOG_TO_COUT (LogToCout().stream()) +#define NO_LOG DLOG_IF(INFO, false) + +#if 0 // Log to log file. +#define ALOG1 LOG(INFO) +#define ALOG2 LOG(INFO) +#define ALOG3 LOG(INFO) +#elif 0 // Log to stdout. +#define ALOG1 LOG_TO_COUT +#define ALOG2 LOG_TO_COUT +#define ALOG3 LOG_TO_COUT +#else // Log to nowhere. +#define ALOG1 NO_LOG +#define ALOG2 NO_LOG +#define ALOG3 NO_LOG +#endif + +//////////////////////////////////////////////////////////////////////////////// + +class AssignmentCandidates; +class LabelInfoMaker; +class Shingle; +class ShinglePattern; + +// The purpose of adjustment is to assign indexes to Labels of a program 'p' to +// make the sequence of indexes similar to a 'model' program 'm'. Labels +// themselves don't have enough information to do this job, so we work with a +// LabelInfo surrogate for each label. +// +class LabelInfo { + public: + // Just a no-argument constructor and copy constructor. Actual LabelInfo + // objects are allocated in std::pair structs in a std::map. + LabelInfo() + : label_(NULL), is_model_(false), debug_index_(0), refs_(0), + assignment_(NULL), candidates_(NULL) + {} + + ~LabelInfo(); + + AssignmentCandidates* candidates(); + + Label* label_; // The label that this info a surrogate for. + + uint32 is_model_ : 1; // Is the label in the model? + uint32 debug_index_ : 31; // A small number for naming the label in debug + // output. The pair (is_model_, debug_index_) is + // unique. + + uint32 refs_; // Number of times this Label is referenced. + + LabelInfo* assignment_; // Label from other program corresponding to this. + + std::vector<uint32> positions_; // Offsets into the trace of references. + + private: + AssignmentCandidates* candidates_; + + void operator=(const LabelInfo*); // Disallow assignment only. + // Public compiler generated copy constructor is needed to constuct + // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated + // inside a std::map. +}; + +typedef std::vector<LabelInfo*> Trace; + +std::string ToString(const LabelInfo* info) { + std::string s; + StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_); + if (info->label_->index_ != Label::kNoIndex) + StringAppendF(&s, " (%d)", info->label_->index_); + + StringAppendF(&s, " #%u", info->refs_); + return s; +} + +// LabelInfoMaker maps labels to their surrogate LabelInfo objects. +class LabelInfoMaker { + public: + LabelInfoMaker() : debug_label_index_gen_(0) {} + + LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) { + LabelInfo& slot = label_infos_[label]; + if (slot.label_ == NULL) { + slot.label_ = label; + slot.is_model_ = is_model; + slot.debug_index_ = ++debug_label_index_gen_; + } + slot.positions_.push_back(position); + ++slot.refs_; + return &slot; + } + + void ResetDebugLabel() { debug_label_index_gen_ = 0; } + + private: + int debug_label_index_gen_; + + // Note LabelInfo is allocated 'flat' inside map::value_type, so the LabelInfo + // lifetimes are managed by the map. + std::map<Label*, LabelInfo> label_infos_; + + DISALLOW_COPY_AND_ASSIGN(LabelInfoMaker); +}; + +struct OrderLabelInfo { + bool operator()(const LabelInfo* a, const LabelInfo* b) const { + if (a->label_->rva_ < b->label_->rva_) return true; + if (a->label_->rva_ > b->label_->rva_) return false; + if (a == b) return false; + return a->positions_ < b->positions_; // Lexicographic ordering of vector. + } +}; + +// AssignmentCandidates is a priority queue of candidate assignments to +// a single program LabelInfo, |program_info_|. +class AssignmentCandidates { + public: + explicit AssignmentCandidates(LabelInfo* program_info) + : program_info_(program_info) {} + + LabelInfo* program_info() const { return program_info_; } + + bool empty() const { return label_to_score_.empty(); } + + LabelInfo* top_candidate() const { return queue_.begin()->second; } + + void Update(LabelInfo* model_info, int delta_score) { + LOG_ASSERT(delta_score != 0); + int old_score = 0; + int new_score = 0; + LabelToScore::iterator p = label_to_score_.find(model_info); + if (p != label_to_score_.end()) { + old_score = p->second; + new_score = old_score + delta_score; + queue_.erase(ScoreAndLabel(old_score, p->first)); + if (new_score == 0) { + label_to_score_.erase(p); + } else { + p->second = new_score; + queue_.insert(ScoreAndLabel(new_score, model_info)); + } + } else { + new_score = delta_score; + label_to_score_.insert(std::make_pair(model_info, new_score)); + queue_.insert(ScoreAndLabel(new_score, model_info)); + } + LOG_ASSERT(queue_.size() == label_to_score_.size()); + } + + int TopScore() const { + int first_value = 0; + int second_value = 0; + Queue::const_iterator p = queue_.begin(); + if (p != queue_.end()) { + first_value = p->first; + ++p; + if (p != queue_.end()) { + second_value = p->first; + } + } + return first_value - second_value; + } + + bool HasPendingUpdates() { return !pending_updates_.empty(); } + + void AddPendingUpdate(LabelInfo* model_info, int delta_score) { + LOG_ASSERT(delta_score != 0); + pending_updates_[model_info] += delta_score; + } + + void ApplyPendingUpdates() { + // TODO(sra): try to walk |pending_updates_| and |label_to_score_| in + // lockstep. Try to batch updates to |queue_|. + size_t zeroes = 0; + for (LabelToScore::iterator p = pending_updates_.begin(); + p != pending_updates_.end(); + ++p) { + if (p->second != 0) + Update(p->first, p->second); + else + ++zeroes; + } + pending_updates_.clear(); + } + + void Print(int max) { + ALOG1 << "score " << TopScore() << " " << ToString(program_info_) + << " := ?"; + if (!pending_updates_.empty()) + ALOG1 << pending_updates_.size() << " pending"; + int count = 0; + for (Queue::iterator q = queue_.begin(); q != queue_.end(); ++q) { + if (++count > max) break; + ALOG1 << " " << q->first << " " << ToString(q->second); + } + } + + private: + typedef std::map<LabelInfo*, int, OrderLabelInfo> LabelToScore; + typedef std::pair<int, LabelInfo*> ScoreAndLabel; + struct OrderScoreAndLabelByScoreDecreasing { + OrderLabelInfo tie_breaker; + bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const { + if (a.first > b.first) return true; + if (a.first < b.first) return false; + return tie_breaker(a.second, b.second); + } + }; + typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue; + + LabelInfo* program_info_; + LabelToScore label_to_score_; + LabelToScore pending_updates_; + Queue queue_; +}; + +AssignmentCandidates* LabelInfo::candidates() { + if (candidates_ == NULL) + candidates_ = new AssignmentCandidates(this); + return candidates_; +} + +LabelInfo::~LabelInfo() { + delete candidates_; +} + +// A Shingle is a short fixed-length string of LabelInfos that actually occurs +// in a Trace. A Shingle may occur many times. We repesent the Shingle by the +// position of one of the occurrences in the Trace. +class Shingle { + public: + static const size_t kWidth = 5; + + struct InterningLess { + bool operator()(const Shingle& a, const Shingle& b) const; + }; + + typedef std::set<Shingle, InterningLess> OwningSet; + + static Shingle* Find(const Trace& trace, size_t position, + OwningSet* owning_set) { + std::pair<OwningSet::iterator, bool> pair = + owning_set->insert(Shingle(trace, position)); + // pair.first is the newly inserted Shingle or the previouly inserted one + // that looks the same according to the comparator. + pair.first->add_position(position); + return &*pair.first; + } + + LabelInfo* at(size_t i) const { return trace_[exemplar_position_ + i]; } + void add_position(size_t position) { positions_.push_back(position); } + size_t position_count() const { return positions_.size(); } + + bool InModel() const { return at(0)->is_model_; } + + ShinglePattern* pattern() const { return pattern_; } + void set_pattern(ShinglePattern* pattern) { pattern_ = pattern; } + + struct PointerLess { + bool operator()(const Shingle* a, const Shingle* b) const { + // Arbitrary but repeatable (memory-address) independent ordering: + return a->exemplar_position_ < b->exemplar_position_; + // return InterningLess()(*a, *b); + } + }; + + private: + Shingle(const Trace& trace, size_t exemplar_position) + : trace_(trace), + exemplar_position_(exemplar_position), + pattern_(NULL) { + } + + const Trace& trace_; // The shingle lives inside trace_. + size_t exemplar_position_; // At this position (and other positions). + std::vector<uint32> positions_; // Includes exemplar_position_. + + ShinglePattern* pattern_; // Pattern changes as LabelInfos are assigned. + + friend std::string ToString(const Shingle* instance); + + // We can't disallow the copy constructor because we use std::set<Shingle> and + // VS2005's implementation of std::set<T>::set() requires T to have a copy + // constructor. + // DISALLOW_COPY_AND_ASSIGN(Shingle); + void operator=(const Shingle&); // Disallow assignment only. +}; + +std::string ToString(const Shingle* instance) { + std::string s; + const char* sep = "<"; + for (size_t i = 0; i < Shingle::kWidth; ++i) { + // StringAppendF(&s, "%s%x ", sep, instance.at(i)->label_->rva_); + s += sep; + s += ToString(instance->at(i)); + sep = ", "; + } + StringAppendF(&s, ">(%u)@{%d}", instance->exemplar_position_, + static_cast<int>(instance->position_count())); + return s; +} + + +bool Shingle::InterningLess::operator()( + const Shingle& a, + const Shingle& b) const { + for (size_t i = 0; i < kWidth; ++i) { + LabelInfo* info_a = a.at(i); + LabelInfo* info_b = b.at(i); + if (info_a->label_->rva_ < info_b->label_->rva_) + return true; + if (info_a->label_->rva_ > info_b->label_->rva_) + return false; + if (info_a->is_model_ < info_b->is_model_) + return true; + if (info_a->is_model_ > info_b->is_model_) + return false; + if (info_a != info_b) { + NOTREACHED(); + } + } + return false; +} + +class ShinglePattern { + public: + enum { kOffsetMask = 7, // Offset lives in low bits. + kFixed = 0, // kind & kVariable == 0 => fixed. + kVariable = 8 // kind & kVariable == 1 => variable. + }; + // sequence[position + (kinds_[i] & kOffsetMask)] gives LabelInfo for position + // i of shingle. Below, second 'A' is duplicate of position 1, second '102' + // is duplicate of position 0. + // + // <102, A, 103, A , 102> + // --> <kFixed+0, kVariable+1, kFixed+2, kVariable+1, kFixed+0> + struct Index { + explicit Index(const Shingle* instance); + uint8 kinds_[Shingle::kWidth]; + uint8 variables_; + uint8 unique_variables_; + uint8 first_variable_index_; + uint32 hash_; + int assigned_indexes_[Shingle::kWidth]; + }; + + // ShinglePattern keeps histograms of member Shingle instances, ordered by + // decreasing number of occurrences. We don't have a pair (occurrence count, + // Shingle instance), so we use a FreqView adapter to make the instance + // pointer look like the pair. + class FreqView { + public: + explicit FreqView(const Shingle* instance) : instance_(instance) {} + size_t count() const { return instance_->position_count(); } + const Shingle* instance() const { return instance_; } + struct Greater { + bool operator()(const FreqView& a, const FreqView& b) const { + if (a.count() > b.count()) return true; + if (a.count() < b.count()) return false; + return resolve_ties(a.instance(), b.instance()); + } + private: + Shingle::PointerLess resolve_ties; + }; + private: + const Shingle* instance_; + }; + + typedef std::set<FreqView, FreqView::Greater> Histogram; + + ShinglePattern() : index_(NULL), model_coverage_(0), program_coverage_(0) {} + + const Index* index_; // Points to the key in the owning map value_type. + Histogram model_histogram_; + Histogram program_histogram_; + int model_coverage_; + int program_coverage_; +}; + +std::string ToString(const ShinglePattern::Index* index) { + std::string s; + if (index == NULL) { + s = "<null>"; + } else { + StringAppendF(&s, "<%d: ", index->variables_); + const char* sep = ""; + for (size_t i = 0; i < Shingle::kWidth; ++i) { + s += sep; + sep = ", "; + uint32 kind = index->kinds_[i]; + int offset = kind & ShinglePattern::kOffsetMask; + if (kind & ShinglePattern::kVariable) + StringAppendF(&s, "V%d", offset); + else + StringAppendF(&s, "%d", index->assigned_indexes_[offset]); + } + StringAppendF(&s, " %x", index->hash_); + s += ">"; + } + return s; +} + +std::string HistogramToString(const ShinglePattern::Histogram& histogram, + size_t snippet_max) { + std::string s; + size_t histogram_size = histogram.size(); + size_t snippet_size = 0; + for (ShinglePattern::Histogram::const_iterator p = histogram.begin(); + p != histogram.end(); + ++p) { + if (++snippet_size > snippet_max && snippet_size != histogram_size) { + s += " ..."; + break; + } + StringAppendF(&s, " %d", p->count()); + } + return s; +} + +std::string HistogramToStringFull(const ShinglePattern::Histogram& histogram, + const char* indent, + size_t snippet_max) { + std::string s; + + size_t histogram_size = histogram.size(); + size_t snippet_size = 0; + for (ShinglePattern::Histogram::const_iterator p = histogram.begin(); + p != histogram.end(); + ++p) { + s += indent; + if (++snippet_size > snippet_max && snippet_size != histogram_size) { + s += "...\n"; + break; + } + StringAppendF(&s, "(%d) ", p->count()); + s += ToString(&(*p->instance())); + s += "\n"; + } + return s; +} + +std::string ToString(const ShinglePattern* pattern, size_t snippet_max = 3) { + std::string s; + if (pattern == NULL) { + s = "<null>"; + } else { + s = "{"; + s += ToString(pattern->index_); + StringAppendF(&s, "; %d(%d):", + static_cast<int>(pattern->model_histogram_.size()), + pattern->model_coverage_); + + s += HistogramToString(pattern->model_histogram_, snippet_max); + StringAppendF(&s, "; %d(%d):", + static_cast<int>(pattern->program_histogram_.size()), + pattern->program_coverage_); + s += HistogramToString(pattern->program_histogram_, snippet_max); + s += "}"; + } + return s; +} + +std::string ShinglePatternToStringFull(const ShinglePattern* pattern, + size_t max) { + std::string s; + s += ToString(pattern->index_); + s += "\n"; + size_t model_size = pattern->model_histogram_.size(); + size_t program_size = pattern->program_histogram_.size(); + StringAppendF(&s, " model shingles %u\n", model_size); + s += HistogramToStringFull(pattern->model_histogram_, " ", max); + StringAppendF(&s, " program shingles %u\n", program_size); + s += HistogramToStringFull(pattern->program_histogram_, " ", max); + return s; +} + +struct ShinglePatternIndexLess { + bool operator()(const ShinglePattern::Index& a, + const ShinglePattern::Index& b) const { + if (a.hash_ < b.hash_) return true; + if (a.hash_ > b.hash_) return false; + + for (size_t i = 0; i < Shingle::kWidth; ++i) { + if (a.kinds_[i] < b.kinds_[i]) return true; + if (a.kinds_[i] > b.kinds_[i]) return false; + if ((a.kinds_[i] & ShinglePattern::kVariable) == 0) { + if (a.assigned_indexes_[i] < b.assigned_indexes_[i]) + return true; + if (a.assigned_indexes_[i] > b.assigned_indexes_[i]) + return false; + } + } + return false; + } +}; + +static uint32 hash_combine(uint32 h, uint32 v) { + h += v; + return (h * (37 + 0x0000d100)) ^ (h >> 13); +} + +ShinglePattern::Index::Index(const Shingle* instance) { + uint32 hash = 0; + variables_ = 0; + unique_variables_ = 0; + first_variable_index_ = 255; + + for (size_t i = 0; i < Shingle::kWidth; ++i) { + LabelInfo* info = instance->at(i); + uint32 kind; + int code = -1; + size_t j = 0; + for ( ; j < i; ++j) { + if (info == instance->at(j)) { // Duplicate LabelInfo + kind = kinds_[j]; + break; + } + } + if (j == i) { // Not found above. + if (info->assignment_) { + code = info->label_->index_; + assigned_indexes_[i] = code; + kind = kFixed + i; + } else { + kind = kVariable + i; + ++unique_variables_; + if (i < first_variable_index_) + first_variable_index_ = i; + } + } + if (kind & kVariable) ++variables_; + hash = hash_combine(hash, code); + hash = hash_combine(hash, kind); + kinds_[i] = kind; + assigned_indexes_[i] = code; + } + hash_ = hash; +} + +struct ShinglePatternLess { + bool operator()(const ShinglePattern& a, const ShinglePattern& b) const { + return index_less(*a.index_, *b.index_); + } + ShinglePatternIndexLess index_less; +}; + +struct ShinglePatternPointerLess { + bool operator()(const ShinglePattern* a, const ShinglePattern* b) const { + return pattern_less(*a, *b); + } + ShinglePatternLess pattern_less; +}; + +template<int (*Scorer)(const ShinglePattern*)> +struct OrderShinglePatternByScoreDescending { + bool operator()(const ShinglePattern* a, const ShinglePattern* b) const { + int score_a = Scorer(a); + int score_b = Scorer(b); + if (score_a > score_b) return true; + if (score_a < score_b) return false; + return break_ties(a, b); + } + ShinglePatternPointerLess break_ties; +}; + +// Returns a score for a 'Single Use' rule. Returns -1 if the rule is not +// applicable. +int SingleUseScore(const ShinglePattern* pattern) { + if (pattern->index_->variables_ != 1) + return -1; + + if (pattern->model_histogram_.size() != 1 || + pattern->program_histogram_.size() != 1) + return -1; + + // Does this pattern account for all uses of the variable? + const ShinglePattern::FreqView& program_freq = + *pattern->program_histogram_.begin(); + const ShinglePattern::FreqView& model_freq = + *pattern->model_histogram_.begin(); + int p1 = program_freq.count(); + int m1 = model_freq.count(); + if (p1 == m1) { + const Shingle* program_instance = program_freq.instance(); + const Shingle* model_instance = model_freq.instance(); + size_t variable_index = pattern->index_->first_variable_index_; + LabelInfo* program_info = program_instance->at(variable_index); + LabelInfo* model_info = model_instance->at(variable_index); + if (!program_info->assignment_) { + if (program_info->refs_ == p1 && model_info->refs_ == m1) { + return p1; + } + } + } + return -1; +} + +// The VariableQueue is a priority queue of unassigned LabelInfos from +// the 'program' (the 'variables') and their AssignmentCandidates. +class VariableQueue { + public: + typedef std::pair<int, LabelInfo*> ScoreAndLabel; + + VariableQueue() {} + + bool empty() const { return queue_.empty(); } + + const ScoreAndLabel& first() const { return *queue_.begin(); } + + // For debugging only. + void Print() const { + for (Queue::const_iterator p = queue_.begin(); p != queue_.end(); ++p) { + AssignmentCandidates* candidates = p->second->candidates(); + candidates->Print(std::numeric_limits<int>::max()); + } + } + + void AddPendingUpdate(LabelInfo* program_info, LabelInfo* model_info, + int delta_score) { + AssignmentCandidates* candidates = program_info->candidates(); + if (!candidates->HasPendingUpdates()) { + pending_update_candidates_.push_back(candidates); + } + candidates->AddPendingUpdate(model_info, delta_score); + } + + void ApplyPendingUpdates() { + for (size_t i = 0; i < pending_update_candidates_.size(); ++i) { + AssignmentCandidates* candidates = pending_update_candidates_[i]; + int old_score = candidates->TopScore(); + queue_.erase(ScoreAndLabel(old_score, candidates->program_info())); + candidates->ApplyPendingUpdates(); + if (!candidates->empty()) { + int new_score = candidates->TopScore(); + queue_.insert(ScoreAndLabel(new_score, candidates->program_info())); + } + } + pending_update_candidates_.clear(); + } + + private: + struct OrderScoreAndLabelByScoreDecreasing { + bool operator()(const ScoreAndLabel& a, const ScoreAndLabel& b) const { + if (a.first > b.first) return true; + if (a.first < b.first) return false; + return OrderLabelInfo()(a.second, b.second); + } + }; + typedef std::set<ScoreAndLabel, OrderScoreAndLabelByScoreDecreasing> Queue; + + Queue queue_; + std::vector<AssignmentCandidates*> pending_update_candidates_; + + DISALLOW_COPY_AND_ASSIGN(VariableQueue); +}; + + +class AssignmentProblem { + public: + AssignmentProblem(const Trace& trace, size_t model_end) + : trace_(trace), + model_end_(model_end) { + ALOG1 << "AssignmentProblem::AssignmentProblem " << model_end << ", " + << trace.size(); + } + + bool Solve() { + if (model_end_ < Shingle::kWidth || + trace_.size() - model_end_ < Shingle::kWidth) { + // Nothing much we can do with such a short problem. + return true; + } + instances_.resize(trace_.size() - Shingle::kWidth + 1, NULL); + AddShingles(0, model_end_); + AddShingles(model_end_, trace_.size()); + InitialClassify(); + AddPatternsNeedingUpdatesToQueues(); + + patterns_needing_updates_.clear(); + while (FindAndAssignBestLeader()) { + NO_LOG << "Updated " << patterns_needing_updates_.size() << " patterns"; + patterns_needing_updates_.clear(); + } + PrintActivePatterns(); + + return true; + } + + private: + typedef std::set<Shingle*, Shingle::PointerLess> ShingleSet; + + typedef std::set<const ShinglePattern*, ShinglePatternPointerLess> + ShinglePatternSet; + + // Patterns are partitioned into the following sets: + + // * Retired patterns (not stored). No shingles exist for this pattern (they + // all now match more specialized patterns). + // * Useless patterns (not stored). There are no 'program' shingles for this + // pattern (they all now match more specialized patterns). + // * Single-use patterns - single_use_pattern_queue_. + // * Other patterns - active_non_single_use_patterns_ / variable_queue_. + + typedef std::set<const ShinglePattern*, + OrderShinglePatternByScoreDescending<&SingleUseScore> > + SingleUsePatternQueue; + + void PrintPatternsHeader() const { + ALOG1 << shingle_instances_.size() << " instances " + << trace_.size() << " trace length " + << patterns_.size() << " shingle indexes " + << single_use_pattern_queue_.size() << " single use patterns " + << active_non_single_use_patterns_.size() << " active patterns"; + } + + void PrintActivePatterns() const { + for (ShinglePatternSet::const_iterator p = + active_non_single_use_patterns_.begin(); + p != active_non_single_use_patterns_.end(); + ++p) { + const ShinglePattern* pattern = *p; + ALOG1 << ToString(pattern, 10); + } + } + + void PrintPatterns() const { + PrintAllPatterns(); + PrintActivePatterns(); + PrintAllShingles(); + } + + void PrintAllPatterns() const { + for (IndexToPattern::const_iterator p = patterns_.begin(); + p != patterns_.end(); + ++p) { + const ShinglePattern& pattern = p->second; + ALOG1 << ToString(&pattern, 10); + } + } + + void PrintAllShingles() const { + for (Shingle::OwningSet::const_iterator p = shingle_instances_.begin(); + p != shingle_instances_.end(); + ++p) { + const Shingle& instance = *p; + ALOG1 << ToString(&instance) << " " << ToString(instance.pattern()); + } + } + + + void AddShingles(size_t begin, size_t end) { + for (size_t i = begin; i + Shingle::kWidth - 1 < end; ++i) { + instances_[i] = Shingle::Find(trace_, i, &shingle_instances_); + } + } + + void Declassify(Shingle* shingle) { + ShinglePattern* pattern = shingle->pattern(); + if (shingle->InModel()) { + pattern->model_histogram_.erase(ShinglePattern::FreqView(shingle)); + pattern->model_coverage_ -= shingle->position_count(); + } else { + pattern->program_histogram_.erase(ShinglePattern::FreqView(shingle)); + pattern->program_coverage_ -= shingle->position_count(); + } + shingle->set_pattern(NULL); + } + + void Reclassify(Shingle* shingle) { + ShinglePattern* pattern = shingle->pattern(); + LOG_ASSERT(pattern == NULL); + + ShinglePattern::Index index(shingle); + if (index.variables_ == 0) + return; + + std::pair<IndexToPattern::iterator, bool> inserted = + patterns_.insert(std::make_pair(index, ShinglePattern())); + + pattern = &inserted.first->second; + pattern->index_ = &inserted.first->first; + shingle->set_pattern(pattern); + patterns_needing_updates_.insert(pattern); + + if (shingle->InModel()) { + pattern->model_histogram_.insert(ShinglePattern::FreqView(shingle)); + pattern->model_coverage_ += shingle->position_count(); + } else { + pattern->program_histogram_.insert(ShinglePattern::FreqView(shingle)); + pattern->program_coverage_ += shingle->position_count(); + } + } + + void InitialClassify() { + for (Shingle::OwningSet::iterator p = shingle_instances_.begin(); + p != shingle_instances_.end(); + ++p) { + Reclassify(&*p); + } + } + + // For the positions in |info|, find the shingles that overlap that position. + void AddAffectedPositions(LabelInfo* info, ShingleSet* affected_shingles) { + const size_t kWidth = Shingle::kWidth; + for (size_t i = 0; i < info->positions_.size(); ++i) { + size_t position = info->positions_[i]; + // Find bounds to the subrange of |trace_| we are in. + size_t start = position < model_end_ ? 0 : model_end_; + size_t end = position < model_end_ ? model_end_ : trace_.size(); + + // Clip [position-kWidth+1, position+1) + size_t low = position > start + kWidth - 1 + ? position - kWidth + 1 + : start; + size_t high = position + kWidth < end ? position + 1 : end - kWidth + 1; + + for (size_t shingle_position = low; + shingle_position < high; + ++shingle_position) { + Shingle* overlapping_shingle = instances_.at(shingle_position); + affected_shingles->insert(overlapping_shingle); + } + } + } + + void RemovePatternsNeedingUpdatesFromQueues() { + for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin(); + p != patterns_needing_updates_.end(); + ++p) { + RemovePatternFromQueues(*p); + } + } + + void AddPatternsNeedingUpdatesToQueues() { + for (ShinglePatternSet::iterator p = patterns_needing_updates_.begin(); + p != patterns_needing_updates_.end(); + ++p) { + AddPatternToQueues(*p); + } + variable_queue_.ApplyPendingUpdates(); + } + + void RemovePatternFromQueues(const ShinglePattern* pattern) { + int single_use_score = SingleUseScore(pattern); + if (single_use_score > 0) { + size_t n = single_use_pattern_queue_.erase(pattern); + LOG_ASSERT(n == 1); + } else if (pattern->program_histogram_.size() == 0 && + pattern->model_histogram_.size() == 0) { + NOTREACHED(); // Should not come back to life. + } else if (pattern->program_histogram_.size() == 0) { + // Useless pattern. + } else { + active_non_single_use_patterns_.erase(pattern); + AddPatternToLabelQueue(pattern, -1); + } + } + + void AddPatternToQueues(const ShinglePattern* pattern) { + int single_use_score = SingleUseScore(pattern); + if (single_use_score > 0) { + single_use_pattern_queue_.insert(pattern); + } else if (pattern->program_histogram_.size() == 0 && + pattern->model_histogram_.size() == 0) { + } else if (pattern->program_histogram_.size() == 0) { + // Useless pattern. + } else { + active_non_single_use_patterns_.insert(pattern); + AddPatternToLabelQueue(pattern, +1); + } + } + + void AddPatternToLabelQueue(const ShinglePattern* pattern, int sign) { + // For each possible assignment in this pattern, update the potential + // contributions to the LabelInfo queues. + size_t model_histogram_size = pattern->model_histogram_.size(); + size_t program_histogram_size = pattern->program_histogram_.size(); + + // We want to find for each symbol (LabelInfo) the maximum contribution that + // could be achieved by making shingle-wise assignments between shingles in + // the model and shingles in the program. + // + // If the shingles in the histograms are independent (no two shingles have a + // symbol in common) then any permutation of the assignments is possible, + // and the maximum contribution can be found by taking the maximum over all + // the pairs. + // + // If the shingles are dependent two things happen. The maximum + // contribution to any given symbol is a sum because the symbol has + // contributions from all the shingles containing it. Second, some + // assignments are blocked by previous incompatible assignments. We want to + // avoid a combinatorial search, so we ignore the blocking. + + const int kUnwieldy = 5; + + typedef std::map<LabelInfo*, int> LabelToScore; + typedef std::map<LabelInfo*, LabelToScore > ScoreSet; + ScoreSet maxima; + + size_t n_model_samples = 0; + for (ShinglePattern::Histogram::const_iterator model_iter = + pattern->model_histogram_.begin(); + model_iter != pattern->model_histogram_.end(); + ++model_iter) { + if (++n_model_samples > kUnwieldy) break; + const ShinglePattern::FreqView& model_freq = *model_iter; + int m1 = model_freq.count(); + const Shingle* model_instance = model_freq.instance(); + + ScoreSet sums; + size_t n_program_samples = 0; + for (ShinglePattern::Histogram::const_iterator program_iter = + pattern->program_histogram_.begin(); + program_iter != pattern->program_histogram_.end(); + ++program_iter) { + if (++n_program_samples > kUnwieldy) break; + const ShinglePattern::FreqView& program_freq = *program_iter; + int p1 = program_freq.count(); + const Shingle* program_instance = program_freq.instance(); + + // int score = p1; // ? weigh all equally?? + int score = std::min(p1, m1); + + for (size_t i = 0; i < Shingle::kWidth; ++i) { + LabelInfo* program_info = program_instance->at(i); + LabelInfo* model_info = model_instance->at(i); + if ((model_info->assignment_ == NULL) != + (program_info->assignment_ == NULL)) { + ALOG1 << "ERROR " << i + << "\n\t" << ToString(pattern, 10) + << "\n\t" << ToString(program_instance) + << "\n\t" << ToString(model_instance); + } + if (!program_info->assignment_ && !model_info->assignment_) { + sums[program_info][model_info] += score; + } + } + + for (ScoreSet::iterator assignee_iterator = sums.begin(); + assignee_iterator != sums.end(); + ++assignee_iterator) { + LabelInfo* program_info = assignee_iterator->first; + for (LabelToScore::iterator p = assignee_iterator->second.begin(); + p != assignee_iterator->second.end(); + ++p) { + LabelInfo* model_info = p->first; + int score = p->second; + int* slot = &maxima[program_info][model_info]; + *slot = std::max(*slot, score); + } + } + } + } + + for (ScoreSet::iterator assignee_iterator = maxima.begin(); + assignee_iterator != maxima.end(); + ++assignee_iterator) { + LabelInfo* program_info = assignee_iterator->first; + for (LabelToScore::iterator p = assignee_iterator->second.begin(); + p != assignee_iterator->second.end(); + ++p) { + LabelInfo* model_info = p->first; + int score = sign * p->second; + variable_queue_.AddPendingUpdate(program_info, model_info, score); + } + } + } + + void AssignOne(LabelInfo* model_info, LabelInfo* program_info) { + LOG_ASSERT(!model_info->assignment_); + LOG_ASSERT(!program_info->assignment_); + LOG_ASSERT(model_info->is_model_); + LOG_ASSERT(!program_info->is_model_); + + ALOG2 << "Assign " << ToString(program_info) + << " := " << ToString(model_info); + + ShingleSet affected_shingles; + AddAffectedPositions(model_info, &affected_shingles); + AddAffectedPositions(program_info, &affected_shingles); + + for (ShingleSet::iterator p = affected_shingles.begin(); + p != affected_shingles.end(); + ++p) { + patterns_needing_updates_.insert((*p)->pattern()); + } + + RemovePatternsNeedingUpdatesFromQueues(); + + for (ShingleSet::iterator p = affected_shingles.begin(); + p != affected_shingles.end(); + ++p) { + Declassify(*p); + } + + program_info->label_->index_ = model_info->label_->index_; + // Mark as assigned + model_info->assignment_ = program_info; + program_info->assignment_ = model_info; + + for (ShingleSet::iterator p = affected_shingles.begin(); + p != affected_shingles.end(); + ++p) { + Reclassify(*p); + } + + AddPatternsNeedingUpdatesToQueues(); + } + + bool AssignFirstVariableOfHistogramHead(const ShinglePattern& pattern) { + const ShinglePattern::FreqView& program_1 = + *pattern.program_histogram_.begin(); + const ShinglePattern::FreqView& model_1 = *pattern.model_histogram_.begin(); + const Shingle* program_instance = program_1.instance(); + const Shingle* model_instance = model_1.instance(); + size_t variable_index = pattern.index_->first_variable_index_; + LabelInfo* program_info = program_instance->at(variable_index); + LabelInfo* model_info = model_instance->at(variable_index); + AssignOne(model_info, program_info); + return true; + } + + bool FindAndAssignBestLeader() { + LOG_ASSERT(patterns_needing_updates_.empty()); + + if (!single_use_pattern_queue_.empty()) { + const ShinglePattern& pattern = **single_use_pattern_queue_.begin(); + return AssignFirstVariableOfHistogramHead(pattern); + } + + if (variable_queue_.empty()) + return false; + + const VariableQueue::ScoreAndLabel best = variable_queue_.first(); + int score = best.first; + LabelInfo* assignee = best.second; + + // TODO(sra): score (best.first) can be zero. A zero score means we are + // blindly picking between two (or more) alternatives which look the same. + // If we exit on the first zero-score we sometimes get 3-4% better total + // compression. This indicates that 'infill' is doing a better job than + // picking blindly. Perhaps we can use an extended region around the + // undistinguished competing alternatives to break the tie. + if (score == 0) { + variable_queue_.Print(); + return false; + } + + AssignmentCandidates* candidates = assignee->candidates(); + if (candidates->empty()) + return false; // Should not happen. + + AssignOne(candidates->top_candidate(), assignee); + return true; + } + + private: + // The trace vector contains the model sequence [0, model_end_) followed by + // the program sequence [model_end_, trace.end()) + const Trace& trace_; + size_t model_end_; + + // |shingle_instances_| is the set of 'interned' shingles. + Shingle::OwningSet shingle_instances_; + + // |instances_| maps from position in |trace_| to Shingle at that position. + std::vector<Shingle*> instances_; + + SingleUsePatternQueue single_use_pattern_queue_; + ShinglePatternSet active_non_single_use_patterns_; + VariableQueue variable_queue_; + + // Transient information: when we make an assignment, we need to recompute + // priority queue information derived from these ShinglePatterns. + ShinglePatternSet patterns_needing_updates_; + + typedef std::map<ShinglePattern::Index, + ShinglePattern, ShinglePatternIndexLess> IndexToPattern; + IndexToPattern patterns_; + + DISALLOW_COPY_AND_ASSIGN(AssignmentProblem); +}; + +class Adjuster : public AdjustmentMethod { + public: + Adjuster() {} + ~Adjuster() {} + + bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) { + LOG(INFO) << "Adjuster::Adjust"; + prog_ = program; + model_ = &model; + return Finish(); + } + + bool Finish() { + prog_->UnassignIndexes(); + Trace abs32_trace_; + Trace rel32_trace_; + CollectTraces(model_, &abs32_trace_, &rel32_trace_, true); + size_t abs32_model_end = abs32_trace_.size(); + size_t rel32_model_end = rel32_trace_.size(); + CollectTraces(prog_, &abs32_trace_, &rel32_trace_, false); + Solve(abs32_trace_, abs32_model_end); + Solve(rel32_trace_, rel32_model_end); + prog_->AssignRemainingIndexes(); + return true; + } + + private: + void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32, + bool is_model) { + label_info_maker_.ResetDebugLabel(); + const std::vector<Instruction*>& instructions = program->instructions(); + for (size_t i = 0; i < instructions.size(); ++i) { + Instruction* instruction = instructions.at(i); + if (Label* label = program->InstructionAbs32Label(instruction)) + ReferenceLabel(abs32, label, is_model); + if (Label* label = program->InstructionRel32Label(instruction)) + ReferenceLabel(rel32, label, is_model); + } + // TODO(sra): we could simply append all the labels in index order to + // incorporate some costing for entropy (bigger deltas) that will be + // introduced into the label address table by non-monotonic ordering. This + // would have some knock-on effects to parts of the algorithm that work on + // single-occurrence labels. + } + + void Solve(const Trace& model, size_t model_end) { + AssignmentProblem a(model, model_end); + a.Solve(); + } + + void ReferenceLabel(Trace* trace, Label* label, bool is_model) { + trace->push_back( + label_info_maker_.MakeLabelInfo(label, is_model, trace->size())); + } + + AssemblyProgram* prog_; // Program to be adjusted, owned by caller. + const AssemblyProgram* model_; // Program to be mimicked, owned by caller. + + LabelInfoMaker label_info_maker_; + + private: + DISALLOW_COPY_AND_ASSIGN(Adjuster); +}; + +//////////////////////////////////////////////////////////////////////////////// + +} // namespace adjustment_method_2 + +AdjustmentMethod* AdjustmentMethod::MakeShingleAdjustmentMethod() { + return new adjustment_method_2::Adjuster(); +} + +} // namespace courgette diff --git a/courgette/courgette.gyp b/courgette/courgette.gyp index 03de462..859212d 100644 --- a/courgette/courgette.gyp +++ b/courgette/courgette.gyp @@ -20,6 +20,7 @@ 'msvs_guid': '9A72A362-E617-4205-B9F2-43C6FB280FA1', 'sources': [ 'adjustment_method.cc', + 'adjustment_method_2.cc', 'adjustment_method.h', 'assembly_program.cc', 'assembly_program.h', |