diff options
author | sra@chromium.org <sra@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-05-08 23:00:29 +0000 |
---|---|---|
committer | sra@chromium.org <sra@chromium.org@0039d316-1c4b-4281-b951-d872f2087c98> | 2009-05-08 23:00:29 +0000 |
commit | 04ca1bc65afb76ea30698c25f24599d20119e3d2 (patch) | |
tree | 2bb65e974d478f4d607b83f6a84a56c01f501ac0 /courgette | |
parent | 9adf1dcf3281408560267787eddcc767566b425f (diff) | |
download | chromium_src-04ca1bc65afb76ea30698c25f24599d20119e3d2.zip chromium_src-04ca1bc65afb76ea30698c25f24599d20119e3d2.tar.gz chromium_src-04ca1bc65afb76ea30698c25f24599d20119e3d2.tar.bz2 |
Move Courgette
from src\third_party\courgette
to src\courgette and src\courgette\third_party
Fixed #includes
Added properties to ignore generated files:
C:\c5\src>svn pg svn:ignore courgette
courgette.xcodeproj
courgette.sln
courgette_fuzz.vcproj
courgette_lib.vcproj
courgette_minimal_tool.vcproj
courgette_tool.vcproj
courgette.vcproj
courgette_unittests.vcproj
SConstruct
courgette_fuzz.scons
courgette_lib.scons
courgette_main.scons
courgette_minimal_tool.scons
courgette.scons
courgette_tool.scons
courgette_unittests.scons
Review URL: http://codereview.chromium.org/115062
git-svn-id: svn://svn.chromium.org/chrome/trunk/src@15692 0039d316-1c4b-4281-b951-d872f2087c98
Diffstat (limited to 'courgette')
48 files changed, 8027 insertions, 0 deletions
diff --git a/courgette/adjustment_method.cc b/courgette/adjustment_method.cc new file mode 100644 index 0000000..48e7cc0 --- /dev/null +++ b/courgette/adjustment_method.cc @@ -0,0 +1,703 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/adjustment_method.h" + +#include <algorithm> +#include <list> +#include <map> +#include <set> +#include <string> +#include <vector> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/string_util.h" + +#include "courgette/assembly_program.h" +#include "courgette/courgette.h" +#include "courgette/encoded_program.h" +#include "courgette/image_info.h" + +namespace courgette { + +// We have three discretionary information logging levels for algorithm +// development. For now just configure with #defines. +// TODO(sra): make dependent of some configurable setting. +#define NO_LOG DLOG_IF(INFO, false) +// #define ALOG1 LOG(INFO) +// #define ALOG2 LOG(INFO) +// #define ALOG3 LOG(INFO) +#define ALOG1 NO_LOG +#define ALOG2 NO_LOG +#define ALOG3 NO_LOG + +//////////////////////////////////////////////////////////////////////////////// + +class NullAdjustmentMethod : public AdjustmentMethod { + bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) { + return true; + } +}; + +//////////////////////////////////////////////////////////////////////////////// + +// The purpose of adjustment is to assign indexes to Labels of a program 'p' to +// make the sequence of indexes similar to a 'model' program 'm'. Labels +// themselves don't have enough information to do this job, so we work with a +// LabelInfo surrogate for each label. +// +class LabelInfo { + public: + Label* label_; // The label that this info a surrogate for. + + // Information used only in debugging messages. + uint32 is_model_ : 1; // Is the label in the model? + uint32 debug_index_ : 31; // An unique small number for naming the label. + + uint32 refs_; // Number of times this Label is referenced. + + LabelInfo* assignment_; // Label from other program corresponding to this. + + // LabelInfos are in a doubly linked list ordered by address (label_->rva_) so + // we can quickly find Labels adjacent in address order. + LabelInfo* next_addr_; // Label(Info) at next highest address. + LabelInfo* prev_addr_; // Label(Info) at next lowest address. + + std::vector<uint32> positions_; // Offsets into the trace of references. + + // Just a no-argument constructor and copy constructor. Actual LabelInfo + // objects are allocated in std::pair structs in a std::map. + LabelInfo() + : label_(NULL), is_model_(false), debug_index_(0), refs_(0), + assignment_(NULL), + next_addr_(NULL), + prev_addr_(NULL) {} + + private: + void operator=(const LabelInfo*); // Disallow assignment only. + + // Public compiler generated copy constructor is needed to constuct + // std::pair<Label*, LabelInfo> so that fresh LabelInfos can be allocated + // inside a std::map. +}; + +struct OrderLabelInfoByAddressAscending { + bool operator()(const LabelInfo* a, const LabelInfo* b) const { + return a->label_->rva_ < b->label_->rva_; + } +}; + +static std::string ToString(LabelInfo* info) { + std::string s; + StringAppendF(&s, "%c%d", "pm"[info->is_model_], info->debug_index_); + if (info->label_->index_ != Label::kNoIndex) + StringAppendF(&s, " (%d)", info->label_->index_); + + StringAppendF(&s, " #%u", info->refs_); + return s; +} + +// General graph matching is exponential, essentially trying all permutations. +// The exponential algorithm can be made faster by avoiding consideration of +// impossible or unlikely matches. We can make the matching practical by eager +// matching - by looking for likely matches and commiting to them, and using the +// committed assignment as the basis for further matching. +// +// The basic eager graph-matching assignment is based on several ideas: +// +// * The strongest match will be for parts of the program that have not +// changed. If part of a program has not changed, then the number of +// references to a label will be the same, and corresponding pairs of +// adjacent labels will have the same RVA difference. +// +// * Some assignments are 'obvious' if you look at the distribution. Example: +// if both the program and the model have a label that is referred to much +// more often than the next most refered-to label, it is likely the two +// labels correspond. +// +// * If a label from the program corresponds to a label in the model, it is +// likely that the labels near the corresponding labels also match. A +// conservative way of extending the match is to assign only those labels +// which have exactly the same address offset and reference count. +// +// * If two labels correspond, then we can try to match up the references +// before and after the labels in the reference stream. For this to be +// practical, the number of references has to be small, e.g. each label has +// exactly one reference. +// + +// Note: we also tried a completely different approach: random assignment +// followed by simulated annealing. This produced similar results. The results +// were not as good for very small differences because the simulated annealing +// never quite hit the groove. And simulated annealing was several orders of +// magnitude slower. + + +// TRIE node for suffix strings in the label reference sequence. +// +// We dynamically build a trie for both the program and model, growing the trie +// as necessary. The trie node for a (possibly) empty string of label +// references contains the distribution of labels following the string. The +// roots node (for the empty string) thus contains the simple distribution of +// labels within the label reference stream. + +struct Node { + Node(LabelInfo* in_edge, Node* prev) + : in_edge_(in_edge), prev_(prev), count_(0), + in_queue_(false) { + length_ = 1 + (prev_ ? prev_->length_ : 0); + } + LabelInfo* in_edge_; // + Node* prev_; // Node at shorter length. + int count_; // Frequency of this path in Trie. + int length_; + typedef std::map<LabelInfo*, Node*> Edges; + Edges edges_; + std::vector<int> places_; // Indexes into sequence of this item. + std::list<Node*> edges_in_frequency_order; + + bool in_queue_; + bool Extended() const { return edges_.size() > 0; } + + uint32 Weight() const { + return edges_in_frequency_order.front()->count_; + } +}; + +static std::string ToString(Node* node) { + std::vector<std::string> prefix; + for (Node* n = node; n->prev_; n = n->prev_) + prefix.push_back(ToString(n->in_edge_)); + + std::string s; + s += "{"; + const char* sep = ""; + while (!prefix.empty()) { + s += sep; + sep = ","; + s += prefix.back(); + prefix.pop_back(); + } + + s += StringPrintf("%u", node->count_); + s += " @"; + s += Uint64ToString(node->edges_in_frequency_order.size()); + s += "}"; + return s; +} + +typedef std::vector<LabelInfo*> Trace; + +struct OrderNodeByCountDecreasing { + bool operator()(Node* a, Node* b) const { + if (a->count_ != b->count_) + return (a->count_) > (b->count_); + return a->places_.at(0) < b->places_.at(0); // Prefer first occuring. + } +}; + +struct OrderNodeByWeightDecreasing { + bool operator()(Node* a, Node* b) const { + // (Maybe tie-break on total count, followed by lowest assigned node indexes + // in path.) + uint32 a_weight = a->Weight(); + uint32 b_weight = b->Weight(); + if (a_weight != b_weight) + return a_weight > b_weight; + if (a->length_ != b->length_) + return a->length_ > b->length_; // Prefer longer. + return a->places_.at(0) < b->places_.at(0); // Prefer first occuring. + } +}; + +typedef std::set<Node*, OrderNodeByWeightDecreasing> NodeQueue; + +class AssignmentProblem { + public: + AssignmentProblem(const Trace& model, + const Trace& problem) + : m_trace_(model), + p_trace_(problem) { + } + + ~AssignmentProblem() { + for (size_t i = 0; i < all_nodes_.size(); ++i) + delete all_nodes_[i]; + } + + bool Solve() { + m_root_ = MakeRootNode(m_trace_); + p_root_ = MakeRootNode(p_trace_); + AddToQueue(p_root_); + + while (!worklist_.empty()) { + Node* node = *worklist_.begin(); + node->in_queue_ = false; + worklist_.erase(node); + TrySolveNode(node); + } + + ALOG1 << unsolved_.size() << " unsolved items"; + return true; + } + + private: + void AddToQueue(Node* node) { + if (node->length_ >= 10) { + ALOG3 << "Length clipped " << ToString(node->prev_); + return; + } + if (node->in_queue_) { + LOG(ERROR) << "Double add " << ToString(node); + return; + } + // just to be sure data for prioritizing is available + ExtendNode(node, p_trace_); + // SkipCommittedLabels(node); + if (node->edges_in_frequency_order.empty()) + return; + node->in_queue_ = true; + worklist_.insert(node); + } + + void SkipCommittedLabels(Node* node) { + ExtendNode(node, p_trace_); + uint32 skipped = 0; + while (!node->edges_in_frequency_order.empty() && + node->edges_in_frequency_order.front()->in_edge_->assignment_) { + ++skipped; + node->edges_in_frequency_order.pop_front(); + } + if (skipped > 0) + ALOG3 << "Skipped " << skipped << " at " << ToString(node); + } + + void TrySolveNode(Node* p_node) { + Node* front = p_node->edges_in_frequency_order.front(); + if (front->in_edge_->assignment_) { + p_node->edges_in_frequency_order.pop_front(); + AddToQueue(front); + AddToQueue(p_node); + return; + } + + // Compare frequencies of unassigned edges, and either make + // assignment(s) or move node to unsolved list + + Node* m_node = FindModelNode(p_node); + + if (m_node == NULL) { + ALOG1 << "Can't find model node"; + unsolved_.insert(p_node); + return; + } + ExtendNode(m_node, m_trace_); + + // Lets just try greedy + + SkipCommittedLabels(m_node); + if (m_node->edges_in_frequency_order.empty()) { + ALOG3 << "Punting, no elements left in model vs " + << p_node->edges_in_frequency_order.size(); + unsolved_.insert(p_node); + return; + } + Node* m_match = m_node->edges_in_frequency_order.front(); + Node* p_match = p_node->edges_in_frequency_order.front(); + + if (p_match->count_ > 1.1 * m_match->count_ || + m_match->count_ > 1.1 * p_match->count_) { + ALOG2 << "Tricky distribution " + << p_match->count_ << ":" << m_match->count_ << " " + << ToString(p_match) << " vs " << ToString(m_match); + return; + } + + m_node->edges_in_frequency_order.pop_front(); + p_node->edges_in_frequency_order.pop_front(); + + LabelInfo* p_label_info = p_match->in_edge_; + LabelInfo* m_label_info = m_match->in_edge_; + int m_index = p_label_info->label_->index_; + if (m_index != Label::kNoIndex) { + ALOG1 << "Cant use unassigned label from model " << m_index; + unsolved_.insert(p_node); + return; + } + + Assign(p_label_info, m_label_info); + + AddToQueue(p_match); // find matches within new match + AddToQueue(p_node); // and more matches within this node + } + + void Assign(LabelInfo* p_info, LabelInfo* m_info) { + AssignOne(p_info, m_info); + ALOG3 << "Assign " << ToString(p_info) << " := " << ToString(m_info); + // Now consider unassigned adjacent addresses + TryExtendAssignment(p_info, m_info); + } + + void AssignOne(LabelInfo* p_info, LabelInfo* m_info) { + p_info->label_->index_ = m_info->label_->index_; + + // Mark as assigned + m_info->assignment_ = p_info; + p_info->assignment_ = m_info; + } + + void TryExtendAssignment(LabelInfo* p_info, LabelInfo* m_info) { + RVA m_rva_base = m_info->label_->rva_; + RVA p_rva_base = p_info->label_->rva_; + + LabelInfo* m_info_next = m_info->next_addr_; + LabelInfo* p_info_next = p_info->next_addr_; + for ( ; m_info_next && p_info_next; ) { + if (m_info_next->assignment_) + break; + + RVA m_rva = m_info_next->label_->rva_; + RVA p_rva = p_info_next->label_->rva_; + + if (m_rva - m_rva_base != p_rva - p_rva_base) { + // previous label was pointing to something that is different size + break; + } + LabelInfo* m_info_next_next = m_info_next->next_addr_; + LabelInfo* p_info_next_next = p_info_next->next_addr_; + if (m_info_next_next && p_info_next_next) { + RVA m_rva_next = m_info_next_next->label_->rva_; + RVA p_rva_next = p_info_next_next->label_->rva_; + if (m_rva_next - m_rva != p_rva_next - p_rva) { + // Since following labels are no longer in address lockstep, assume + // this address has a difference. + break; + } + } + + // The label has inconsistent numbers of references, it is probably not + // the same thing. + if (m_info_next->refs_ != p_info_next->refs_) { + break; + } + + ALOG3 << " Extending assignment -> " + << ToString(p_info_next) << " := " << ToString(m_info_next); + + AssignOne(p_info_next, m_info_next); + + if (p_info_next->refs_ == m_info_next->refs_ && + p_info_next->refs_ == 1) { + TryExtendSequence(p_info_next->positions_[0], + m_info_next->positions_[0]); + TryExtendSequenceBackwards(p_info_next->positions_[0], + m_info_next->positions_[0]); + } + + p_info_next = p_info_next_next; + m_info_next = m_info_next_next; + } + + LabelInfo* m_info_prev = m_info->prev_addr_; + LabelInfo* p_info_prev = p_info->prev_addr_; + for ( ; m_info_prev && p_info_prev; ) { + if (m_info_prev->assignment_) + break; + + RVA m_rva = m_info_prev->label_->rva_; + RVA p_rva = p_info_prev->label_->rva_; + + if (m_rva - m_rva_base != p_rva - p_rva_base) { + // previous label was pointing to something that is different size + break; + } + LabelInfo* m_info_prev_prev = m_info_prev->prev_addr_; + LabelInfo* p_info_prev_prev = p_info_prev->prev_addr_; + + // The the label has inconsistent numbers of references, it is + // probably not the same thing + if (m_info_prev->refs_ != p_info_prev->refs_) { + break; + } + + AssignOne(p_info_prev, m_info_prev); + ALOG3 << " Extending assignment <- " << ToString(p_info_prev) << " := " + << ToString(m_info_prev); + + p_info_prev = p_info_prev_prev; + m_info_prev = m_info_prev_prev; + } + } + + uint32 TryExtendSequence(uint32 p_pos_start, uint32 m_pos_start) { + uint32 p_pos = p_pos_start + 1; + uint32 m_pos = m_pos_start + 1; + + while (p_pos < p_trace_.size() && m_pos < m_trace_.size()) { + LabelInfo* p_info = p_trace_[p_pos]; + LabelInfo* m_info = m_trace_[m_pos]; + + // To match, either (1) both are assigned or (2) both are unassigned. + if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL)) + break; + + // If they are assigned, it needs to be consistent (same index). + if (p_info->assignment_ && m_info->assignment_) { + if (p_info->label_->index_ != m_info->label_->index_) + break; + ++p_pos; + ++m_pos; + continue; + } + + if (p_info->refs_ != m_info->refs_) + break; + + AssignOne(p_info, m_info); + ALOG3 << " Extending assignment seq" + << "[+" << p_pos - p_pos_start << "]" + << " -> " + << ToString(p_info) << " := " << ToString(m_info); + + ++p_pos; + ++m_pos; + } + + return p_pos - p_pos_start; + } + + uint32 TryExtendSequenceBackwards(uint32 p_pos_start, uint32 m_pos_start) { + if (p_pos_start == 0 || m_pos_start == 0) + return 0; + + uint32 p_pos = p_pos_start - 1; + uint32 m_pos = m_pos_start - 1; + + while (p_pos > 0 && m_pos > 0) { + LabelInfo* p_info = p_trace_[p_pos]; + LabelInfo* m_info = m_trace_[m_pos]; + + if ((p_info->assignment_ == NULL) != (m_info->assignment_ == NULL)) + break; + + if (p_info->assignment_ && m_info->assignment_) { + if (p_info->label_->index_ != m_info->label_->index_) + break; + --p_pos; + --m_pos; + continue; + } + + if (p_info->refs_ != m_info->refs_) + break; + + AssignOne(p_info, m_info); + ALOG3 << " Extending assignment seq" + << "[-" << p_pos_start - p_pos << "]" + << " <- " + << ToString(p_info) << " := " << ToString(m_info); + + --p_pos; + --m_pos; + } + + return p_pos - p_pos_start; + } + + Node* FindModelNode(Node* node) { + if (node->prev_ == NULL) + return m_root_; + + Node* m_parent = FindModelNode(node->prev_); + if (m_parent == NULL) { + return NULL; + } + + ExtendNode(m_parent, m_trace_); + + LabelInfo* p_label = node->in_edge_; + LabelInfo* m_label = p_label->assignment_; + if (m_label == NULL) { + ALOG1 << "Expected assigned prefix"; + return NULL; + } + + Node::Edges::iterator e = m_parent->edges_.find(m_label); + if (e == m_parent->edges_.end()) { + ALOG2 << "Expected defined edge in parent"; + return NULL; + } + + return e->second; + } + + Node* MakeRootNode(const Trace& trace) { + Node* node = new Node(NULL, NULL); + all_nodes_.push_back(node); + for (size_t i = 0; i < trace.size(); ++i) { + ++node->count_; + node->places_.push_back(i); + } + return node; + } + + void ExtendNode(Node* node, const Trace& trace) { + // Make sure trie is filled in at this node. + if (node->Extended()) + return; + for (size_t i = 0; i < node->places_.size(); ++i) { + uint32 index = node->places_.at(i); + if (index < trace.size()) { + LabelInfo* item = trace.at(index); + Node*& slot = node->edges_[item]; + if (slot == NULL) { + slot = new Node(item, node); + all_nodes_.push_back(slot); + node->edges_in_frequency_order.push_back(slot); + } + slot->places_.push_back(index + 1); + ++slot->count_; + } + } + node->edges_in_frequency_order.sort(OrderNodeByCountDecreasing()); + } + + const Trace& m_trace_; + const Trace& p_trace_; + Node* m_root_; + Node* p_root_; + + NodeQueue worklist_; + NodeQueue unsolved_; + + std::vector<Node*> all_nodes_; + + DISALLOW_COPY_AND_ASSIGN(AssignmentProblem); +}; + +class GraphAdjuster : public AdjustmentMethod { + public: + GraphAdjuster() {} + ~GraphAdjuster() {} + + bool Adjust(const AssemblyProgram& model, AssemblyProgram* program) { + LOG(INFO) << "GraphAdjuster::Adjust"; + prog_ = program; + model_ = &model; + debug_label_index_gen_ = 0; + return Finish(); + } + + bool Finish() { + prog_->UnassignIndexes(); + CollectTraces(model_, &model_abs32_, &model_rel32_, true); + CollectTraces(prog_, &prog_abs32_, &prog_rel32_, false); + Solve(model_abs32_, prog_abs32_); + Solve(model_rel32_, prog_rel32_); + prog_->AssignRemainingIndexes(); + return true; + } + + private: + + void CollectTraces(const AssemblyProgram* program, Trace* abs32, Trace* rel32, + bool is_model) { + const std::vector<Instruction*>& instructions = program->instructions(); + for (size_t i = 0; i < instructions.size(); ++i) { + Instruction* instruction = instructions.at(i); + if (Label* label = program->InstructionAbs32Label(instruction)) + ReferenceLabel(abs32, label, is_model); + if (Label* label = program->InstructionRel32Label(instruction)) + ReferenceLabel(rel32, label, is_model); + } + // TODO(sra): we could simply append all the labels in index order to + // incorporate some costing for entropy (bigger deltas) that will be + // introduced into the label address table by non-monotonic ordering. This + // would have some knock-on effects to parts of the algorithm that work on + // single-occurence labels. + } + + void Solve(const Trace& model, const Trace& problem) { + LinkLabelInfos(model); + LinkLabelInfos(problem); + AssignmentProblem a(model, problem); + a.Solve(); + } + + void LinkLabelInfos(const Trace& trace) { + typedef std::set<LabelInfo*, OrderLabelInfoByAddressAscending> Ordered; + Ordered ordered; + for (Trace::const_iterator p = trace.begin(); p != trace.end(); ++p) + ordered.insert(*p); + LabelInfo* prev = NULL; + for (Ordered::iterator p = ordered.begin(); p != ordered.end(); ++p) { + LabelInfo* curr = *p; + if (prev) prev->next_addr_ = curr; + curr->prev_addr_ = prev; + prev = curr; + + if (curr->positions_.size() != curr->refs_) + NOTREACHED(); + } + } + + void ReferenceLabel(Trace* trace, Label* label, bool is_model) { + trace->push_back(MakeLabelInfo(label, is_model, trace->size())); + } + + LabelInfo* MakeLabelInfo(Label* label, bool is_model, uint32 position) { + LabelInfo& slot = label_infos_[label]; + if (slot.label_ == NULL) { + slot.label_ = label; + slot.is_model_ = is_model; + slot.debug_index_ = ++debug_label_index_gen_; + } + slot.positions_.push_back(position); + ++slot.refs_; + return &slot; + } + + AssemblyProgram* prog_; // Program to be adjusted, owned by caller. + const AssemblyProgram* model_; // Program to be mimicked, owned by caller. + + Trace model_abs32_; + Trace model_rel32_; + Trace prog_abs32_; + Trace prog_rel32_; + + int debug_label_index_gen_; + + // Note LabelInfo is allocated inside map, so the LabelInfo lifetimes are + // managed by the map. + std::map<Label*, LabelInfo> label_infos_; + + private: + DISALLOW_COPY_AND_ASSIGN(GraphAdjuster); +}; + + +//////////////////////////////////////////////////////////////////////////////// + +void AdjustmentMethod::Destroy() { delete this; } + +AdjustmentMethod* AdjustmentMethod::MakeNullAdjustmentMethod() { + return new NullAdjustmentMethod(); +} + +AdjustmentMethod* AdjustmentMethod::MakeProductionAdjustmentMethod() { + return new GraphAdjuster(); +} + +Status Adjust(const AssemblyProgram& model, AssemblyProgram* program) { + AdjustmentMethod* method = AdjustmentMethod::MakeProductionAdjustmentMethod(); + bool ok = method->Adjust(model, program); + method->Destroy(); + if (ok) + return C_OK; + else + return C_ADJUSTMENT_FAILED; +} + +} // namespace courgette diff --git a/courgette/adjustment_method.h b/courgette/adjustment_method.h new file mode 100644 index 0000000..8ca71e8 --- /dev/null +++ b/courgette/adjustment_method.h @@ -0,0 +1,42 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_ADJUSTMENT_METHOD_H_ +#define COURGETTE_ADJUSTMENT_METHOD_H_ + +#include "base/basictypes.h" + +namespace courgette { + +class AssemblyProgram; + +class AdjustmentMethod { + public: + // Factory methods for making adjusters. + + // Returns the adjustment method used in production. + static AdjustmentMethod* MakeProductionAdjustmentMethod(); + + // Returns and adjustement method that makes no adjustments. + static AdjustmentMethod* MakeNullAdjustmentMethod(); + + + // Adjusts |program| to increase similarity to |model|. |program| can be + // changed in any way provided that it still produces the same output when + // assembled. + virtual bool Adjust(const AssemblyProgram& model, + AssemblyProgram* program) = 0; + + // Deletes 'this' adjustment method. + virtual void Destroy(); + + protected: + AdjustmentMethod() {} + virtual ~AdjustmentMethod() {} + + DISALLOW_COPY_AND_ASSIGN(AdjustmentMethod); +}; + +} // namespace courgette +#endif // COURGETTE_ADJUSTMENT_METHOD_H_ diff --git a/courgette/adjustment_method_unittest.cc b/courgette/adjustment_method_unittest.cc new file mode 100644 index 0000000..661b23d --- /dev/null +++ b/courgette/adjustment_method_unittest.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <string> + +#include "base/path_service.h" +#include "base/file_util.h" +#include "base/string_util.h" + +#include "courgette/assembly_program.h" +#include "courgette/courgette.h" +#include "courgette/streams.h" + +#include "testing/gtest/include/gtest/gtest.h" + +class AdjustmentMethodTest : public testing::Test { + public: + void Test1() const; + + private: + void SetUp() { + } + + void TearDown() { + } + + // Returns one of two similar a simple programs. They differ only in the + // label assignment, so that it is possible to make them look identical. + courgette::AssemblyProgram* MakeProgram(int kind) const { + courgette::AssemblyProgram* prog = new courgette::AssemblyProgram(); + prog->set_image_base(0x00400000); + + courgette::Label* labelA = prog->FindOrMakeAbs32Label(0x00410000); + courgette::Label* labelB = prog->FindOrMakeAbs32Label(0x00410004); + + prog->EmitAbs32(labelA); + prog->EmitAbs32(labelA); + prog->EmitAbs32(labelB); + prog->EmitAbs32(labelA); + prog->EmitAbs32(labelA); + prog->EmitAbs32(labelB); + + if (kind == 0) { + labelA->index_ = 0; + labelB->index_ = 1; + } else { + labelA->index_ = 1; + labelB->index_ = 0; + } + prog->AssignRemainingIndexes(); + + return prog; + } + + courgette::AssemblyProgram* MakeProgramA() const { return MakeProgram(0); } + courgette::AssemblyProgram* MakeProgramB() const { return MakeProgram(1); } + + // Returns a string that is the serialized version of |program|. + // Deletes |program|. + std::string Serialize(courgette::AssemblyProgram *program) const { + courgette::EncodedProgram* encoded = NULL; + + const courgette::Status encode_status = Encode(program, &encoded); + EXPECT_EQ(courgette::C_OK, encode_status); + + DeleteAssemblyProgram(program); + + courgette::SinkStreamSet sinks; + const courgette::Status write_status = WriteEncodedProgram(encoded, &sinks); + EXPECT_EQ(courgette::C_OK, write_status); + + DeleteEncodedProgram(encoded); + + courgette::SinkStream sink; + bool can_collect = sinks.CopyTo(&sink); + EXPECT_TRUE(can_collect); + + return std::string(reinterpret_cast<const char *>(sink.Buffer()), + sink.Length()); + } +}; + + +void AdjustmentMethodTest::Test1() const { + courgette::AssemblyProgram* prog1 = MakeProgramA(); + courgette::AssemblyProgram* prog2 = MakeProgramB(); + std::string s1 = Serialize(prog1); + std::string s2 = Serialize(prog2); + + // Don't use EXPECT_EQ because strings are unprintable. + EXPECT_FALSE(s1 == s2); // Unadjusted A and B differ. + + courgette::AssemblyProgram* prog5 = MakeProgramA(); + courgette::AssemblyProgram* prog6 = MakeProgramB(); + courgette::Status can_adjust = Adjust(*prog5, prog6); + EXPECT_EQ(courgette::C_OK, can_adjust); + std::string s5 = Serialize(prog5); + std::string s6 = Serialize(prog6); + + EXPECT_TRUE(s1 == s5); // Adjustment did not change A (prog5) + EXPECT_TRUE(s5 == s6); // Adjustment did change B into A +} + + +TEST_F(AdjustmentMethodTest, All) { + Test1(); +} diff --git a/courgette/assembly_program.cc b/courgette/assembly_program.cc new file mode 100644 index 0000000..e40f38f --- /dev/null +++ b/courgette/assembly_program.cc @@ -0,0 +1,372 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/assembly_program.h" + +#include <memory.h> +#include <algorithm> +#include <map> +#include <set> +#include <sstream> +#include <vector> + +#include "base/logging.h" + +#include "courgette/courgette.h" +#include "courgette/encoded_program.h" + +namespace courgette { + +// Opcodes of simple assembly language +enum OP { + ORIGIN, // ORIGIN <rva> - set current address for assembly. + MAKERELOCS, // Generates a base relocation table. + DEFBYTE, // DEFBYTE <value> - emit a byte literal. + REL32, // REL32 <label> - emit a rel32 encoded reference to 'label'. + ABS32, // REL32 <label> - emit am abs32 encoded reference to 'label'. + LAST_OP +}; + +// Base class for instructions. Because we have so many instructions we want to +// keep them as small as possible. For this reason we avoid virtual functions. +// +class Instruction { + public: + OP op() const { return static_cast<OP>(op_); } + + protected: + explicit Instruction(OP op) : op_(op), info_(0) {} + Instruction(OP op, unsigned int info) : op_(op), info_(info) {} + + uint32 op_ : 4; // A few bits to store the OP code. + uint32 info_ : 28; // Remaining bits in first word available to subclass. + + private: + DISALLOW_COPY_AND_ASSIGN(Instruction); +}; + +namespace { + +// Sets the current address for the emitting instructions. +class OriginInstruction : public Instruction { + public: + explicit OriginInstruction(RVA rva) : Instruction(ORIGIN, 0), rva_(rva) {} + RVA origin_rva() const { return rva_; } + private: + RVA rva_; +}; + +// Emits an entire base relocation table. +class MakeRelocsInstruction : public Instruction { + public: + MakeRelocsInstruction() : Instruction(MAKERELOCS) {} +}; + +// Emits a single byte. +class ByteInstruction : public Instruction { + public: + explicit ByteInstruction(uint8 value) : Instruction(DEFBYTE, value) {} + uint8 byte_value() const { return info_; } +}; + +// A ABS32 to REL32 instruction emits a reference to a label's address. +class InstructionWithLabel : public Instruction { + public: + InstructionWithLabel(OP op, Label* label) + : Instruction(op, 0), label_(label) { + if (label == NULL) NOTREACHED(); + } + Label* label() const { return label_; } + private: + Label* label_; +}; + +} // namespace + +AssemblyProgram::AssemblyProgram() + : image_base_(0), + byte_instruction_cache_(NULL) { +} + +static void DeleteContainedLabels(const RVAToLabel& labels) { + for (RVAToLabel::const_iterator p = labels.begin(); p != labels.end(); ++p) + delete p->second; +} + +AssemblyProgram::~AssemblyProgram() { + for (size_t i = 0; i < instructions_.size(); ++i) { + Instruction* instruction = instructions_[i]; + if (instruction->op() != DEFBYTE) // Will be in byte_instruction_cache_. + delete instruction; + } + if (byte_instruction_cache_) { + for (size_t i = 0; i < 256; ++i) + delete byte_instruction_cache_[i]; + delete[] byte_instruction_cache_; + } + DeleteContainedLabels(rel32_labels_); + DeleteContainedLabels(abs32_labels_); +} + +void AssemblyProgram::EmitMakeRelocsInstruction() { + Emit(new MakeRelocsInstruction()); +} + +void AssemblyProgram::EmitOriginInstruction(RVA rva) { + Emit(new OriginInstruction(rva)); +} + +void AssemblyProgram::EmitByteInstruction(uint8 byte) { + Emit(GetByteInstruction(byte)); +} + +void AssemblyProgram::EmitRel32(Label* label) { + Emit(new InstructionWithLabel(REL32, label)); +} + +void AssemblyProgram::EmitAbs32(Label* label) { + Emit(new InstructionWithLabel(ABS32, label)); +} + +Label* AssemblyProgram::FindOrMakeAbs32Label(RVA rva) { + return FindLabel(rva, &abs32_labels_); +} + +Label* AssemblyProgram::FindOrMakeRel32Label(RVA rva) { + return FindLabel(rva, &rel32_labels_); +} + +void AssemblyProgram::DefaultAssignIndexes() { + DefaultAssignIndexes(&abs32_labels_); + DefaultAssignIndexes(&rel32_labels_); +} + +void AssemblyProgram::UnassignIndexes() { + UnassignIndexes(&abs32_labels_); + UnassignIndexes(&rel32_labels_); +} + +void AssemblyProgram::AssignRemainingIndexes() { + AssignRemainingIndexes(&abs32_labels_); + AssignRemainingIndexes(&rel32_labels_); +} + +Label* AssemblyProgram::InstructionAbs32Label( + const Instruction* instruction) const { + if (instruction->op() == ABS32) + return static_cast<const InstructionWithLabel*>(instruction)->label(); + return NULL; +} + +Label* AssemblyProgram::InstructionRel32Label( + const Instruction* instruction) const { + if (instruction->op() == REL32) + return static_cast<const InstructionWithLabel*>(instruction)->label(); + return NULL; +} + +Label* AssemblyProgram::FindLabel(RVA rva, RVAToLabel* labels) { + Label*& slot = (*labels)[rva]; + if (slot == 0) { + slot = new Label(rva); + } + return slot; +} + +void AssemblyProgram::UnassignIndexes(RVAToLabel* labels) { + for (RVAToLabel::iterator p = labels->begin(); p != labels->end(); ++p) { + Label* current = p->second; + current->index_ = Label::kNoIndex; + } +} + +// DefaultAssignIndexes takes a set of labels and assigns indexes in increasing +// address order. +// +void AssemblyProgram::DefaultAssignIndexes(RVAToLabel* labels) { + int index = 0; + for (RVAToLabel::iterator p = labels->begin(); p != labels->end(); ++p) { + Label* current = p->second; + if (current->index_ != Label::kNoIndex) + NOTREACHED(); + current->index_ = index; + ++index; + } +} + +// AssignRemainingIndexes assigns indexes to any addresses (labels) that are not +// yet assigned an index. +// +void AssemblyProgram::AssignRemainingIndexes(RVAToLabel* labels) { + // An address table compresses best when each index is associated with an + // address that is slight larger than the previous index. + + // First see which indexes have not been used. The 'available' vector could + // grow even bigger, but the number of addresses is a better starting size + // than empty. + std::vector<bool> available(labels->size(), true); + int used = 0; + + for (RVAToLabel::iterator p = labels->begin(); p != labels->end(); ++p) { + size_t index = p->second->index_; + if (index != Label::kNoIndex) { + while (index >= available.size()) + available.push_back(true); + available.at(index) = false; + ++used; + } + } + + LOG(INFO) << used << " of " << labels->size() << " labels pre-assigned"; + + // Are there any unused labels that happen to be adjacent following a used + // label? + // + int fill_forward_count = 0; + Label* prev = 0; + for (RVAToLabel::iterator p = labels->begin(); p != labels->end(); ++p) { + Label* current = p->second; + if (current->index_ == Label::kNoIndex) { + size_t index = 0; + if (prev && prev->index_ != Label::kNoIndex) + index = prev->index_ + 1; + if (index < available.size() && available.at(index)) { + current->index_ = index; + available.at(index) = false; + ++fill_forward_count; + } + } + prev = current; + } + + // Are there any unused labels that happen to be adjacent preceeding a used + // label? + // + int fill_backward_count = 0; + int backward_refs = 0; + prev = 0; + for (RVAToLabel::reverse_iterator p = labels->rbegin(); + p != labels->rend(); + ++p) { + Label* current = p->second; + if (current->index_ == Label::kNoIndex) { + int prev_index; + if (prev) + prev_index = prev->index_; + else + prev_index = available.size(); + if (prev_index != 0 && + prev_index != Label::kNoIndex && + available.at(prev_index - 1)) { + current->index_ = prev_index - 1; + available.at(current->index_) = false; + ++fill_backward_count; + } + } + prev = current; + } + + // Fill in any remaining indexes + int fill_infill_count = 0; + int index = 0; + for (RVAToLabel::iterator p = labels->begin(); p != labels->end(); ++p) { + Label* current = p->second; + if (current->index_ == Label::kNoIndex) { + while (!available.at(index)) { + ++index; + } + current->index_ = index; + available.at(index) = false; + ++index; + ++fill_infill_count; + } + } + + LOG(INFO) << " fill" + << " forward " << fill_forward_count << " " + << " backward " << fill_backward_count << " " + << " infill " << fill_infill_count; +} + +typedef void (EncodedProgram::*DefineLabelMethod)(int index, RVA value); + +static void DefineLabels(const RVAToLabel& labels, + EncodedProgram* encoded_format, + DefineLabelMethod define_label) { + for (RVAToLabel::const_iterator p = labels.begin(); p != labels.end(); ++p) { + Label* label = p->second; + (encoded_format->*define_label)(label->index_, label->rva_); + } +} + +EncodedProgram* AssemblyProgram::Encode() const { + EncodedProgram* encoded = new EncodedProgram(); + + encoded->set_image_base(image_base_); + DefineLabels(abs32_labels_, encoded, &EncodedProgram::DefineAbs32Label); + DefineLabels(rel32_labels_, encoded, &EncodedProgram::DefineRel32Label); + encoded->EndLabels(); + + for (size_t i = 0; i < instructions_.size(); ++i) { + Instruction* instruction = instructions_[i]; + + switch (instruction->op()) { + case ORIGIN: { + OriginInstruction* org = static_cast<OriginInstruction*>(instruction); + encoded->AddOrigin(org->origin_rva()); + break; + } + case DEFBYTE: { + uint8 b = static_cast<ByteInstruction*>(instruction)->byte_value(); + encoded->AddCopy(1, &b); + break; + } + case REL32: { + Label* label = static_cast<InstructionWithLabel*>(instruction)->label(); + encoded->AddRel32(label->index_); + break; + } + case ABS32: { + Label* label = static_cast<InstructionWithLabel*>(instruction)->label(); + encoded->AddAbs32(label->index_); + break; + } + case MAKERELOCS: { + encoded->AddMakeRelocs(); + break; + } + default: { + NOTREACHED() << "Unknown Insn OP kind"; + } + } + } + + return encoded; +} + +Instruction* AssemblyProgram::GetByteInstruction(uint8 byte) { + if (!byte_instruction_cache_) { + byte_instruction_cache_ = new Instruction*[256]; + for (int i = 0; i < 256; ++i) { + byte_instruction_cache_[i] = new ByteInstruction(static_cast<uint8>(i)); + } + } + + return byte_instruction_cache_[byte]; +} + +//////////////////////////////////////////////////////////////////////////////// + +Status Encode(AssemblyProgram* program, EncodedProgram** output) { + *output = NULL; + EncodedProgram *encoded = program->Encode(); + if (encoded) { + *output = encoded; + return C_OK; + } else { + return C_GENERAL_ERROR; + } +} + +} // namespace courgette diff --git a/courgette/assembly_program.h b/courgette/assembly_program.h new file mode 100644 index 0000000..e726f81 --- /dev/null +++ b/courgette/assembly_program.h @@ -0,0 +1,133 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_ASSEMBLY_PROGRAM_H_ +#define COURGETTE_ASSEMBLY_PROGRAM_H_ + +#include <map> +#include <set> +#include <vector> + +#include "base/basictypes.h" + +#include "courgette/image_info.h" + +namespace courgette { + +class EncodedProgram; +class Instruction; + +// A Label is a symbolic reference to an address. Unlike a conventional +// assembly language, we always know the address. The address will later be +// stored in a table and the Label will be replaced with the index into the +// table. +// +// TODO(sra): Make fields private and add setters and getters. +class Label { + public: + static const int kNoIndex = -1; + Label() : rva_(0), index_(kNoIndex) {} + explicit Label(RVA rva) : rva_(rva), index_(kNoIndex) {} + + RVA rva_; // Address refered to by the label. + int index_; // Index of address in address table, kNoIndex until assigned. +}; + +typedef std::map<RVA, Label*> RVAToLabel; + +// An AssemblyProgram is the result of disassembling an executable file. +// +// * The disassembler creates labels in the AssemblyProgram and emits +// 'Instructions'. +// * The disassembler then calls DefaultAssignIndexes to assign +// addresses to positions in the address tables. +// * [Optional step] +// * At this point the AssemblyProgram can be converted into an +// EncodedProgram and serialized to an output stream. +// * Later, the EncodedProgram can be deserialized and assembled into +// the original file. +// +// The optional step is to modify the AssemblyProgram. One form of modification +// is to assign indexes in such a way as to make the EncodedProgram for this +// AssemblyProgram look more like the EncodedProgram for some other +// AssemblyProgram. The modification process should call UnassignIndexes, do +// its own assignment, and then call AssignRemainingIndexes to ensure all +// indexes are assigned. +// +class AssemblyProgram { + public: + AssemblyProgram(); + ~AssemblyProgram(); + + void set_image_base(uint64 image_base) { image_base_ = image_base; } + + // Instructions will be assembled in the order they are emitted. + + // Generates an entire base relocation table. + void EmitMakeRelocsInstruction(); + + // Following instruction will be assembled at address 'rva'. + void EmitOriginInstruction(RVA rva); + + // Generates a single byte of data or machine instruction. + void EmitByteInstruction(uint8 byte); + + // Generates 4-byte relative reference to address of 'label'. + void EmitRel32(Label* label); + + // Generates 4-byte absolute reference to address of 'label'. + void EmitAbs32(Label* label); + + Label* FindOrMakeAbs32Label(RVA rva); + Label* FindOrMakeRel32Label(RVA rva); + + void DefaultAssignIndexes(); + void UnassignIndexes(); + void AssignRemainingIndexes(); + + EncodedProgram* Encode() const; + + // Accessor for instruction list. + const std::vector<Instruction*>& instructions() const { + return instructions_; + } + + // Returns the label if the instruction contains and absolute address, + // otherwise returns NULL. + Label* InstructionAbs32Label(const Instruction* instruction) const; + + // Returns the label if the instruction contains and rel32 offset, + // otherwise returns NULL. + Label* InstructionRel32Label(const Instruction* instruction) const; + + + private: + void Emit(Instruction* instruction) { instructions_.push_back(instruction); } + + Label* FindLabel(RVA rva, RVAToLabel* labels); + + // Helper methods for the public versions. + static void UnassignIndexes(RVAToLabel* labels); + static void DefaultAssignIndexes(RVAToLabel* labels); + static void AssignRemainingIndexes(RVAToLabel* labels); + + // Sharing instructions that emit a single byte saves a lot of space. + Instruction* GetByteInstruction(uint8 byte); + Instruction** byte_instruction_cache_; + + uint64 image_base_; // Desired or mandated base address of image. + + std::vector<Instruction*> instructions_; // All the instructions in program. + + // These are lookup maps to find the label associated with a given address. + // We have separate label spaces for addresses referenced by rel32 labels and + // abs32 labels. This is somewhat arbitrary. + RVAToLabel rel32_labels_; + RVAToLabel abs32_labels_; + + DISALLOW_COPY_AND_ASSIGN(AssemblyProgram); +}; + +} // namespace courgette +#endif // COURGETTE_ASSEMBLY_PROGRAM_H_ diff --git a/courgette/bsdiff_memory_unittest.cc b/courgette/bsdiff_memory_unittest.cc new file mode 100644 index 0000000..97da726 --- /dev/null +++ b/courgette/bsdiff_memory_unittest.cc @@ -0,0 +1,111 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/third_party/bsdiff.h" + +#include <string> + +#include "base/file_util.h" +#include "base/path_service.h" +#include "base/string_util.h" + +#include "courgette/courgette.h" +#include "courgette/streams.h" + +#include "testing/gtest/include/gtest/gtest.h" + +class BSDiffMemoryTest : public testing::Test { + public: + std::string FileContents(const char *file_name) const; + void GenerateAndTestPatch(const std::string& a, const std::string& b) const; + + private: + void SetUp() { + PathService::Get(base::DIR_SOURCE_ROOT, &test_dir_); + file_util::AppendToPath(&test_dir_, L"courgette"); + file_util::AppendToPath(&test_dir_, L"testdata"); + } + + std::wstring test_dir_; +}; + +// Reads a test file into a string. +std::string BSDiffMemoryTest::FileContents(const char *file_name) const { + std::wstring file_path = test_dir_; + file_util::AppendToPath(&file_path, UTF8ToWide(file_name)); + std::string file_bytes; + if (!file_util::ReadFileToString(file_path, &file_bytes)) { + EXPECT_TRUE(!"Could not read test data"); + } + return file_bytes; +} + +void BSDiffMemoryTest::GenerateAndTestPatch(const std::string& old_text, + const std::string& new_text) const { + courgette::SourceStream old1; + courgette::SourceStream new1; + old1.Init(old_text.c_str(), old_text.length()); + new1.Init(new_text.c_str(), new_text.length()); + + courgette::SinkStream patch1; + courgette::BSDiffStatus status = CreateBinaryPatch(&old1, &new1, &patch1); + EXPECT_EQ(courgette::OK, status); + + courgette::SourceStream old2; + courgette::SourceStream patch2; + old2.Init(old_text.c_str(), old_text.length()); + patch2.Init(patch1); + + courgette::SinkStream new2; + status = ApplyBinaryPatch(&old2, &patch2, &new2); + EXPECT_EQ(courgette::OK, status); + EXPECT_EQ(new_text.length(), new2.Length()); + EXPECT_EQ(0, memcmp(new_text.c_str(), new2.Buffer(), new_text.length())); +} + +TEST_F(BSDiffMemoryTest, TestEmpty) { + GenerateAndTestPatch("", ""); +} + +TEST_F(BSDiffMemoryTest, TestEmptyVsNonempty) { + GenerateAndTestPatch("", "xxx"); +} + +TEST_F(BSDiffMemoryTest, TestNonemptyVsEmpty) { + GenerateAndTestPatch("xxx", ""); +} + +TEST_F(BSDiffMemoryTest, TestSmallInputsWithSmallChanges) { + std::string file1 = + "I would not, could not, in a box.\n" + "I could not, would not, with a fox.\n" + "I will not eat them with a mouse.\n" + "I will not eat them in a house.\n" + "I will not eat them here or there.\n" + "I will not eat them anywhere.\n" + "I do not eat green eggs and ham.\n" + "I do not like them, Sam-I-am.\n"; + std::string file2 = + "I would not, could not, in a BOX.\n" + "I could not, would not, with a FOX.\n" + "I will not eat them with a MOUSE.\n" + "I will not eat them in a HOUSE.\n" + "I will not eat them in a HOUSE.\n" // Extra line. + "I will not eat them here or THERE.\n" + "I will not eat them ANYWHERE.\n" + "I do not eat green eggs and HAM.\n" + "I do not like them, Sam-I-am.\n"; + GenerateAndTestPatch(file1, file2); +} + +TEST_F(BSDiffMemoryTest, TestIndenticalDlls) { + std::string file1 = FileContents("en-US.dll"); + GenerateAndTestPatch(file1, file1); +} + +TEST_F(BSDiffMemoryTest, TestDifferentExes) { + std::string file1 = FileContents("setup1.exe"); + std::string file2 = FileContents("setup2.exe"); + GenerateAndTestPatch(file1, file2); +} diff --git a/courgette/courgette.gyp b/courgette/courgette.gyp new file mode 100644 index 0000000..03de462 --- /dev/null +++ b/courgette/courgette.gyp @@ -0,0 +1,111 @@ +# Copyright (c) 2009 The Chromium Authors. All rights reserved. +# Use of this source code is governed by a BSD-style license that can be +# found in the LICENSE file. + +{ + 'variables': { + 'chromium_code': 1, + }, + 'includes': [ + '../build/common.gypi', + ], + 'targets': [ + { + 'target_name': 'courgette_lib', + 'type': '<(library)', + 'dependencies': [ + '../base/base.gyp:base', + #'../third_party/lzma_sdk/7z_C.vcproj', + ], + 'msvs_guid': '9A72A362-E617-4205-B9F2-43C6FB280FA1', + 'sources': [ + 'adjustment_method.cc', + 'adjustment_method.h', + 'assembly_program.cc', + 'assembly_program.h', + 'third_party/bsdiff.h', + 'third_party/bsdiff_apply.cc', + 'third_party/bsdiff_create.cc', + 'courgette.h', + 'crc.cc', + 'crc.h', + 'difference_estimator.cc', + 'difference_estimator.h', + 'disassembler.cc', + 'disassembler.h', + 'encoded_program.cc', + 'encoded_program.h', + 'ensemble.cc', + 'ensemble.h', + 'ensemble_apply.cc', + 'ensemble_create.cc', + 'image_info.cc', + 'image_info.h', + 'region.h', + 'simple_delta.cc', + 'simple_delta.h', + 'streams.cc', + 'streams.h', + 'win32_x86_generator.h', + 'win32_x86_patcher.h', + ], + }, + { + 'target_name': 'courgette', + 'type': 'executable', + 'msvs_guid': '4EA8CE12-9C6F-45E5-9D08-720383FE3685', + 'sources': [ + 'courgette_tool.cc', + ], + 'dependencies': [ + 'courgette_lib', + '../base/base.gyp:base', + ], + }, + { + 'target_name': 'courgette_minimal_tool', + 'type': 'executable', + 'msvs_guid': 'EB79415F-2F17-4BDC-AADD-4CA4C2D21B73', + 'sources': [ + 'courgette_minimal_tool.cc', + ], + 'dependencies': [ + 'courgette_lib', + '../base/base.gyp:base', + ], + }, + { + 'target_name': 'courgette_unittests', + 'type': 'executable', + 'msvs_guid': '24309F1A-4035-46F9-A3D8-F47DC4BCC2B8', + 'sources': [ + 'adjustment_method_unittest.cc', + 'bsdiff_memory_unittest.cc', + 'difference_estimator_unittest.cc', + 'encoded_program_unittest.cc', + 'encode_decode_unittest.cc', + 'image_info_unittest.cc', + 'run_all_unittests.cc', + 'streams_unittest.cc', + ], + 'dependencies': [ + 'courgette_lib', + '../base/base.gyp:base', + '../testing/gtest.gyp:gtest', + ], + }, + { + 'target_name': 'courgette_fuzz', + 'type': 'executable', + 'msvs_guid': '57C27529-8CA9-4FC3-9C02-DA05B172F785', + 'sources': [ + 'encoded_program_fuzz_unittest.cc', + ], + 'dependencies': [ + 'courgette_lib', + '../base/base.gyp:base', + '../testing/gtest.gyp:gtest', + ], + }, + ], +} diff --git a/courgette/courgette.h b/courgette/courgette.h new file mode 100644 index 0000000..0ddded5 --- /dev/null +++ b/courgette/courgette.h @@ -0,0 +1,118 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_COURGETTE_H_ +#define COURGETTE_COURGETTE_H_ + +namespace courgette { + +// Status codes for Courgette APIs. +// +// Client code should only rely on the distintion between C_OK and the other +// status codes. +// +enum Status { + C_OK = 1, // Successful operation. + + C_GENERAL_ERROR = 2, // Error other than listed below. + + C_READ_OPEN_ERROR = 3, // Could not open input file for reading. + C_READ_ERROR = 4, // Could not read from opened input file. + + C_WRITE_OPEN_ERROR = 3, // Could not open output file for writing. + C_WRITE_ERROR = 4, // Could not write to opened output file. + + C_BAD_ENSEMBLE_MAGIC = 5, // Ensemble patch has bad magic. + C_BAD_ENSEMBLE_VERSION = 6, // Ensemble patch has wrong version. + C_BAD_ENSEMBLE_HEADER = 7, // Ensemble patch has corrupt header. + C_BAD_ENSEMBLE_CRC = 8, // Ensemble patch has corrupt header. + + C_BAD_TRANSFORM = 12, // Transform mis-specified. + C_BAD_BASE = 13, // Base for transform malformed. + + C_BINARY_DIFF_CRC_ERROR = 14, // Internal diff input doesn't have expected + // CRC. + + // Internal errors. + C_STREAM_ERROR = 20, // Unexpected error from streams.h. + C_STREAM_NOT_CONSUMED = 21, // Stream has extra data, is expected to be + // used up. + C_SERIALIZATION_FAILED = 22, // + C_DESERIALIZATION_FAILED = 23, // + C_INPUT_NOT_RECOGNIZED = 24, // Unrecognized input (not an executable). + C_DISASSEMBLY_FAILED = 25, // + C_ASSEMBLY_FAILED = 26, // + C_ADJUSTMENT_FAILED = 27, // + + +}; + +class SinkStream; +class SinkStreamSet; +class SourceStream; +class SourceStreamSet; + +class AssemblyProgram; +class EncodedProgram; + +// Applies the patch to the bytes in |old| and writes the transformed ensemble +// to |output|. +// Returns C_OK unless something went wrong. +Status ApplyEnsemblePatch(SourceStream* old, SourceStream* patch, + SinkStream* output); + +// Applies the patch in |patch_file_name| to the bytes in |old_file_name| and +// writes the transformed ensemble to |new_file_name|. +// Returns C_OK unless something went wrong. +// This function first validates that the patch file has a proper header, so the +// function can be used to 'try' a patch. +Status ApplyEnsemblePatch(const wchar_t* old_file_name, + const wchar_t* patch_file_name, + const wchar_t* new_file_name); + +// Generates a patch that will transform the bytes in |old| into the bytes in +// |target|. +// Returns C_OK unless something when wrong (unexpected). +Status GenerateEnsemblePatch(SourceStream* old, SourceStream* target, + SinkStream* patch); + +// Parses a Windows 32-bit 'Portable Executable' format file from memory, +// storing the pointer to the AssemblyProgram in |*output|. +// Returns C_OK if successful, otherwise returns an error status and sets +// |*output| to NULL. +Status ParseWin32X86PE(const void* buffer, size_t length, + AssemblyProgram** output); + +// Converts |program| into encoded form, returning it as |*output|. +// Returns C_OK if succeeded, otherwise returns an error status and +// sets |*output| to NULL +Status Encode(AssemblyProgram* program, EncodedProgram** output); + + +// Serializes |encoded| into the stream set. +// Returns C_OK if succeeded, otherwise returns an error status. +Status WriteEncodedProgram(EncodedProgram* encoded, SinkStreamSet* sink); + +// Assembles |encoded|, emitting the bytes into |buffer|. +// Returns C_OK if succeeded, otherwise returns an error status and leaves +// |buffer| in an undefined state. +Status Assemble(EncodedProgram* encoded, SinkStream* buffer); + +// Deserializes program from the stream set. +// Returns C_OK if succeeded, otherwise returns an error status and +// sets |*output| to NULL +Status ReadEncodedProgram(SourceStreamSet* source, EncodedProgram** output); + +// Used to free an AssemblyProgram returned by other APIs. +void DeleteAssemblyProgram(AssemblyProgram* program); + +// Used to free an EncodedProgram returned by other APIs. +void DeleteEncodedProgram(EncodedProgram* encoded); + +// Adjusts |program| to look more like |model|. +// +Status Adjust(const AssemblyProgram& model, AssemblyProgram *program); + +} // namespace courgette +#endif // COURGETTE_COURGETTE_H_ diff --git a/courgette/courgette.sln b/courgette/courgette.sln new file mode 100644 index 0000000..38e1f59 --- /dev/null +++ b/courgette/courgette.sln @@ -0,0 +1,111 @@ +Microsoft Visual Studio Solution File, Format Version 9.00
+# Visual Studio 2005
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "courgette_lib", "courgette_lib.vcproj", "{9A72A362-E617-4205-B9F2-43C6FB280FA1}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "courgette_unittests", "courgette_unittests.vcproj", "{24309F1A-4035-46F9-A3D8-F47DC4BCC2B8}"
+ ProjectSection(ProjectDependencies) = postProject
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1} = {9A72A362-E617-4205-B9F2-43C6FB280FA1}
+ {1832A374-8A74-4F9E-B536-69A699B3E165} = {1832A374-8A74-4F9E-B536-69A699B3E165}
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B} = {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A} = {F22022F0-2D3B-5610-4E80-C674A8E01C5A}
+ {8C27D792-2648-4F5E-9ED0-374276327308} = {8C27D792-2648-4F5E-9ED0-374276327308}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "courgette_fuzz", "courgette_fuzz.vcproj", "{57C27529-8CA9-4FC3-9C02-DA05B172F785}"
+ ProjectSection(ProjectDependencies) = postProject
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1} = {9A72A362-E617-4205-B9F2-43C6FB280FA1}
+ {1832A374-8A74-4F9E-B536-69A699B3E165} = {1832A374-8A74-4F9E-B536-69A699B3E165}
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B} = {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A} = {F22022F0-2D3B-5610-4E80-C674A8E01C5A}
+ {8C27D792-2648-4F5E-9ED0-374276327308} = {8C27D792-2648-4F5E-9ED0-374276327308}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "courgette_minimal_tool", "courgette_minimal_tool.vcproj", "{EB79415F-2F17-4BDC-AADD-4CA4C2D21B73}"
+ ProjectSection(ProjectDependencies) = postProject
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1} = {9A72A362-E617-4205-B9F2-43C6FB280FA1}
+ {1832A374-8A74-4F9E-B536-69A699B3E165} = {1832A374-8A74-4F9E-B536-69A699B3E165}
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A} = {F22022F0-2D3B-5610-4E80-C674A8E01C5A}
+ {8C27D792-2648-4F5E-9ED0-374276327308} = {8C27D792-2648-4F5E-9ED0-374276327308}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "courgette", "courgette.vcproj", "{4EA8CE12-9C6F-45E5-9D08-720383FE3685}"
+ ProjectSection(ProjectDependencies) = postProject
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1} = {9A72A362-E617-4205-B9F2-43C6FB280FA1}
+ {1832A374-8A74-4F9E-B536-69A699B3E165} = {1832A374-8A74-4F9E-B536-69A699B3E165}
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A} = {F22022F0-2D3B-5610-4E80-C674A8E01C5A}
+ {8C27D792-2648-4F5E-9ED0-374276327308} = {8C27D792-2648-4F5E-9ED0-374276327308}
+ EndProjectSection
+EndProject
+Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "dependencies", "dependencies", "{3C050794-ECB2-85EB-D650-18D00BF2A989}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icuuc", "..\third_party\icu38\icuuc.vcproj", "{8C27D792-2648-4F5E-9ED0-374276327308}"
+ ProjectSection(ProjectDependencies) = postProject
+ {A0D94973-D355-47A5-A1E2-3456F321F010} = {A0D94973-D355-47A5-A1E2-3456F321F010}
+ EndProjectSection
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icui18n", "..\third_party\icu38\icui18n.vcproj", "{F22022F0-2D3B-5610-4E80-C674A8E01C5A}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "icudata", "..\third_party\icu38\icudata.vcproj", "{A0D94973-D355-47A5-A1E2-3456F321F010}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "base", "..\base\base.vcproj", "{1832A374-8A74-4F9E-B536-69A699B3E165}"
+EndProject
+Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "gtest", "..\testing\gtest.vcproj", "{BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}"
+EndProject
+Global
+ GlobalSection(SolutionConfigurationPlatforms) = preSolution
+ Debug|Win32 = Debug|Win32
+ Release|Win32 = Release|Win32
+ EndGlobalSection
+ GlobalSection(ProjectConfigurationPlatforms) = postSolution
+ {1832A374-8A74-4F9E-B536-69A699B3E165}.Debug|Win32.ActiveCfg = Debug|Win32
+ {1832A374-8A74-4F9E-B536-69A699B3E165}.Debug|Win32.Build.0 = Debug|Win32
+ {1832A374-8A74-4F9E-B536-69A699B3E165}.Release|Win32.ActiveCfg = Release|Win32
+ {1832A374-8A74-4F9E-B536-69A699B3E165}.Release|Win32.Build.0 = Release|Win32
+ {24309F1A-4035-46F9-A3D8-F47DC4BCC2B8}.Debug|Win32.ActiveCfg = Debug|Win32
+ {24309F1A-4035-46F9-A3D8-F47DC4BCC2B8}.Debug|Win32.Build.0 = Debug|Win32
+ {24309F1A-4035-46F9-A3D8-F47DC4BCC2B8}.Release|Win32.ActiveCfg = Release|Win32
+ {24309F1A-4035-46F9-A3D8-F47DC4BCC2B8}.Release|Win32.Build.0 = Release|Win32
+ {4EA8CE12-9C6F-45E5-9D08-720383FE3685}.Debug|Win32.ActiveCfg = Debug|Win32
+ {4EA8CE12-9C6F-45E5-9D08-720383FE3685}.Debug|Win32.Build.0 = Debug|Win32
+ {4EA8CE12-9C6F-45E5-9D08-720383FE3685}.Release|Win32.ActiveCfg = Release|Win32
+ {4EA8CE12-9C6F-45E5-9D08-720383FE3685}.Release|Win32.Build.0 = Release|Win32
+ {57C27529-8CA9-4FC3-9C02-DA05B172F785}.Debug|Win32.ActiveCfg = Debug|Win32
+ {57C27529-8CA9-4FC3-9C02-DA05B172F785}.Debug|Win32.Build.0 = Debug|Win32
+ {57C27529-8CA9-4FC3-9C02-DA05B172F785}.Release|Win32.ActiveCfg = Release|Win32
+ {57C27529-8CA9-4FC3-9C02-DA05B172F785}.Release|Win32.Build.0 = Release|Win32
+ {8C27D792-2648-4F5E-9ED0-374276327308}.Debug|Win32.ActiveCfg = Debug|Win32
+ {8C27D792-2648-4F5E-9ED0-374276327308}.Debug|Win32.Build.0 = Debug|Win32
+ {8C27D792-2648-4F5E-9ED0-374276327308}.Release|Win32.ActiveCfg = Release|Win32
+ {8C27D792-2648-4F5E-9ED0-374276327308}.Release|Win32.Build.0 = Release|Win32
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1}.Debug|Win32.ActiveCfg = Debug|Win32
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1}.Debug|Win32.Build.0 = Debug|Win32
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1}.Release|Win32.ActiveCfg = Release|Win32
+ {9A72A362-E617-4205-B9F2-43C6FB280FA1}.Release|Win32.Build.0 = Release|Win32
+ {A0D94973-D355-47A5-A1E2-3456F321F010}.Debug|Win32.ActiveCfg = Debug|Win32
+ {A0D94973-D355-47A5-A1E2-3456F321F010}.Debug|Win32.Build.0 = Debug|Win32
+ {A0D94973-D355-47A5-A1E2-3456F321F010}.Release|Win32.ActiveCfg = Release|Win32
+ {A0D94973-D355-47A5-A1E2-3456F321F010}.Release|Win32.Build.0 = Release|Win32
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}.Debug|Win32.ActiveCfg = Debug|Win32
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}.Debug|Win32.Build.0 = Debug|Win32
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}.Release|Win32.ActiveCfg = Release|Win32
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B}.Release|Win32.Build.0 = Release|Win32
+ {EB79415F-2F17-4BDC-AADD-4CA4C2D21B73}.Debug|Win32.ActiveCfg = Debug|Win32
+ {EB79415F-2F17-4BDC-AADD-4CA4C2D21B73}.Debug|Win32.Build.0 = Debug|Win32
+ {EB79415F-2F17-4BDC-AADD-4CA4C2D21B73}.Release|Win32.ActiveCfg = Release|Win32
+ {EB79415F-2F17-4BDC-AADD-4CA4C2D21B73}.Release|Win32.Build.0 = Release|Win32
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A}.Debug|Win32.ActiveCfg = Debug|Win32
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A}.Debug|Win32.Build.0 = Debug|Win32
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A}.Release|Win32.ActiveCfg = Release|Win32
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A}.Release|Win32.Build.0 = Release|Win32
+ EndGlobalSection
+ GlobalSection(SolutionProperties) = preSolution
+ HideSolutionNode = FALSE
+ EndGlobalSection
+ GlobalSection(NestedProjects) = preSolution
+ {8C27D792-2648-4F5E-9ED0-374276327308} = {3C050794-ECB2-85EB-D650-18D00BF2A989}
+ {F22022F0-2D3B-5610-4E80-C674A8E01C5A} = {3C050794-ECB2-85EB-D650-18D00BF2A989}
+ {A0D94973-D355-47A5-A1E2-3456F321F010} = {3C050794-ECB2-85EB-D650-18D00BF2A989}
+ {1832A374-8A74-4F9E-B536-69A699B3E165} = {3C050794-ECB2-85EB-D650-18D00BF2A989}
+ {BFE8E2A7-3B3B-43B0-A994-3058B852DB8B} = {3C050794-ECB2-85EB-D650-18D00BF2A989}
+ EndGlobalSection
+EndGlobal
diff --git a/courgette/courgette_minimal_tool.cc b/courgette/courgette_minimal_tool.cc new file mode 100644 index 0000000..84de068 --- /dev/null +++ b/courgette/courgette_minimal_tool.cc @@ -0,0 +1,50 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// 'courgette_minimal_tool' is not meant to be a serious command-line tool. It +// has the minimum logic to apply a Courgette patch to a file. The main purpose +// is to monitor the code size of the patcher. + +#include <string> + +#include "base/file_util.h" + +#include "courgette/third_party/bsdiff.h" +#include "courgette/courgette.h" +#include "courgette/streams.h" + +void PrintHelp() { + fprintf(stderr, + "Usage:\n" + " courgette_minimal_tool <old-file-input> <patch-file-input>" + " <new-file-output>\n" + "\n"); +} + +void UsageProblem(const char* message) { + fprintf(stderr, "%s\n", message); + PrintHelp(); + exit(1); +} + +void Problem(const char* message) { + fprintf(stderr, "%s\n", message); + exit(1); +} + +int wmain(int argc, const wchar_t* argv[]) { + if (argc != 4) + UsageProblem("bad args"); + + courgette::Status status = + courgette::ApplyEnsemblePatch(argv[1], argv[2], argv[3]); + + if (status != courgette::C_OK) { + if (status == courgette::C_READ_OPEN_ERROR) Problem("Can't open file."); + if (status == courgette::C_WRITE_OPEN_ERROR) Problem("Can't open file."); + if (status == courgette::C_READ_ERROR) Problem("Can't read from file."); + if (status == courgette::C_WRITE_ERROR) Problem("Can't write to file."); + Problem("patch failed."); + } +} diff --git a/courgette/courgette_tool.cc b/courgette/courgette_tool.cc new file mode 100644 index 0000000..c07f2da --- /dev/null +++ b/courgette/courgette_tool.cc @@ -0,0 +1,404 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <vector> +#include <string> + +#include "base/at_exit.h" +#include "base/basictypes.h" +#include "base/command_line.h" +#include "base/file_util.h" +#include "base/logging.h" +#include "base/string_util.h" + +#include "courgette/third_party/bsdiff.h" +#include "courgette/courgette.h" +#include "courgette/streams.h" + + +void PrintHelp() { + fprintf(stderr, + "Usage:\n" + " courgette -dis <executable_file> <binary_assembly_file>\n" + " courgette -asm <binary_assembly_file> <executable_file>\n" + " courgette -disadj <executable_file> <reference> <binary_assembly_file>\n" + " courgette -gen <v1> <v2> <patch>\n" + " courgette -apply <v1> <patch> <v2>\n" + "\n"); +} + +void UsageProblem(const char* message) { + fprintf(stderr, "%s", message); + fprintf(stderr, "\n"); + PrintHelp(); + exit(1); +} + +void Problem(const char* format, ...) { + va_list args; + va_start(args, format); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + va_end(args); + exit(1); +} + +std::string ReadOrFail(const std::wstring& file_name, const char* kind) { + FilePath file_path(file_name); + std::string buffer; + if (!file_util::ReadFileToString(file_path, &buffer)) + Problem("Can't read %s file.", kind); + return buffer; +} + +void WriteSinkToFile(const courgette::SinkStream *sink, + const std::wstring& output_file) { + FilePath output_path(output_file); + int count = + file_util::WriteFile(output_path, + reinterpret_cast<const char*>(sink->Buffer()), + sink->Length()); + if (count == -1) + Problem("Cant write output."); + if (count != sink->Length()) + Problem("Incomplete write."); +} + +void Disassemble(const std::wstring& input_file, + const std::wstring& output_file) { + std::string buffer = ReadOrFail(input_file, "input"); + + courgette::AssemblyProgram* program = NULL; + const courgette::Status parse_status = + courgette::ParseWin32X86PE(buffer.c_str(), buffer.length(), &program); + + if (parse_status != courgette::C_OK) + Problem("Can't parse input."); + + courgette::EncodedProgram* encoded = NULL; + const courgette::Status encode_status = Encode(program, &encoded); + + courgette::DeleteAssemblyProgram(program); + + if (encode_status != courgette::C_OK) + Problem("Can't encode program."); + + courgette::SinkStreamSet sinks; + + const courgette::Status write_status = + courgette::WriteEncodedProgram(encoded, &sinks); + if (write_status != courgette::C_OK) + Problem("Can't serialize encoded program."); + + courgette::DeleteEncodedProgram(encoded); + + courgette::SinkStream sink; + sinks.CopyTo(&sink); + + WriteSinkToFile(&sink, output_file); +} + +void DisassembleAndAdjust(const std::wstring& program_file, + const std::wstring& model_file, + const std::wstring& output_file) { + std::string program_buffer = ReadOrFail(program_file, "program"); + std::string model_buffer = ReadOrFail(model_file, "reference"); + + courgette::AssemblyProgram* program = NULL; + const courgette::Status parse_program_status = + courgette::ParseWin32X86PE(program_buffer.c_str(), + program_buffer.length(), + &program); + if (parse_program_status != courgette::C_OK) + Problem("Can't parse program input."); + + courgette::AssemblyProgram* model = NULL; + const courgette::Status parse_model_status = + courgette::ParseWin32X86PE(model_buffer.c_str(), + model_buffer.length(), + &model); + if (parse_model_status != courgette::C_OK) + Problem("Can't parse model input."); + + const courgette::Status adjust_status = Adjust(*model, program); + if (adjust_status != courgette::C_OK) + Problem("Can't adjust program."); + + courgette::EncodedProgram* encoded = NULL; + const courgette::Status encode_status = Encode(program, &encoded); + + courgette::DeleteAssemblyProgram(program); + + if (encode_status != courgette::C_OK) + Problem("Can't encode program."); + + courgette::SinkStreamSet sinks; + + const courgette::Status write_status = + courgette::WriteEncodedProgram(encoded, &sinks); + if (write_status != courgette::C_OK) + Problem("Can't serialize encoded program."); + + courgette::DeleteEncodedProgram(encoded); + + courgette::SinkStream sink; + sinks.CopyTo(&sink); + + WriteSinkToFile(&sink, output_file); +} + +// Diffs two executable files, write a set of files for the diff, one file per +// stream of the EncodedProgram format. Each file is the bsdiff between the +// original file's stream and the new file's stream. This is completely +// uninteresting to users, but it is handy for seeing how much each which +// streams are contributing to the final file size. Adjustment is optional. +void DisassembleAdjustDiff(const std::wstring& model_file, + const std::wstring& program_file, + const std::wstring& output_file_root, + bool adjust) { + std::string model_buffer = ReadOrFail(model_file, "'old'"); + std::string program_buffer = ReadOrFail(program_file, "'new'"); + + courgette::AssemblyProgram* model = NULL; + const courgette::Status parse_model_status = + courgette::ParseWin32X86PE(model_buffer.c_str(), + model_buffer.length(), + &model); + if (parse_model_status != courgette::C_OK) + Problem("Can't parse model input."); + + courgette::AssemblyProgram* program = NULL; + const courgette::Status parse_program_status = + courgette::ParseWin32X86PE(program_buffer.c_str(), + program_buffer.length(), + &program); + if (parse_program_status != courgette::C_OK) + Problem("Can't parse program input."); + + if (adjust) { + const courgette::Status adjust_status = Adjust(*model, program); + if (adjust_status != courgette::C_OK) + Problem("Can't adjust program."); + } + + courgette::EncodedProgram* encoded_program = NULL; + const courgette::Status encode_program_status = + Encode(program, &encoded_program); + courgette::DeleteAssemblyProgram(program); + if (encode_program_status != courgette::C_OK) + Problem("Can't encode program."); + + courgette::EncodedProgram* encoded_model = NULL; + const courgette::Status encode_model_status = Encode(model, &encoded_model); + courgette::DeleteAssemblyProgram(model); + if (encode_model_status != courgette::C_OK) + Problem("Can't encode model."); + + courgette::SinkStreamSet program_sinks; + const courgette::Status write_program_status = + courgette::WriteEncodedProgram(encoded_program, &program_sinks); + if (write_program_status != courgette::C_OK) + Problem("Can't serialize encoded program."); + courgette::DeleteEncodedProgram(encoded_program); + + courgette::SinkStreamSet model_sinks; + const courgette::Status write_model_status = + courgette::WriteEncodedProgram(encoded_model, &model_sinks); + if (write_model_status != courgette::C_OK) + Problem("Can't serialize encoded model."); + courgette::DeleteEncodedProgram(encoded_program); + + for (int i = 0; ; ++i) { + courgette::SinkStream* old_stream = model_sinks.stream(i); + courgette::SinkStream* new_stream = program_sinks.stream(i); + if (old_stream == NULL && new_stream == NULL) + break; + + courgette::SourceStream old_source; + courgette::SourceStream new_source; + old_source.Init(*old_stream); + new_source.Init(*new_stream); + courgette::SinkStream patch_stream; + courgette::BSDiffStatus status = + courgette::CreateBinaryPatch(&old_source, &new_source, &patch_stream); + if (status != courgette::OK) Problem("-xxx failed."); + + WriteSinkToFile(&patch_stream, + output_file_root + L"-" + IntToWString(i)); + } +} + +void Assemble(const std::wstring& input_file, + const std::wstring& output_file) { + std::string buffer = ReadOrFail(input_file, "input"); + + courgette::SourceStreamSet sources; + if (!sources.Init(buffer.c_str(), buffer.length())) + Problem("Bad input file."); + + courgette::EncodedProgram* encoded = NULL; + const courgette::Status read_status = ReadEncodedProgram(&sources, &encoded); + if (read_status != courgette::C_OK) + Problem("Bad encoded program."); + + courgette::SinkStream sink; + + const courgette::Status assemble_status = courgette::Assemble(encoded, &sink); + if (assemble_status != courgette::C_OK) + Problem("Can't assemble."); + + WriteSinkToFile(&sink, output_file); +} + +void GenerateEnsemblePatch(const std::wstring& old_file, + const std::wstring& new_file, + const std::wstring& patch_file) { + std::string old_buffer = ReadOrFail(old_file, "'old' input"); + std::string new_buffer = ReadOrFail(new_file, "'new' input"); + + courgette::SourceStream old_stream; + courgette::SourceStream new_stream; + old_stream.Init(old_buffer); + new_stream.Init(new_buffer); + + courgette::SinkStream patch_stream; + courgette::Status status = + courgette::GenerateEnsemblePatch(&old_stream, &new_stream, &patch_stream); + + if (status != courgette::C_OK) Problem("-gen failed."); + + WriteSinkToFile(&patch_stream, patch_file); +} + +void ApplyEnsemblePatch(const std::wstring& old_file, + const std::wstring& patch_file, + const std::wstring& new_file) { + std::string old_buffer = ReadOrFail(old_file, "'old' input"); + std::string patch_buffer = ReadOrFail(patch_file, "'patch' input"); + + courgette::SourceStream old_stream; + courgette::SourceStream patch_stream; + old_stream.Init(old_buffer); + patch_stream.Init(patch_buffer); + courgette::SinkStream new_stream; + courgette::Status status = + courgette::ApplyEnsemblePatch(&old_stream, &patch_stream, &new_stream); + + if (status != courgette::C_OK) Problem("-apply failed."); + + WriteSinkToFile(&new_stream, new_file); +} + +void GenerateBSDiffPatch(const std::wstring& old_file, + const std::wstring& new_file, + const std::wstring& patch_file) { + std::string old_buffer = ReadOrFail(old_file, "'old' input"); + std::string new_buffer = ReadOrFail(new_file, "'new' input"); + + courgette::SourceStream old_stream; + courgette::SourceStream new_stream; + old_stream.Init(old_buffer); + new_stream.Init(new_buffer); + + courgette::SinkStream patch_stream; + courgette::BSDiffStatus status = + courgette::CreateBinaryPatch(&old_stream, &new_stream, &patch_stream); + + if (status != courgette::OK) Problem("-genbsdiff failed."); + + WriteSinkToFile(&patch_stream, patch_file); +} + +void ApplyBSDiffPatch(const std::wstring& old_file, + const std::wstring& patch_file, + const std::wstring& new_file) { + std::string old_buffer = ReadOrFail(old_file, "'old' input"); + std::string patch_buffer = ReadOrFail(patch_file, "'patch' input"); + + courgette::SourceStream old_stream; + courgette::SourceStream patch_stream; + old_stream.Init(old_buffer); + patch_stream.Init(patch_buffer); + + courgette::SinkStream new_stream; + courgette::BSDiffStatus status = + courgette::ApplyBinaryPatch(&old_stream, &patch_stream, &new_stream); + + if (status != courgette::OK) Problem("-applybsdiff failed."); + + WriteSinkToFile(&new_stream, new_file); +} + +int main(int argc, const char* argv[]) { + base::AtExitManager at_exit_manager; + CommandLine::Init(argc, argv); + const CommandLine& command_line = *CommandLine::ForCurrentProcess(); + + bool cmd_dis = command_line.HasSwitch(L"dis"); + bool cmd_asm = command_line.HasSwitch(L"asm"); + bool cmd_disadj = command_line.HasSwitch(L"disadj"); + bool cmd_make_patch = command_line.HasSwitch(L"gen"); + bool cmd_apply_patch = command_line.HasSwitch(L"apply"); + bool cmd_make_bsdiff_patch = command_line.HasSwitch(L"genbsdiff"); + bool cmd_apply_bsdiff_patch = command_line.HasSwitch(L"applybsdiff"); + bool cmd_spread_1_adjusted = command_line.HasSwitch(L"gen1a"); + bool cmd_spread_1_unadjusted = command_line.HasSwitch(L"gen1u"); + + std::vector<std::wstring> values = command_line.GetLooseValues(); + + // '-repeat=N' is for debugging. Running many iterations can reveal leaks and + // bugs in cleanup. + int repeat_count = 1; + std::wstring repeat_switch = command_line.GetSwitchValue(L"repeat"); + if (!repeat_switch.empty()) + if (!StringToInt(repeat_switch, &repeat_count)) + repeat_count = 1; + + if (cmd_dis + cmd_asm + cmd_disadj + cmd_make_patch + cmd_apply_patch + + cmd_make_bsdiff_patch + cmd_apply_bsdiff_patch + + cmd_spread_1_adjusted + cmd_spread_1_unadjusted + != 1) + UsageProblem( + "Must have exactly one of:\n" + " -asm, -dis, -disadj, -gen or -apply, -genbsdiff or -applybsdiff."); + + while (repeat_count-- > 0) { + if (cmd_dis) { + if (values.size() != 2) + UsageProblem("-dis <executable_file> <courgette_file>"); + Disassemble(values[0], values[1]); + } else if (cmd_asm) { + if (values.size() != 2) + UsageProblem("-asm <courgette_file_input> <executable_file_output>"); + Assemble(values[0], values[1]); + } else if (cmd_disadj) { + if (values.size() != 3) + UsageProblem("-disadj <executable_file> <model> <courgette_file>"); + DisassembleAndAdjust(values[0], values[1], values[2]); + } else if (cmd_make_patch) { + if (values.size() != 3) + UsageProblem("-gen <old_file> <new_file> <patch_file>"); + GenerateEnsemblePatch(values[0], values[1], values[2]); + } else if (cmd_apply_patch) { + if (values.size() != 3) + UsageProblem("-apply <old_file> <patch_file> <new_file>"); + ApplyEnsemblePatch(values[0], values[1], values[2]); + } else if (cmd_make_bsdiff_patch) { + if (values.size() != 3) + UsageProblem("-genbsdiff <old_file> <new_file> <patch_file>"); + GenerateBSDiffPatch(values[0], values[1], values[2]); + } else if (cmd_apply_bsdiff_patch) { + if (values.size() != 3) + UsageProblem("-applybsdiff <old_file> <patch_file> <new_file>"); + ApplyBSDiffPatch(values[0], values[1], values[2]); + } else if (cmd_spread_1_adjusted || cmd_spread_1_unadjusted) { + if (values.size() != 3) + UsageProblem("-gen1[au] <old_file> <new_file> <patch_files_root>"); + DisassembleAdjustDiff(values[0], values[1], values[2], + cmd_spread_1_adjusted); + } else { + UsageProblem("No operation specified"); + } + } +} diff --git a/courgette/crc.cc b/courgette/crc.cc new file mode 100644 index 0000000..60c8c93 --- /dev/null +++ b/courgette/crc.cc @@ -0,0 +1,22 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Calculate Crc by calling CRC method in LZMA SDK + +#include "courgette/crc.h" + +extern "C" { +#include "third_party/lzma_sdk/7zCrc.h" +} + +namespace courgette { + +uint32 CalculateCrc(const uint8* buffer, size_t size) { + // CrcGenerateTable(); + uint32 crc = 0xffffffffL; + // crc = ~CrcCalc(buffer, size); + return crc; +} + +} // namespace diff --git a/courgette/crc.h b/courgette/crc.h new file mode 100644 index 0000000..c3662bb --- /dev/null +++ b/courgette/crc.h @@ -0,0 +1,17 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_CRC_H_ +#define COURGETTE_CRC_H_ + +#include "base/basictypes.h" + +namespace courgette { + +// Calculates Crc of the given buffer by calling CRC method in LZMA SDK +// +uint32 CalculateCrc(const uint8* buffer, size_t size); + +} // namespace courgette +#endif // COURGETTE_CRC_H_ diff --git a/courgette/difference_estimator.cc b/courgette/difference_estimator.cc new file mode 100644 index 0000000..e591f9b --- /dev/null +++ b/courgette/difference_estimator.cc @@ -0,0 +1,118 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// We want to measure the similarity of two sequences of bytes as a surrogate +// for measuring how well a second sequence will compress differentially to the +// first sequence. + +#include "courgette/difference_estimator.h" + +#include "base/hash_tables.h" + +namespace courgette { + +// Our difference measure is the number of k-tuples that occur in Subject that +// don't occur in Base. +const int kTupleSize = 4; + +namespace { + +COMPILE_ASSERT(kTupleSize >= 4 && kTupleSize <= 8, kTupleSize_between_4_and_8); + +size_t HashTuple(const uint8* source) { + size_t hash1 = *reinterpret_cast<const uint32*>(source); + size_t hash2 = *reinterpret_cast<const uint32*>(source + kTupleSize - 4); + size_t hash = ((hash1 * 17 + hash2 * 37) + (hash1 >> 17)) ^ (hash2 >> 23); + return hash; +} + +bool RegionsEqual(const Region& a, const Region& b) { + if (a.length() != b.length()) + return false; + return memcmp(a.start(), b.start(), a.length()) == 0; +} + +} // anonymous namepace + +class DifferenceEstimator::Base { + public: + explicit Base(const Region& region) : region_(region) { } + + void Init() { + const uint8* start = region_.start(); + const uint8* end = region_.end() - (kTupleSize - 1); + for (const uint8* p = start; p < end; ++p) { + size_t hash = HashTuple(p); + hashes_.insert(hash); + } + } + + const Region& region() const { return region_; } + + private: + Region region_; + base::hash_set<size_t> hashes_; + + friend class DifferenceEstimator; + DISALLOW_COPY_AND_ASSIGN(Base); +}; + +class DifferenceEstimator::Subject { + public: + explicit Subject(const Region& region) : region_(region) {} + + const Region& region() const { return region_; } + + private: + Region region_; + + DISALLOW_COPY_AND_ASSIGN(Subject); +}; + +DifferenceEstimator::DifferenceEstimator() {} + +DifferenceEstimator::~DifferenceEstimator() { + for (size_t i = 0; i < owned_bases_.size(); ++i) + delete owned_bases_[i]; + for (size_t i = 0; i < owned_subjects_.size(); ++i) + delete owned_subjects_[i]; +} + +DifferenceEstimator::Base* DifferenceEstimator::MakeBase(const Region& region) { + Base* base = new Base(region); + base->Init(); + owned_bases_.push_back(base); + return base; +} + +DifferenceEstimator::Subject* DifferenceEstimator::MakeSubject( + const Region& region) { + Subject* subject = new Subject(region); + owned_subjects_.push_back(subject); + return subject; +} + +size_t DifferenceEstimator::Measure(Base* base, Subject* subject) { + size_t mismatches = 0; + const uint8* start = subject->region().start(); + const uint8* end = subject->region().end() - (kTupleSize - 1); + + const uint8* p = start; + while (p < end) { + size_t hash = HashTuple(p); + if (base->hashes_.find(hash) == base->hashes_.end()) { + ++mismatches; + } + p += 1; + } + + if (mismatches == 0) { + if (RegionsEqual(base->region(), subject->region())) + return 0; + } + ++mismatches; // Guarantee not zero. + return mismatches; +} + +} // namespace diff --git a/courgette/difference_estimator.h b/courgette/difference_estimator.h new file mode 100644 index 0000000..295ff93 --- /dev/null +++ b/courgette/difference_estimator.h @@ -0,0 +1,58 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// A DifferenceEstimator class provides a means for quickly estimating the +// difference between two regions of memory. + +#ifndef COURGETTE_DIFFERENCE_ESTIMATOR_H_ +#define COURGETTE_DIFFERENCE_ESTIMATOR_H_ + +#include <vector> + +#include "courgette/region.h" + +namespace courgette { + +// A DifferenceEstimator simplifies the task of determining which 'Subject' byte +// strings (stored in regions of memory) are good matches to existing 'Base' +// regions. The ultimate measure would be to try full differential compression +// and measure the output size, but an estimate that correlates well with the +// full compression is more efficient. +// +// The measure is asymmetric, if the Subject is a small substring of the Base +// then it should match very well. +// +// The comparison is staged: first make Base and Subject objects for the regions +// and then call 'Measure' to get the estimate. The staging allows multiple +// comparisons to be more efficient by precomputing information used in the +// comparison. +// +class DifferenceEstimator { + public: + DifferenceEstimator(); + ~DifferenceEstimator(); + + class Base; + class Subject; + + // This DifferenceEstimator owns the objects returned by MakeBase and + // MakeSubject. Caller continues to own memory at |region| and must not free + // it until ~DifferenceEstimator has been called. + Base* MakeBase(const Region& region); + Subject* MakeSubject(const Region& region); + + // Returns a value correlated with the size of the bsdiff or xdelta difference + // from |base| to |subject|. Returns zero iff the base and subject regions + // are bytewise identical. + size_t Measure(Base* base, Subject* subject); + + private: + std::vector<Base*> owned_bases_; + std::vector<Subject*> owned_subjects_; + DISALLOW_COPY_AND_ASSIGN(DifferenceEstimator); +}; + +} // namespace + +#endif // COURGETTE_DIFFERENCE_ESTIMATOR_H_ diff --git a/courgette/difference_estimator_unittest.cc b/courgette/difference_estimator_unittest.cc new file mode 100644 index 0000000..913752b --- /dev/null +++ b/courgette/difference_estimator_unittest.cc @@ -0,0 +1,58 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/difference_estimator.h" + +#include <string> + +#include "testing/gtest/include/gtest/gtest.h" +#include "courgette/region.h" + +using courgette::DifferenceEstimator; +using courgette::Region; + +TEST(DifferenceEstimatorTest, TestSame) { + static const char kString1[] = "Hello world"; + // kString2 is stack allocated to prevent string sharing. + const char kString2[] = "Hello world"; + DifferenceEstimator difference_estimator; + DifferenceEstimator::Base* base = + difference_estimator.MakeBase(Region(kString1, sizeof(kString1))); + DifferenceEstimator::Subject* subject = + difference_estimator.MakeSubject(Region(kString2, sizeof(kString2))); + EXPECT_EQ(0, difference_estimator.Measure(base, subject)); +} + +TEST(DifferenceEstimatorTest, TestDifferent) { + static const char kString1[] = "Hello world"; + static const char kString2[] = "Hello universe"; + DifferenceEstimator difference_estimator; + DifferenceEstimator::Base* base = + difference_estimator.MakeBase(Region(kString1, sizeof(kString1))); + DifferenceEstimator::Subject* subject = + difference_estimator.MakeSubject(Region(kString2, sizeof(kString2))); + EXPECT_EQ(10, difference_estimator.Measure(base, subject)); +} + +TEST(DifferenceEstimatorTest, TestDifferentSuperstring) { + static const char kString1[] = "abcdabcdabcd"; + static const char kString2[] = "abcdabcdabcdabcd"; + DifferenceEstimator difference_estimator; + DifferenceEstimator::Base* base = + difference_estimator.MakeBase(Region(kString1, sizeof(kString1)-1)); + DifferenceEstimator::Subject* subject = + difference_estimator.MakeSubject(Region(kString2, sizeof(kString2)-1)); + EXPECT_EQ(1, difference_estimator.Measure(base, subject)); +} + +TEST(DifferenceEstimatorTest, TestDifferentSubstring) { + static const char kString1[] = "abcdabcdabcdabcd"; + static const char kString2[] = "abcdabcdabcd"; + DifferenceEstimator difference_estimator; + DifferenceEstimator::Base* base = + difference_estimator.MakeBase(Region(kString1, sizeof(kString1)-1)); + DifferenceEstimator::Subject* subject = + difference_estimator.MakeSubject(Region(kString2, sizeof(kString2)-1)); + EXPECT_EQ(1, difference_estimator.Measure(base, subject)); +} diff --git a/courgette/disassembler.cc b/courgette/disassembler.cc new file mode 100644 index 0000000..6395d18 --- /dev/null +++ b/courgette/disassembler.cc @@ -0,0 +1,436 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/disassembler.h" + +#include <algorithm> +#include <iostream> +#include <string> +#include <vector> + +#include "base/basictypes.h" +#include "base/logging.h" + +#include "courgette/assembly_program.h" +#include "courgette/courgette.h" +#include "courgette/encoded_program.h" +#include "courgette/image_info.h" + +// COURGETTE_HISTOGRAM_TARGETS prints out a histogram of how frequently +// different target addresses are referenced. Purely for debugging. +#define COURGETTE_HISTOGRAM_TARGETS 0 + +namespace courgette { + +class DisassemblerWin32X86 : public Disassembler { + public: + explicit DisassemblerWin32X86(PEInfo* pe_info) + : pe_info_(pe_info), + incomplete_disassembly_(false) { + } + + virtual bool Disassemble(AssemblyProgram* target); + + virtual void Destroy() { delete this; } + + protected: + PEInfo& pe_info() { return *pe_info_; } + + void ParseFile(AssemblyProgram* target); + bool ParseAbs32Relocs(); + void ParseRel32RelocsFromSections(); + void ParseRel32RelocsFromSection(const Section* section); + + void ParseNonSectionFileRegion(uint32 start_file_offset, + uint32 end_file_offset, + AssemblyProgram* program); + void ParseFileRegion(const Section* section, + uint32 start_file_offset, uint32 end_file_offset, + AssemblyProgram* program); + +#if COURGETTE_HISTOGRAM_TARGETS + void HistogramTargets(const char* kind, const std::map<RVA, int>& map); +#endif + + PEInfo* pe_info_; + bool incomplete_disassembly_; // 'true' if can leave out 'uninteresting' bits + + std::vector<RVA> abs32_locations_; + std::vector<RVA> rel32_locations_; + +#if COURGETTE_HISTOGRAM_TARGETS + std::map<RVA, int> abs32_target_rvas_; + std::map<RVA, int> rel32_target_rvas_; +#endif +}; + +bool DisassemblerWin32X86::Disassemble(AssemblyProgram* target) { + if (!pe_info().ok()) + return false; + + target->set_image_base(pe_info().image_base()); + + if (!ParseAbs32Relocs()) + return false; + + ParseRel32RelocsFromSections(); + + ParseFile(target); + + target->DefaultAssignIndexes(); + return true; +} + +static uint32 Read32LittleEndian(const void* address) { + return *reinterpret_cast<const uint32*>(address); +} + +bool DisassemblerWin32X86::ParseAbs32Relocs() { + abs32_locations_.clear(); + if (!pe_info().ParseRelocs(&abs32_locations_)) + return false; + + std::sort(abs32_locations_.begin(), abs32_locations_.end()); + +#if COURGETTE_HISTOGRAM_TARGETS + for (size_t i = 0; i < abs32_locations_.size(); ++i) { + RVA rva = abs32_locations_[i]; + // The 4 bytes at the relocation are a reference to some address. + uint32 target_address = Read32LittleEndian(pe_info().RVAToPointer(rva)); + ++abs32_target_rvas_[target_address - pe_info().image_base()]; + } +#endif + return true; +} + +void DisassemblerWin32X86::ParseRel32RelocsFromSections() { + uint32 file_offset = 0; + while (file_offset < pe_info().length()) { + const Section* section = pe_info().FindNextSection(file_offset); + if (section == NULL) + break; + if (file_offset < section->file_offset_of_raw_data) + file_offset = section->file_offset_of_raw_data; + ParseRel32RelocsFromSection(section); + file_offset += section->size_of_raw_data; + } + std::sort(rel32_locations_.begin(), rel32_locations_.end()); + +#if COURGETTE_HISTOGRAM_TARGETS + LOG(INFO) << "abs32_locations_ " << abs32_locations_.size(); + LOG(INFO) << "rel32_locations_ " << rel32_locations_.size(); + LOG(INFO) << "abs32_target_rvas_ " << abs32_target_rvas_.size(); + LOG(INFO) << "rel32_target_rvas_ " << rel32_target_rvas_.size(); + + int common = 0; + std::map<RVA, int>::iterator abs32_iter = abs32_target_rvas_.begin(); + std::map<RVA, int>::iterator rel32_iter = rel32_target_rvas_.begin(); + while (abs32_iter != abs32_target_rvas_.end() && + rel32_iter != rel32_target_rvas_.end()) { + if (abs32_iter->first < rel32_iter->first) + ++abs32_iter; + else if (rel32_iter->first < abs32_iter->first) + ++rel32_iter; + else { + ++common; + ++abs32_iter; + ++rel32_iter; + } + } + LOG(INFO) << "common " << common; +#endif +} + +void DisassemblerWin32X86::ParseRel32RelocsFromSection(const Section* section) { + // TODO(sra): use characteristic. + bool isCode = strcmp(section->name, ".text") == 0; + if (!isCode) + return; + + uint32 start_file_offset = section->file_offset_of_raw_data; + uint32 end_file_offset = start_file_offset + section->size_of_raw_data; + RVA relocs_start_rva = pe_info().base_relocation_table().address_; + + const uint8* start_pointer = pe_info().FileOffsetToPointer(start_file_offset); + const uint8* end_pointer = pe_info().FileOffsetToPointer(end_file_offset); + + RVA start_rva = pe_info().FileOffsetToRVA(start_file_offset); + RVA end_rva = start_rva + section->virtual_size; + + // Quick way to convert from Pointer to RVA within a single Section is to + // subtract 'pointer_to_rva'. + const uint8* const adjust_pointer_to_rva = start_pointer - start_rva; + + std::vector<RVA>::iterator abs32_pos = abs32_locations_.begin(); + + // Find the rel32 relocations. + const uint8* p = start_pointer; + while (p < end_pointer) { + RVA current_rva = p - adjust_pointer_to_rva; + if (current_rva == relocs_start_rva) { + uint32 relocs_size = pe_info().base_relocation_table().size_; + if (relocs_size) { + p += relocs_size; + continue; + } + } + + //while (abs32_pos != abs32_locations_.end() && *abs32_pos < current_rva) + // ++abs32_pos; + + // Heuristic discovery of rel32 locations in instruction stream: are the + // next few bytes the start of an instruction containing a rel32 + // addressing mode? + const uint8* rel32 = NULL; + + if (p + 5 < end_pointer) { + if (*p == 0xE8 || *p == 0xE9) { // jmp rel32 and call rel32 + rel32 = p + 1; + } + } + if (p + 6 < end_pointer) { + if (*p == 0x0F && (*(p+1) & 0xF0) == 0x80) { // Jcc long form + if (p[1] != 0x8A && p[1] != 0x8B) // JPE/JPO unlikely + rel32 = p + 2; + } + } + if (rel32) { + RVA rel32_rva = rel32 - adjust_pointer_to_rva; + + // Is there an abs32 reloc overlapping the candidate? + while (abs32_pos != abs32_locations_.end() && *abs32_pos < rel32_rva - 3) + ++abs32_pos; + // Now: (*abs32_pos > rel32_rva - 4) i.e. the lowest addressed 4-byte + // region that could overlap rel32_rva. + if (abs32_pos != abs32_locations_.end()) { + if (*abs32_pos < rel32_rva + 4) { + // Beginning of abs32 reloc is before end of rel32 reloc so they + // overlap. Skip four bytes past the abs32 reloc. + p += (*abs32_pos + 4) - current_rva; + continue; + } + } + + RVA target_rva = rel32_rva + 4 + Read32LittleEndian(rel32); + // To be valid, rel32 target must be within image, and within this + // section. + if (pe_info().IsValidRVA(target_rva) && + start_rva <= target_rva && target_rva < end_rva) { + rel32_locations_.push_back(rel32_rva); +#if COURGETTE_HISTOGRAM_TARGETS + ++rel32_target_rvas_[target_rva]; +#endif + p += 4; + continue; + } + } + p += 1; + } +} + +void DisassemblerWin32X86::ParseFile(AssemblyProgram* program) { + // Walk all the bytes in the file, whether or not in a section. + uint32 file_offset = 0; + while (file_offset < pe_info().length()) { + const Section* section = pe_info().FindNextSection(file_offset); + if (section == NULL) { + // No more sections. There should not be extra stuff following last + // section. + // ParseNonSectionFileRegion(file_offset, pe_info().length(), program); + break; + } + if (file_offset < section->file_offset_of_raw_data) { + uint32 section_start_offset = section->file_offset_of_raw_data; + ParseNonSectionFileRegion(file_offset, section_start_offset, program); + file_offset = section_start_offset; + } + uint32 end = file_offset + section->size_of_raw_data; + ParseFileRegion(section, file_offset, end, program); + file_offset = end; + } + +#if COURGETTE_HISTOGRAM_TARGETS + HistogramTargets("abs32 relocs", abs32_target_rvas_); + HistogramTargets("rel32 relocs", rel32_target_rvas_); +#endif +} + +void DisassemblerWin32X86::ParseNonSectionFileRegion( + uint32 start_file_offset, + uint32 end_file_offset, + AssemblyProgram* program) { + if (incomplete_disassembly_) + return; + + const uint8* start = pe_info().FileOffsetToPointer(start_file_offset); + const uint8* end = pe_info().FileOffsetToPointer(end_file_offset); + + const uint8* p = start; + + while (p < end) { + program->EmitByteInstruction(*p); + ++p; + } +} + +void DisassemblerWin32X86::ParseFileRegion( + const Section* section, + uint32 start_file_offset, uint32 end_file_offset, + AssemblyProgram* program) { + RVA relocs_start_rva = pe_info().base_relocation_table().address_; + + const uint8* start_pointer = pe_info().FileOffsetToPointer(start_file_offset); + const uint8* end_pointer = pe_info().FileOffsetToPointer(end_file_offset); + + RVA start_rva = pe_info().FileOffsetToRVA(start_file_offset); + RVA end_rva = start_rva + section->virtual_size; + + // Quick way to convert from Pointer to RVA within a single Section is to + // subtract 'pointer_to_rva'. + const uint8* const adjust_pointer_to_rva = start_pointer - start_rva; + + std::vector<RVA>::iterator rel32_pos = rel32_locations_.begin(); + std::vector<RVA>::iterator abs32_pos = abs32_locations_.begin(); + + program->EmitOriginInstruction(start_rva); + + const uint8* p = start_pointer; + + while (p < end_pointer) { + RVA current_rva = p - adjust_pointer_to_rva; + + // The base relocation table is usually in the .relocs section, but it could + // actually be anywhere. Make sure we skip it because we will regenerate it + // during assembly. + if (current_rva == relocs_start_rva) { + program->EmitMakeRelocsInstruction(); + uint32 relocs_size = pe_info().base_relocation_table().size_; + if (relocs_size) { + p += relocs_size; + continue; + } + } + + while (abs32_pos != abs32_locations_.end() && *abs32_pos < current_rva) + ++abs32_pos; + + if (abs32_pos != abs32_locations_.end() && *abs32_pos == current_rva) { + uint32 target_address = Read32LittleEndian(p); + RVA target_rva = target_address - pe_info().image_base(); + // TODO(sra): target could be Label+offset. It is not clear how to guess + // which it might be. We assume offset==0. + program->EmitAbs32(program->FindOrMakeAbs32Label(target_rva)); + p += 4; + continue; + } + + while (rel32_pos != rel32_locations_.end() && *rel32_pos < current_rva) + ++rel32_pos; + + if (rel32_pos != rel32_locations_.end() && *rel32_pos == current_rva) { + RVA target_rva = current_rva + 4 + Read32LittleEndian(p); + program->EmitRel32(program->FindOrMakeRel32Label(target_rva)); + p += 4; + continue; + } + + if (incomplete_disassembly_) { + if ((abs32_pos == abs32_locations_.end() || end_rva <= *abs32_pos) && + (rel32_pos == rel32_locations_.end() || end_rva <= *rel32_pos) && + (end_rva <= relocs_start_rva || current_rva >= relocs_start_rva)) { + // No more relocs in this section, don't bother encoding bytes. + break; + } + } + + program->EmitByteInstruction(*p); + p += 1; + } +} + +#if COURGETTE_HISTOGRAM_TARGETS +// Histogram is printed to std::cout. It is purely for debugging the algorithm +// and is only enabled manually in 'exploration' builds. I don't want to add +// command-line configuration for this feature because this code has to be +// small, which means compiled-out. +void DisassemblerWin32X86::HistogramTargets(const char* kind, + const std::map<RVA, int>& map) { + int total = 0; + std::map<int, std::vector<RVA> > h; + for (std::map<RVA, int>::const_iterator p = map.begin(); + p != map.end(); + ++p) { + h[p->second].push_back(p->first); + total += p->second; + } + + std::cout << total << " " << kind << " to " + << map.size() << " unique targets" << std::endl; + + std::cout << "indegree: #targets-with-indegree (example)" << std::endl; + const int kFirstN = 15; + bool someSkipped = false; + int index = 0; + for (std::map<int, std::vector<RVA> >::reverse_iterator p = h.rbegin(); + p != h.rend(); + ++p) { + ++index; + if (index <= kFirstN || p->first <= 3) { + if (someSkipped) { + std::cout << "..." << std::endl; + } + size_t count = p->second.size(); + std::cout << std::dec << p->first << ": " << count; + if (count <= 2) { + for (size_t i = 0; i < count; ++i) + std::cout << " " << pe_info().DescribeRVA(p->second[i]); + } + std::cout << std::endl; + someSkipped = false; + } else { + someSkipped = true; + } + } +} +#endif // COURGETTE_HISTOGRAM_TARGETS + +Disassembler* Disassembler::MakeDisassemberWin32X86(PEInfo* pe_info) { + return new DisassemblerWin32X86(pe_info); +} + +//////////////////////////////////////////////////////////////////////////////// + +Status ParseWin32X86PE(const void* buffer, size_t length, + AssemblyProgram** output) { + *output = NULL; + + PEInfo* pe_info = new PEInfo(); + pe_info->Init(buffer, length); + + if (!pe_info->ParseHeader()) { + delete pe_info; + return C_INPUT_NOT_RECOGNIZED; + } + + Disassembler* disassembler = Disassembler::MakeDisassemberWin32X86(pe_info); + AssemblyProgram* program = new AssemblyProgram(); + + if (!disassembler->Disassemble(program)) { + delete program; + disassembler->Destroy(); + delete pe_info; + return C_DISASSEMBLY_FAILED; + } + + disassembler->Destroy(); + delete pe_info; + *output = program; + return C_OK; +} + +void DeleteAssemblyProgram(AssemblyProgram* program) { + delete program; +} + +} // namespace courgette diff --git a/courgette/disassembler.h b/courgette/disassembler.h new file mode 100644 index 0000000..fa7c908 --- /dev/null +++ b/courgette/disassembler.h @@ -0,0 +1,38 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_DISASSEMBLER_H_ +#define COURGETTE_DISASSEMBLER_H_ + +#include "base/basictypes.h" + +namespace courgette { + +class AssemblyProgram; +class PEInfo; + +class Disassembler { + public: + // Factory methods for making disassemblers for various kinds of executables. + // We have only one so far. + + static Disassembler* MakeDisassemberWin32X86(PEInfo* pe_info); + + // Disassembles the item passed to the factory method into the output + // parameter 'program'. + virtual bool Disassemble(AssemblyProgram* program) = 0; + + // Deletes 'this' disassembler. + virtual void Destroy() = 0; + + protected: + Disassembler() {} + virtual ~Disassembler() {} + + private: + DISALLOW_COPY_AND_ASSIGN(Disassembler); +}; + +} // namespace courgette +#endif // COURGETTE_DISASSEMBLER_H_ diff --git a/courgette/encode_decode_unittest.cc b/courgette/encode_decode_unittest.cc new file mode 100644 index 0000000..436479d8 --- /dev/null +++ b/courgette/encode_decode_unittest.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include <string> + +#include "base/path_service.h" +#include "base/file_util.h" +#include "base/string_util.h" + +#include "courgette/courgette.h" +#include "courgette/streams.h" + +#include "testing/gtest/include/gtest/gtest.h" + +class EncodeDecodeTest : public testing::Test { + public: + void TestExe(const char *) const; + + private: + void SetUp() { + PathService::Get(base::DIR_SOURCE_ROOT, &testdata_dir_); + file_util::AppendToPath(&testdata_dir_, L"courgette"); + file_util::AppendToPath(&testdata_dir_, L"testdata"); + } + + void TearDown() { } + + // Returns contents of |file_name| as uninterprested bytes stored in a string. + std::string FileContents(const char* file_name) const; + + std::wstring testdata_dir_; // Full path name of testdata directory +}; + +// Reads a test file into a string. +std::string EncodeDecodeTest::FileContents(const char* file_name) const { + std::wstring file_path = testdata_dir_; + file_util::AppendToPath(&file_path, UTF8ToWide(file_name)); + std::string file_contents; + if (!file_util::ReadFileToString(file_path, &file_contents)) { + EXPECT_TRUE(!"Could not read test data"); + } + return file_contents; +} + +void EncodeDecodeTest::TestExe(const char* file_name) const { + // Test top-level Courgette API for converting an a file to a binary + // assembly representation and back. + std::string file1 = FileContents(file_name); + + const void* original_buffer = file1.c_str(); + size_t original_length = file1.size(); + + courgette::AssemblyProgram* program = NULL; + const courgette::Status parse_status = + courgette::ParseWin32X86PE(original_buffer, original_length, &program); + EXPECT_EQ(courgette::C_OK, parse_status); + + courgette::EncodedProgram* encoded = NULL; + + const courgette::Status encode_status = Encode(program, &encoded); + EXPECT_EQ(courgette::C_OK, encode_status); + + DeleteAssemblyProgram(program); + + courgette::SinkStreamSet sinks; + const courgette::Status write_status = WriteEncodedProgram(encoded, &sinks); + EXPECT_EQ(courgette::C_OK, write_status); + + DeleteEncodedProgram(encoded); + + courgette::SinkStream sink; + bool can_collect = sinks.CopyTo(&sink); + EXPECT_TRUE(can_collect); + + const void* buffer = sink.Buffer(); + size_t length = sink.Length(); + + EXPECT_EQ(971850, length); + + courgette::SourceStreamSet sources; + bool can_get_source_streams = sources.Init(buffer, length); + EXPECT_TRUE(can_get_source_streams); + + courgette::EncodedProgram *encoded2 = NULL; + const courgette::Status read_status = ReadEncodedProgram(&sources, &encoded2); + EXPECT_EQ(courgette::C_OK, read_status); + + courgette::SinkStream assembled; + const courgette::Status assemble_status = Assemble(encoded2, &assembled); + EXPECT_EQ(courgette::C_OK, assemble_status); + + const void* assembled_buffer = assembled.Buffer(); + size_t assembled_length = assembled.Length(); + + EXPECT_EQ(original_length, assembled_length); + EXPECT_EQ(0, memcmp(original_buffer, assembled_buffer, original_length)); + + DeleteEncodedProgram(encoded2); +} + + +TEST_F(EncodeDecodeTest, All) { + TestExe("setup1.exe"); +} diff --git a/courgette/encoded_program.cc b/courgette/encoded_program.cc new file mode 100644 index 0000000..f606f96 --- /dev/null +++ b/courgette/encoded_program.cc @@ -0,0 +1,573 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/encoded_program.h" + +#include <algorithm> +#include <map> +#include <string> +#include <vector> + +#include "base/logging.h" +#include "base/sys_info.h" + +#include "courgette/courgette.h" +#include "courgette/streams.h" + +namespace courgette { + +// Stream indexes. +const int kStreamMisc = 0; +const int kStreamOps = 1; +const int kStreamBytes = 2; +const int kStreamAbs32Indexes = 3; +const int kStreamRel32Indexes = 4; +const int kStreamAbs32Addresses = 5; +const int kStreamRel32Addresses = 6; +const int kStreamCopyCounts = 7; +const int kStreamOriginAddresses = kStreamMisc; + +const int kStreamLimit = 9; + +// Binary assembly language operations. +enum EncodedProgram::OP { + ORIGIN, // ORIGIN <rva> - set address for subsequent assembly. + COPY, // COPY <count> <bytes> - copy bytes to output. + COPY1, // COPY1 <byte> - same as COPY 1 <byte>. + REL32, // REL32 <index> - emit rel32 encoded reference to address at + // address table offset <index> + ABS32, // ABS32 <index> - emit abs32 encoded reference to address at + // address table offset <index> + MAKE_BASE_RELOCATION_TABLE, // Emit base relocation table blocks. + OP_LAST +}; + + +// Constructor is here rather than in the header. Although the constructor +// appears to do nothing it is fact quite large because of the implict calls to +// field constructors. Ditto for the destructor. +EncodedProgram::EncodedProgram() {} +EncodedProgram::~EncodedProgram() {} + +// Serializes a vector of integral values using Varint32 coding. +template<typename T> +void WriteVector(const std::vector<T>& items, SinkStream* buffer) { + size_t count = items.size(); + buffer->WriteVarint32(count); + for (size_t i = 0; i < count; ++i) { + COMPILE_ASSERT(sizeof(T) <= sizeof(uint32), T_must_fit_in_uint32); + buffer->WriteVarint32(static_cast<uint32>(items[i])); + } +} + +template<typename T> +bool ReadVector(std::vector<T>* items, SourceStream* buffer) { + uint32 count; + if (!buffer->ReadVarint32(&count)) + return false; + + items->clear(); + items->reserve(count); + for (size_t i = 0; i < count; ++i) { + uint32 item; + if (!buffer->ReadVarint32(&item)) + return false; + items->push_back(static_cast<T>(item)); + } + + return true; +} + +// Serializes a vector, using delta coding followed by Varint32 coding. +void WriteU32Delta(const std::vector<uint32>& set, SinkStream* buffer) { + size_t count = set.size(); + buffer->WriteVarint32(count); + uint32 prev = 0; + for (size_t i = 0; i < count; ++i) { + uint32 current = set[i]; + uint32 delta = current - prev; + buffer->WriteVarint32(delta); + prev = current; + } +} + +static bool ReadU32Delta(std::vector<uint32>* set, SourceStream* buffer) { + uint32 count; + + if (!buffer->ReadVarint32(&count)) + return false; + + set->clear(); + set->reserve(count); + uint32 prev = 0; + + for (size_t i = 0; i < count; ++i) { + uint32 delta; + if (!buffer->ReadVarint32(&delta)) + return false; + uint32 current = prev + delta; + set->push_back(current); + prev = current; + } + + return true; +} + +// Write a vector as the byte representation of the contents. +// +// (This only really makes sense for a type T that has sizeof(T)==1, otherwise +// serilized representation is not endian-agnositic. But it is useful to keep +// the possibility of a greater size for experiments comparing Varint32 encoding +// of a vector of larger integrals vs a plain form.) +// +template<typename T> +void WriteVectorU8(const std::vector<T>& items, SinkStream* buffer) { + uint32 count = items.size(); + buffer->WriteVarint32(count); + if (count != 0) { + size_t byte_count = count * sizeof(T); + buffer->Write(static_cast<const void*>(&items[0]), byte_count); + } +} + +template<typename T> +bool ReadVectorU8(std::vector<T>* items, SourceStream* buffer) { + uint32 count; + if (!buffer->ReadVarint32(&count)) + return false; + + items->clear(); + items->resize(count); + if (count != 0) { + size_t byte_count = count * sizeof(T); + return buffer->Read(static_cast<void*>(&((*items)[0])), byte_count); + } + return true; +} + +//////////////////////////////////////////////////////////////////////////////// + +void EncodedProgram::DefineRel32Label(int index, RVA value) { + DefineLabelCommon(&rel32_rva_, index, value); +} + +void EncodedProgram::DefineAbs32Label(int index, RVA value) { + DefineLabelCommon(&abs32_rva_, index, value); +} + +static const RVA kUnassignedRVA = static_cast<RVA>(-1); + +void EncodedProgram::DefineLabelCommon(std::vector<RVA>* rvas, + int index, + RVA rva) { + if (static_cast<int>(rvas->size()) <= index) { + rvas->resize(index + 1, kUnassignedRVA); + } + if ((*rvas)[index] != kUnassignedRVA) { + NOTREACHED() << "DefineLabel double assigned " << index; + } + (*rvas)[index] = rva; +} + +void EncodedProgram::EndLabels() { + FinishLabelsCommon(&abs32_rva_); + FinishLabelsCommon(&rel32_rva_); +} + +void EncodedProgram::FinishLabelsCommon(std::vector<RVA>* rvas) { + // Replace all unassigned slots with the value at the previous index so they + // delta-encode to zero. (There might be better values than zero. The way to + // get that is have the higher level assembly program assign the unassigned + // slots.) + RVA previous = 0; + size_t size = rvas->size(); + for (size_t i = 0; i < size; ++i) { + if ((*rvas)[i] == kUnassignedRVA) + (*rvas)[i] = previous; + else + previous = (*rvas)[i]; + } +} + +void EncodedProgram::AddOrigin(RVA origin) { + ops_.push_back(ORIGIN); + origins_.push_back(origin); +} + +void EncodedProgram::AddCopy(int count, const void* bytes) { + const uint8* source = static_cast<const uint8*>(bytes); + + // Fold adjacent COPY instructions into one. This nearly halves the size of + // an EncodedProgram with only COPY1 instructions since there are approx plain + // 16 bytes per reloc. This has a working-set benefit during decompression. + // For compression of files with large differences this makes a small (4%) + // improvement in size. For files with small differences this degrades the + // compressed size by 1.3% + if (ops_.size() > 0) { + if (ops_.back() == COPY1) { + ops_.back() = COPY; + copy_counts_.push_back(1); + } + if (ops_.back() == COPY) { + copy_counts_.back() += count; + for (int i = 0; i < count; ++i) { + copy_bytes_.push_back(source[i]); + } + return; + } + } + + if (count == 1) { + ops_.push_back(COPY1); + copy_bytes_.push_back(source[0]); + } else { + ops_.push_back(COPY); + copy_counts_.push_back(count); + for (int i = 0; i < count; ++i) { + copy_bytes_.push_back(source[i]); + } + } +} + +void EncodedProgram::AddAbs32(int label_index) { + ops_.push_back(ABS32); + abs32_ix_.push_back(label_index); +} + +void EncodedProgram::AddRel32(int label_index) { + ops_.push_back(REL32); + rel32_ix_.push_back(label_index); +} + +void EncodedProgram::AddMakeRelocs() { + ops_.push_back(MAKE_BASE_RELOCATION_TABLE); +} + +void EncodedProgram::DebuggingSummary() { + LOG(INFO) << "EncodedProgram Summary"; + LOG(INFO) << " image base " << image_base_; + LOG(INFO) << " abs32 rvas " << abs32_rva_.size(); + LOG(INFO) << " rel32 rvas " << rel32_rva_.size(); + LOG(INFO) << " ops " << ops_.size(); + LOG(INFO) << " origins " << origins_.size(); + LOG(INFO) << " copy_counts " << copy_counts_.size(); + LOG(INFO) << " copy_bytes " << copy_bytes_.size(); + LOG(INFO) << " abs32_ix " << abs32_ix_.size(); + LOG(INFO) << " rel32_ix " << rel32_ix_.size(); +} + +//////////////////////////////////////////////////////////////////////////////// + +// For algorithm refinement purposes it is useful to write subsets of the file +// format. This gives us the ability to estimate the entropy of the +// differential compression of the individual streams, which can provide +// invaluable insights. The default, of course, is to include all the streams. +// +enum FieldSelect { + INCLUDE_ABS32_ADDRESSES = 0x0001, + INCLUDE_REL32_ADDRESSES = 0x0002, + INCLUDE_ABS32_INDEXES = 0x0010, + INCLUDE_REL32_INDEXES = 0x0020, + INCLUDE_OPS = 0x0100, + INCLUDE_BYTES = 0x0200, + INCLUDE_COPY_COUNTS = 0x0400, + INCLUDE_MISC = 0x1000 +}; + +static FieldSelect GetFieldSelect() { +#if 1 + // TODO(sra): Use better configuration. + std::wstring s = base::SysInfo::GetEnvVar(L"A_FIELDS"); + if (!s.empty()) { + return static_cast<FieldSelect>(wcstoul(s.c_str(), 0, 0)); + } +#endif + return static_cast<FieldSelect>(~0); +} + +void EncodedProgram::WriteTo(SinkStreamSet* streams) { + FieldSelect select = GetFieldSelect(); + + // The order of fields must be consistent in WriteTo and ReadFrom, regardless + // of the streams used. The code can be configured with all kStreamXXX + // constants the same. + // + // If we change the code to pipeline reading with assembly (to avoid temporary + // storage vectors by consuming operands directly from the stream) then we + // need to read the base address and the random access address tables first, + // the rest can be interleaved. + + if (select & INCLUDE_MISC) { + // TODO(sra): write 64 bits. + streams->stream(kStreamMisc)->WriteVarint32( + static_cast<uint32>(image_base_)); + } + + if (select & INCLUDE_ABS32_ADDRESSES) + WriteU32Delta(abs32_rva_, streams->stream(kStreamAbs32Addresses)); + if (select & INCLUDE_REL32_ADDRESSES) + WriteU32Delta(rel32_rva_, streams->stream(kStreamRel32Addresses)); + if (select & INCLUDE_MISC) + WriteVector(origins_, streams->stream(kStreamOriginAddresses)); + if (select & INCLUDE_OPS) { + streams->stream(kStreamOps)->Reserve(ops_.size() + 5); // 5 for length. + WriteVector(ops_, streams->stream(kStreamOps)); + } + if (select & INCLUDE_COPY_COUNTS) + WriteVector(copy_counts_, streams->stream(kStreamCopyCounts)); + if (select & INCLUDE_BYTES) + WriteVectorU8(copy_bytes_, streams->stream(kStreamBytes)); + if (select & INCLUDE_ABS32_INDEXES) + WriteVector(abs32_ix_, streams->stream(kStreamAbs32Indexes)); + if (select & INCLUDE_REL32_INDEXES) + WriteVector(rel32_ix_, streams->stream(kStreamRel32Indexes)); +} + +bool EncodedProgram::ReadFrom(SourceStreamSet* streams) { + // TODO(sra): read 64 bits. + uint32 temp; + if (!streams->stream(kStreamMisc)->ReadVarint32(&temp)) + return false; + image_base_ = temp; + + if (!ReadU32Delta(&abs32_rva_, streams->stream(kStreamAbs32Addresses))) + return false; + if (!ReadU32Delta(&rel32_rva_, streams->stream(kStreamRel32Addresses))) + return false; + if (!ReadVector(&origins_, streams->stream(kStreamOriginAddresses))) + return false; + if (!ReadVector(&ops_, streams->stream(kStreamOps))) + return false; + if (!ReadVector(©_counts_, streams->stream(kStreamCopyCounts))) + return false; + if (!ReadVectorU8(©_bytes_, streams->stream(kStreamBytes))) + return false; + if (!ReadVector(&abs32_ix_, streams->stream(kStreamAbs32Indexes))) + return false; + if (!ReadVector(&rel32_ix_, streams->stream(kStreamRel32Indexes))) + return false; + + // Check that streams have been completely consumed. + for (int i = 0; i < kStreamLimit; ++i) { + if (streams->stream(i)->Remaining() > 0) + return false; + } + + return true; +} + +// Safe, non-throwing version of std::vector::at(). Returns 'true' for success, +// 'false' for out-of-bounds index error. +template<typename T> +bool VectorAt(const std::vector<T>& v, size_t index, T* output) { + if (index >= v.size()) + return false; + *output = v[index]; + return true; +} + +bool EncodedProgram::AssembleTo(SinkStream* final_buffer) { + // For the most part, the assembly process walks the various tables. + // ix_mumble is the index into the mumble table. + size_t ix_origins = 0; + size_t ix_copy_counts = 0; + size_t ix_copy_bytes = 0; + size_t ix_abs32_ix = 0; + size_t ix_rel32_ix = 0; + + RVA current_rva = 0; + + bool pending_base_relocation_table = false; + SinkStream bytes_following_base_relocation_table; + + SinkStream* output = final_buffer; + + for (size_t ix_ops = 0; ix_ops < ops_.size(); ++ix_ops) { + OP op = ops_[ix_ops]; + + switch (op) { + default: + return false; + + case ORIGIN: { + RVA section_rva; + if (!VectorAt(origins_, ix_origins, §ion_rva)) + return false; + ++ix_origins; + current_rva = section_rva; + break; + } + + case COPY: { + int count; + if (!VectorAt(copy_counts_, ix_copy_counts, &count)) + return false; + ++ix_copy_counts; + for (int i = 0; i < count; ++i) { + uint8 b; + if (!VectorAt(copy_bytes_, ix_copy_bytes, &b)) + return false; + ++ix_copy_bytes; + output->Write(&b, 1); + } + current_rva += count; + break; + } + + case COPY1: { + uint8 b; + if (!VectorAt(copy_bytes_, ix_copy_bytes, &b)) + return false; + ++ix_copy_bytes; + output->Write(&b, 1); + current_rva += 1; + break; + } + + case REL32: { + uint32 index; + if (!VectorAt(rel32_ix_, ix_rel32_ix, &index)) + return false; + ++ix_rel32_ix; + RVA rva; + if (!VectorAt(rel32_rva_, index, &rva)) + return false; + uint32 offset = (rva - (current_rva + 4)); + output->Write(&offset, 4); + current_rva += 4; + break; + } + + case ABS32: { + uint32 index; + if (!VectorAt(abs32_ix_, ix_abs32_ix, &index)) + return false; + ++ix_abs32_ix; + RVA rva; + if (!VectorAt(abs32_rva_, index, &rva)) + return false; + uint32 abs32 = static_cast<uint32>(rva + image_base_); + abs32_relocs_.push_back(current_rva); + output->Write(&abs32, 4); + current_rva += 4; + break; + } + + case MAKE_BASE_RELOCATION_TABLE: { + // We can see the base relocation anywhere, but we only have the + // information to generate it at the very end. So we divert the bytes + // we are generating to a temporary stream. + if (pending_base_relocation_table) // Can't have two base relocation + // tables. + return false; + + pending_base_relocation_table = true; + output = &bytes_following_base_relocation_table; + break; + // There is a potential problem *if* the instruction stream contains + // some REL32 relocations following the base relocation and in the same + // section. We don't know the size of the table, so 'current_rva' will + // be wrong, causing REL32 offsets to be miscalculated. This never + // happens; the base relocation table is usually in a section of its + // own, a data-only section, and following everything else in the + // executable except some padding zero bytes. We could fix this by + // emitting an ORIGIN after the MAKE_BASE_RELOCATION_TABLE. + } + } + } + + if (pending_base_relocation_table) { + GenerateBaseRelocations(final_buffer); + final_buffer->Append(&bytes_following_base_relocation_table); + } + + // Final verification check: did we consume all lists? + if (ix_copy_counts != copy_counts_.size()) + return false; + if (ix_copy_bytes != copy_bytes_.size()) + return false; + if (ix_abs32_ix != abs32_ix_.size()) + return false; + if (ix_rel32_ix != rel32_ix_.size()) + return false; + + return true; +} + + +// RelocBlock has the layout of a block of relocations in the base relocation +// table file format. +// +class RelocBlock { + public: + uint32 page_rva; + uint32 block_size; + uint16 relocs[4096]; // Allow up to one relocation per byte of a 4k page. + + RelocBlock() : page_rva(~0), block_size(8) {} + + void Add(uint16 item) { + relocs[(block_size-8)/2] = item; + block_size += 2; + } + + void Flush(SinkStream* buffer) { + if (block_size != 8) { + if (block_size % 4 != 0) { // Pad to make size multiple of 4 bytes. + Add(0); + } + buffer->Write(this, block_size); + block_size = 8; + } + } +}; + +COMPILE_ASSERT(offsetof(RelocBlock, relocs) == 8, reloc_block_header_size); + +void EncodedProgram::GenerateBaseRelocations(SinkStream* buffer) { + std::sort(abs32_relocs_.begin(), abs32_relocs_.end()); + + RelocBlock block; + + for (size_t i = 0; i < abs32_relocs_.size(); ++i) { + uint32 rva = abs32_relocs_[i]; + uint32 page_rva = rva & ~0xFFF; + if (page_rva != block.page_rva) { + block.Flush(buffer); + block.page_rva = page_rva; + } + block.Add(0x3000 | (rva & 0xFFF)); + } + block.Flush(buffer); +} + +//////////////////////////////////////////////////////////////////////////////// + +Status WriteEncodedProgram(EncodedProgram* encoded, SinkStreamSet* sink) { + encoded->WriteTo(sink); + return C_OK; +} + +Status ReadEncodedProgram(SourceStreamSet* streams, EncodedProgram** output) { + EncodedProgram* encoded = new EncodedProgram(); + if (encoded->ReadFrom(streams)) { + *output = encoded; + return C_OK; + } + delete encoded; + return C_DESERIALIZATION_FAILED; +} + +Status Assemble(EncodedProgram* encoded, SinkStream* buffer) { + bool assembled = encoded->AssembleTo(buffer); + if (assembled) + return C_OK; + return C_ASSEMBLY_FAILED; +} + +void DeleteEncodedProgram(EncodedProgram* encoded) { + delete encoded; +} + +} // end namespace diff --git a/courgette/encoded_program.h b/courgette/encoded_program.h new file mode 100644 index 0000000..df1119c --- /dev/null +++ b/courgette/encoded_program.h @@ -0,0 +1,83 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_ENCODED_PROGRAM_H_ +#define COURGETTE_ENCODED_PROGRAM_H_ + +#include <vector> + +#include "base/basictypes.h" +#include "courgette/image_info.h" + +namespace courgette { + +class SinkStream; +class SinkStreamSet; +class SourceStreamSet; + +// An EncodedProgram is a set of tables that contain a simple 'binary assembly +// language' that can be assembled to produce a sequence of bytes, for example, +// a Windows 32-bit executable. +// +class EncodedProgram { + public: + EncodedProgram(); + ~EncodedProgram(); + + // Generating an EncodedProgram: + // + // (1) The image base can be specified at any time. + void set_image_base(uint64 base) { image_base_ = base; } + + // (2) Address tables and indexes defined first. + void DefineRel32Label(int index, RVA address); + void DefineAbs32Label(int index, RVA address); + void EndLabels(); + + // (3) Add instructions in the order needed to generate bytes of file. + void AddOrigin(RVA rva); + void AddCopy(int count, const void* bytes); + void AddRel32(int label_index); + void AddAbs32(int label_index); + void AddMakeRelocs(); + + // (3) Serialize binary assembly language tables to a set of streams. + void WriteTo(SinkStreamSet *streams); + + // Using an EncodedProgram to generate a byte stream: + // + // (4) Deserializes a fresh EncodedProgram from a set of streams. + bool ReadFrom(SourceStreamSet *streams); + + // (5) Assembles the 'binary assembly language' into final file. + bool AssembleTo(SinkStream *buffer); + + private: + enum OP; // Binary assembly language operations. + + void DebuggingSummary(); + void GenerateBaseRelocations(SinkStream *buffer); + void DefineLabelCommon(std::vector<RVA>*, int, RVA); + void FinishLabelsCommon(std::vector<RVA>* addresses); + + // Binary assembly language tables. + uint64 image_base_; + std::vector<RVA> rel32_rva_; + std::vector<RVA> abs32_rva_; + std::vector<OP> ops_; + std::vector<RVA> origins_; + std::vector<int> copy_counts_; + std::vector<uint8> copy_bytes_; + std::vector<uint32> rel32_ix_; + std::vector<uint32> abs32_ix_; + + // Table of the addresses containing abs32 relocations; computed during + // assembly, used to generate base relocation table. + std::vector<uint32> abs32_relocs_; + + DISALLOW_COPY_AND_ASSIGN(EncodedProgram); +}; + +} // namespace courgette +#endif // COURGETTE_ENCODED_FORMAT_H_ diff --git a/courgette/encoded_program_fuzz_unittest.cc b/courgette/encoded_program_fuzz_unittest.cc new file mode 100644 index 0000000..adc09d9 --- /dev/null +++ b/courgette/encoded_program_fuzz_unittest.cc @@ -0,0 +1,235 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Fuzz testing for EncodedProgram serialized format and assembly. +// +// We would like some assurance that if an EncodedProgram is malformed we will +// not crash. The EncodedProgram could be malformed either due to malicious +// attack to due to an error in patch generation. +// +// We try a lot of arbitrary modifications to the serialized form and make sure +// that the outcome is not a crash. + +#include <string> + +#include "base/path_service.h" +#include "base/file_util.h" +#include "base/string_util.h" +#include "base/test_suite.h" + +#include "courgette/courgette.h" +#include "courgette/streams.h" + +#include "testing/gtest/include/gtest/gtest.h" + +class DecodeFuzzTest : public testing::Test { + public: + void FuzzExe(const char *) const; + + private: + virtual void SetUp() { + PathService::Get(base::DIR_SOURCE_ROOT, &testdata_dir_); + testdata_dir_ = testdata_dir_.Append(L"courgette"); + testdata_dir_ = testdata_dir_.Append(L"testdata"); + } + + virtual void TearDown() { } + + void FuzzByte(const std::string& buffer, const std::string& output, + size_t index) const; + void FuzzBits(const std::string& buffer, const std::string& output, + size_t index, int bits_to_flip) const; + + // Returns true if could assemble, false if rejected. + bool TryAssemble(const std::string& buffer, std::string* output) const; + + // Returns contents of |file_name| as uninterprested bytes stored in a string. + std::string FileContents(const char* file_name) const; + + // Full path name of testdata directory + FilePath testdata_dir_; +}; + +// Reads a test file into a string. +std::string DecodeFuzzTest::FileContents(const char* file_name) const { + FilePath file_path = testdata_dir_.AppendASCII(file_name); + std::string file_contents; + if (!file_util::ReadFileToString(file_path, &file_contents)) { + EXPECT_TRUE(!"Could not read test data"); + } + return file_contents; +} + +// Loads an executable and does fuzz testing in the serialized format. +void DecodeFuzzTest::FuzzExe(const char* file_name) const { + std::string file1 = FileContents(file_name); + + const void* original_buffer = file1.c_str(); + size_t original_length = file1.size(); + + courgette::AssemblyProgram* program = NULL; + const courgette::Status parse_status = + courgette::ParseWin32X86PE(original_buffer, original_length, &program); + EXPECT_EQ(courgette::C_OK, parse_status); + + courgette::EncodedProgram* encoded = NULL; + + const courgette::Status encode_status = Encode(program, &encoded); + EXPECT_EQ(courgette::C_OK, encode_status); + + DeleteAssemblyProgram(program); + + courgette::SinkStreamSet sinks; + const courgette::Status write_status = WriteEncodedProgram(encoded, &sinks); + EXPECT_EQ(courgette::C_OK, write_status); + + DeleteEncodedProgram(encoded); + + courgette::SinkStream sink; + bool can_collect = sinks.CopyTo(&sink); + EXPECT_TRUE(can_collect); + + size_t length = sink.Length(); + + std::string base_buffer(reinterpret_cast<const char*>(sink.Buffer()), length); + std::string base_output; + bool ok = TryAssemble(base_buffer, &base_output); + EXPECT_EQ(true, ok); + + // Now we have a good serialized EncodedProgram in |base_buffer|. Time to + // fuzz. + + // More intense fuzzing on the first part because it contains more control + // information like substeam lengths. + size_t position = 0; + for ( ; position < 100 && position < length; position += 1) { + FuzzByte(base_buffer, base_output, position); + } + // We would love to fuzz every position, but it takes too long. + for ( ; position < length; position += 900) { + FuzzByte(base_buffer, base_output, position); + } +} + +// FuzzByte tries to break the EncodedProgram deserializer and assembler. It +// takes a good serialization of and EncodedProgram, flips some bits, and checks +// that the behaviour is reasonable. It has testing checks for unreasonable +// behaviours. +void DecodeFuzzTest::FuzzByte(const std::string& base_buffer, + const std::string& base_output, + size_t index) const { + printf("Fuzzing position %d\n", static_cast<int>(index)); + + // The following 10 values are a compromize between run time and coverage of + // the 255 'wrong' values at this byte position. + + // 0xFF flips all the bits. + FuzzBits(base_buffer, base_output, index, 0xFF); + // 0x7F flips the most bits without changing Varint32 framing. + FuzzBits(base_buffer, base_output, index, 0x7F); + // These all flip one bit. + FuzzBits(base_buffer, base_output, index, 0x80); + FuzzBits(base_buffer, base_output, index, 0x40); + FuzzBits(base_buffer, base_output, index, 0x20); + FuzzBits(base_buffer, base_output, index, 0x10); + FuzzBits(base_buffer, base_output, index, 0x08); + FuzzBits(base_buffer, base_output, index, 0x04); + FuzzBits(base_buffer, base_output, index, 0x02); + FuzzBits(base_buffer, base_output, index, 0x01); +} + +// FuzzBits tries to break the EncodedProgram deserializer and assembler. It +// takes a good serialization of and EncodedProgram, flips some bits, and checks +// that the behaviour is reasonable. +// +// There are EXPECT calls to check for unreasonable behaviour. These are +// somewhat arbitrary in that the parameters cannot easily be derived from first +// principles. They may need updating as the serialized format evolves. +void DecodeFuzzTest::FuzzBits(const std::string& base_buffer, + const std::string& base_output, + size_t index, int bits_to_flip) const { + std::string modified_buffer = base_buffer; + std::string modified_output; + modified_buffer[index] ^= bits_to_flip; + + bool ok = TryAssemble(modified_buffer, &modified_output); + + if (ok) { + // We normally expect TryAssemble to fail. But sometimes it succeeds. + // What could have happened? We changed one byte in the serialized form: + // + // * If we changed one of the copied bytes, we would see a single byte + // change in the output. + // * If we changed an address table element, all the references to that + // address would be different. + // * If we changed a copy count, we would run out of data in some stream, + // or leave data remaining, so should not be here. + // * If we changed an origin address, it could affect all relocations based + // off that address. If no relocations were based off the address then + // there will be no changes. + // * If we changed an origin address, it could cause some abs32 relocs to + // shift from one page to the next, changing the number and layout of + // blocks in the base relocation table. + + // Generated length could vary slightly due to base relocation table layout. + // In the worst case the number of base relocation blocks doubles, approx + // 12/4096 or 0.3% size of file. + size_t base_length = base_output.length(); + size_t modified_length = modified_output.length(); + ptrdiff_t diff = base_length - modified_length; + if (diff < -200 || diff > 200) { + EXPECT_EQ(base_length, modified_length); + } + + size_t changed_byte_count = 0; + for (size_t i = 0; i < base_length && i < modified_length; ++i) { + changed_byte_count += (base_output[i] != modified_output[i]); + } + + if (index > 60) { // Beyond the origin addresses ... + EXPECT_NE(0, changed_byte_count); // ... we expect some difference. + } + // Currently all changes are smaller than this number: + EXPECT_GE(45000u, changed_byte_count); + } +} + +bool DecodeFuzzTest::TryAssemble(const std::string& buffer, + std::string* output) const { + courgette::EncodedProgram *encoded = NULL; + bool result = false; + + courgette::SourceStreamSet sources; + bool can_get_source_streams = sources.Init(buffer.c_str(), buffer.length()); + if (can_get_source_streams) { + const courgette::Status read_status = + ReadEncodedProgram(&sources, &encoded); + if (read_status == courgette::C_OK) { + courgette::SinkStream assembled; + const courgette::Status assemble_status = Assemble(encoded, &assembled); + + if (assemble_status == courgette::C_OK) { + const void* assembled_buffer = assembled.Buffer(); + size_t assembled_length = assembled.Length(); + + output->clear(); + output->assign(reinterpret_cast<const char*>(assembled_buffer), + assembled_length); + result = true; + } + } + } + + DeleteEncodedProgram(encoded); + + return result; +} + +TEST_F(DecodeFuzzTest, All) { + FuzzExe("setup1.exe"); +} + +int main(int argc, char** argv) { + return TestSuite(argc, argv).Run(); +} diff --git a/courgette/encoded_program_unittest.cc b/courgette/encoded_program_unittest.cc new file mode 100644 index 0000000..aedb838 --- /dev/null +++ b/courgette/encoded_program_unittest.cc @@ -0,0 +1,64 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/encoded_program.h" + +#include "courgette/streams.h" +#include "testing/gtest/include/gtest/gtest.h" + +TEST(EncodedProgramTest, Test) { + // + // Create a simple program with a few addresses and references and + // check that the bits produced are as expected. + // + courgette::EncodedProgram* program = new courgette::EncodedProgram(); + + uint32 base = 0x00900000; + program->set_image_base(base); + + program->DefineRel32Label(5, 0); // REL32 index 5 == base + 0 + program->DefineAbs32Label(7, 4); // ABS32 index 7 == base + 4 + program->EndLabels(); + + program->AddOrigin(0); // Start at base. + program->AddAbs32(7); + program->AddRel32(5); + + // Serialize and deserialize. + + courgette::SinkStreamSet sinks; + program->WriteTo(&sinks); + + courgette::SinkStream sink; + bool can_collect = sinks.CopyTo(&sink); + EXPECT_TRUE(can_collect); + + const void* buffer = sink.Buffer(); + size_t length = sink.Length(); + + courgette::SourceStreamSet sources; + bool can_get_source_streams = sources.Init(buffer, length); + EXPECT_TRUE(can_get_source_streams); + + courgette::EncodedProgram *encoded2 = new courgette::EncodedProgram(); + bool can_read = encoded2->ReadFrom(&sources); + EXPECT_TRUE(can_read); + + // Finally, try to assemble. + courgette::SinkStream assembled; + bool can_assemble = encoded2->AssembleTo(&assembled); + EXPECT_TRUE(can_assemble); + + const void* assembled_buffer = assembled.Buffer(); + size_t assembled_length = assembled.Length(); + + EXPECT_EQ(8, assembled_length); + + static const uint8 golden[] = { + 0x04, 0x00, 0x90, 0x00, // ABS32 to base + 4 + 0xF8, 0xFF, 0xFF, 0xFF // REL32 from next line to base + 2 + }; + + EXPECT_EQ(0, memcmp(assembled_buffer, golden, 8)); +} diff --git a/courgette/ensemble.cc b/courgette/ensemble.cc new file mode 100644 index 0000000..15bfb3e --- /dev/null +++ b/courgette/ensemble.cc @@ -0,0 +1,102 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// *** File comments + +#include "courgette/ensemble.h" + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/string_util.h" + +#include "courgette/image_info.h" +#include "courgette/region.h" +#include "courgette/streams.h" +#include "courgette/simple_delta.h" + +namespace courgette { + +Element::Element(Kind kind, Ensemble* ensemble, const Region& region) + : kind_(kind), ensemble_(ensemble), region_(region) { +} + +std::string Element::Name() const { + return ensemble_->name() + "(" + + IntToString(kind()) + "," + + Uint64ToString(offset_in_ensemble()) + "," + + Uint64ToString(region().length()) + ")"; +} + +// A subclass of Element that has a PEInfo. +class ElementWinPE : public Element { + public: + ElementWinPE(Kind kind, Ensemble* ensemble, const Region& region, + PEInfo* info) + : Element(kind, ensemble, region), + pe_info_(info) { + } + + virtual PEInfo* GetPEInfo() const { return pe_info_; } + + protected: + ~ElementWinPE() { delete pe_info_; } + + private: + PEInfo* pe_info_; // Owned by |this|. +}; + +// Scans the Ensemble's region, sniffing out Elements. We assume that the +// elements do not overlap. +Status Ensemble::FindEmbeddedElements() { + size_t length = region_.length(); + const uint8* start = region_.start(); + + size_t position = 0; + while (position < length) { + // Quick test; Windows executables begin with 'MZ'. + if (start[position] == 'M' && + position + 1 < length && start[position + 1] == 'Z') { + courgette::PEInfo *info = new courgette::PEInfo(); + info->Init(start + position, length - position); + if (info->ParseHeader()) { + Region region(start + position, info->length()); + + if (info->has_text_section()) { + Element* element = new ElementWinPE(Element::WIN32_X86_WITH_CODE, + this, region, info); + owned_elements_.push_back(element); + elements_.push_back(element); + position += region.length(); + continue; + } + + // If we had a clever transformation for resource-only executables we + // should identify the suitable elements here: + if (!info->has_text_section() && false) { + Element* element = new ElementWinPE(Element::WIN32_NOCODE, + this, region, info); + owned_elements_.push_back(element); + elements_.push_back(element); + position += region.length(); + continue; + } + } + delete info; + } + + // This is where to add new formats, e.g. Linux executables, Dalvik + // executables etc. + + // No Element found at current position. + ++position; + } + return C_OK; +} + +Ensemble::~Ensemble() { + for (size_t i = 0; i < owned_elements_.size(); ++i) + delete owned_elements_[i]; +} + +} // namespace diff --git a/courgette/ensemble.h b/courgette/ensemble.h new file mode 100644 index 0000000..6a58cb0 --- /dev/null +++ b/courgette/ensemble.h @@ -0,0 +1,257 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The main idea in Courgette is to do patching *under a tranformation*. The +// input is transformed into a new representation, patching occurs in the new +// repesentation, and then the tranform is reversed to get the patched data. +// +// The idea is applied to pieces (or 'Elements') of the whole (or 'Ensemble'). +// Each of the elements has to go through the same set of steps in lock-step, +// but there may be many different kinds of elements, which have different +// transformation. +// +// This file declares all the main types involved in creating and applying a +// patch with this structure. + +#ifndef COURGETTE_ENSEMBLE_H_ +#define COURGETTE_ENSEMBLE_H_ + +#include <vector> +#include <string> + +#include "base/basictypes.h" + +#include "courgette/courgette.h" +#include "courgette/region.h" +#include "courgette/streams.h" + +namespace courgette { + +// Forward declarations: +class Ensemble; +class PEInfo; + +// An Element is a region of an Ensemble with an identifyable kind. +// +class Element { + public: + enum Kind { WIN32_X86_WITH_CODE, WIN32_NOCODE }; + + virtual ~Element() {} + + Kind kind() const { return kind_; } + const Region& region() const { return region_; } + + // The name is used only for debugging and logging. + virtual std::string Name() const; + + // Returns the byte position of this Element relative to the start of + // containing Ensemble. + size_t offset_in_ensemble() const; + + // Some subclasses of Element might have a PEInfo. + virtual PEInfo* GetPEInfo() const { return NULL; } + + protected: + Element(Kind kind, Ensemble* ensemble, const Region& region); + + private: + Kind kind_; + Ensemble* ensemble_; + Region region_; + + DISALLOW_COPY_AND_ASSIGN(Element); +}; + + +class Ensemble { + public: + Ensemble(const Region& region, const char* name) + : region_(region), name_(name) {} + ~Ensemble(); + + const Region& region() const { return region_; } + const std::string& name() const { return name_; } + + // Scans the region to find Elements within the region(). + Status FindEmbeddedElements(); + + // Returns the elements found by 'FindEmbeddedElements'. + const std::vector<Element*>& elements() const { return elements_; } + + + private: + Region region_; // The memory, owned by caller, containing the + // Ensemble's data. + std::string name_; // A debugging/logging name for the Ensemble. + + std::vector<Element*> elements_; // Embedded elements discovered. + std::vector<Element*> owned_elements_; // For deallocation. + + DISALLOW_COPY_AND_ASSIGN(Ensemble); +}; + +inline size_t Element::offset_in_ensemble() const { + return region().start() - ensemble_->region().start(); +} + +// The 'CourgettePatchFile' is class is a 'namespace' for the constants that +// appear in a Courgette patch file. +struct CourgettePatchFile { + // + // The Courgette patch format interleaves the data for N embedded Elements. + // + // Format of a patch file: + // header: + // magic + // version + // source-checksum + // target-checksum + // multiple-streams: + // stream 0: + // number-of-transformed-elements (N) - varint32 + // transformation-1-method-id + // transformation-2-method-id + // ... + // transformation-1-initial-parameters + // transformation-2-initial-parameters + // ... + // stream 1: + // correction: + // transformation-1-parameters + // transformation-2-parameters + // ... + // stream 2: + // correction: + // transformed-element-1 + // transformed-element-2 + // ... + // stream 3: + // correction: + // base-file + // element-1 + // element-2 + // ... + + static const uint32 kMagic = 'C' | ('o' << 8) | ('u' << 16); + + static const uint32 kVersion = 20090320; + + // Transformation method IDs. + enum TransformationMethodId { + T_COURGETTE_WIN32_X86 = 1, // Windows 32 bit 'Portable Executable' x86. + }; +}; + +// For any transform you would implement both a TransformationPatcher and a +// TransformationPatchGenerator. +// +// TransformationPatcher is the interface which abstracts out the actual +// transformation used on an Element. The patching itself happens outside the +// actions of a TransformationPatcher. There are four steps. +// +// The first step is an Init step. The parameters to the Init step identify the +// element, for example, range of locations within the original ensemble that +// correspond to the element. +// +// PredictTransformParameters, explained below. +// +// The two final steps are 'Transform' - to transform the element into a new +// representation, and to 'Reform' - to transform from the new representation +// back to the original form. +// +// The Transform step takes some parameters. This allows the transform to be +// customized to the particular element, or to receive some assistance in the +// analysis required to perform the transform. The transform parameters might +// be extensive but mostly predicable, so preceeding Transform is a +// PredictTransformParameters step. +// +class TransformationPatcher { + public: + virtual ~TransformationPatcher() {} + + // First step: provides parameters for the patching. This would at a minimum + // identify the element within the ensemble being patched. + virtual Status Init(SourceStream* parameter_stream) = 0; + + // Second step: predicts transform parameters. + virtual Status PredictTransformParameters( + SinkStreamSet* predicted_parameters) = 0; + + // Third step: transforms element from original representation into alternate + // representation. + virtual Status Transform(SourceStreamSet* corrected_parameters, + SinkStreamSet* transformed_element) = 0; + + // Final step: transforms element back from alternate representation into + // original representation. + virtual Status Reform(SourceStreamSet* transformed_element, + SinkStream* reformed_element) = 0; +}; + +// TransformationPatchGenerator is the interface which abstracts out the actual +// transformation used (and adjustment used) when differentially compressing one +// Element from the |new_ensemble| against a corresponding element in the +// |old_ensemble|. +// +// This is not a pure interface. There is a small amount of inheritance +// implementation for the fields and actions common to all +// TransformationPatchGenerators. +// +// When TransformationPatchGenerator is subclassed, there will be a +// corresponding subclass of TransformationPatcher. +// +class TransformationPatchGenerator { + public: + TransformationPatchGenerator(Element* old_element, + Element* new_element, + TransformationPatcher* patcher); + + virtual ~TransformationPatchGenerator(); + + // Returns the TransformationMethodId that identies this transformation. + virtual CourgettePatchFile::TransformationMethodId Kind() = 0; + + // Writes the parameters that will be passed to TransformationPatcher::Init. + virtual Status WriteInitialParameters(SinkStream* parameter_stream) = 0; + + // Predicts the transform parameters for the |old_element|. This must match + // exactly the output that will be produced by the PredictTransformParameters + // method of the corresponding subclass of TransformationPatcher. This method + // is not pure. The default implementation delegates to the patcher to + // guarantee matching output. + virtual Status PredictTransformParameters(SinkStreamSet* prediction); + + // Writes the desired parameters for the transform of the old element from the + // file representation to the alternate representation. + virtual Status CorrectedTransformParameters(SinkStreamSet* parameters) = 0; + + // Writes both |old_element| and |new_element| in the new representation. + // |old_corrected_parameters| will match the |corrected_parameters| passed to + // the Transform method of the corresponding sublcass of + // TransformationPatcher. + // + // The output written to |old_transformed_element| must match exactly the + // output written by the Transform method of the corresponding subclass of + // TransformationPatcher. + virtual Status Transform(SourceStreamSet* old_corrected_parameters, + SinkStreamSet* old_transformed_element, + SinkStreamSet* new_transformed_element) = 0; + + // Transforms the new transformed_element back from the alternate + // representation into the original file format. This must match exactly the + // output that will be produced by the corresponding subclass of + // TransformationPatcher::Reform. This method is not pure. The default + // implementation delegates to the patcher. + virtual Status Reform(SourceStreamSet* transformed_element, + SinkStream* reformed_element); + + protected: + Element* old_element_; + Element* new_element_; + TransformationPatcher* patcher_; +}; + +} // namespace +#endif // COURGETTE_ENSEMBLE_H_ diff --git a/courgette/ensemble_apply.cc b/courgette/ensemble_apply.cc new file mode 100644 index 0000000..5a9edfd4 --- /dev/null +++ b/courgette/ensemble_apply.cc @@ -0,0 +1,412 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This file contains the code to apply a Courgette patch. + +#include "courgette/ensemble.h" + +#include "base/basictypes.h" +#include "base/file_util.h" +#include "base/logging.h" + +#include "courgette/crc.h" +#include "courgette/image_info.h" +#include "courgette/region.h" +#include "courgette/streams.h" +#include "courgette/simple_delta.h" +#include "courgette/win32_x86_patcher.h" + +namespace courgette { + +// EnsemblePatchApplication is all the logic and data required to apply the +// multi-stage patch. +class EnsemblePatchApplication { + public: + EnsemblePatchApplication(); + ~EnsemblePatchApplication(); + + Status ReadHeader(SourceStream* header_stream); + + Status InitBase(const Region& region); + + Status ValidateBase(); + + Status ReadInitialParameters(SourceStream* initial_parameters); + + Status PredictTransformParameters(SinkStreamSet* predicted_parameters); + + Status SubpatchTransformParameters(SinkStreamSet* prediction, + SourceStream* correction, + SourceStreamSet* corrected_parameters); + + Status TransformUp(SourceStreamSet* parameters, + SinkStreamSet* transformed_elements); + + Status SubpatchTransformedElements(SinkStreamSet* elements, + SourceStream* correction, + SourceStreamSet* corrected_elements); + + Status TransformDown(SourceStreamSet* transformed_elements, + SinkStream* basic_elements); + + Status SubpatchFinalOutput(SourceStream* original, + SourceStream* correction, + SinkStream* corrected_ensemble); + + private: + Status SubpatchStreamSets(SinkStreamSet* predicted_items, + SourceStream* correction, + SourceStreamSet* corrected_items, + SinkStream* corrected_items_storage); + + Region base_region_; // Location of in-memory copy of 'old' version. + + uint32 source_checksum_; + uint32 target_checksum_; + + std::vector<TransformationPatcher*> patchers_; + + SinkStream corrected_parameters_storage_; + SinkStream corrected_elements_storage_; + + DISALLOW_COPY_AND_ASSIGN(EnsemblePatchApplication); +}; + +EnsemblePatchApplication::EnsemblePatchApplication() + : source_checksum_(0), target_checksum_(0) { +} + +EnsemblePatchApplication::~EnsemblePatchApplication() { + for (size_t i = 0; i < patchers_.size(); ++i) { + delete patchers_[i]; + } +} + +Status EnsemblePatchApplication::ReadHeader(SourceStream* header_stream) { + uint32 magic; + if (!header_stream->ReadVarint32(&magic)) + return C_BAD_ENSEMBLE_MAGIC; + + if (magic != CourgettePatchFile::kMagic) + return C_BAD_ENSEMBLE_MAGIC; + + uint32 version; + if (!header_stream->ReadVarint32(&version)) + return C_BAD_ENSEMBLE_VERSION; + + if (version != CourgettePatchFile::kVersion) + return C_BAD_ENSEMBLE_VERSION; + + if (!header_stream->ReadVarint32(&source_checksum_)) + return C_BAD_ENSEMBLE_HEADER; + + if (!header_stream->ReadVarint32(&target_checksum_)) + return C_BAD_ENSEMBLE_HEADER; + + return C_OK; +} + +Status EnsemblePatchApplication::InitBase(const Region& region) { + base_region_.assign(region); + return C_OK; +} + +Status EnsemblePatchApplication::ValidateBase() { + uint32 checksum = CalculateCrc(base_region_.start(), base_region_.length()); + if (source_checksum_ != checksum) + return C_BAD_ENSEMBLE_CRC; + + return C_OK; +} + +Status EnsemblePatchApplication::ReadInitialParameters( + SourceStream* transformation_parameters) { + uint32 number_of_transformations = 0; + if (!transformation_parameters->ReadVarint32(&number_of_transformations)) + return C_BAD_ENSEMBLE_HEADER; + + for (size_t i = 0; i < number_of_transformations; ++i) { + uint32 kind; + if (!transformation_parameters->ReadVarint32(&kind)) + return C_BAD_ENSEMBLE_HEADER; + + if (kind == CourgettePatchFile::T_COURGETTE_WIN32_X86) { + TransformationPatcher* patcher = + new CourgetteWin32X86Patcher(base_region_); + patchers_.push_back(patcher); + } else { + return C_BAD_ENSEMBLE_HEADER; + } + } + + for (size_t i = 0; i < patchers_.size(); ++i) { + Status status = patchers_[i]->Init(transformation_parameters); + if (status != C_OK) + return status; + } + + // All transformation_parameters should have been consumed by the above loop. + if (!transformation_parameters->Empty()) + return C_BAD_ENSEMBLE_HEADER; + + return C_OK; +} + +Status EnsemblePatchApplication::PredictTransformParameters( + SinkStreamSet* all_predicted_parameters) { + for (size_t i = 0; i < patchers_.size(); ++i) { + SinkStreamSet single_predicted_parameters; + Status status = + patchers_[i]->PredictTransformParameters(&single_predicted_parameters); + if (status != C_OK) + return status; + if (!all_predicted_parameters->WriteSet(&single_predicted_parameters)) + return C_STREAM_ERROR; + } + return C_OK; +} + +Status EnsemblePatchApplication::SubpatchTransformParameters( + SinkStreamSet* predicted_parameters, + SourceStream* correction, + SourceStreamSet* corrected_parameters) { + return SubpatchStreamSets(predicted_parameters, + correction, + corrected_parameters, + &corrected_parameters_storage_); +} + +Status EnsemblePatchApplication::TransformUp( + SourceStreamSet* parameters, + SinkStreamSet* transformed_elements) { + for (size_t i = 0; i < patchers_.size(); ++i) { + SourceStreamSet single_parameters; + if (!parameters->ReadSet(&single_parameters)) + return C_STREAM_ERROR; + SinkStreamSet single_transformed_element; + Status status = patchers_[i]->Transform(&single_parameters, + &single_transformed_element); + if (status != C_OK) + return status; + if (!single_parameters.Empty()) + return C_STREAM_NOT_CONSUMED; + if (!transformed_elements->WriteSet(&single_transformed_element)) + return C_STREAM_ERROR; + } + + if (!parameters->Empty()) + return C_STREAM_NOT_CONSUMED; + return C_OK; +} + +Status EnsemblePatchApplication::SubpatchTransformedElements( + SinkStreamSet* predicted_elements, + SourceStream* correction, + SourceStreamSet* corrected_elements) { + return SubpatchStreamSets(predicted_elements, + correction, + corrected_elements, + &corrected_elements_storage_); +} + +Status EnsemblePatchApplication::TransformDown( + SourceStreamSet* transformed_elements, + SinkStream* basic_elements) { + // Construct blob of original input followed by reformed elements. + + // The original input: + basic_elements->Write(base_region_.start(), base_region_.length()); + + for (size_t i = 0; i < patchers_.size(); ++i) { + SourceStreamSet single_corrected_element; + if (!transformed_elements->ReadSet(&single_corrected_element)) + return C_STREAM_ERROR; + Status status = patchers_[i]->Reform(&single_corrected_element, + basic_elements); + if (status != C_OK) + return status; + if (!single_corrected_element.Empty()) + return C_STREAM_NOT_CONSUMED; + } + + if (!transformed_elements->Empty()) + return C_STREAM_NOT_CONSUMED; + + return C_OK; +} + +Status EnsemblePatchApplication::SubpatchFinalOutput( + SourceStream* original, + SourceStream* correction, + SinkStream* corrected_ensemble) { + Status delta_status = ApplySimpleDelta(original, correction, + corrected_ensemble); + if (delta_status != C_OK) + return delta_status; + + if (CalculateCrc(corrected_ensemble->Buffer(), + corrected_ensemble->Length()) != target_checksum_) + return C_BAD_ENSEMBLE_CRC; + + return C_OK; +} + +Status EnsemblePatchApplication::SubpatchStreamSets( + SinkStreamSet* predicted_items, + SourceStream* correction, + SourceStreamSet* corrected_items, + SinkStream* corrected_items_storage) { + SinkStream linearized_predicted_items; + if (!predicted_items->CopyTo(&linearized_predicted_items)) + return C_STREAM_ERROR; + + SourceStream prediction; + prediction.Init(linearized_predicted_items); + + Status status = ApplySimpleDelta(&prediction, + correction, + corrected_items_storage); + if (status != C_OK) + return status; + + if (!corrected_items->Init(corrected_items_storage->Buffer(), + corrected_items_storage->Length())) + return C_STREAM_ERROR; + + return C_OK; +} + +Status ApplyEnsemblePatch(SourceStream* base, + SourceStream* patch, + SinkStream* output) { + Status status; + EnsemblePatchApplication patch_process; + + status = patch_process.ReadHeader(patch); + if (status != C_OK) + return status; + + status = patch_process.InitBase(Region(base->Buffer(), base->Remaining())); + if (status != C_OK) + return status; + + status = patch_process.ValidateBase(); + if (status != C_OK) + return status; + + // The rest of the patch stream is a StreamSet. + SourceStreamSet patch_streams; + patch_streams.Init(patch); + + SourceStream* transformation_descriptions = patch_streams.stream(0); + SourceStream* parameter_correction = patch_streams.stream(1); + SourceStream* transformed_elements_correction = patch_streams.stream(2); + SourceStream* ensemble_correction = patch_streams.stream(3); + + status = patch_process.ReadInitialParameters(transformation_descriptions); + if (status != C_OK) + return status; + + SinkStreamSet predicted_parameters; + status = patch_process.PredictTransformParameters(&predicted_parameters); + if (status != C_OK) + return status; + + SourceStreamSet corrected_parameters; + status = patch_process.SubpatchTransformParameters(&predicted_parameters, + parameter_correction, + &corrected_parameters); + if (status != C_OK) + return status; + + SinkStreamSet transformed_elements; + status = patch_process.TransformUp(&corrected_parameters, + &transformed_elements); + if (status != C_OK) + return status; + + SourceStreamSet corrected_transformed_elements; + status = patch_process.SubpatchTransformedElements( + &transformed_elements, + transformed_elements_correction, + &corrected_transformed_elements); + if (status != C_OK) + return status; + + SinkStream original_ensemble_and_corrected_base_elements; + status = patch_process.TransformDown( + &corrected_transformed_elements, + &original_ensemble_and_corrected_base_elements); + if (status != C_OK) + return status; + + SourceStream final_patch_prediction; + final_patch_prediction.Init(original_ensemble_and_corrected_base_elements); + status = patch_process.SubpatchFinalOutput(&final_patch_prediction, + ensemble_correction, output); + if (status != C_OK) + return status; + + return C_OK; +} + +Status ApplyEnsemblePatch(const wchar_t* old_file_name, + const wchar_t* patch_file_name, + const wchar_t* new_file_name) { + Status status; + + // First read enough of the patch file to validate the header is well-formed. + // A few varint32 numbers should fit in 100. + FilePath patch_file_path(patch_file_name); + const int BIG_ENOUGH_FOR_HEADER = 100; + char buffer[BIG_ENOUGH_FOR_HEADER]; + int read_count = + file_util::ReadFile(patch_file_path, buffer, sizeof(buffer)); + if (read_count < 0) + return C_READ_OPEN_ERROR; + + // 'Dry-run' the first step of the patch process to validate format of header. + SourceStream patch_header_stream; + patch_header_stream.Init(buffer, read_count); + EnsemblePatchApplication patch_process; + status = patch_process.ReadHeader(&patch_header_stream); + if (status != C_OK) + return status; + + // Header smells good so read the whole patch file for real. + std::string patch_file_buffer; + if (!file_util::ReadFileToString(patch_file_path, &patch_file_buffer)) + return C_READ_ERROR; + + // Read the old_file. + FilePath old_file_path(old_file_name); + std::string old_file_buffer; + if (!file_util::ReadFileToString(old_file_path, &old_file_buffer)) + return C_READ_ERROR; + + // Apply patch on streams. + SourceStream old_source_stream; + SourceStream patch_source_stream; + old_source_stream.Init(old_file_buffer); + patch_source_stream.Init(patch_file_buffer); + SinkStream new_sink_stream; + status = ApplyEnsemblePatch(&old_source_stream, &patch_source_stream, + &new_sink_stream); + + // Write the patched data to |new_file_name|. + FilePath new_file_path(new_file_name); + int written = + file_util::WriteFile( + new_file_path, + reinterpret_cast<const char*>(new_sink_stream.Buffer()), + new_sink_stream.Length()); + if (written == -1) + return C_WRITE_OPEN_ERROR; + if (written != new_sink_stream.Length()) + return C_WRITE_ERROR; + + return C_OK; +} + +} // namespace diff --git a/courgette/ensemble_create.cc b/courgette/ensemble_create.cc new file mode 100644 index 0000000..6e64474 --- /dev/null +++ b/courgette/ensemble_create.cc @@ -0,0 +1,382 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// The main idea in Courgette is to do patching *under a tranformation*. The +// input is transformed into a new representation, patching occurs in the new +// repesentation, and then the tranform is reversed to get the patched data. +// +// The idea is applied to pieces (or 'elements') of the whole (or 'ensemble'). +// Each of the elements has to go through the same set of steps in lock-step. + +// This file contains the code to create the patch. + + +#include "courgette/ensemble.h" + +#include <vector> +#include <limits> + +#include "base/basictypes.h" +#include "base/logging.h" +#include "base/time.h" + +#include "courgette/third_party/bsdiff.h" +#include "courgette/crc.h" +#include "courgette/difference_estimator.h" +#include "courgette/image_info.h" +#include "courgette/streams.h" +#include "courgette/region.h" +#include "courgette/simple_delta.h" + +#include "courgette/win32_x86_patcher.h" +#include "courgette/win32_x86_generator.h" + +namespace courgette { + +TransformationPatchGenerator::TransformationPatchGenerator( + Element* old_element, + Element* new_element, + TransformationPatcher* patcher) + : old_element_(old_element), + new_element_(new_element), + patcher_(patcher) { +} + +TransformationPatchGenerator::~TransformationPatchGenerator() { + delete patcher_; +} + +// The default implementation of PredictTransformParameters delegates to the +// patcher. +Status TransformationPatchGenerator::PredictTransformParameters( + SinkStreamSet* prediction) { + return patcher_->PredictTransformParameters(prediction); +} + +// The default implementation of Reform delegates to the patcher. +Status TransformationPatchGenerator::Reform( + SourceStreamSet* transformed_element, + SinkStream* reformed_element) { + return patcher_->Reform(transformed_element, reformed_element); +} + +// Makes a TransformationPatchGenerator of the appropriate variety for the +// Element kind. +TransformationPatchGenerator* MakeGenerator(Element* old_element, + Element* new_element) { + if (new_element->kind() == Element::WIN32_X86_WITH_CODE) { + CourgetteWin32X86PatchGenerator* generator = + new CourgetteWin32X86PatchGenerator( + old_element, + new_element, + new CourgetteWin32X86Patcher(old_element->region())); + return generator; + } else { + LOG(WARNING) << "Unexpected Element::Kind " << old_element->kind(); + return NULL; + } +} + +// FindGenerators finds TransformationPatchGenerators for the elements of +// |new_ensemble|. For each element of |new_ensemble| we find the closest +// matching element from |old_ensemble| and use that as the basis for +// differential compression. The elements have to be the same kind so as to +// support transformation into the same kind of 'new representation'. +// +Status FindGenerators(Ensemble* old_ensemble, Ensemble* new_ensemble, + std::vector<TransformationPatchGenerator*>* generators) { + base::Time start_find_time = base::Time::Now(); + old_ensemble->FindEmbeddedElements(); + new_ensemble->FindEmbeddedElements(); + LOG(INFO) << "done FindEmbeddedElements " + << (base::Time::Now() - start_find_time).InSecondsF(); + + std::vector<Element*> old_elements(old_ensemble->elements()); + std::vector<Element*> new_elements(new_ensemble->elements()); + + LOG(INFO) << "old has " << old_elements.size() << " elements"; + LOG(INFO) << "new has " << new_elements.size() << " elements"; + + DifferenceEstimator difference_estimator; + std::vector<DifferenceEstimator::Base*> bases; + + base::Time start_bases_time = base::Time::Now(); + for (size_t i = 0; i < old_elements.size(); ++i) { + bases.push_back( + difference_estimator.MakeBase(old_elements[i]->region())); + } + LOG(INFO) << "done make bases " + << (base::Time::Now() - start_bases_time).InSecondsF() + << "s"; + + for (size_t new_index = 0; new_index < new_elements.size(); ++new_index) { + Element* new_element = new_elements[new_index]; + DifferenceEstimator::Subject* new_subject = + difference_estimator.MakeSubject(new_element->region()); + + // Search through old elements to find the best match. + // + // TODO(sra): This is O(N x M), i.e. O(N^2) since old_ensemble and + // new_ensemble probably have a very similar structure. We can make the + // search faster by making the comparison provided by DifferenceEstimator + // more nuanced, returning early if the measured difference is greater than + // the current best. This will be most effective if we can arrange that the + // first elements we try to match are likely the 'right' ones. We could + // prioritize elements that are of a similar size or similar position in the + // sequence of elements. + // + Element* best_old_element = NULL; + size_t best_difference = std::numeric_limits<size_t>::max(); + for (size_t old_index = 0; old_index < old_elements.size(); ++old_index) { + Element* old_element = old_elements[old_index]; + // Elements of different kinds are incompatible. + if (old_element->kind() != new_element->kind()) + continue; + + base::Time start_compare = base::Time::Now(); + DifferenceEstimator::Base* old_base = bases[old_index]; + size_t difference = difference_estimator.Measure(old_base, new_subject); + + LOG(INFO) << "Compare " << old_element->Name() + << " to " << new_element->Name() + << " --> " << difference + << " in " << (base::Time::Now() - start_compare).InSecondsF() + << "s"; + if (difference == 0) { + LOG(INFO) << "Skip " << new_element->Name() + << " - identical to " << old_element->Name(); + best_difference = 0; + best_old_element = NULL; + break; + } + if (difference < best_difference) { + best_difference = difference; + best_old_element = old_element; + } + } + + if (best_old_element) { + LOG(INFO) << "Matched " << best_old_element->Name() + << " to " << new_element->Name() + << " --> " << best_difference; + TransformationPatchGenerator* generator = + MakeGenerator(best_old_element, new_element); + if (generator) + generators->push_back(generator); + } + } + + LOG(INFO) << "done FindGenerators " + << "found " << generators->size() << " in " + << (base::Time::Now() - start_find_time).InSecondsF() << "s"; + + return C_OK; +} + +void FreeGenerators(std::vector<TransformationPatchGenerator*>* generators) { + for (size_t i = 0; i < generators->size(); ++i) { + delete (*generators)[i]; + } + generators->clear(); +} + +//////////////////////////////////////////////////////////////////////////////// + +Status GenerateEnsemblePatch(SourceStream* base, + SourceStream* update, + SinkStream* final_patch) { + Region old_region(base->Buffer(), base->Remaining()); + Region new_region(update->Buffer(), update->Remaining()); + Ensemble old_ensemble(old_region, "old"); + Ensemble new_ensemble(new_region, "new"); + std::vector<TransformationPatchGenerator*> generators; + Status generators_status = FindGenerators(&old_ensemble, &new_ensemble, + &generators); + if (generators_status != C_OK) + return generators_status; + + SinkStreamSet patch_streams; + + SinkStream* tranformation_descriptions = patch_streams.stream(0); + SinkStream* parameter_correction = patch_streams.stream(1); + SinkStream* transformed_elements_correction = patch_streams.stream(2); + SinkStream* ensemble_correction = patch_streams.stream(3); + + uint32 number_of_transformations = generators.size(); + tranformation_descriptions->WriteVarint32(number_of_transformations); + + for (size_t i = 0; i < number_of_transformations; ++i) { + CourgettePatchFile::TransformationMethodId kind = generators[i]->Kind(); + tranformation_descriptions->WriteVarint32(kind); + } + + for (size_t i = 0; i < number_of_transformations; ++i) { + Status status = + generators[i]->WriteInitialParameters(tranformation_descriptions); + if (status != C_OK) + return status; + } + + // + // Generate sub-patch for parameters. + // + SinkStreamSet predicted_parameters_sink; + SinkStreamSet corrected_parameters_sink; + + for (size_t i = 0; i < number_of_transformations; ++i) { + SinkStreamSet single_predicted_parameters; + Status status; + status = generators[i]->PredictTransformParameters( + &single_predicted_parameters); + if (status != C_OK) + return status; + if (!predicted_parameters_sink.WriteSet(&single_predicted_parameters)) + return C_STREAM_ERROR; + + SinkStreamSet single_corrected_parameters; + status = generators[i]->CorrectedTransformParameters( + &single_corrected_parameters); + if (status != C_OK) + return status; + if (!corrected_parameters_sink.WriteSet(&single_corrected_parameters)) + return C_STREAM_ERROR; + } + + SinkStream linearized_predicted_parameters; + SinkStream linearized_corrected_parameters; + + if (!predicted_parameters_sink.CopyTo(&linearized_predicted_parameters)) + return C_STREAM_ERROR; + if (!corrected_parameters_sink.CopyTo(&linearized_corrected_parameters)) + return C_STREAM_ERROR; + + SourceStream predicted_parameters_source; + SourceStream corrected_parameters_source; + predicted_parameters_source.Init(linearized_predicted_parameters); + corrected_parameters_source.Init(linearized_corrected_parameters); + + Status delta1_status = GenerateSimpleDelta(&predicted_parameters_source, + &corrected_parameters_source, + parameter_correction); + if (delta1_status != C_OK) + return delta1_status; + + // + // Generate sub-patch for elements. + // + corrected_parameters_source.Init(linearized_corrected_parameters); + SourceStreamSet corrected_parameters_source_set; + if (!corrected_parameters_source_set.Init(&corrected_parameters_source)) + return C_STREAM_ERROR; + + SinkStreamSet predicted_transformed_elements; + SinkStreamSet corrected_transformed_elements; + + for (size_t i = 0; i < number_of_transformations; ++i) { + SourceStreamSet single_parameters; + if (!corrected_parameters_source_set.ReadSet(&single_parameters)) + return C_STREAM_ERROR; + SinkStreamSet single_predicted_transformed_element; + SinkStreamSet single_corrected_transformed_element; + Status status = generators[i]->Transform( + &single_parameters, + &single_predicted_transformed_element, + &single_corrected_transformed_element); + if (status != C_OK) + return status; + if (!single_parameters.Empty()) + return C_STREAM_NOT_CONSUMED; + if (!predicted_transformed_elements.WriteSet( + &single_predicted_transformed_element)) + return C_STREAM_ERROR; + if (!corrected_transformed_elements.WriteSet( + &single_corrected_transformed_element)) + return C_STREAM_ERROR; + } + + if (!corrected_parameters_source_set.Empty()) + return C_STREAM_NOT_CONSUMED; + + SinkStream linearized_predicted_transformed_elements; + SinkStream linearized_corrected_transformed_elements; + + if (!predicted_transformed_elements.CopyTo( + &linearized_predicted_transformed_elements)) + return C_STREAM_ERROR; + if (!corrected_transformed_elements.CopyTo( + &linearized_corrected_transformed_elements)) + return C_STREAM_ERROR; + + SourceStream predicted_transformed_elements_source; + SourceStream corrected_transformed_elements_source; + predicted_transformed_elements_source + .Init(linearized_predicted_transformed_elements); + corrected_transformed_elements_source + .Init(linearized_corrected_transformed_elements); + + Status delta2_status = + GenerateSimpleDelta(&predicted_transformed_elements_source, + &corrected_transformed_elements_source, + transformed_elements_correction); + if (delta2_status != C_OK) + return delta2_status; + + // + // Generate sub-patch for whole enchilada. + // + SinkStream predicted_ensemble; + + predicted_ensemble.Write(base->Buffer(), base->Remaining()); + + SourceStreamSet corrected_transformed_elements_source_set; + corrected_transformed_elements_source + .Init(linearized_corrected_transformed_elements); + if (!corrected_transformed_elements_source_set + .Init(&corrected_transformed_elements_source)) + return C_STREAM_ERROR; + + for (size_t i = 0; i < number_of_transformations; ++i) { + SourceStreamSet single_corrected_transformed_element; + if (!corrected_transformed_elements_source_set.ReadSet( + &single_corrected_transformed_element)) + return C_STREAM_ERROR; + Status status = generators[i]->Reform(&single_corrected_transformed_element, + &predicted_ensemble); + if (status != C_OK) + return status; + if (!single_corrected_transformed_element.Empty()) + return C_STREAM_NOT_CONSUMED; + } + + if (!corrected_transformed_elements_source_set.Empty()) + return C_STREAM_NOT_CONSUMED; + + FreeGenerators(&generators); + + SourceStream predicted_ensemble_source; + predicted_ensemble_source.Init(predicted_ensemble); + Status delta3_status = GenerateSimpleDelta(&predicted_ensemble_source, + update, + ensemble_correction); + if (delta3_status != C_OK) + return delta3_status; + + // + // Final output stream has a header followed by a StreamSet. + // + final_patch->WriteVarint32(CourgettePatchFile::kMagic); + final_patch->WriteVarint32(CourgettePatchFile::kVersion); + + final_patch->WriteVarint32( + CalculateCrc(old_region.start(), old_region.length())); + final_patch->WriteVarint32( + CalculateCrc(new_region.start(), new_region.length())); + + if (!patch_streams.CopyTo(final_patch)) + return C_STREAM_ERROR; + + return C_OK; +} + +} // namespace diff --git a/courgette/image_info.cc b/courgette/image_info.cc new file mode 100644 index 0000000..5651f36 --- /dev/null +++ b/courgette/image_info.cc @@ -0,0 +1,391 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/image_info.h" + +#include <memory.h> +#include <algorithm> +#include <map> +#include <set> +#include <sstream> +#include <vector> + +#include "base/logging.h" + +namespace courgette { + +std::string SectionName(const Section* section) { + if (section == NULL) + return "<none>"; + char name[9]; + memcpy(name, section->name, 8); + name[8] = '\0'; // Ensure termination. + return name; +} + +PEInfo::PEInfo() + : failure_reason_("uninitialized"), + start_(0), end_(0), length_(0), + is_PE32_plus_(0), file_length_(0), has_text_section_(false) { +} + +void PEInfo::Init(const void* start, size_t length) { + start_ = reinterpret_cast<const uint8*>(start); + length_ = length; + end_ = start_ + length_; + failure_reason_ = "unparsed"; +} + +// DescribeRVA is for debugging only. I would put it under #ifdef DEBUG except +// that during development I'm finding I need to call it when compiled in +// Release mode. Hence: +// TODO(sra): make this compile only for debug mode. +std::string PEInfo::DescribeRVA(RVA rva) const { + const Section* section = RVAToSection(rva); + std::ostringstream s; + s << std::hex << rva; + if (section) { + s << " ("; + s << SectionName(section) << "+" + << std::hex << (rva - section->virtual_address) + << ")"; + } + return s.str(); +} + +const Section* PEInfo::FindNextSection(uint32 fileOffset) const { + const Section* best = 0; + for (int i = 0; i < number_of_sections_; i++) { + const Section* section = §ions_[i]; + if (fileOffset <= section->file_offset_of_raw_data) { + if (best == 0 || + section->file_offset_of_raw_data < best->file_offset_of_raw_data) { + best = section; + } + } + } + return best; +} + +const Section* PEInfo::RVAToSection(RVA rva) const { + for (int i = 0; i < number_of_sections_; i++) { + const Section* section = §ions_[i]; + uint32 offset = rva - section->virtual_address; + if (offset < section->virtual_size) { + return section; + } + } + return NULL; +} + +int PEInfo::RVAToFileOffset(RVA rva) const { + const Section* section = RVAToSection(rva); + if (section) { + uint32 offset = rva - section->virtual_address; + if (offset < section->size_of_raw_data) { + return section->file_offset_of_raw_data + offset; + } else { + return kNoOffset; // In section but not in file (e.g. uninit data). + } + } + + // Small RVA values point into the file header in the loaded image. + // RVA 0 is the module load address which Windows uses as the module handle. + // RVA 2 sometimes occurs, I'm not sure what it is, but it would map into the + // DOS header. + if (rva == 0 || rva == 2) + return rva; + + NOTREACHED(); + return kNoOffset; +} + +const uint8* PEInfo::RVAToPointer(RVA rva) const { + int file_offset = RVAToFileOffset(rva); + if (file_offset == kNoOffset) + return NULL; + else + return start_ + file_offset; +} + +RVA PEInfo::FileOffsetToRVA(uint32 file_offset) const { + for (int i = 0; i < number_of_sections_; i++) { + const Section* section = §ions_[i]; + uint32 offset = file_offset - section->file_offset_of_raw_data; + if (offset < section->size_of_raw_data) { + return section->virtual_address + offset; + } + } + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// + +namespace { + +// Constants and offsets gleaned from WINNT.H and various articles on the +// format of Windows PE executables. + +// This is FIELD_OFFSET(IMAGE_DOS_HEADER, e_lfanew): +const size_t kOffsetOfFileAddressOfNewExeHeader = 0x3c; + +const uint16 kImageNtOptionalHdr32Magic = 0x10b; +const uint16 kImageNtOptionalHdr64Magic = 0x20b; + +const size_t kSizeOfCoffHeader = 20; +const size_t kOffsetOfDataDirectoryFromImageOptionalHeader32 = 96; +const size_t kOffsetOfDataDirectoryFromImageOptionalHeader64 = 112; + +// These helper functions avoid the need for casts in the main code. +inline uint16 ReadU16(const uint8* address, size_t offset) { + return *reinterpret_cast<const uint16*>(address + offset); +} + +inline uint32 ReadU32(const uint8* address, size_t offset) { + return *reinterpret_cast<const uint32*>(address + offset); +} + +inline uint64 ReadU64(const uint8* address, size_t offset) { + return *reinterpret_cast<const uint64*>(address + offset); +} + +} // namespace + +// ParseHeader attempts to match up the buffer with the Windows data +// structures that exist within a Windows 'Portable Executable' format file. +// Returns 'true' if the buffer matches, and 'false' if the data looks +// suspicious. Rather than try to 'map' the buffer to the numerous windows +// structures, we extract the information we need into the courgette::PEInfo +// structure. +// +bool PEInfo::ParseHeader() { + if (length_ < kOffsetOfFileAddressOfNewExeHeader + 4 /*size*/) + return Bad("Too small"); + + // Have 'MZ' magic for a DOS header? + if (start_[0] != 'M' || start_[1] != 'Z') + return Bad("Not MZ"); + + // offset from DOS header to PE header is stored in DOS header. + uint32 offset = ReadU32(start_, kOffsetOfFileAddressOfNewExeHeader); + + const uint8* const pe_header = start_ + offset; + const size_t kMinPEHeaderSize = 4 /*signature*/ + kSizeOfCoffHeader; + if (pe_header <= start_ || pe_header >= end_ - kMinPEHeaderSize) + return Bad("Bad offset to PE header"); + + if (offset % 8 != 0) + return Bad("Misaligned PE header"); + + // The 'PE' header is an IMAGE_NT_HEADERS structure as defined in WINNT.H. + // See http://msdn.microsoft.com/en-us/library/ms680336(VS.85).aspx + // + // The first field of the IMAGE_NT_HEADERS is the signature. + if (!(pe_header[0] == 'P' && + pe_header[1] == 'E' && + pe_header[2] == 0 && + pe_header[3] == 0)) + return Bad("no PE signature"); + + // The second field of the IMAGE_NT_HEADERS is the COFF header. + // The COFF header is also called an IMAGE_FILE_HEADER + // http://msdn.microsoft.com/en-us/library/ms680313(VS.85).aspx + const uint8* const coff_header = pe_header + 4; + machine_type_ = ReadU16(coff_header, 0); + number_of_sections_ = ReadU16(coff_header, 2); + size_of_optional_header_ = ReadU16(coff_header, 16); + + // The rest of the IMAGE_NT_HEADERS is the IMAGE_OPTIONAL_HEADER(32|64) + const uint8* const optional_header = coff_header + kSizeOfCoffHeader; + optional_header_ = optional_header; + + if (optional_header + size_of_optional_header_ >= end_) + return Bad("optional header past end of file"); + + // Check we can read the magic. + if (size_of_optional_header_ < 2) + return Bad("optional header no magic"); + + uint16 magic = ReadU16(optional_header, 0); + + if (magic == kImageNtOptionalHdr32Magic) { + is_PE32_plus_ = false; + offset_of_data_directories_ = + kOffsetOfDataDirectoryFromImageOptionalHeader32; + } else if (magic == kImageNtOptionalHdr64Magic) { + is_PE32_plus_ = true; + offset_of_data_directories_ = + kOffsetOfDataDirectoryFromImageOptionalHeader64; + } else { + return Bad("unrecognized magic"); + } + + // Check that we can read the rest of the the fixed fields. Data directories + // directly follow the fixed fields of the IMAGE_OPTIONAL_HEADER. + if (size_of_optional_header_ < offset_of_data_directories_) + return Bad("optional header too short"); + + // The optional header is either an IMAGE_OPTIONAL_HEADER32 or + // IMAGE_OPTIONAL_HEADER64 + // http://msdn.microsoft.com/en-us/library/ms680339(VS.85).aspx + // + // Copy the fields we care about. + size_of_code_ = ReadU32(optional_header, 4); + size_of_initialized_data_ = ReadU32(optional_header, 8); + size_of_uninitialized_data_ = ReadU32(optional_header, 12); + base_of_code_ = ReadU32(optional_header, 20); + if (is_PE32_plus_) { + base_of_data_ = 0; + image_base_ = ReadU64(optional_header, 24); + } else { + base_of_data_ = ReadU32(optional_header, 24); + image_base_ = ReadU32(optional_header, 28); + } + size_of_image_ = ReadU32(optional_header, 56); + number_of_data_directories_ = + ReadU32(optional_header, (is_PE32_plus_ ? 108 : 92)); + + if (size_of_code_ >= length_ || + size_of_initialized_data_ >= length_ || + size_of_code_ + size_of_initialized_data_ >= length_) { + // This validation fires on some perfectly fine executables. + // return Bad("code or initialized data too big"); + } + + // TODO(sra): we can probably get rid of most of the data directories. + bool b = true; + // 'b &= ...' could be short circuit 'b = b && ...' but it is not necessary + // for correctness and it compiles smaller this way. + b &= ReadDataDirectory(0, &export_table_); + b &= ReadDataDirectory(1, &import_table_); + b &= ReadDataDirectory(2, &resource_table_); + b &= ReadDataDirectory(3, &exception_table_); + b &= ReadDataDirectory(5, &base_relocation_table_); + b &= ReadDataDirectory(11, &bound_import_table_); + b &= ReadDataDirectory(12, &import_address_table_); + b &= ReadDataDirectory(13, &delay_import_descriptor_); + b &= ReadDataDirectory(14, &clr_runtime_header_); + if (!b) { + return Bad("malformed data directory"); + } + + // Sections follow the optional header. + sections_ = + reinterpret_cast<const Section*>(optional_header + + size_of_optional_header_); + file_length_ = 0; + + for (int i = 0; i < number_of_sections_; ++i) { + const Section* section = §ions_[i]; + + // TODO(sra): consider using the 'characteristics' field of the section + // header to see if the section contains instructions. + if (memcmp(section->name, ".text", 6) == 0) + has_text_section_ = true; + + uint32 section_end = + section->file_offset_of_raw_data + section->size_of_raw_data; + if (section_end > file_length_) + file_length_ = section_end; + } + + failure_reason_ = NULL; + return true; +} + +bool PEInfo::ReadDataDirectory(int index, ImageDataDirectory* directory) { + if (index < number_of_data_directories_) { + size_t offset = index * 8 + offset_of_data_directories_; + if (offset >= size_of_optional_header_) + return Bad("number of data directories inconsistent"); + const uint8* data_directory = optional_header_ + offset; + if (data_directory < start_ || data_directory + 8 >= end_) + return Bad("data directory outside image"); + RVA rva = ReadU32(data_directory, 0); + size_t size = ReadU32(data_directory, 4); + if (size > size_of_image_) + return Bad("data directory size too big"); + + // TODO(sra): validate RVA. + directory->address_ = rva; + directory->size_ = size; + return true; + } else { + directory->address_ = 0; + directory->size_ = 0; + return true; + } +} + +bool PEInfo::Bad(const char* reason) { + failure_reason_ = reason; + return false; +} + +//////////////////////////////////////////////////////////////////////////////// + +bool PEInfo::ParseRelocs(std::vector<RVA> *relocs) { + relocs->clear(); + + size_t relocs_size = base_relocation_table_.size_; + if (relocs_size == 0) + return true; + + // The format of the base relocation table is a sequence of variable sized + // IMAGE_BASE_RELOCATION blocks. Search for + // "The format of the base relocation data is somewhat quirky" + // at http://msdn.microsoft.com/en-us/library/ms809762.aspx + + const uint8* start = RVAToPointer(base_relocation_table_.address_); + const uint8* end = start + relocs_size; + + // Make sure entire base relocation table is within the buffer. + if (start < start_ || + start >= end_ || + end <= start_ || + end > end_) { + return Bad(".relocs outside image"); + } + + const uint8* block = start; + + // Walk the variable sized blocks. + while (block + 8 < end) { + RVA page_rva = ReadU32(block, 0); + uint32 size = ReadU32(block, 4); + if (size < 8 || // Size includes header ... + size % 4 != 0) // ... and is word aligned. + return Bad("unreasonable relocs block"); + + const uint8* end_entries = block + size; + + if (end_entries <= block || end_entries <= start_ || end_entries > end_) + return Bad(".relocs block outside image"); + + // Walk through the two-byte entries. + for (const uint8* p = block + 8; p < end_entries; p += 2) { + uint16 entry = ReadU16(p, 0); + int type = entry >> 12; + int offset = entry & 0xFFF; + + RVA rva = page_rva + offset; + if (type == 3) { // IMAGE_REL_BASED_HIGHLOW + relocs->push_back(rva); + } else if (type == 0) { // IMAGE_REL_BASED_ABSOLUTE + // Ignore, used as padding. + } else { + // Does not occur in Windows x86 executables. + return Bad("unknown type of reloc"); + } + } + + block += size; + } + + std::sort(relocs->begin(), relocs->end()); + + return true; +} + +} // namespace courgette diff --git a/courgette/image_info.h b/courgette/image_info.h new file mode 100644 index 0000000..53a0be7 --- /dev/null +++ b/courgette/image_info.h @@ -0,0 +1,199 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_IMAGE_INFO_H_ +#define COURGETTE_IMAGE_INFO_H_ + +#include <string> +#include <vector> + +#include "base/basictypes.h" + +namespace courgette { + +// A Relative Virtual Address is the address in the image file after it is +// loaded into memory relative to the image load address. +typedef uint32 RVA; + +// PE file section header. This struct has the same layout as the +// IMAGE_SECTION_HEADER structure from WINNT.H +// http://msdn.microsoft.com/en-us/library/ms680341(VS.85).aspx +// +#pragma pack(push, 1) // Supported by MSVC and GCC. Ensures no gaps in packing. +struct Section { + char name[8]; + uint32 virtual_size; + uint32 virtual_address; + uint32 size_of_raw_data; + uint32 file_offset_of_raw_data; + uint32 pointer_to_relocations; // Always zero in an image. + uint32 pointer_to_line_numbers; // Always zero in an image. + uint16 number_of_relocations; // Always zero in an image. + uint16 number_of_line_numbers; // Always zero in an image. + uint32 characteristics; +}; +#pragma pack(pop) + +COMPILE_ASSERT(sizeof(Section) == 40, section_is_40_bytes); + +// Returns the name of a section, solving the problem that the name is not +// always properly NUL-terminated. Used only for debugging. +std::string SectionName(const Section* section); + +// ImageDataDirectory has same layout as IMAGE_DATA_DIRECTORY structure from +// WINNT.H +// http://msdn.microsoft.com/en-us/library/ms680305(VS.85).aspx +// +class ImageDataDirectory { + public: + ImageDataDirectory() : address_(0), size_(0) {} + RVA address_; + uint32 size_; +}; + +COMPILE_ASSERT(sizeof(ImageDataDirectory) == 8, + image_data_directory_is_8_bytes); + +// +// PEInfo holds information about a single Windows 'Portable Executable' format +// file in the on-disk format. +// +// Imagine you had concatenated a bunch of 'original' files into one 'big' +// file and read the big file into memory. You could find the executables +// from the original files by calling PEInfo::Init with different addresses. +// If PEInfo::TryParseHeader returns true, then Init was passed the address +// of the first byte of one of the original executables, and PEIinfo::length +// will tell how long the file was. +// +class PEInfo { + public: + PEInfo(); + + // ok() may always be called but returns 'true' only after ParseHeader + // succeeds. + bool ok() const { return failure_reason_ == NULL; } + + // Initialize with buffer. This just sets up the region of memory that + // potentially contains the bytes from an executable file. The caller + // continues to own 'start'. + void Init(const void* start, size_t length); + + // Returns 'true' if the buffer appears to point to a Windows 32 bit + // executable, 'false' otherwise. If ParseHeader() succeeds, other member + // functions may be called. + bool ParseHeader(); + + // Returns 'true' if the base relocation table can be parsed. + // Output is a vector of the RVAs corresponding to locations within executable + // that are listed in the base relocation table. + bool ParseRelocs(std::vector<RVA> *addresses); + + // Returns the length of the image. Valid only if ParseHeader succeeded. + uint32 length() const { return file_length_; } + + bool has_text_section() const { return has_text_section_; } + + uint32 size_of_code() const { return size_of_code_; } + + bool is_32bit() const { return !is_PE32_plus_; } + + // Most addresses are represented as 32-bit RVAs. The one address we can't + // do this with is the image base address. 'image_base' is valid only for + // 32-bit executables. 'image_base_64' is valid for 32- and 64-bit executable. + uint32 image_base() const { return static_cast<uint32>(image_base_); } + uint64 image_base_64() const { return image_base_; } + + const ImageDataDirectory& base_relocation_table() const { + return base_relocation_table_; + } + + bool IsValidRVA(RVA rva) const { return rva < size_of_image_; } + + // Returns description of the RVA, e.g. ".text+0x1243". For debugging only. + std::string DescribeRVA(RVA rva) const; + + // Returns a pointer into the memory copy of the file format. + // FileOffsetToPointer(0) returns a pointer to the start of the file format. + const uint8* FileOffsetToPointer(uint32 offset) const { + return start_ + offset; + } + + // Finds the first section at file_offset or above. + const Section* FindNextSection(uint32 file_offset) const; + // Returns Section containing the relative virtual address, or NULL if none. + const Section* RVAToSection(RVA rva) const; + + // There are 2 'coordinate systems' for reasoning about executables. + // FileOffset - the the offset within a single .EXE or .DLL *file*. + // RVA - relative virtual address (offset within *loaded image*) + // FileOffsetToRVA and RVAToFileOffset convert between these representations. + + RVA FileOffsetToRVA(uint32 offset) const; + + static const int kNoOffset = -1; + // Returns kNoOffset if there is no file offset corresponding to 'rva'. + int RVAToFileOffset(RVA rva) const; + + // Returns same as FileOffsetToPointer(RVAToFileOffset(rva)) except that NULL + // is returned if there is no file offset corresponding to 'rva'. + const uint8* RVAToPointer(RVA rva) const; + + protected: + // + // Fields that are always valid. + // + const char* failure_reason_; + + // + // Basic information that is always valid after Init. + // + const uint8* start_; // In current memory, base for 'file offsets'. + const uint8* end_; // In current memory. + unsigned int length_; // In current memory. + + // + // Information that is valid after successful ParseHeader. + // + bool is_PE32_plus_; // PE32_plus is for 64 bit executables. + uint32 file_length_; + + // Location and size of IMAGE_OPTIONAL_HEADER in the buffer. + const uint8 *optional_header_; + uint16 size_of_optional_header_; + uint16 offset_of_data_directories_; + + uint16 machine_type_; + uint16 number_of_sections_; + const Section *sections_; + bool has_text_section_; + + uint32 size_of_code_; + uint32 size_of_initialized_data_; + uint32 size_of_uninitialized_data_; + RVA base_of_code_; + RVA base_of_data_; + + uint64 image_base_; // range limited to 32 bits for 32 bit executable + uint32 size_of_image_; + int number_of_data_directories_; + + ImageDataDirectory export_table_; + ImageDataDirectory import_table_; + ImageDataDirectory resource_table_; + ImageDataDirectory exception_table_; + ImageDataDirectory base_relocation_table_; + ImageDataDirectory bound_import_table_; + ImageDataDirectory import_address_table_; + ImageDataDirectory delay_import_descriptor_; + ImageDataDirectory clr_runtime_header_; + + private: + bool ReadDataDirectory(int index, ImageDataDirectory* dir); + bool Bad(const char *reason); + + DISALLOW_COPY_AND_ASSIGN(PEInfo); +}; + +} // namespace +#endif // COURGETTE_IMAGE_INFO_H_ diff --git a/courgette/image_info_unittest.cc b/courgette/image_info_unittest.cc new file mode 100644 index 0000000..6d6deb5 --- /dev/null +++ b/courgette/image_info_unittest.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/image_info.h" + +#include <string> + +#include "base/path_service.h" +#include "base/file_util.h" +#include "base/string_util.h" +#include "testing/gtest/include/gtest/gtest.h" + +class ImageInfoTest : public testing::Test { + public: + + void TestExe() const; + void TestResourceDll() const; + + private: + void SetUp() { + PathService::Get(base::DIR_SOURCE_ROOT, &test_dir_); + file_util::AppendToPath(&test_dir_, L"courgette"); + file_util::AppendToPath(&test_dir_, L"testdata"); + } + + void TearDown() { + } + + void ExpectExecutable(courgette::PEInfo *info) const; + + std::string FileContents(const char *file_name) const; + + std::wstring test_dir_; +}; + +// Reads a test file into a string. +std::string ImageInfoTest::FileContents(const char *file_name) const { + std::wstring file_path = test_dir_; + file_util::AppendToPath(&file_path, UTF8ToWide(file_name)); + std::string file_bytes; + if (!file_util::ReadFileToString(file_path, &file_bytes)) { + EXPECT_TRUE(!"Could not read test data"); + } + return file_bytes; +} + +void ImageInfoTest::ExpectExecutable(courgette::PEInfo *info) const { + EXPECT_TRUE(info->ok()); + EXPECT_TRUE(info->has_text_section()); +} + +void ImageInfoTest::TestExe() const { + std::string file1 = FileContents("setup1.exe"); + + courgette::PEInfo *info = new courgette::PEInfo(); + info->Init(reinterpret_cast<const uint8*>(file1.c_str()), file1.length()); + + bool can_parse_header = info->ParseHeader(); + EXPECT_TRUE(can_parse_header); + + // The executable is the whole file, not 'embedded' with the file + EXPECT_EQ(file1.length(), info->length()); + + ExpectExecutable(info); + EXPECT_EQ(449536, info->size_of_code()); + EXPECT_EQ(SectionName(info->RVAToSection(0x00401234 - 0x00400000)), + std::string(".text")); + + EXPECT_EQ(0, info->RVAToFileOffset(0)); + EXPECT_EQ(1024, info->RVAToFileOffset(4096)); + EXPECT_EQ(46928, info->RVAToFileOffset(50000)); + + std::vector<courgette::RVA> relocs; + bool can_parse_relocs = info->ParseRelocs(&relocs); + EXPECT_TRUE(can_parse_relocs); + + const uint8* p = info->RVAToPointer(0); + EXPECT_EQ(reinterpret_cast<const void*>(file1.c_str()), + reinterpret_cast<const void*>(p)); + EXPECT_EQ('M', p[0]); + EXPECT_EQ('Z', p[1]); +} + +void ImageInfoTest::TestResourceDll() const { + std::string file1 = FileContents("en-US.dll"); + + courgette::PEInfo *info = new courgette::PEInfo(); + info->Init(reinterpret_cast<const uint8*>(file1.c_str()), file1.length()); + + bool can_parse_header = info->ParseHeader(); + EXPECT_TRUE(can_parse_header); + + // The executable is the whole file, not 'embedded' with the file + EXPECT_EQ(file1.length(), info->length()); + + EXPECT_TRUE(info->ok()); + EXPECT_FALSE(info->has_text_section()); + EXPECT_EQ(0u, info->size_of_code()); +} + +TEST_F(ImageInfoTest, All) { + TestExe(); + TestResourceDll(); +} diff --git a/courgette/memory_monitor.cc b/courgette/memory_monitor.cc new file mode 100644 index 0000000..51cb0cc --- /dev/null +++ b/courgette/memory_monitor.cc @@ -0,0 +1,122 @@ +#include <stdio.h> +#include <map> + +#include "base/logging.h" +#include "base/string_util.h" + +bool inH = true; +struct H { + H() { inH = false; tick_ = 0; bw_ = 0; d_bw_ = d_tick_ = 0; m_bw_ = 0; mem_ = high_ = 0;} + ~H() { + inH = true; + int i = 0; + for (M::iterator p = m_.begin(); p != m_.end(); ++p, ++i) { + size_t s = p->first; + LOG(INFO) << StringPrintf("%3d %8u: %8u %8u %8u %8u", i, s, + m_[s], c_[s], h_[s], h_[s] * s); + } + LOG(INFO) << "Peak " << fmt(high_); + } + + std::string fmt(size_t s) { + if (s > 1000000000) return StringPrintf("%.3gG", s/(1000000000.0)); + if (s > 1000000) return StringPrintf("%.3gM", s/(1000000.)); + if (s > 9999) return StringPrintf("%.3gk", s/(1000.)); + return StringPrintf("%d", (int)s); + } + + void tick(size_t w, char sign) { + d_tick_ += 1; + d_bw_ += w; + const size_t T = 4*4*1024; + const size_t M = 4*1024*1024; + bool print = false; + if (d_tick_ >= T) { + tick_ += (d_tick_/T)*T; + d_tick_ %= T; + print = true; + } + if (d_bw_ >= M) { + bw_ += (d_bw_/M) * M; + d_bw_ %= M; + print = true; + } + if (!print) return; + std::string o; + StringAppendF(&o, "%u:", tick_ + d_tick_); + StringAppendF(&o, " (%c%s)", sign, fmt(w).c_str()); + size_t sum = 0; + for (M::iterator p = c_.begin(); p != c_.end(); ++p) { + size_t s = p->first; + size_t n = p->second; + if (n) { + if (s*n >= 64*1024) + if (n == 1) + StringAppendF(&o, " %s", fmt(s).c_str()); + else + StringAppendF(&o, " %s*%u", fmt(s).c_str(), n); + sum += s*n; + } + } + StringAppendF(&o, " = %s", fmt(sum).c_str()); + LOG(INFO) << o; + //printf("%s\n", o.c_str()); + if (sum > 200*1024*1024) { + // __asm int 3; + m_bw_ = sum; + } + } + void add(size_t s, void *p) { + if (!inH) { + inH = true; + mem_ += s; if (mem_ > high_) high_ = mem_; + c_[s] += 1; + m_[s] += 1; + if (c_[s] > h_[s]) h_[s] = c_[s]; + allocs_[p] = s; + inH = false; + tick(s, '+'); + } + } + + void sub(void *p) { + if (!inH) { + inH = true; + size_t s = allocs_[p]; + if (s) { + mem_ -= s; + c_[s] -= 1; + allocs_[p] = 0; + tick(s, '-'); + } + inH = false; + } + } + + typedef std::map<size_t, size_t> M; + M m_; + M c_; + M h_; + + size_t bw_; + size_t d_bw_; + size_t tick_; + size_t d_tick_; + size_t m_bw_; + size_t mem_; + size_t high_; + + std::map<void*, size_t> allocs_; +} _H; + +void* operator new(size_t s) { + //printf("%u\n", s); + void *p = malloc(s); + _H.add(s, p); + return p; +} + +void operator delete(void *p) { + _H.sub(p); + free(p); +} diff --git a/courgette/region.h b/courgette/region.h new file mode 100644 index 0000000..4aafbca --- /dev/null +++ b/courgette/region.h @@ -0,0 +1,62 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#ifndef COURGETTE_REGION_H_ +#define COURGETTE_REGION_H_ + +#include <string> + +#include "base/basictypes.h" + +namespace courgette { + +// A Region is a descriptor for a region of memory. It has a start address and +// a length measured in bytes. The Region object does not own the memory. +// +class Region { + public: + // Default constructor: and empty region. + Region() : start_(NULL), length_(0) {} + + // Usual constructor for regions given a |start| address and |length|. + Region(const void* start, size_t length) + : start_(static_cast<const uint8*>(start)), + length_(length) { + } + + // String constructor. Region is owned by the string, so the string should + // have a lifetime greater than the region. + explicit Region(const std::string& string) + : start_(reinterpret_cast<const uint8*>(string.c_str())), + length_(string.length()) { + } + + // Copy constructor. + Region(const Region& other) : start_(other.start_), length_(other.length_) {} + + // Assignment 'operator' makes |this| region the same as |other|. + Region& assign(const Region& other) { + this->start_ = other.start_; + this->length_ = other.length_; + return *this; + } + + // Returns the starting address of the region. + const uint8* start() const { return start_; } + + // Returns the length of the region. + size_t length() const { return length_; } + + // Returns the address after the last byte of the region. + const uint8* end() const { return start_ + length_; } + + private: + const uint8* start_; + size_t length_; + + void operator=(const Region&); // Disallow assignment operator. +}; + +} // namespace +#endif // COURGETTE_REGION_H_ diff --git a/courgette/run_all_unittests.cc b/courgette/run_all_unittests.cc new file mode 100644 index 0000000..b5fc4a6 --- /dev/null +++ b/courgette/run_all_unittests.cc @@ -0,0 +1,9 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "base/test_suite.h" + +int main(int argc, char** argv) { + return TestSuite(argc, argv).Run(); +} diff --git a/courgette/simple_delta.cc b/courgette/simple_delta.cc new file mode 100644 index 0000000..71038fd --- /dev/null +++ b/courgette/simple_delta.cc @@ -0,0 +1,41 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Implementation of the byte-level differential compression used internally by +// Courgette. + +#include "courgette/simple_delta.h" + +#include "base/basictypes.h" +#include "base/logging.h" + +#include "courgette/third_party/bsdiff.h" + +namespace courgette { + +namespace { + +Status BSDiffStatusToStatus(BSDiffStatus status) { + switch (status) { + case OK: return C_OK; + case CRC_ERROR: return C_BINARY_DIFF_CRC_ERROR; + default: return C_GENERAL_ERROR; + } +} + +} + +Status ApplySimpleDelta(SourceStream* old, SourceStream* delta, + SinkStream* target) { + return BSDiffStatusToStatus(ApplyBinaryPatch(old, delta, target)); +} + +Status GenerateSimpleDelta(SourceStream* old, SourceStream* target, + SinkStream* delta) { + LOG(INFO) << "GenerateSimpleDelta " + << old->Remaining() << " " << target->Remaining(); + return BSDiffStatusToStatus(CreateBinaryPatch(old, target, delta)); +} + +} // namespace courgette diff --git a/courgette/simple_delta.h b/courgette/simple_delta.h new file mode 100644 index 0000000..d35333a --- /dev/null +++ b/courgette/simple_delta.h @@ -0,0 +1,23 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Byte level differential compression algorithm used by Courgette. + +#ifndef COURGETTE_SIMPLE_DELTA_H_ +#define COURGETTE_SIMPLE_DELTA_H_ + +#include "courgette/courgette.h" +#include "courgette/streams.h" + +namespace courgette { + +Status ApplySimpleDelta(SourceStream* old, SourceStream* delta, + SinkStream* target); + +Status GenerateSimpleDelta(SourceStream* old, SourceStream* target, + SinkStream* delta); + +} // namespace courgette + +#endif // COURGETTE_SIMPLE_DELTA_H_ diff --git a/courgette/streams.cc b/courgette/streams.cc new file mode 100644 index 0000000..365a416 --- /dev/null +++ b/courgette/streams.cc @@ -0,0 +1,383 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Streams classes. +// +// These memory-resident streams are used for serializing data into a sequential +// region of memory. +// +// Streams are divided into SourceStreams for reading and SinkStreams for +// writing. Streams are aggregated into Sets which allows several streams to be +// used at once. Example: we can write A1, B1, A2, B2 but achieve the memory +// layout A1 A2 B1 B2 by writing 'A's to one stream and 'B's to another. +// +// The aggregated streams are important to Courgette's compression efficiency, +// we use it to cluster similar kinds of data which helps to generate longer +// common subsequences and repeated sequences. + +#include "courgette/streams.h" + +#include <io.h> +#include <memory.h> + +#include "base/basictypes.h" + +namespace courgette { + +// Update this version number if the serialization format of a StreamSet +// changes. +static const unsigned int kStreamsSerializationFormatVersion = 20090218; + +// +// This is a cut down Varint implementation, implementing only what we use for +// streams. +// +class Varint { + public: + // Maximum lengths of varint encoding of uint32 + static const int kMax32 = 5; + + // Parses a Varint32 encoded value from |source| and stores it in |output|, + // and returns a pointer to the following byte. Returns NULL if a valid + // varint value was not found before |limit|. + static const uint8* Parse32WithLimit(const uint8* source, const uint8* limit, + uint32* output); + + // Writes the Varint32 encoded representation of |value| to buffer + // |destination|. |destination| must have sufficient length to hold kMax32 + // bytes. Returns a pointer to the byte just past the last encoded byte. + static uint8* Encode32(uint8* destination, uint32 value); +}; + +// Parses a Varint32 encoded unsigned number from |source|. The Varint32 +// encoding is a little-endian sequence of bytes containing base-128 digits, +// with the high order bit set to indicate if there are more digits. +// +// For each byte, we mask out the digit and 'or' it into the right place in the +// result. +// +// The digit loop is unrolled for performance. It usually exits after the first +// one or two digits. +const uint8* Varint::Parse32WithLimit(const uint8* source, + const uint8* limit, + uint32* output) { + uint32 digit, result; + if (source >= limit) + return NULL; + digit = *(source++); + result = digit & 127; + if (digit < 128) { + *output = result; + return source; + } + + if (source >= limit) + return NULL; + digit = *(source++); + result |= (digit & 127) << 7; + if (digit < 128) { + *output = result; + return source; + } + + if (source >= limit) + return NULL; + digit = *(source++); + result |= (digit & 127) << 14; + if (digit < 128) { + *output = result; + return source; + } + + if (source >= limit) + return NULL; + digit = *(source++); + result |= (digit & 127) << 21; + if (digit < 128) { + *output = result; + return source; + } + + if (source >= limit) + return NULL; + digit = *(source++); + result |= (digit & 127) << 28; + if (digit < 128) { + *output = result; + return source; + } + + return NULL; // Value is too long to be a Varint32. +} + +// Write the base-128 digits in little-endian order. All except the last digit +// have the high bit set to indicate more digits. +inline uint8* Varint::Encode32(uint8* destination, uint32 value) { + while (value >= 128) { + *(destination++) = value | 128; + value = value >> 7; + } + *(destination++) = value; + return destination; +} + +void SourceStream::Init(const SinkStream& sink) { + Init(sink.Buffer(), sink.Length()); +} + +bool SourceStream::Read(void* destination, size_t count) { + if (current_ + count > end_) + return false; + memcpy(destination, current_, count); + current_ += count; + return true; +} + +bool SourceStream::ReadVarint32(uint32* output_value) { + const uint8* after = Varint::Parse32WithLimit(current_, end_, output_value); + if (!after) + return false; + current_ = after; + return true; +} + +bool SourceStream::ReadVarint32Signed(int32* output_value) { + // Signed numbers are encoded as unsigned numbers so that numbers nearer zero + // have shorter varint encoding. + // 0000xxxx encoded as 000xxxx0. + // 1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx. + uint32 unsigned_value; + if (!ReadVarint32(&unsigned_value)) + return false; + if (unsigned_value & 1) + *output_value = ~static_cast<int32>(unsigned_value >> 1); + else + *output_value = (unsigned_value >> 1); + return true; +} + +bool SourceStream::ShareSubstream(size_t offset, size_t length, + SourceStream* substream) { + if (offset > Remaining()) + return false; + if (length > Remaining() - offset) + return false; + substream->Init(current_ + offset, length); + return true; +} + +bool SourceStream::ReadSubstream(size_t length, SourceStream* substream) { + if (!ShareSubstream(0, length, substream)) + return false; + current_ += length; + return true; +} + +bool SourceStream::Skip(size_t byte_count) { + if (current_ + byte_count > end_) + return false; + current_ += byte_count; + return true; +} + +void SinkStream::Write(const void* data, size_t byte_count) { + buffer_.append(static_cast<const char*>(data), byte_count); +} + +void SinkStream::WriteVarint32(uint32 value) { + uint8 buffer[Varint::kMax32]; + uint8* end = Varint::Encode32(buffer, value); + Write(buffer, end - buffer); +} + +void SinkStream::WriteVarint32Signed(int32 value) { + // Encode signed numbers so that numbers nearer zero have shorter + // varint encoding. + // 0000xxxx encoded as 000xxxx0. + // 1111xxxx encoded as 000yyyy1 where yyyy is complement of xxxx. + if (value < 0) + WriteVarint32(~value * 2 + 1); + else + WriteVarint32(value * 2); +} + +void SinkStream::Append(SinkStream* other) { + Write(other->buffer_.c_str(), other->buffer_.size()); + other->buffer_.clear(); + other->buffer_.reserve(0); // Non-binding request to reduce storage. +} + +//////////////////////////////////////////////////////////////////////////////// + +SourceStreamSet::SourceStreamSet() + : count_(kMaxStreams) { +} + +SourceStreamSet::~SourceStreamSet() { +} + + +// Initializes from |source|. +// The stream set for N streams is serialized as a header +// <version><N><length1><length2>...<lengthN> +// followed by the stream contents +// <bytes1><bytes2>...<bytesN> +// +bool SourceStreamSet::Init(const void* source, size_t byte_count) { + const uint8* start = static_cast<const uint8*>(source); + const uint8* end = start + byte_count; + + unsigned int version; + const uint8* finger = Varint::Parse32WithLimit(start, end, &version); + if (finger == NULL) + return false; + if (version != kStreamsSerializationFormatVersion) + return false; + + unsigned int count; + finger = Varint::Parse32WithLimit(finger, end, &count); + if (finger == NULL) + return false; + if (count > kMaxStreams) + return false; + + count_ = count; + + unsigned int lengths[kMaxStreams]; + size_t accumulated_length = 0; + + for (size_t i = 0; i < count_; ++i) { + finger = Varint::Parse32WithLimit(finger, end, &lengths[i]); + if (finger == NULL) + return false; + accumulated_length += lengths[i]; + } + + // Remaining bytes should add up to sum of lengths. + if (end - finger != accumulated_length) + return false; + + accumulated_length = finger - start; + for (size_t i = 0; i < count_; ++i) { + stream(i)->Init(start + accumulated_length, lengths[i]); + accumulated_length += lengths[i]; + } + + return true; +} + +bool SourceStreamSet::Init(SourceStream* source) { + // TODO(sra): consume the rest of |source|. + return Init(source->Buffer(), source->Remaining()); +} + +bool SourceStreamSet::ReadSet(SourceStreamSet* set) { + uint32 stream_count = 0; + SourceStream* control_stream = this->stream(0); + if (!control_stream->ReadVarint32(&stream_count)) + return false; + + uint32 lengths[kMaxStreams] = {}; // i.e. all zero. + + for (size_t i = 0; i < stream_count; ++i) { + if (!control_stream->ReadVarint32(&lengths[i])) + return false; + } + + for (size_t i = 0; i < stream_count; ++i) { + if (!this->stream(i)->ReadSubstream(lengths[i], set->stream(i))) + return false; + } + return true; +} + +bool SourceStreamSet::Empty() const { + for (size_t i = 0; i < count_; ++i) { + if (streams_[i].Remaining() != 0) + return false; + } + return true; +} + +//////////////////////////////////////////////////////////////////////////////// + +SinkStreamSet::SinkStreamSet() + : count_(kMaxStreams) { +} + +SinkStreamSet::~SinkStreamSet() { +} + +void SinkStreamSet::Init(size_t stream_index_limit) { + count_ = stream_index_limit; +} + +// The header for a stream set for N streams is serialized as +// <version><N><length1><length2>...<lengthN> +void SinkStreamSet::CopyHeaderTo(SinkStream* header) { + header->WriteVarint32(kStreamsSerializationFormatVersion); + header->WriteVarint32(count_); + for (size_t i = 0; i < count_; ++i) { + header->WriteVarint32(stream(i)->Length()); + } +} + +// Writes |this| to |combined_stream|. See SourceStreamSet::Init for the layout +// of the stream metadata and contents. +bool SinkStreamSet::CopyTo(SinkStream *combined_stream) { + SinkStream header; + CopyHeaderTo(&header); + combined_stream->Append(&header); + for (size_t i = 0; i < count_; ++i) { + combined_stream->Append(stream(i)); + } + return true; +} + +namespace { +bool Write(int file_descriptor, SinkStream* sink) { + size_t length = sink->Length(); + const void *buffer = sink->Buffer(); + int bytes_written = _write(file_descriptor, buffer, length); + return bytes_written == length; +} +} + +bool SinkStreamSet::CopyToFileDescriptor(int file_descriptor) { + SinkStream header; + CopyHeaderTo(&header); + if (!Write(file_descriptor, &header)) + return false; + for (size_t i = 0; i < count_; ++i) { + if (!Write(file_descriptor, stream(i))) + return false; + } + return true; +} + +bool SinkStreamSet::WriteSet(SinkStreamSet* set) { + uint32 lengths[kMaxStreams]; + // 'stream_count' includes all non-empty streams and all empty stream numbered + // lower than a non-empty stream. + size_t stream_count = 0; + for (size_t i = 0; i < kMaxStreams; ++i) { + SinkStream* stream = set->stream(i); + lengths[i] = stream->Length(); + if (lengths[i] > 0) + stream_count = i + 1; + } + + SinkStream* control_stream = this->stream(0); + control_stream->WriteVarint32(stream_count); + for (size_t i = 0; i < stream_count; ++i) { + control_stream->WriteVarint32(lengths[i]); + } + + for (size_t i = 0; i < stream_count; ++i) { + this->stream(i)->Append(set->stream(i)); + } + return true; +} + +} // namespace diff --git a/courgette/streams.h b/courgette/streams.h new file mode 100644 index 0000000..2fb824f --- /dev/null +++ b/courgette/streams.h @@ -0,0 +1,220 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// Streams classes. +// +// These memory-resident streams are used for serialzing data into a sequential +// region of memory. +// Streams are divided into SourceStreams for reading and SinkStreams for +// writing. Streams are aggregated into Sets which allows several streams to be +// used at once. Example: we can write A1, B1, A2, B2 but achive the memory +// layout A1 A2 B1 B2 by writing 'A's to one stream and 'B's to another. +#ifndef COURGETTE_STREAMS_H_ +#define COURGETTE_STREAMS_H_ + +#include <stdio.h> // for FILE* +#include <string> + +#include "base/basictypes.h" + +#include "courgette/region.h" + +namespace courgette { + +class SourceStream; +class SinkStream; + +// Maximum number of streams in a stream set. +static const unsigned int kMaxStreams = 10; + +// A SourceStream allows a region of memory to be scanned by a sequence of Read +// operations. The stream does not own the memory. +class SourceStream { + public: + SourceStream() : start_(NULL), end_(NULL), current_(NULL) {} + + // Initializes the SourceStream to yield the bytes at |pointer|. The caller + // still owns the memory at |pointer| and should free the memory only after + // the last use of the stream. + void Init(const void* pointer, size_t length) { + start_ = static_cast<const uint8*>(pointer); + end_ = start_ + length; + current_ = start_; + } + + // Initializes the SourceStream to yield the bytes in |region|. The caller + // still owns the memory at |region| and should free the memory only after + // the last use of the stream. + void Init(const Region& region) { Init(region.start(), region.length()); } + + // Initializes the SourceStream to yield the bytes in |string|. The caller + // still owns the memory at |string| and should free the memory only after + // the last use of the stream. + void Init(const std::string& string) { Init(string.c_str(), string.size()); } + + // Initializes the SourceStream to yield the bytes written to |sink|. |sink| + // still owns the memory, so needs to outlive |this|. |sink| should not be + // written to after |this| is initialized. + void Init(const SinkStream& sink); + + // Returns number of bytes remaining to be read from stream. + size_t Remaining() const { return end_ - current_; } + + // Returns initial length of stream before any data consumed by reading. + size_t OriginalLength() const { return end_ - start_; } + + const uint8* Buffer() const { return current_; } + bool Empty() const { return current_ == end_; } + + // Copies bytes from stream to memory at |destination|. Returns 'false' if + // insufficient data to satisfy request. + bool Read(void* destination, size_t byte_count); + + // Reads a varint formatted unsigned integer from stream. Returns 'false' if + // the read failed due to insufficient data or malformed Varint32. + bool ReadVarint32(uint32* output_value); + + // Reads a varint formatted signed integer from stream. Returns 'false' if + // the read failed due to insufficient data or malformed Varint32. + bool ReadVarint32Signed(int32* output_value); + + // Initializes |substream| to yield |length| bytes from |this| stream, + // starting at |offset| bytes from the current position. Returns 'false' if + // there are insufficient bytes in |this| stream. + bool ShareSubstream(size_t offset, size_t length, SourceStream* substream); + + // Initializes |substream| to yield |length| bytes from |this| stream, + // starting at the current position. Returns 'false' if there are + // insufficient bytes in |this| stream. + bool ShareSubstream(size_t length, SourceStream* substream) { + return ShareSubstream(0, length, substream); + } + + // Reads |length| bytes from |this| stream. Initializes |substream| to yield + // the bytes. Returns 'false' if there are insufficient bytes in |this| + // stream. + bool ReadSubstream(size_t length, SourceStream* substream); + + // Skips over bytes. Returns 'false' if insufficient data to satisfy request. + bool Skip(size_t byte_count); + + private: + const uint8* start_; // Points to start of buffer. + const uint8* end_; // Points to first location after buffer. + const uint8* current_; // Points into buffer at current read location. + + DISALLOW_COPY_AND_ASSIGN(SourceStream); +}; + +// A SinkStream accumulates writes into a buffer that it owns. The stream is +// initialy in an 'accumulating' state where writes are permitted. Accessing +// the buffer moves the stream into a 'locked' state where no more writes are +// permitted. The stream may also be in a 'retired' state where the buffer +// contents are no longer available. +class SinkStream { + public: + SinkStream() {} + ~SinkStream() {} + + // Appends |byte_count| bytes from |data| to the stream. + void Write(const void* data, size_t byte_count); + + // Appends the 'varint32' encoding of |value| to the stream. + void WriteVarint32(uint32 value); + + // Appends the 'varint32' encoding of |value| to the stream. + void WriteVarint32Signed(int32 value); + + // Contents of |other| are appended to |this| stream. The |other| stream + // becomes retired. + void Append(SinkStream* other); + + // Returns the number of bytes in this SinkStream + size_t Length() const { return buffer_.size(); } + + // Returns a pointer to contiguously allocated Length() bytes in the stream. + // Writing to the stream invalidates the pointer. The SinkStream continues to + // own the memory. + const uint8* Buffer() const { + return reinterpret_cast<const uint8*>(buffer_.c_str()); + } + + // Hints that the stream will grow by an additional |length| bytes. + void Reserve(size_t length) { buffer_.reserve(length + buffer_.length()); } + + private: + std::string buffer_; // Use a string to manage the stream's memory. + + DISALLOW_COPY_AND_ASSIGN(SinkStream); +}; + +// A SourceStreamSet is a set of SourceStreams. +class SourceStreamSet { + public: + SourceStreamSet(); + ~SourceStreamSet(); + + // Initializes the SourceStreamSet with the stream data in memory at |source|. + // The caller continues to own the memory and should not modify or free the + // memory until the SourceStreamSet destructor has been called. + // + // The layout of the streams are as written by SinkStreamSet::CopyTo. + // Init returns 'false' if the layout is inconsistent with |byte_count|. + bool Init(const void* source, size_t byte_count); + + // Initializes |this| from |source|. The caller continues to own the memory + // because it continues to be owned by |source|. + bool Init(SourceStream* source); + + // Returns a pointer to one of the sub-streams. + SourceStream* stream(size_t id) { return id < count_ ? &streams_[id] : NULL; } + + // Initialize |set| from |this|. + bool ReadSet(SourceStreamSet* set); + + // Returns 'true' if all streams are completely consumed. + bool Empty() const; + + private: + size_t count_; + SourceStream streams_[kMaxStreams]; + + DISALLOW_COPY_AND_ASSIGN(SourceStreamSet); +}; + +class SinkStreamSet { + public: + SinkStreamSet(); + ~SinkStreamSet(); + + // Initializes the SinkStreamSet to have |stream_index_limit| streams. Must + // be <= kMaxStreams. If Init is not called the default is has kMaxStream. + void Init(size_t stream_index_limit); + + // Returns a pointer to a substream. + SinkStream* stream(size_t id) { return id < count_ ? &streams_[id] : NULL; } + + // CopyTo serializes the streams in the SinkStreamSet into a single target + // stream or file. The serialized format may be re-read by initializing a + // SourceStreamSet with a buffer containing the data. + bool CopyTo(SinkStream* combined_stream); + bool CopyToFile(FILE* file); + bool CopyToFileDescriptor(int file_descriptor); + + // Writes the streams of |set| into the corresponding streams of |this|. + // Stream zero first has some metadata written to it. |set| becomes retired. + // Partner to SourceStreamSet::ReadSet. + bool WriteSet(SinkStreamSet* set); + + private: + void CopyHeaderTo(SinkStream* stream); + + size_t count_; + SinkStream streams_[kMaxStreams]; + + DISALLOW_COPY_AND_ASSIGN(SinkStreamSet); +}; + +} // namespace +#endif // COURGETTE_STREAMS_H_ diff --git a/courgette/streams_unittest.cc b/courgette/streams_unittest.cc new file mode 100644 index 0000000..dd330ce --- /dev/null +++ b/courgette/streams_unittest.cc @@ -0,0 +1,200 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/streams.h" + +#include "testing/gtest/include/gtest/gtest.h" + +TEST(StreamsTest, SimpleWriteRead) { + const unsigned int kValue1 = 12345; + courgette::SinkStream sink; + + sink.WriteVarint32(kValue1); + + const uint8* sink_buffer = sink.Buffer(); + size_t length = sink.Length(); + + courgette::SourceStream source; + source.Init(sink_buffer, length); + + unsigned int value; + bool can_read = source.ReadVarint32(&value); + EXPECT_EQ(true, can_read); + EXPECT_EQ(kValue1, value); + EXPECT_EQ(0, source.Remaining()); +} + +TEST(StreamsTest, SimpleWriteRead2) { + courgette::SinkStream sink; + + sink.Write("Hello", 5); + + const uint8* sink_buffer = sink.Buffer(); + size_t sink_length = sink.Length(); + + courgette::SourceStream source; + source.Init(sink_buffer, sink_length); + + char text[10] = {0}; + bool can_read = source.Read(text, 5); + EXPECT_EQ(true, can_read); + EXPECT_EQ(0, memcmp("Hello", text, 5)); + EXPECT_EQ(0, source.Remaining()); +} + +TEST(StreamsTest, StreamSetWriteRead) { + courgette::SinkStreamSet out; + out.Init(4); + + const unsigned int kValue1 = 12345; + + out.stream(3)->WriteVarint32(kValue1); + + courgette::SinkStream collected; + + out.CopyTo(&collected); + + const uint8* collected_buffer = collected.Buffer(); + size_t collected_length = collected.Length(); + + courgette::SourceStreamSet in; + bool can_init = in.Init(collected_buffer, collected_length); + EXPECT_EQ(true, can_init); + + uint32 value; + bool can_read = in.stream(3)->ReadVarint32(&value); + EXPECT_EQ(true, can_read); + EXPECT_EQ(kValue1, value); + EXPECT_EQ(0, in.stream(3)->Remaining()); + EXPECT_EQ(0, in.stream(2)->Remaining()); +} + +TEST(StreamsTest, StreamSetWriteRead2) { + const size_t kNumberOfStreams = 4; + const unsigned int kEnd = ~0U; + + courgette::SinkStreamSet out; + out.Init(kNumberOfStreams); + + static const unsigned int data[] = { + 3, 123, 3, 1000, 0, 100, 2, 100, 0, 999999, + 0, 0, 0, 0, 1, 2, 1, 3, 1, 5, 0, 66, + // varint32 edge case values: + 1, 127, 1, 128, 1, 129, 1, 16383, 1, 16384, + kEnd + }; + + for (size_t i = 0; data[i] != kEnd; i += 2) { + size_t id = data[i]; + size_t datum = data[i + 1]; + out.stream(id)->WriteVarint32(datum); + } + + courgette::SinkStream collected; + + out.CopyTo(&collected); + + courgette::SourceStreamSet in; + bool can_init = in.Init(collected.Buffer(), collected.Length()); + EXPECT_EQ(true, can_init); + + for (size_t i = 0; data[i] != kEnd; i += 2) { + size_t id = data[i]; + size_t datum = data[i + 1]; + uint32 value = 77; + bool can_read = in.stream(id)->ReadVarint32(&value); + EXPECT_EQ(true, can_read); + EXPECT_EQ(datum, value); + } + + for (size_t i = 0; i < kNumberOfStreams; ++i) { + EXPECT_EQ(0, in.stream(i)->Remaining()); + } +} + +TEST(StreamsTest, SignedVarint32) { + courgette::SinkStream out; + + static const int32 data[] = { + 0, 64, 128, 8192, 16384, + 1 << 20, 1 << 21, 1 << 22, + 1 << 27, 1 << 28, + 0x7fffffff, -0x7fffffff + }; + + std::vector<int32> values; + for (size_t i = 0; i < sizeof(data)/sizeof(data[0]); ++i) { + int32 basis = data[i]; + for (int delta = -4; delta <= 4; ++delta) { + out.WriteVarint32Signed(basis + delta); + values.push_back(basis + delta); + out.WriteVarint32Signed(-basis + delta); + values.push_back(-basis + delta); + } + } + + courgette::SourceStream in; + in.Init(out); + + for (size_t i = 0; i < values.size(); ++i) { + int written_value = values[i]; + int32 datum; + bool can_read = in.ReadVarint32Signed(&datum); + EXPECT_EQ(written_value, datum); + } + + EXPECT_EQ(true, in.Empty()); +} + +TEST(StreamsTest, StreamSetReadWrite) { + courgette::SinkStreamSet out; + + { // Local scope for temporary stream sets. + courgette::SinkStreamSet subset1; + subset1.stream(3)->WriteVarint32(30000); + subset1.stream(5)->WriteVarint32(50000); + out.WriteSet(&subset1); + + courgette::SinkStreamSet subset2; + subset2.stream(2)->WriteVarint32(20000); + subset2.stream(6)->WriteVarint32(60000); + out.WriteSet(&subset2); + } + + courgette::SinkStream collected; + out.CopyTo(&collected); + courgette::SourceStreamSet in; + bool can_init_in = in.Init(collected.Buffer(), collected.Length()); + EXPECT_EQ(true, can_init_in); + + courgette::SourceStreamSet subset1; + bool can_read_1 = in.ReadSet(&subset1); + EXPECT_EQ(true, can_read_1); + EXPECT_EQ(false, in.Empty()); + + courgette::SourceStreamSet subset2; + bool can_read_2 = in.ReadSet(&subset2); + EXPECT_EQ(true, can_read_2); + EXPECT_EQ(true, in.Empty()); + + courgette::SourceStreamSet subset3; + bool can_read_3 = in.ReadSet(&subset3); + EXPECT_EQ(false, can_read_3); + + EXPECT_EQ(false, subset1.Empty()); + EXPECT_EQ(false, subset1.Empty()); + + uint32 datum; + EXPECT_EQ(true, subset1.stream(3)->ReadVarint32(&datum)); + EXPECT_EQ(30000, datum); + EXPECT_EQ(true, subset1.stream(5)->ReadVarint32(&datum)); + EXPECT_EQ(50000, datum); + EXPECT_EQ(true, subset1.Empty()); + + EXPECT_EQ(true, subset2.stream(2)->ReadVarint32(&datum)); + EXPECT_EQ(20000, datum); + EXPECT_EQ(true, subset2.stream(6)->ReadVarint32(&datum)); + EXPECT_EQ(60000, datum); + EXPECT_EQ(true, subset2.Empty()); +} diff --git a/courgette/testdata/en-US.dll b/courgette/testdata/en-US.dll Binary files differnew file mode 100644 index 0000000..f1ad46c --- /dev/null +++ b/courgette/testdata/en-US.dll diff --git a/courgette/testdata/setup1.exe b/courgette/testdata/setup1.exe Binary files differnew file mode 100644 index 0000000..da072f5 --- /dev/null +++ b/courgette/testdata/setup1.exe diff --git a/courgette/testdata/setup2.exe b/courgette/testdata/setup2.exe Binary files differnew file mode 100644 index 0000000..577a47f --- /dev/null +++ b/courgette/testdata/setup2.exe diff --git a/courgette/third_party/LICENCE b/courgette/third_party/LICENCE new file mode 100644 index 0000000..c146b5b --- /dev/null +++ b/courgette/third_party/LICENCE @@ -0,0 +1,121 @@ +BSD Protection License +February 2002 + +Preamble +-------- + +The Berkeley Software Distribution ("BSD") license has proven very effective +over the years at allowing for a wide spread of work throughout both +commercial and non-commercial products. For programmers whose primary +intention is to improve the general quality of available software, it is +arguable that there is no better license than the BSD license, as it +permits improvements to be used wherever they will help, without idealogical +or metallic constraint. + +This is of particular value to those who produce reference implementations +of proposed standards: The case of TCP/IP clearly illustrates that freely +and universally available implementations leads the rapid acceptance of +standards -- often even being used instead of a de jure standard (eg, OSI +network models). + +With the rapid proliferation of software licensed under the GNU General +Public License, however, the continued success of this role is called into +question. Given that the inclusion of a few lines of "GPL-tainted" work +into a larger body of work will result in restricted distribution -- and +given that further work will likely build upon the "tainted" portions, +making them difficult to remove at a future date -- there are inevitable +circumstances where authors would, in order to protect their goal of +providing for the widespread usage of their work, wish to guard against +such "GPL-taint". + +In addition, one can imagine that companies which operate by producing and +selling (possibly closed-source) code would wish to protect themselves +against the rise of a GPL-licensed competitor. While under existing +licenses this would mean not releasing their code under any form of open +license, if a license existed under which they could incorporate any +improvements back into their own (commercial) products then they might be +far more willing to provide for non-closed distribution. + +For the above reasons, we put forth this "BSD Protection License": A +license designed to retain the freedom granted by the BSD license to use +licensed works in a wide variety of settings, both non-commercial and +commercial, while protecting the work from having future contributors +restrict that freedom. + +The precise terms and conditions for copying, distribution, and +modification follow. + +BSD PROTECTION LICENSE +TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION, AND MODIFICATION +---------------------------------------------------------------- + +0. Definitions. + a) "Program", below, refers to any program or work distributed under + the terms of this license. + b) A "work based on the Program", below, refers to either the Program + or any derivative work under copyright law. + c) "Modification", below, refers to the act of creating derivative works. + d) "You", below, refers to each licensee. + +1. Scope. + This license governs the copying, distribution, and modification of the + Program. Other activities are outside the scope of this license; The + act of running the Program is not restricted, and the output from the + Program is covered only if its contents constitute a work based on the + Program. + +2. Verbatim copies. + You may copy and distribute verbatim copies of the Program as you + receive it, in any medium, provided that you conspicuously and + appropriately publish on each copy an appropriate copyright notice; keep + intact all the notices that refer to this License and to the absence of + any warranty; and give any other recipients of the Program a copy of this + License along with the Program. + +3. Modification and redistribution under closed license. + You may modify your copy or copies of the Program, and distribute + the resulting derivative works, provided that you meet the + following conditions: + a) The copyright notice and disclaimer on the Program must be reproduced + and included in the source code, documentation, and/or other materials + provided in a manner in which such notices are normally distributed. + b) The derivative work must be clearly identified as such, in order that + it may not be confused with the original work. + c) The license under which the derivative work is distributed must + expressly prohibit the distribution of further derivative works. + +4. Modification and redistribution under open license. + You may modify your copy or copies of the Program, and distribute + the resulting derivative works, provided that you meet the + following conditions: + a) The copyright notice and disclaimer on the Program must be reproduced + and included in the source code, documentation, and/or other materials + provided in a manner in which such notices are normally distributed. + b) You must clearly indicate the nature and date of any changes made + to the Program. The full details need not necessarily be included in + the individual modified files, provided that each modified file is + clearly marked as such and instructions are included on where the + full details of the modifications may be found. + c) You must cause any work that you distribute or publish, that in whole + or in part contains or is derived from the Program or any part + thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + +5. Implied acceptance. + You may not copy or distribute the Program or any derivative works except + as expressly provided under this license. Consequently, any such action + will be taken as implied acceptance of the terms of this license. + +6. NO WARRANTY. + THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, + INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY + AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + THE COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR + REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE FOR ANY DIRECT, + INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + ARISING OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING, BUT + NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR + TORT, EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE + POSSIBILITY OF SUCH DAMAGES. diff --git a/courgette/third_party/README.chromium b/courgette/third_party/README.chromium new file mode 100644 index 0000000..d3037a1 --- /dev/null +++ b/courgette/third_party/README.chromium @@ -0,0 +1,23 @@ +This directory contains an extensively modified version of Colin Percival's
+bsdiff, available in its original form from:
+
+ http://www.daemonology.net/bsdiff/
+
+The basic principles of operation are best understood by reading Colin's
+unpublised paper:
+
+Colin Percival, Naive differences of executable code, http://www.daemonology.net/bsdiff/, 200
+
+The copy on this directory so extensively modified that the binary format is
+incompatible with the original and it cannot be compiled outside the Chromium
+source tree or the Courgette project.
+
+List of changes made to original code:
+ - wrapped functions in 'courgette' namespace
+ - renamed .c files to .cc
+ - added bsdiff.h header file
+ - changed the code to use streams.h from courgette
+ - changed the encoding of numbers to use the 'varint' encoding
+ - reformatted code to be closer to Google coding standards
+ - renamed variables
+ - added comments
diff --git a/courgette/third_party/bsdiff.h b/courgette/third_party/bsdiff.h new file mode 100644 index 0000000..bf7fdec --- /dev/null +++ b/courgette/third_party/bsdiff.h @@ -0,0 +1,87 @@ +/*- + * Copyright 2003,2004 Colin Percival + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted providing that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Changelog: + * 2005-04-26 - Define the header as a C structure, add a CRC32 checksum to + * the header, and make all the types 32-bit. + * --Benjamin Smedberg <benjamin@smedbergs.us> + * 2009-03-31 - Change to use Streams. Move CRC code to crc.{h,cc} + * Changed status to an enum, removed unused status codes. + * --Stephen Adams <sra@chromium.org> + */ + +#ifndef COURGETTE_BSDIFF_H_ +#define COURGETTE_BSDIFF_H_ + +#include "base/basictypes.h" + +namespace courgette { + +enum BSDiffStatus { + OK = 0, + MEM_ERROR = 1, + CRC_ERROR = 2, + READ_ERROR = 3, + UNEXPECTED_ERROR = 4 +}; + +class SourceStream; +class SinkStream; + +// Creates a binary patch. +// +BSDiffStatus CreateBinaryPatch(SourceStream* old_stream, + SourceStream* new_stream, + SinkStream* patch_stream); + +// Applies the given patch file to a given source file. This method validates +// the CRC of the original file stored in the patch file, before applying the +// patch to it. +// +BSDiffStatus ApplyBinaryPatch(SourceStream* old_stream, + SourceStream* patch_stream, + SinkStream* new_stream); + + +// The following declarations are common to the patch-creation and +// patch-application code. + +// The patch stream starts with a MBSPatchHeader. +typedef struct MBSPatchHeader_ { + char tag[8]; // Contains MBS_PATCH_HEADER_TAG + uint32 slen; // Length of the file to be patched. + uint32 scrc32; // CRC32 of the file to be patched. + uint32 dlen; // Length of the result file. + uint32 cblen; // Length of the control block in bytes. + uint32 difflen; // Length of the diff block in bytes. + uint32 extralen; // Length of the extra block in bytes. +} MBSPatchHeader; + +// This is the value for the tag field. Must match length exactly, not counting +// null at end of string. +#define MBS_PATCH_HEADER_TAG "GBSDIF42" + +} // namespace +#endif // COURGETTE_BSDIFF_H_ diff --git a/courgette/third_party/bsdiff_apply.cc b/courgette/third_party/bsdiff_apply.cc new file mode 100644 index 0000000..4b6a011 --- /dev/null +++ b/courgette/third_party/bsdiff_apply.cc @@ -0,0 +1,165 @@ +/*- + * Copyright 2003,2004 Colin Percival + * All rights reserved + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted providing that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + * + * Changelog: + * 2009-03-31 - Change to use Streams. Move CRC code to crc.{h,cc} + * --Stephen Adams <sra@chromium.org> + */ + +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +#include "courgette/third_party/bsdiff.h" + +#include "courgette/crc.h" +#include "courgette/streams.h" + +namespace courgette { + +BSDiffStatus MBS_ReadHeader(SourceStream* stream, MBSPatchHeader* header) { + if (!stream->Read(header->tag, sizeof(header->tag))) return READ_ERROR; + if (!stream->ReadVarint32(&header->slen)) return READ_ERROR; + if (!stream->ReadVarint32(&header->scrc32)) return READ_ERROR; + if (!stream->ReadVarint32(&header->dlen)) return READ_ERROR; + if (!stream->ReadVarint32(&header->cblen)) return READ_ERROR; + if (!stream->ReadVarint32(&header->difflen)) return READ_ERROR; + if (!stream->ReadVarint32(&header->extralen)) return READ_ERROR; + + // The string will have a NUL terminator that we don't use, hence '-1'. + COMPILE_ASSERT(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header->tag), + MBS_PATCH_HEADER_TAG_must_match_header_field_size); + if (memcmp(header->tag, MBS_PATCH_HEADER_TAG, 8) != 0) + return UNEXPECTED_ERROR; + + size_t bytes_remaining = stream->Remaining(); + if (header->cblen + + header->difflen + + header->extralen != bytes_remaining) + return UNEXPECTED_ERROR; + + return OK; +} + +BSDiffStatus MBS_ApplyPatch(const MBSPatchHeader *header, + SourceStream* patch_stream, + const uint8* old_start, size_t old_size, + SinkStream* new_stream) { + const uint8* old_end = old_start + old_size; + + SourceStream control_stream; + + const uint8* control_start = patch_stream->Buffer(); + if (!patch_stream->ReadSubstream(header->cblen, &control_stream)) + return READ_ERROR; + if (!patch_stream->Skip(header->difflen + header->extralen)) + return READ_ERROR; + if (!patch_stream->Empty()) + return READ_ERROR; + + const uint8* diff_start = control_start + header->cblen; + const uint8* diff_end = diff_start + header->difflen; + const uint8* extra_start = diff_end; + const uint8* extra_end = extra_start + header->extralen; + + const uint8* old_position = old_start; + const uint8* diff_position = diff_start; + const uint8* extra_position = extra_start; + + new_stream->Reserve(header->dlen); + + while (!control_stream.Empty()) { + uint32 copy_count, extra_count; + int32 seek_adjustment; + if (!control_stream.ReadVarint32(©_count)) + return UNEXPECTED_ERROR; + if (!control_stream.ReadVarint32(&extra_count)) + return UNEXPECTED_ERROR; + if (!control_stream.ReadVarint32Signed(&seek_adjustment)) + return UNEXPECTED_ERROR; + +#ifdef DEBUG_bsmedberg + printf("Applying block: copy: %-8u extra: %-8u seek: %+i\n", + copy_count, extra_count, seek_adjustment); +#endif + // Byte-wise arithmetically add bytes from old file to bytes from the diff + // block. + if (copy_count > static_cast<size_t>(old_end - old_position)) + return UNEXPECTED_ERROR; + if (copy_count > static_cast<size_t>(diff_end - diff_position)) + return UNEXPECTED_ERROR; + + // Add together bytes from the 'old' file and the 'diff' stream. + for (size_t i = 0; i < copy_count; ++i) { + uint8 byte = old_position[i] + diff_position[i]; + new_stream->Write(&byte, 1); + } + old_position += copy_count; + diff_position += copy_count; + + // Copy bytes from the extra block. + if (extra_count > static_cast<size_t>(extra_end - extra_position)) + return UNEXPECTED_ERROR; + + new_stream->Write(extra_position, extra_count); + extra_position += extra_count; + + // "seek" forwards (or backwards) in oldfile. + if (old_position + seek_adjustment < old_start || + old_position + seek_adjustment > old_end) + return UNEXPECTED_ERROR; + + old_position += seek_adjustment; + } + + if (diff_position != diff_end) + return UNEXPECTED_ERROR; + if (extra_position != extra_end) + return UNEXPECTED_ERROR; + + return OK; +} + +BSDiffStatus ApplyBinaryPatch(SourceStream* old_stream, + SourceStream* patch_stream, + SinkStream* new_stream) { + MBSPatchHeader header; + BSDiffStatus ret = MBS_ReadHeader(patch_stream, &header); + if (ret != OK) return ret; + + const uint8* old_start = old_stream->Buffer(); + size_t old_size = old_stream->Remaining(); + + if (old_size != header.slen) return UNEXPECTED_ERROR; + + if (CalculateCrc(old_start, old_size) != header.scrc32) + return CRC_ERROR; + + MBS_ApplyPatch(&header, patch_stream, old_start, old_size, new_stream); + + return OK; +} + +} // namespace diff --git a/courgette/third_party/bsdiff_create.cc b/courgette/third_party/bsdiff_create.cc new file mode 100644 index 0000000..d59dc21 --- /dev/null +++ b/courgette/third_party/bsdiff_create.cc @@ -0,0 +1,432 @@ +/* + bsdiff.c -- Binary patch generator. + + Copyright 2003 Colin Percival + + For the terms under which this work may be distributed, please see + the adjoining file "LICENSE". + + ChangeLog: + 2005-05-05 - Use the modified header struct from bspatch.h; use 32-bit + values throughout. + --Benjamin Smedberg <benjamin@smedbergs.us> + 2005-05-18 - Use the same CRC algorithm as bzip2, and leverage the CRC table + provided by libbz2. + --Darin Fisher <darin@meer.net> + 2007-11-14 - Changed to use Crc from Lzma library instead of Bzip library + --Rahul Kuchhal + 2009-03-31 - Change to use Streams. Added lots of comments. + --Stephen Adams <sra@chromium.org> +*/ + +#include "courgette/third_party/bsdiff.h" + +#include <stdlib.h> +#include <algorithm> + +#include "base/logging.h" +#include "base/scoped_ptr.h" +#include "base/string_util.h" +#include "base/time.h" + +#include "courgette/crc.h" +#include "courgette/streams.h" + +namespace courgette { + +// ------------------------------------------------------------------------ +// +// The following code is taken verbatim from 'bsdiff.c'. Please keep all the +// code formatting and variable names. The only changes from the original are +// replacing tabs with spaces, indentation, and using 'const'. +// +// The code appears to be a rewritten version of the suffix array algorithm +// presented in "Faster Suffix Sorting" by N. Jesper Larsson and Kunihiko +// Sadakane, special-cased for bytes. + +static void +split(int *I,int *V,int start,int len,int h) +{ + int i,j,k,x,tmp,jj,kk; + + if(len<16) { + for(k=start;k<start+len;k+=j) { + j=1;x=V[I[k]+h]; + for(i=1;k+i<start+len;i++) { + if(V[I[k+i]+h]<x) { + x=V[I[k+i]+h]; + j=0; + }; + if(V[I[k+i]+h]==x) { + tmp=I[k+j];I[k+j]=I[k+i];I[k+i]=tmp; + j++; + }; + }; + for(i=0;i<j;i++) V[I[k+i]]=k+j-1; + if(j==1) I[k]=-1; + }; + return; + }; + + x=V[I[start+len/2]+h]; + jj=0;kk=0; + for(i=start;i<start+len;i++) { + if(V[I[i]+h]<x) jj++; + if(V[I[i]+h]==x) kk++; + }; + jj+=start;kk+=jj; + + i=start;j=0;k=0; + while(i<jj) { + if(V[I[i]+h]<x) { + i++; + } else if(V[I[i]+h]==x) { + tmp=I[i];I[i]=I[jj+j];I[jj+j]=tmp; + j++; + } else { + tmp=I[i];I[i]=I[kk+k];I[kk+k]=tmp; + k++; + }; + }; + + while(jj+j<kk) { + if(V[I[jj+j]+h]==x) { + j++; + } else { + tmp=I[jj+j];I[jj+j]=I[kk+k];I[kk+k]=tmp; + k++; + }; + }; + + if(jj>start) split(I,V,start,jj-start,h); + + for(i=0;i<kk-jj;i++) V[I[jj+i]]=kk-1; + if(jj==kk-1) I[jj]=-1; + + if(start+len>kk) split(I,V,kk,start+len-kk,h); +} + +static void +qsufsort(int *I,int *V,const unsigned char *old,int oldsize) +{ + int buckets[256]; + int i,h,len; + + for(i=0;i<256;i++) buckets[i]=0; + for(i=0;i<oldsize;i++) buckets[old[i]]++; + for(i=1;i<256;i++) buckets[i]+=buckets[i-1]; + for(i=255;i>0;i--) buckets[i]=buckets[i-1]; + buckets[0]=0; + + for(i=0;i<oldsize;i++) I[++buckets[old[i]]]=i; + I[0]=oldsize; + for(i=0;i<oldsize;i++) V[i]=buckets[old[i]]; + V[oldsize]=0; + for(i=1;i<256;i++) if(buckets[i]==buckets[i-1]+1) I[buckets[i]]=-1; + I[0]=-1; + + for(h=1;I[0]!=-(oldsize+1);h+=h) { + len=0; + for(i=0;i<oldsize+1;) { + if(I[i]<0) { + len-=I[i]; + i-=I[i]; + } else { + if(len) I[i-len]=-len; + len=V[I[i]]+1-i; + split(I,V,i,len,h); + i+=len; + len=0; + }; + }; + if(len) I[i-len]=-len; + }; + + for(i=0;i<oldsize+1;i++) I[V[i]]=i; +} + +static int +matchlen(const unsigned char *old,int oldsize,const unsigned char *newbuf,int newsize) +{ + int i; + + for(i=0;(i<oldsize)&&(i<newsize);i++) + if(old[i]!=newbuf[i]) break; + + return i; +} + +static int +search(int *I,const unsigned char *old,int oldsize, + const unsigned char *newbuf,int newsize,int st,int en,int *pos) +{ + int x,y; + + if(en-st<2) { + x=matchlen(old+I[st],oldsize-I[st],newbuf,newsize); + y=matchlen(old+I[en],oldsize-I[en],newbuf,newsize); + + if(x>y) { + *pos=I[st]; + return x; + } else { + *pos=I[en]; + return y; + } + } + + x=st+(en-st)/2; + if(memcmp(old+I[x],newbuf,std::min(oldsize-I[x],newsize))<0) { + return search(I,old,oldsize,newbuf,newsize,x,en,pos); + } else { + return search(I,old,oldsize,newbuf,newsize,st,x,pos); + } +} + +// End of 'verbatim' code. +// ------------------------------------------------------------------------ + +static void WriteHeader(SinkStream* stream, MBSPatchHeader* header) { + stream->Write(header->tag, sizeof(header->tag)); + stream->WriteVarint32(header->slen); + stream->WriteVarint32(header->scrc32); + stream->WriteVarint32(header->dlen); + stream->WriteVarint32(header->cblen); + stream->WriteVarint32(header->difflen); + stream->WriteVarint32(header->extralen); +} + +BSDiffStatus CreateBinaryPatch(SourceStream* old_stream, + SourceStream* new_stream, + SinkStream* patch_stream) +{ + base::Time start_bsdiff_time = base::Time::Now(); + LOG(INFO) << "Start bsdiff"; + size_t initial_patch_stream_length = patch_stream->Length(); + + const uint8* old = old_stream->Buffer(); + const int oldsize = old_stream->Remaining(); + + scoped_array<int> I(new(std::nothrow) int[oldsize + 1]); + scoped_array<int> V(new(std::nothrow) int[oldsize + 1]); + if (I == NULL || V == NULL) + return MEM_ERROR; + + base::Time q_start_time = base::Time::Now(); + qsufsort(I.get(), V.get(), old, oldsize); + LOG(INFO) << " done qsufsort " + << (base::Time::Now() - q_start_time).InSecondsF(); + V.reset(); + + const uint8* newbuf = new_stream->Buffer(); + const int newsize = new_stream->Remaining(); + + // Allocate newsize+1 bytes instead of newsize bytes to ensure that we never + // try to malloc(0) and get a NULL pointer. + + scoped_array<uint8> diff_bytes_array(new(std::nothrow) uint8[newsize + 1]); + scoped_array<uint8> extra_bytes_array(new(std::nothrow) uint8[newsize + 1]); + if (diff_bytes_array == NULL || extra_bytes_array == NULL) + return MEM_ERROR; + + uint8* diff_bytes = diff_bytes_array.get(); + uint8* extra_bytes = extra_bytes_array.get(); + int control_length = 0; + int diff_bytes_length = 0; + int diff_bytes_nonzero = 0; + int extra_bytes_length = 0; + int eblen = 0; + + SinkStream control_stream; + + // The patch format is a sequence of triples <copy,extra,seek> where 'copy' is + // the number of bytes to copy from the old file (possibly with mistakes), + // 'extra' is the number of bytes to copy from a stream of fresh bytes, and + // 'seek' is an offset to move to the position to copy for the next triple. + // + // The invariant at the top of this loop is that we are committed to emitting + // a triple for the part of |newbuf| surrounding a 'seed' match near + // |lastscan|. We are searching for a second match that will be the 'seed' of + // the next triple. As we scan through |newbuf|, one of four things can + // happen at the current position |scan|: + // + // 1. We find a nice match that appears to be consistent with the current + // seed. Continue scanning. It is likely that this match will become + // part of the 'copy'. + // + // 2. We find match which does much better than extending the current seed + // old match. Emit a triple for the current seed and take this match as + // the new seed for a new triple. By 'much better' we remove 8 mismatched + // bytes by taking the new seed. + // + // 3. There is not a good match. Continue scanning. These bytes will likely + // become part of the 'extra'. + // + // 4. There is no match because we reached the end of the input, |newbuf|. + + // This is how the loop advances through the bytes of |newbuf|: + // + // ...012345678901234567890123456789... + // ssssssssss Seed at |lastscan| + // xxyyyxxyyxy |scan| forward, cases (3)(x) & (1)(y) + // mmmmmmmm New match will start new seed case (2). + // fffffffffffffff |lenf| = scan forward from |lastscan| + // bbbb |lenb| = scan back from new seed |scan|. + // ddddddddddddddd Emit diff bytes for the 'copy'. + // xx Emit extra bytes. + // ssssssssssss |lastscan = scan - lenb| is new seed. + // x Cases (1) and (3) .... + + + int lastscan = 0, lastpos = 0, lastoffset = 0; + + int scan = 0; + int match_length = 0; + + while (scan < newsize) { + int pos = 0; + int oldscore = 0; // Count of how many bytes of the current match at |scan| + // extend the match at |lastscan|. + + scan += match_length; + for (int scsc = scan; scan < newsize; ++scan) { + match_length = search(I.get(), old, oldsize, + newbuf + scan, newsize - scan, + 0, oldsize, &pos); + + for ( ; scsc < scan + match_length ; scsc++) + if ((scsc + lastoffset < oldsize) && + (old[scsc + lastoffset] == newbuf[scsc])) + oldscore++; + + if ((match_length == oldscore) && (match_length != 0)) + break; // Good continuing match, case (1) + if (match_length > oldscore + 8) + break; // New seed match, case (2) + + if ((scan + lastoffset < oldsize) && + (old[scan + lastoffset] == newbuf[scan])) + oldscore--; + // Case (3) continues in this loop until we fall out of the loop (4). + } + + if ((match_length != oldscore) || (scan == newsize)) { // Cases (2) and (4) + // This next chunk of code finds the boundary between the bytes to be + // copied as part of the current triple, and the bytes to be copied as + // part of the next triple. The |lastscan| match is extended forwards as + // far as possible provided doing to does not add too many mistakes. The + // |scan| match is extended backwards in a similar way. + + // Extend the current match (if any) backwards. |lenb| is the maximal + // extension for which less than half the byte positions in the extension + // are wrong. + int lenb = 0; + if (scan < newsize) { // i.e. not case (4); there is a match to extend. + int score = 0, Sb = 0; + for (int i = 1; (scan >= lastscan + i) && (pos >= i); i++) { + if (old[pos - i] == newbuf[scan - i]) score++; + if (score*2 - i > Sb*2 - lenb) { Sb = score; lenb = i; } + } + } + + // Extend the lastscan match forward; |lenf| is the maximal extension for + // which less than half of the byte positions in entire lastscan match are + // wrong. There is a subtle point here: |lastscan| points to before the + // seed match by |lenb| bytes from the previous iteration. This is why + // the loop measures the total number of mistakes in the the match, not + // just the from the match. + int lenf = 0; + { + int score = 0, Sf = 0; + for (int i = 0; (lastscan + i < scan) && (lastpos + i < oldsize); ) { + if (old[lastpos + i] == newbuf[lastscan + i]) score++; + i++; + if (score*2 - i > Sf*2 - lenf) { Sf = score; lenf = i; } + } + } + + // If the extended scans overlap, pick a position in the overlap region + // that maximizes the exact matching bytes. + if (lastscan + lenf > scan - lenb) { + int overlap = (lastscan + lenf) - (scan - lenb); + int score = 0; + int Ss = 0, lens = 0; + for (int i = 0; i < overlap; i++) { + if (newbuf[lastscan + lenf - overlap + i] == + old[lastpos + lenf - overlap + i]) score++; + if (newbuf[scan - lenb + i] == old[pos - lenb + i]) score--; + if (score > Ss) { Ss = score; lens = i + 1; } + } + + lenf += lens - overlap; + lenb -= lens; + }; + + for (int i = 0; i < lenf; i++) { + uint8 diff_byte = newbuf[lastscan + i] - old[lastpos + i]; + diff_bytes[diff_bytes_length + i] = diff_byte; + if (diff_byte) + ++diff_bytes_nonzero; + } + int gap = (scan - lenb) - (lastscan + lenf); + for (int i = 0; i < gap; i++) + extra_bytes[extra_bytes_length + i] = newbuf[lastscan + lenf + i]; + + diff_bytes_length += lenf; + extra_bytes_length += gap; + + uint32 copy_count = lenf; + uint32 extra_count = gap; + int32 seek_adjustment = ((pos - lenb) - (lastpos + lenf)); + + control_stream.WriteVarint32(copy_count); + control_stream.WriteVarint32(extra_count); + control_stream.WriteVarint32Signed(seek_adjustment); + ++control_length; +#ifdef DEBUG_bsmedberg + LOG(INFO) << StringPrintf( + "Writing a block: copy: %-8u extra: %-8u seek: %+-9d", + copy_count, extra_count, seek_adjustment); +#endif + + lastscan = scan - lenb; // Include the backward extension in seed. + lastpos = pos - lenb; // ditto. + lastoffset = lastpos - lastscan; + } + } + + I.reset(); + + MBSPatchHeader header; + // The string will have a null terminator that we don't use, hence '-1'. + COMPILE_ASSERT(sizeof(MBS_PATCH_HEADER_TAG) - 1 == sizeof(header.tag), + MBS_PATCH_HEADER_TAG_must_match_header_field_size); + memcpy(header.tag, MBS_PATCH_HEADER_TAG, sizeof(header.tag)); + header.slen = oldsize; + header.scrc32 = CalculateCrc(old, oldsize); + header.dlen = newsize; + header.cblen = control_stream.Length(); + header.difflen = diff_bytes_length; + header.extralen = extra_bytes_length; + + WriteHeader(patch_stream, &header); + + patch_stream->Append(&control_stream); + patch_stream->Write(diff_bytes, diff_bytes_length); + patch_stream->Write(extra_bytes, extra_bytes_length); + + LOG(INFO) << "Control tuples: " << control_length + << " copy bytes: " << diff_bytes_length + << " mistakes: " << diff_bytes_nonzero + << " extra bytes: " << extra_bytes_length; + + LOG(INFO) << "Uncompressed bsdiff patch size " + << patch_stream->Length() - initial_patch_stream_length; + + LOG(INFO) << "End bsdiff " + << (base::Time::Now() - start_bsdiff_time).InSecondsF(); + + return OK; +} + +} // namespace diff --git a/courgette/win32_x86_generator.h b/courgette/win32_x86_generator.h new file mode 100644 index 0000000..3aabe9f --- /dev/null +++ b/courgette/win32_x86_generator.h @@ -0,0 +1,126 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This is the transformation and adjustment for Windows X86 executables. + +#ifndef COURGETTE_WIN32_X86_GENERATOR_H_ +#define COURGETTE_WIN32_X86_GENERATOR_H_ + +#include "base/scoped_ptr.h" + +#include "courgette/ensemble.h" + +namespace courgette { + +class CourgetteWin32X86PatchGenerator : public TransformationPatchGenerator { + public: + CourgetteWin32X86PatchGenerator(Element* old_element, + Element* new_element, + CourgetteWin32X86Patcher* patcher) + : TransformationPatchGenerator(old_element, new_element, patcher) { + } + + CourgettePatchFile::TransformationMethodId Kind() { + return CourgettePatchFile::T_COURGETTE_WIN32_X86; + } + + Status WriteInitialParameters(SinkStream* parameter_stream) { + parameter_stream->WriteVarint32(old_element_->offset_in_ensemble()); + parameter_stream->WriteVarint32(old_element_->region().length()); + return C_OK; + // TODO(sra): Initialize |patcher_| with these parameters. + } + + Status PredictTransformParameters(SinkStreamSet* prediction) { + return TransformationPatchGenerator::PredictTransformParameters(prediction); + } + + Status CorrectedTransformParameters(SinkStreamSet* parameters) { + // No code needed to write an 'empty' parameter set. + return C_OK; + } + + // The format of a transformed_element is a serialized EncodedProgram. We + // first disassemble the original old and new Elements into AssemblyPrograms. + // Then we adjust the new AssemblyProgram to make it as much like the old one + // as possible, before converting the AssemblyPrograms to EncodedPrograms and + // serializing them. + Status Transform(SourceStreamSet* corrected_parameters, + SinkStreamSet* old_transformed_element, + SinkStreamSet* new_transformed_element) { + // Don't expect any corrected parameters. + if (!corrected_parameters->Empty()) + return C_GENERAL_ERROR; + + // Generate old version of program using |corrected_parameters|. + // TODO(sra): refactor to use same code from patcher_. + AssemblyProgram* old_program = NULL; + Status old_parse_status = + ParseWin32X86PE(old_element_->region().start(), + old_element_->region().length(), + &old_program); + if (old_parse_status != C_OK) + return old_parse_status; + + AssemblyProgram* new_program = NULL; + Status new_parse_status = + ParseWin32X86PE(new_element_->region().start(), + new_element_->region().length(), + &new_program); + if (new_parse_status != C_OK) { + DeleteAssemblyProgram(old_program); + return new_parse_status; + } + + EncodedProgram* old_encoded = NULL; + Status old_encode_status = Encode(old_program, &old_encoded); + if (old_encode_status != C_OK) { + DeleteAssemblyProgram(old_program); + return old_encode_status; + } + + Status old_write_status = + WriteEncodedProgram(old_encoded, old_transformed_element); + DeleteEncodedProgram(old_encoded); + if (old_write_status != C_OK) { + DeleteAssemblyProgram(old_program); + return old_write_status; + } + + Status adjust_status = Adjust(*old_program, new_program); + DeleteAssemblyProgram(old_program); + if (adjust_status != C_OK) { + DeleteAssemblyProgram(new_program); + return adjust_status; + } + + EncodedProgram* new_encoded = NULL; + Status new_encode_status = Encode(new_program, &new_encoded); + DeleteAssemblyProgram(new_program); + if (new_encode_status != C_OK) + return new_encode_status; + + Status new_write_status = + WriteEncodedProgram(new_encoded, new_transformed_element); + DeleteEncodedProgram(new_encoded); + if (new_write_status != C_OK) + return new_write_status; + + return C_OK; + } + + Status Reform(SourceStreamSet* transformed_element, + SinkStream* reformed_element) { + return TransformationPatchGenerator::Reform(transformed_element, + reformed_element); + } + + private: + ~CourgetteWin32X86PatchGenerator() { } + + DISALLOW_COPY_AND_ASSIGN(CourgetteWin32X86PatchGenerator); +}; + +} // namespace courgette +#endif // COURGETTE_WIN32_X86_GENERATOR_H_ diff --git a/courgette/win32_x86_patcher.h b/courgette/win32_x86_patcher.h new file mode 100644 index 0000000..9ab53d9 --- /dev/null +++ b/courgette/win32_x86_patcher.h @@ -0,0 +1,95 @@ +// Copyright (c) 2009 The Chromium Authors. All rights reserved. +// Use of this source code is governed by a BSD-style license that can be +// found in the LICENSE file. + +// This is the transformation for Windows X86 executables. + +#ifndef COURGETTE_WIN32_X86_PATCHER_H_ +#define COURGETTE_WIN32_X86_PATCHER_H_ + +#include "courgette/ensemble.h" + +namespace courgette { + +// CourgetteWin32X86Patcher is a TransformationPatcher for Windows 32-bit +// executables. +// +class CourgetteWin32X86Patcher : public TransformationPatcher { + public: + explicit CourgetteWin32X86Patcher(const Region& region) + : ensemble_region_(region) { + } + + Status Init(SourceStream* parameter_stream) { + if (!parameter_stream->ReadVarint32(&base_offset_)) + return C_BAD_TRANSFORM; + if (!parameter_stream->ReadVarint32(&base_length_)) + return C_BAD_TRANSFORM; + + if (base_offset_ > ensemble_region_.length()) + return C_BAD_TRANSFORM; + if (base_length_ > ensemble_region_.length() - base_offset_) + return C_BAD_TRANSFORM; + + return C_OK; + } + + Status PredictTransformParameters(SinkStreamSet* predicted_parameters) { + // No code needed to write an 'empty' predicted parameter set. + return C_OK; + } + + Status Transform(SourceStreamSet* corrected_parameters, + SinkStreamSet* transformed_element) { + Status status; + if (!corrected_parameters->Empty()) + return C_GENERAL_ERROR; // Don't expect any corrected parameters. + + AssemblyProgram* program = NULL; + status = ParseWin32X86PE(ensemble_region_.start() + base_offset_, + base_length_, + &program); + if (status != C_OK) + return status; + + EncodedProgram* encoded = NULL; + status = Encode(program, &encoded); + DeleteAssemblyProgram(program); + if (status != C_OK) + return status; + + status = WriteEncodedProgram(encoded, transformed_element); + DeleteEncodedProgram(encoded); + if (status != C_OK) + return status; + + return status; + } + + Status Reform(SourceStreamSet* transformed_element, + SinkStream* reformed_element) { + Status status; + EncodedProgram* encoded_program = NULL; + status = ReadEncodedProgram(transformed_element, &encoded_program); + if (status != C_OK) + return status; + + status = Assemble(encoded_program, reformed_element); + DeleteEncodedProgram(encoded_program); + if (status != C_OK) + return status; + + return C_OK; + } + + private: + Region ensemble_region_; + + uint32 base_offset_; + uint32 base_length_; + + DISALLOW_COPY_AND_ASSIGN(CourgetteWin32X86Patcher); +}; + +} // namespace +#endif // COURGETTE_WIN32_X86_PATCHER_H_ |