summaryrefslogtreecommitdiffstats
path: root/courgette/assembly_program.h
blob: 65147a8c04d5137b30f39b424f78ad080a256529 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
// Copyright (c) 2011 The Chromium Authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef COURGETTE_ASSEMBLY_PROGRAM_H_
#define COURGETTE_ASSEMBLY_PROGRAM_H_

#include <stddef.h>
#include <stdint.h>

#include <map>
#include <set>
#include <vector>

#include "base/macros.h"
#include "base/memory/free_deleter.h"
#include "base/memory/scoped_ptr.h"
#include "courgette/courgette.h"
#include "courgette/image_utils.h"
#include "courgette/label_manager.h"
#include "courgette/memory_allocator.h"

namespace courgette {

class EncodedProgram;

// Opcodes of simple assembly language
enum OP {
  ORIGIN,         // ORIGIN <rva> - set current address for assembly.
  MAKEPERELOCS,   // Generates a base relocation table.
  MAKEELFRELOCS,  // Generates a base relocation table.
  DEFBYTE,        // DEFBYTE <value> - emit a byte literal.
  REL32,          // REL32 <label> - emit a rel32 encoded reference to 'label'.
  ABS32,          // ABS32 <label> - emit an abs32 encoded reference to 'label'.
  REL32ARM,       // REL32ARM <c_op> <label> - arm-specific rel32 reference
  MAKEELFARMRELOCS,  // Generates a base relocation table.
  DEFBYTES,       // Emits any number of byte literals
  ABS64,          // ABS64 <label> - emit an abs64 encoded reference to 'label'.
  LAST_OP
};

// Base class for instructions.  Because we have so many instructions we want to
// keep them as small as possible.  For this reason we avoid virtual functions.
class Instruction {
 public:
  OP op() const { return static_cast<OP>(op_); }

 protected:
  explicit Instruction(OP op) : op_(op), info_(0) {}
  Instruction(OP op, unsigned int info) : op_(op), info_(info) {}

  uint32_t op_ : 4;     // A few bits to store the OP code.
  uint32_t info_ : 28;  // Remaining bits in first word available to subclass.

 private:
  DISALLOW_COPY_AND_ASSIGN(Instruction);
};

typedef NoThrowBuffer<Instruction*> InstructionVector;

// An AssemblyProgram is the result of disassembling an executable file.
//
// * The disassembler creates labels in the AssemblyProgram and emits
//   'Instructions'.
// * The disassembler then calls DefaultAssignIndexes to assign
//   addresses to positions in the address tables.
// * [Optional step]
// * At this point the AssemblyProgram can be converted into an
//   EncodedProgram and serialized to an output stream.
// * Later, the EncodedProgram can be deserialized and assembled into
//   the original file.
//
// The optional step is to modify the AssemblyProgram.  One form of modification
// is to assign indexes in such a way as to make the EncodedProgram for this
// AssemblyProgram look more like the EncodedProgram for some other
// AssemblyProgram.  The modification process should call UnassignIndexes, do
// its own assignment, and then call AssignRemainingIndexes to ensure all
// indexes are assigned.
//
class AssemblyProgram {
 public:
  explicit AssemblyProgram(ExecutableType kind);
  ~AssemblyProgram();

  ExecutableType kind() const { return kind_; }

  void set_image_base(uint64_t image_base) { image_base_ = image_base; }

  // Instructions will be assembled in the order they are emitted.

  // Generates an entire base relocation table.
  CheckBool EmitPeRelocsInstruction() WARN_UNUSED_RESULT;

  // Generates an ELF style relocation table for X86.
  CheckBool EmitElfRelocationInstruction() WARN_UNUSED_RESULT;

  // Generates an ELF style relocation table for ARM.
  CheckBool EmitElfARMRelocationInstruction() WARN_UNUSED_RESULT;

  // Following instruction will be assembled at address 'rva'.
  CheckBool EmitOriginInstruction(RVA rva) WARN_UNUSED_RESULT;

  // Generates a single byte of data or machine instruction.
  CheckBool EmitByteInstruction(uint8_t byte) WARN_UNUSED_RESULT;

  // Generates multiple bytes of data or machine instructions.
  CheckBool EmitBytesInstruction(const uint8_t* value,
                                 size_t len) WARN_UNUSED_RESULT;

  // Generates 4-byte relative reference to address of 'label'.
  CheckBool EmitRel32(Label* label) WARN_UNUSED_RESULT;

  // Generates 4-byte relative reference to address of 'label' for
  // ARM.
  CheckBool EmitRel32ARM(uint16_t op,
                         Label* label,
                         const uint8_t* arm_op,
                         uint16_t op_size) WARN_UNUSED_RESULT;

  // Generates 4-byte absolute reference to address of 'label'.
  CheckBool EmitAbs32(Label* label) WARN_UNUSED_RESULT;

  // Generates 8-byte absolute reference to address of 'label'.
  CheckBool EmitAbs64(Label* label) WARN_UNUSED_RESULT;

  // Looks up a label or creates a new one.  Might return NULL.
  Label* FindOrMakeAbs32Label(RVA rva);

  // Looks up a label or creates a new one.  Might return NULL.
  Label* FindOrMakeRel32Label(RVA rva);

  void DefaultAssignIndexes();
  void UnassignIndexes();
  void AssignRemainingIndexes();

  scoped_ptr<EncodedProgram> Encode() const;

  // Accessor for instruction list.
  const InstructionVector& instructions() const {
    return instructions_;
  }

  // Returns the label if the instruction contains an absolute 32-bit address,
  // otherwise returns NULL.
  Label* InstructionAbs32Label(const Instruction* instruction) const;

  // Returns the label if the instruction contains an absolute 64-bit address,
  // otherwise returns NULL.
  Label* InstructionAbs64Label(const Instruction* instruction) const;

  // Returns the label if the instruction contains a rel32 offset,
  // otherwise returns NULL.
  Label* InstructionRel32Label(const Instruction* instruction) const;

  // Removes underused Labels. Thresholds used (may be 0, i.e., no trimming) is
  // dependent on architecture. Returns true on success, and false otherwise.
  CheckBool TrimLabels();

 private:
  using ScopedInstruction =
      scoped_ptr<Instruction, UncheckedDeleter<Instruction>>;

  ExecutableType kind_;

  CheckBool Emit(ScopedInstruction instruction) WARN_UNUSED_RESULT;
  CheckBool EmitShared(Instruction* instruction) WARN_UNUSED_RESULT;

  static const int kLabelLowerLimit;

  // Looks up a label or creates a new one.  Might return NULL.
  Label* FindLabel(RVA rva, RVAToLabel* labels);

  // Helper methods for the public versions.
  static void UnassignIndexes(RVAToLabel* labels);
  static void DefaultAssignIndexes(RVAToLabel* labels);
  static void AssignRemainingIndexes(RVAToLabel* labels);

  // Sharing instructions that emit a single byte saves a lot of space.
  Instruction* GetByteInstruction(uint8_t byte);
  scoped_ptr<Instruction* [], base::FreeDeleter> byte_instruction_cache_;

  uint64_t image_base_;  // Desired or mandated base address of image.

  InstructionVector instructions_;  // All the instructions in program.

  // These are lookup maps to find the label associated with a given address.
  // We have separate label spaces for addresses referenced by rel32 labels and
  // abs32 labels.  This is somewhat arbitrary.
  RVAToLabel rel32_labels_;
  RVAToLabel abs32_labels_;

  DISALLOW_COPY_AND_ASSIGN(AssemblyProgram);
};

// Converts |program| into encoded form, returning it as |*output|.
// Returns C_OK if succeeded, otherwise returns an error status and sets
// |*output| to null.
Status Encode(const AssemblyProgram& program,
              scoped_ptr<EncodedProgram>* output);

}  // namespace courgette

#endif  // COURGETTE_ASSEMBLY_PROGRAM_H_