sdch/open-vcdiff/src/addrcache.cc


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331

// Copyright 2007 Google Inc.
// Author: Lincoln Smith
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Implementation of the Address Cache and Address Encoding
// algorithms described in sections 5.1 - 5.4 of RFC 3284 -
// The VCDIFF Generic Differencing and Compression Data Format.
// The RFC text can be found at http://www.faqs.org/rfcs/rfc3284.html
//
// Assumptions:
//   * The VCDAddress type is large enough to hold any offset within
//     the source and target windows.  The limit (for int32_t) is 2^31-1 bytes.
//     The source (dictionary) should not approach this size limit;
//     to compress a target file that is larger than
//     INT_MAX - (dictionary size) bytes, the encoder must
//     break it up into multiple target windows.

#include <config.h>
#include "addrcache.h"
#include "logging.h"
#include "varint_bigendian.h"
#include "vcdiff_defs.h"  // RESULT_ERROR

namespace open_vcdiff {

// The constructor does not initialize near_addresses_ and same_addresses_.
// Therefore, Init() must be called before any other method can be used.
//
// Arguments:
//   near_cache_size: Size of the NEAR cache (number of 4-byte integers)
//   same_cache_size: Size of the SAME cache (number of blocks of
//                                            256 4-byte integers per block)
//     Because the mode is expressed as a byte value,
//     near_cache_size + same_cache_size should not exceed 254.
//
VCDiffAddressCache::VCDiffAddressCache(int near_cache_size,
                                       int same_cache_size)
    : near_cache_size_(near_cache_size),
      same_cache_size_(same_cache_size),
      next_slot_(0) { }

VCDiffAddressCache::VCDiffAddressCache()
    : near_cache_size_(kDefaultNearCacheSize),
      same_cache_size_(kDefaultSameCacheSize),
      next_slot_(0) { }

// Sets up data structures needed to call other methods.  Operations that may
// fail at runtime (for example, validating the provided near_cache_size_ and
// same_cache_size_ parameters against their maximum allowed values) are
// confined to this routine in order to guarantee that the class constructor
// will never fail.  Other methods (except the destructor) cannot be invoked
// until this method has been called successfully.  After the object has been
// initialized and used, Init() can be called again to reset it to its initial
// state.
//
// Return value: "true" if initialization succeeded, "false" if it failed.
//     No other method except the destructor may be invoked if this function
//     returns false.  The caller is responsible for checking the return value
//     and providing an exit path in case of error.
//
bool VCDiffAddressCache::Init() {
  // The mode is expressed as a byte value, so there is only room for 256 modes,
  // including the two non-cached modes (SELF and HERE).  Do not allow a larger
  // number of modes to be defined.  We do a separate sanity check for
  // near_cache_size_ and same_cache_size_ because adding them together can
  // cause an integer overflow if each is set to, say, INT_MAX.
  if ((near_cache_size_ > (VCD_MAX_MODES - 2)) || (near_cache_size_ < 0)) {
    LOG(ERROR) << "Near cache size " << near_cache_size_ << " is invalid"
               << LOG_ENDL;
    return false;
  }
  if ((same_cache_size_ > (VCD_MAX_MODES - 2)) || (same_cache_size_ < 0)) {
    LOG(ERROR) << "Same cache size " << same_cache_size_ << " is invalid"
               << LOG_ENDL;
    return false;
  }
  if ((near_cache_size_ + same_cache_size_) > VCD_MAX_MODES - 2) {
    LOG(ERROR) << "Using near cache size " << near_cache_size_
               << " and same cache size " << same_cache_size_
               << " would exceed maximum number of COPY modes ("
               << VCD_MAX_MODES << ")" << LOG_ENDL;
    return false;
  }
  if (near_cache_size_ > 0) {
    near_addresses_.assign(near_cache_size_, 0);
  }
  if (same_cache_size_ > 0) {
    same_addresses_.assign(same_cache_size_ * 256, 0);
  }
  next_slot_ = 0;  // in case Init() is called a second time to reinit
  return true;
}

// This method will be called whenever an address is calculated for an
// encoded or decoded COPY instruction, and will update the contents
// of the SAME and NEAR caches.  It is vital that the use of
// UpdateCache (called cache_update in the RFC examples) exactly match
// the RFC standard, and that the same caching logic be used in the
// decoder as in the encoder, in order for the decoded addresses to
// match.
//
// Argument:
//   address: This must be a valid address between 0 and
//       (source window size + target window size).  It is assumed that
//       these bounds have been checked before calling UpdateCache.
//
void VCDiffAddressCache::UpdateCache(VCDAddress address) {
  if (near_cache_size_ > 0) {
    near_addresses_[next_slot_] = address;
    next_slot_ = (next_slot_ + 1) % near_cache_size_;
  }
  if (same_cache_size_ > 0) {
    same_addresses_[address % (same_cache_size_ * 256)] = address;
  }
}

// Determines the address mode that yields the most compact encoding
// of the given address value, writes the encoded address into the
// address stream, and returns the mode used.  The most compact encoding
// is found by looking for the numerically lowest encoded address.
// The Init() function must already have been called.
//
// Arguments:
//   address: The address to be encoded.  Must be a non-negative integer
//       between 0 and (here_address - 1).
//   here_address: The current location in the target data (i.e., the
//       position just after the last encoded value.)  Must be non-negative.
//   encoded_addr: Points to an VCDAddress that will be replaced
//       with the encoded representation of address.
//       If WriteAddressAsVarintForMode returns true when passed
//       the return value, then encoded_addr should be written
//       into the delta file as a variable-length integer (Varint);
//       otherwise, it should be written as a byte (unsigned char).
//
// Return value: A mode value between 0 and 255.  The mode will tell
//       how to interpret the next value in the address stream.
//       The values 0 and 1 correspond to SELF and HERE addressing.
//
// The function is guaranteed to succeed unless the conditions on the arguments
// have not been met, in which case a LOG(DFATAL) message will be produced,
// 0 will be returned, and *encoded_addr will be replaced with 0.
//
unsigned char VCDiffAddressCache::EncodeAddress(VCDAddress address,
                                                VCDAddress here_address,
                                                VCDAddress* encoded_addr) {
  if (address < 0) {
    LOG(DFATAL) << "EncodeAddress was passed a negative address: "
                << address << LOG_ENDL;
    *encoded_addr = 0;
    return 0;
  }
  if (address >= here_address) {
    LOG(DFATAL) << "EncodeAddress was called with address (" << address
                << ") < here_address (" << here_address << ")" << LOG_ENDL;
    *encoded_addr = 0;
    return 0;
  }
  // Try using the SAME cache.  This method, if available, always
  // results in the smallest encoding and takes priority over other modes.
  if (same_cache_size() > 0) {
    const VCDAddress same_cache_pos =
        address % (same_cache_size() * 256);
    if (SameAddress(same_cache_pos) == address) {
      // This is the only mode for which an single byte will be written
      // to the address stream instead of a variable-length integer.
      UpdateCache(address);
      *encoded_addr = same_cache_pos % 256;
      return FirstSameMode() + (same_cache_pos / 256);  // SAME mode
    }
  }

  // Try SELF mode
  unsigned char best_mode = VCD_SELF_MODE;
  VCDAddress best_encoded_address = address;

  // Try HERE mode
  {
    const VCDAddress here_encoded_address = here_address - address;
    if (here_encoded_address < best_encoded_address) {
      best_mode = VCD_HERE_MODE;
      best_encoded_address = here_encoded_address;
    }
  }

  // Try using the NEAR cache
  for (int i = 0; i < near_cache_size(); ++i) {
    const VCDAddress near_encoded_address = address - NearAddress(i);
    if ((near_encoded_address >= 0) &&
        (near_encoded_address < best_encoded_address)) {
      best_mode = FirstNearMode() + i;
      best_encoded_address = near_encoded_address;
    }
  }

  UpdateCache(address);
  *encoded_addr = best_encoded_address;
  return best_mode;
}

// Increments *byte_pointer and returns the byte it pointed to before the
// increment.  The caller must check bounds to ensure that *byte_pointer
// points to a valid address in memory.
static unsigned char ParseByte(const char** byte_pointer) {
  unsigned char byte_value = static_cast<unsigned char>(**byte_pointer);
  ++(*byte_pointer);
  return byte_value;
}

// Checks the given decoded address for validity.  Returns true if the
// address is valid; otherwise, prints an error message to the log and
// returns false.
static bool IsDecodedAddressValid(VCDAddress decoded_address,
                                  VCDAddress here_address) {
  if (decoded_address < 0) {
    LOG(ERROR) << "Decoded address " << decoded_address << " is invalid"
               << LOG_ENDL;
    return false;
  } else if (decoded_address >= here_address) {
    LOG(ERROR) << "Decoded address (" << decoded_address
               << ") is beyond location in target file (" << here_address
               << ")" << LOG_ENDL;
    return false;
  }
  return true;
}

// Interprets the next value in the address_stream using the provided mode,
// which may need to access the SAME or NEAR address cache.  Returns the
// decoded address.
// The Init() function must already have been called.
//
// Arguments:
//   here_address: The current location in the source + target data (i.e., the
//       location into which the COPY instruction will copy.)  By definition,
//       all addresses between 0 and (here_address - 1) are valid, and
//       any other address is invalid.
//   mode: A byte value between 0 and (near_cache_size_ + same_cache_size_ + 1)
//       which tells how to interpret the next value in the address stream.
//       The values 0 and 1 correspond to SELF and HERE addressing.
//       The validity of "mode" should already have been checked before
//       calling this function.
//   address_stream: Points to a pointer holding the position
//       in the "Addresses section for COPYs" part of the input data.
//       That section must already have been uncompressed
//       using a secondary decompressor (if necessary.)
//       This is an IN/OUT argument; the value of *address_stream will be
//       incremented by the size of an integer, or (if the SAME cache
//       was used) by the size of a byte (1).
//   address_stream_end: Points to the position just after the end of
//       the address stream buffer.  All addresses between *address_stream
//       and address_stream_end should contain valid address data.
//
// Return value: If the input conditions were met, and the address section
//     of the input data contains properly encoded addresses that match
//     the instructions section, then an integer between 0 and here_address - 1
//     will be returned, representing the address from which data should
//     be copied from the source or target window into the output stream.
//     If an invalid address value is found in address_stream, then
//     RESULT_ERROR will be returned.  If the limit address_stream_end
//     is reached before the address can be decoded, then
//     RESULT_END_OF_DATA will be returned.  If more streamed data
//     is expected, this means that the consumer should block and wait
//     for more data before continuing to decode.  If no more data is expected,
//     this return value signals an error condition.
//
VCDAddress VCDiffAddressCache::DecodeAddress(VCDAddress here_address,
                                             unsigned char mode,
                                             const char** address_stream,
                                             const char* address_stream_end) {
  if (here_address < 0) {
    LOG(DFATAL) << "DecodeAddress was passed a negative value"
                   " for here_address: " << here_address << LOG_ENDL;
    return RESULT_ERROR;
  }
  const char* new_address_pos = *address_stream;
  if (new_address_pos >= address_stream_end) {
    return RESULT_END_OF_DATA;
  }
  VCDAddress decoded_address;
  if (IsSameMode(mode)) {
    // SAME mode expects a byte value as the encoded address
    unsigned char encoded_address = ParseByte(&new_address_pos);
    decoded_address = DecodeSameAddress(mode, encoded_address);
  } else {
    // All modes except SAME mode expect a VarintBE as the encoded address
    int32_t encoded_address = VarintBE<int32_t>::Parse(address_stream_end,
                                                       &new_address_pos);
    switch (encoded_address) {
      case RESULT_ERROR:
        LOG(ERROR) << "Found invalid variable-length integer "
                      "as encoded address value" << LOG_ENDL;
        return RESULT_ERROR;
      case RESULT_END_OF_DATA:
        return RESULT_END_OF_DATA;
      default:
        break;
    }
    if (IsSelfMode(mode)) {
      decoded_address = DecodeSelfAddress(encoded_address);
    } else if (IsHereMode(mode)) {
      decoded_address = DecodeHereAddress(encoded_address, here_address);
    } else if (IsNearMode(mode)) {
      decoded_address = DecodeNearAddress(mode, encoded_address);
    } else {
      LOG(DFATAL) << "Invalid mode value (" << static_cast<int>(mode)
                  << ") passed to DecodeAddress; maximum mode value = "
                  << static_cast<int>(LastMode()) << LOG_ENDL;
      return RESULT_ERROR;
    }
  }
  // Check for an out-of-bounds address (corrupt/malicious data)
  if (!IsDecodedAddressValid(decoded_address, here_address)) {
    return RESULT_ERROR;
  }
  *address_stream = new_address_pos;
  UpdateCache(decoded_address);
  return decoded_address;
}

}  // namespace open_vcdiff