• Home
  • History
  • Annotate
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 //      http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15 
16 #include <config.h>
17 #include "vcdiffengine.h"
18 #include <stdint.h>  // uint32_t
19 #include <string.h>  // memcpy
20 #include "blockhash.h"
21 #include "codetablewriter_interface.h"
22 #include "logging.h"
23 #include "rolling_hash.h"
24 
25 namespace open_vcdiff {
26 
VCDiffEngine(const char * dictionary,size_t dictionary_size)27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28     // If dictionary_size == 0, then dictionary could be NULL.  Guard against
29     // using a NULL value.
30     : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31       dictionary_size_(dictionary_size),
32       hashed_dictionary_(NULL) {
33   if (dictionary_size > 0) {
34     memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35   }
36 }
37 
~VCDiffEngine()38 VCDiffEngine::~VCDiffEngine() {
39   delete hashed_dictionary_;
40   if (dictionary_size_ > 0) {
41     delete[] dictionary_;
42   }
43 }
44 
Init()45 bool VCDiffEngine::Init() {
46   if (hashed_dictionary_) {
47     VCD_DFATAL << "Init() called twice for same VCDiffEngine object"
48                << VCD_ENDL;
49     return false;
50   }
51   hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52                                                        dictionary_size());
53   if (!hashed_dictionary_) {
54     VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL;
55     return false;
56   }
57   RollingHash<BlockHash::kBlockSize>::Init();
58   return true;
59 }
60 
61 // This helper function tries to find an appropriate match within
62 // hashed_dictionary_ for the block starting at the current target position.
63 // If target_hash is not NULL, this function will also look for a match
64 // within the previously encoded target data.
65 //
66 // If a match is found, this function will generate an ADD instruction
67 // for all unencoded data that precedes the match,
68 // and a COPY instruction for the match itself; then it returns
69 // the number of bytes processed by both instructions,
70 // which is guaranteed to be > 0.
71 // If no appropriate match is found, the function returns 0.
72 //
73 // The first four parameters are input parameters which are passed
74 // directly to BlockHash::FindBestMatch; please see that function
75 // for a description of their allowable values.
76 template<bool look_for_target_matches>
EncodeCopyForBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * unencoded_target_start,size_t unencoded_target_size,const BlockHash * target_hash,CodeTableWriterInterface * coder) const77 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
78     uint32_t hash_value,
79     const char* target_candidate_start,
80     const char* unencoded_target_start,
81     size_t unencoded_target_size,
82     const BlockHash* target_hash,
83     CodeTableWriterInterface* coder) const {
84   // When FindBestMatch() comes up with a match for a candidate block,
85   // it will populate best_match with the size, source offset,
86   // and target offset of the match.
87   BlockHash::Match best_match;
88 
89   // First look for a match in the dictionary.
90   hashed_dictionary_->FindBestMatch(hash_value,
91                                     target_candidate_start,
92                                     unencoded_target_start,
93                                     unencoded_target_size,
94                                     &best_match);
95   // If target matching is enabled, then see if there is a better match
96   // within the target data that has been encoded so far.
97   if (look_for_target_matches) {
98     target_hash->FindBestMatch(hash_value,
99                                target_candidate_start,
100                                unencoded_target_start,
101                                unencoded_target_size,
102                                &best_match);
103   }
104   if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
105     return 0;
106   }
107   if (best_match.target_offset() > 0) {
108     // Create an ADD instruction to encode all target bytes
109     // from the end of the last COPY match, if any, up to
110     // the beginning of this COPY match.
111     coder->Add(unencoded_target_start, best_match.target_offset());
112   }
113   coder->Copy(best_match.source_offset(), best_match.size());
114   return best_match.target_offset()  // ADD size
115        + best_match.size();          // + COPY size
116 }
117 
118 // Once the encoder loop has finished checking for matches in the target data,
119 // this function creates an ADD instruction to encode all target bytes
120 // from the end of the last COPY match, if any, through the end of
121 // the target data.  In the worst case, if no matches were found at all,
122 // this function will create one big ADD instruction
123 // for the entire buffer of target data.
AddUnmatchedRemainder(const char * unencoded_target_start,size_t unencoded_target_size,CodeTableWriterInterface * coder) const124 inline void VCDiffEngine::AddUnmatchedRemainder(
125     const char* unencoded_target_start,
126     size_t unencoded_target_size,
127     CodeTableWriterInterface* coder) const {
128   if (unencoded_target_size > 0) {
129     coder->Add(unencoded_target_start, unencoded_target_size);
130   }
131 }
132 
133 // This helper function tells the coder to finish the encoding and write
134 // the results into the output string "diff".
FinishEncoding(size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const135 inline void VCDiffEngine::FinishEncoding(
136     size_t target_size,
137     OutputStringInterface* diff,
138     CodeTableWriterInterface* coder) const {
139   if (target_size != static_cast<size_t>(coder->target_length())) {
140     VCD_DFATAL << "Internal error in VCDiffEngine::Encode: "
141                   "original target size (" << target_size
142                << ") does not match number of bytes processed ("
143                << coder->target_length() << ")" << VCD_ENDL;
144   }
145   coder->Output(diff);
146 }
147 
148 template<bool look_for_target_matches>
EncodeInternal(const char * target_data,size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const149 void VCDiffEngine::EncodeInternal(const char* target_data,
150                                   size_t target_size,
151                                   OutputStringInterface* diff,
152                                   CodeTableWriterInterface* coder) const {
153   if (!hashed_dictionary_) {
154     VCD_DFATAL << "Internal error: VCDiffEngine::Encode() "
155                   "called before VCDiffEngine::Init()" << VCD_ENDL;
156     return;
157   }
158   if (target_size == 0) {
159     return;  // Do nothing for empty target
160   }
161   // Special case for really small input
162   if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
163     AddUnmatchedRemainder(target_data, target_size, coder);
164     FinishEncoding(target_size, diff, coder);
165     return;
166   }
167   RollingHash<BlockHash::kBlockSize> hasher;
168   BlockHash* target_hash = NULL;
169   if (look_for_target_matches) {
170     // Check matches against previously encoded target data
171     // in this same target window, as well as against the dictionary
172     target_hash = BlockHash::CreateTargetHash(target_data,
173                                               target_size,
174                                               dictionary_size());
175     if (!target_hash) {
176       VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL;
177       return;
178     }
179   }
180   const char* const target_end = target_data + target_size;
181   const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
182   // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
183   // dictionary)
184   const char* next_encode = target_data;
185   // candidate_pos points to the start of the kBlockSize-byte block that may
186   // begin a match with the dictionary or previously encoded target data.
187   const char* candidate_pos = target_data;
188   uint32_t hash_value = hasher.Hash(candidate_pos);
189   while (1) {
190     const size_t bytes_encoded =
191         EncodeCopyForBestMatch<look_for_target_matches>(
192             hash_value,
193             candidate_pos,
194             next_encode,
195             (target_end - next_encode),
196             target_hash,
197             coder);
198     if (bytes_encoded > 0) {
199       next_encode += bytes_encoded;  // Advance past COPYed data
200       candidate_pos = next_encode;
201       if (candidate_pos > start_of_last_block) {
202         break;  // Reached end of target data
203       }
204       // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
205       // can't be used to calculate the hash value at its new position.
206       hash_value = hasher.Hash(candidate_pos);
207       if (look_for_target_matches) {
208         // Update the target hash for the ADDed and COPYed data
209         target_hash->AddAllBlocksThroughIndex(
210             static_cast<int>(next_encode - target_data));
211       }
212     } else {
213       // No match, or match is too small to be worth a COPY instruction.
214       // Move to the next position in the target data.
215       if ((candidate_pos + 1) > start_of_last_block) {
216         break;  // Reached end of target data
217       }
218       if (look_for_target_matches) {
219         target_hash->AddOneIndexHash(
220             static_cast<int>(candidate_pos - target_data),
221             hash_value);
222       }
223       hash_value = hasher.UpdateHash(hash_value,
224                                      candidate_pos[0],
225                                      candidate_pos[BlockHash::kBlockSize]);
226       ++candidate_pos;
227     }
228   }
229   AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
230   FinishEncoding(target_size, diff, coder);
231   delete target_hash;
232 }
233 
Encode(const char * target_data,size_t target_size,bool look_for_target_matches,OutputStringInterface * diff,CodeTableWriterInterface * coder) const234 void VCDiffEngine::Encode(const char* target_data,
235                           size_t target_size,
236                           bool look_for_target_matches,
237                           OutputStringInterface* diff,
238                           CodeTableWriterInterface* coder) const {
239   if (look_for_target_matches) {
240     EncodeInternal<true>(target_data, target_size, diff, coder);
241   } else {
242     EncodeInternal<false>(target_data, target_size, diff, coder);
243   }
244 }
245 
246 }  // namespace open_vcdiff
247