1 // Copyright 2006, 2008 Google Inc.
2 // Authors: Chandra Chereddi, Lincoln Smith
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 //
8 // http://www.apache.org/licenses/LICENSE-2.0
9 //
10 // Unless required by applicable law or agreed to in writing, software
11 // distributed under the License is distributed on an "AS IS" BASIS,
12 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 // See the License for the specific language governing permissions and
14 // limitations under the License.
15
16 #include <config.h>
17 #include "vcdiffengine.h"
18 #include <stdint.h> // uint32_t
19 #include <string.h> // memcpy
20 #include "blockhash.h"
21 #include "codetablewriter_interface.h"
22 #include "logging.h"
23 #include "rolling_hash.h"
24
25 namespace open_vcdiff {
26
VCDiffEngine(const char * dictionary,size_t dictionary_size)27 VCDiffEngine::VCDiffEngine(const char* dictionary, size_t dictionary_size)
28 // If dictionary_size == 0, then dictionary could be NULL. Guard against
29 // using a NULL value.
30 : dictionary_((dictionary_size > 0) ? new char[dictionary_size] : ""),
31 dictionary_size_(dictionary_size),
32 hashed_dictionary_(NULL) {
33 if (dictionary_size > 0) {
34 memcpy(const_cast<char*>(dictionary_), dictionary, dictionary_size);
35 }
36 }
37
~VCDiffEngine()38 VCDiffEngine::~VCDiffEngine() {
39 delete hashed_dictionary_;
40 if (dictionary_size_ > 0) {
41 delete[] dictionary_;
42 }
43 }
44
Init()45 bool VCDiffEngine::Init() {
46 if (hashed_dictionary_) {
47 VCD_DFATAL << "Init() called twice for same VCDiffEngine object"
48 << VCD_ENDL;
49 return false;
50 }
51 hashed_dictionary_ = BlockHash::CreateDictionaryHash(dictionary_,
52 dictionary_size());
53 if (!hashed_dictionary_) {
54 VCD_DFATAL << "Creation of dictionary hash failed" << VCD_ENDL;
55 return false;
56 }
57 RollingHash<BlockHash::kBlockSize>::Init();
58 return true;
59 }
60
61 // This helper function tries to find an appropriate match within
62 // hashed_dictionary_ for the block starting at the current target position.
63 // If target_hash is not NULL, this function will also look for a match
64 // within the previously encoded target data.
65 //
66 // If a match is found, this function will generate an ADD instruction
67 // for all unencoded data that precedes the match,
68 // and a COPY instruction for the match itself; then it returns
69 // the number of bytes processed by both instructions,
70 // which is guaranteed to be > 0.
71 // If no appropriate match is found, the function returns 0.
72 //
73 // The first four parameters are input parameters which are passed
74 // directly to BlockHash::FindBestMatch; please see that function
75 // for a description of their allowable values.
76 template<bool look_for_target_matches>
EncodeCopyForBestMatch(uint32_t hash_value,const char * target_candidate_start,const char * unencoded_target_start,size_t unencoded_target_size,const BlockHash * target_hash,CodeTableWriterInterface * coder) const77 inline size_t VCDiffEngine::EncodeCopyForBestMatch(
78 uint32_t hash_value,
79 const char* target_candidate_start,
80 const char* unencoded_target_start,
81 size_t unencoded_target_size,
82 const BlockHash* target_hash,
83 CodeTableWriterInterface* coder) const {
84 // When FindBestMatch() comes up with a match for a candidate block,
85 // it will populate best_match with the size, source offset,
86 // and target offset of the match.
87 BlockHash::Match best_match;
88
89 // First look for a match in the dictionary.
90 hashed_dictionary_->FindBestMatch(hash_value,
91 target_candidate_start,
92 unencoded_target_start,
93 unencoded_target_size,
94 &best_match);
95 // If target matching is enabled, then see if there is a better match
96 // within the target data that has been encoded so far.
97 if (look_for_target_matches) {
98 target_hash->FindBestMatch(hash_value,
99 target_candidate_start,
100 unencoded_target_start,
101 unencoded_target_size,
102 &best_match);
103 }
104 if (!ShouldGenerateCopyInstructionForMatchOfSize(best_match.size())) {
105 return 0;
106 }
107 if (best_match.target_offset() > 0) {
108 // Create an ADD instruction to encode all target bytes
109 // from the end of the last COPY match, if any, up to
110 // the beginning of this COPY match.
111 coder->Add(unencoded_target_start, best_match.target_offset());
112 }
113 coder->Copy(best_match.source_offset(), best_match.size());
114 return best_match.target_offset() // ADD size
115 + best_match.size(); // + COPY size
116 }
117
118 // Once the encoder loop has finished checking for matches in the target data,
119 // this function creates an ADD instruction to encode all target bytes
120 // from the end of the last COPY match, if any, through the end of
121 // the target data. In the worst case, if no matches were found at all,
122 // this function will create one big ADD instruction
123 // for the entire buffer of target data.
AddUnmatchedRemainder(const char * unencoded_target_start,size_t unencoded_target_size,CodeTableWriterInterface * coder) const124 inline void VCDiffEngine::AddUnmatchedRemainder(
125 const char* unencoded_target_start,
126 size_t unencoded_target_size,
127 CodeTableWriterInterface* coder) const {
128 if (unencoded_target_size > 0) {
129 coder->Add(unencoded_target_start, unencoded_target_size);
130 }
131 }
132
133 // This helper function tells the coder to finish the encoding and write
134 // the results into the output string "diff".
FinishEncoding(size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const135 inline void VCDiffEngine::FinishEncoding(
136 size_t target_size,
137 OutputStringInterface* diff,
138 CodeTableWriterInterface* coder) const {
139 if (target_size != static_cast<size_t>(coder->target_length())) {
140 VCD_DFATAL << "Internal error in VCDiffEngine::Encode: "
141 "original target size (" << target_size
142 << ") does not match number of bytes processed ("
143 << coder->target_length() << ")" << VCD_ENDL;
144 }
145 coder->Output(diff);
146 }
147
148 template<bool look_for_target_matches>
EncodeInternal(const char * target_data,size_t target_size,OutputStringInterface * diff,CodeTableWriterInterface * coder) const149 void VCDiffEngine::EncodeInternal(const char* target_data,
150 size_t target_size,
151 OutputStringInterface* diff,
152 CodeTableWriterInterface* coder) const {
153 if (!hashed_dictionary_) {
154 VCD_DFATAL << "Internal error: VCDiffEngine::Encode() "
155 "called before VCDiffEngine::Init()" << VCD_ENDL;
156 return;
157 }
158 if (target_size == 0) {
159 return; // Do nothing for empty target
160 }
161 // Special case for really small input
162 if (target_size < static_cast<size_t>(BlockHash::kBlockSize)) {
163 AddUnmatchedRemainder(target_data, target_size, coder);
164 FinishEncoding(target_size, diff, coder);
165 return;
166 }
167 RollingHash<BlockHash::kBlockSize> hasher;
168 BlockHash* target_hash = NULL;
169 if (look_for_target_matches) {
170 // Check matches against previously encoded target data
171 // in this same target window, as well as against the dictionary
172 target_hash = BlockHash::CreateTargetHash(target_data,
173 target_size,
174 dictionary_size());
175 if (!target_hash) {
176 VCD_DFATAL << "Instantiation of target hash failed" << VCD_ENDL;
177 return;
178 }
179 }
180 const char* const target_end = target_data + target_size;
181 const char* const start_of_last_block = target_end - BlockHash::kBlockSize;
182 // Offset of next bytes in string to ADD if NOT copied (i.e., not found in
183 // dictionary)
184 const char* next_encode = target_data;
185 // candidate_pos points to the start of the kBlockSize-byte block that may
186 // begin a match with the dictionary or previously encoded target data.
187 const char* candidate_pos = target_data;
188 uint32_t hash_value = hasher.Hash(candidate_pos);
189 while (1) {
190 const size_t bytes_encoded =
191 EncodeCopyForBestMatch<look_for_target_matches>(
192 hash_value,
193 candidate_pos,
194 next_encode,
195 (target_end - next_encode),
196 target_hash,
197 coder);
198 if (bytes_encoded > 0) {
199 next_encode += bytes_encoded; // Advance past COPYed data
200 candidate_pos = next_encode;
201 if (candidate_pos > start_of_last_block) {
202 break; // Reached end of target data
203 }
204 // candidate_pos has jumped ahead by bytes_encoded bytes, so UpdateHash
205 // can't be used to calculate the hash value at its new position.
206 hash_value = hasher.Hash(candidate_pos);
207 if (look_for_target_matches) {
208 // Update the target hash for the ADDed and COPYed data
209 target_hash->AddAllBlocksThroughIndex(
210 static_cast<int>(next_encode - target_data));
211 }
212 } else {
213 // No match, or match is too small to be worth a COPY instruction.
214 // Move to the next position in the target data.
215 if ((candidate_pos + 1) > start_of_last_block) {
216 break; // Reached end of target data
217 }
218 if (look_for_target_matches) {
219 target_hash->AddOneIndexHash(
220 static_cast<int>(candidate_pos - target_data),
221 hash_value);
222 }
223 hash_value = hasher.UpdateHash(hash_value,
224 candidate_pos[0],
225 candidate_pos[BlockHash::kBlockSize]);
226 ++candidate_pos;
227 }
228 }
229 AddUnmatchedRemainder(next_encode, target_end - next_encode, coder);
230 FinishEncoding(target_size, diff, coder);
231 delete target_hash;
232 }
233
Encode(const char * target_data,size_t target_size,bool look_for_target_matches,OutputStringInterface * diff,CodeTableWriterInterface * coder) const234 void VCDiffEngine::Encode(const char* target_data,
235 size_t target_size,
236 bool look_for_target_matches,
237 OutputStringInterface* diff,
238 CodeTableWriterInterface* coder) const {
239 if (look_for_target_matches) {
240 EncodeInternal<true>(target_data, target_size, diff, coder);
241 } else {
242 EncodeInternal<false>(target_data, target_size, diff, coder);
243 }
244 }
245
246 } // namespace open_vcdiff
247