1 // Copyright 2017 The Chromium OS Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "puffin/src/puffin_stream.h"
6 
7 #include <algorithm>
8 #include <memory>
9 #include <string>
10 #include <utility>
11 #include <vector>
12 
13 #include "puffin/src/bit_reader.h"
14 #include "puffin/src/bit_writer.h"
15 #include "puffin/src/include/puffin/common.h"
16 #include "puffin/src/include/puffin/huffer.h"
17 #include "puffin/src/include/puffin/puffer.h"
18 #include "puffin/src/include/puffin/stream.h"
19 #include "puffin/src/logging.h"
20 #include "puffin/src/puff_reader.h"
21 #include "puffin/src/puff_writer.h"
22 
23 using std::shared_ptr;
24 using std::unique_ptr;
25 using std::vector;
26 
27 namespace puffin {
28 
29 namespace {
30 
CheckArgsIntegrity(uint64_t puff_size,const vector<BitExtent> & deflates,const vector<ByteExtent> & puffs)31 bool CheckArgsIntegrity(uint64_t puff_size,
32                         const vector<BitExtent>& deflates,
33                         const vector<ByteExtent>& puffs) {
34   TEST_AND_RETURN_FALSE(puffs.size() == deflates.size());
35   // Check if the |puff_size| is actually greater than the last byte of the last
36   // puff in |puffs|.
37   if (!puffs.empty()) {
38     TEST_AND_RETURN_FALSE(puff_size >=
39                           puffs.back().offset + puffs.back().length);
40   }
41 
42   // Check to make sure |puffs| and |deflates| are sorted and non-overlapping.
43   auto is_overlap = [](const auto& a, const auto& b) {
44     return (a.offset + a.length) > b.offset;
45   };
46   TEST_AND_RETURN_FALSE(deflates.end() == std::adjacent_find(deflates.begin(),
47                                                              deflates.end(),
48                                                              is_overlap));
49   TEST_AND_RETURN_FALSE(puffs.end() == std::adjacent_find(puffs.begin(),
50                                                           puffs.end(),
51                                                           is_overlap));
52   return true;
53 }
54 
55 }  // namespace
56 
CreateForPuff(UniqueStreamPtr stream,shared_ptr<Puffer> puffer,uint64_t puff_size,const vector<BitExtent> & deflates,const vector<ByteExtent> & puffs,size_t max_cache_size)57 UniqueStreamPtr PuffinStream::CreateForPuff(UniqueStreamPtr stream,
58                                             shared_ptr<Puffer> puffer,
59                                             uint64_t puff_size,
60                                             const vector<BitExtent>& deflates,
61                                             const vector<ByteExtent>& puffs,
62                                             size_t max_cache_size) {
63   TEST_AND_RETURN_VALUE(CheckArgsIntegrity(puff_size, deflates, puffs),
64                         nullptr);
65   TEST_AND_RETURN_VALUE(stream->Seek(0), nullptr);
66 
67   UniqueStreamPtr puffin_stream(new PuffinStream(std::move(stream), puffer,
68                                                  nullptr, puff_size, deflates,
69                                                  puffs, max_cache_size));
70   TEST_AND_RETURN_VALUE(puffin_stream->Seek(0), nullptr);
71   return puffin_stream;
72 }
73 
CreateForHuff(UniqueStreamPtr stream,shared_ptr<Huffer> huffer,uint64_t puff_size,const vector<BitExtent> & deflates,const vector<ByteExtent> & puffs)74 UniqueStreamPtr PuffinStream::CreateForHuff(UniqueStreamPtr stream,
75                                             shared_ptr<Huffer> huffer,
76                                             uint64_t puff_size,
77                                             const vector<BitExtent>& deflates,
78                                             const vector<ByteExtent>& puffs) {
79   TEST_AND_RETURN_VALUE(CheckArgsIntegrity(puff_size, deflates, puffs),
80                         nullptr);
81   TEST_AND_RETURN_VALUE(stream->Seek(0), nullptr);
82 
83   UniqueStreamPtr puffin_stream(new PuffinStream(
84       std::move(stream), nullptr, huffer, puff_size, deflates, puffs, 0));
85   TEST_AND_RETURN_VALUE(puffin_stream->Seek(0), nullptr);
86   return puffin_stream;
87 }
88 
PuffinStream(UniqueStreamPtr stream,shared_ptr<Puffer> puffer,shared_ptr<Huffer> huffer,uint64_t puff_size,const vector<BitExtent> & deflates,const vector<ByteExtent> & puffs,size_t max_cache_size)89 PuffinStream::PuffinStream(UniqueStreamPtr stream,
90                            shared_ptr<Puffer> puffer,
91                            shared_ptr<Huffer> huffer,
92                            uint64_t puff_size,
93                            const vector<BitExtent>& deflates,
94                            const vector<ByteExtent>& puffs,
95                            size_t max_cache_size)
96     : stream_(std::move(stream)),
97       puffer_(puffer),
98       huffer_(huffer),
99       puff_stream_size_(puff_size),
100       deflates_(deflates),
101       puffs_(puffs),
102       puff_pos_(0),
103       skip_bytes_(0),
104       deflate_bit_pos_(0),
105       last_byte_(0),
106       extra_byte_(0),
107       is_for_puff_(puffer_ ? true : false),
108       closed_(false),
109       max_cache_size_(max_cache_size),
110       cur_cache_size_(0) {
111   // Building upper bounds for faster seek.
112   upper_bounds_.reserve(puffs.size());
113   for (const auto& puff : puffs) {
114     upper_bounds_.emplace_back(puff.offset + puff.length);
115   }
116   upper_bounds_.emplace_back(puff_stream_size_ + 1);
117 
118   // We can pass the size of the deflate stream too, but it is not necessary
119   // yet. We cannot get the size of stream from itself, because we might be
120   // writing into it and its size is not defined yet.
121   uint64_t deflate_stream_size = puff_stream_size_;
122   if (!puffs.empty()) {
123     deflate_stream_size =
124         ((deflates.back().offset + deflates.back().length) / 8) +
125         puff_stream_size_ - (puffs.back().offset + puffs.back().length);
126   }
127 
128   deflates_.emplace_back(deflate_stream_size * 8, 0);
129   puffs_.emplace_back(puff_stream_size_, 0);
130 
131   // Look for the largest puff and deflate extents and get proper size buffers.
132   uint64_t max_puff_length = 0;
133   for (const auto& puff : puffs) {
134     max_puff_length = std::max(max_puff_length, puff.length);
135   }
136   puff_buffer_.reset(new Buffer(max_puff_length + 1));
137   if (max_cache_size_ < max_puff_length) {
138     max_cache_size_ = 0;  // It means we are not caching puffs.
139   }
140 
141   uint64_t max_deflate_length = 0;
142   for (const auto& deflate : deflates) {
143     max_deflate_length = std::max(max_deflate_length, deflate.length * 8);
144   }
145   deflate_buffer_.reset(new Buffer(max_deflate_length + 2));
146 }
147 
GetSize(uint64_t * size) const148 bool PuffinStream::GetSize(uint64_t* size) const {
149   *size = puff_stream_size_;
150   return true;
151 }
152 
GetOffset(uint64_t * offset) const153 bool PuffinStream::GetOffset(uint64_t* offset) const {
154   *offset = puff_pos_ + skip_bytes_;
155   return true;
156 }
157 
Seek(uint64_t offset)158 bool PuffinStream::Seek(uint64_t offset) {
159   TEST_AND_RETURN_FALSE(!closed_);
160   if (!is_for_puff_) {
161     // For huffing we should not seek, only seek to zero is accepted.
162     TEST_AND_RETURN_FALSE(offset == 0);
163   }
164 
165   TEST_AND_RETURN_FALSE(offset <= puff_stream_size_);
166 
167   // We are searching first available puff which either includes the |offset| or
168   // it is the next available puff after |offset|.
169   auto next_puff_iter =
170       std::upper_bound(upper_bounds_.begin(), upper_bounds_.end(), offset);
171   TEST_AND_RETURN_FALSE(next_puff_iter != upper_bounds_.end());
172   auto next_puff_idx = std::distance(upper_bounds_.begin(), next_puff_iter);
173   cur_puff_ = std::next(puffs_.begin(), next_puff_idx);
174   cur_deflate_ = std::next(deflates_.begin(), next_puff_idx);
175 
176   if (offset < cur_puff_->offset) {
177     // between two puffs.
178     puff_pos_ = offset;
179     auto back_track_bytes = cur_puff_->offset - puff_pos_;
180     deflate_bit_pos_ = ((cur_deflate_->offset + 7) / 8 - back_track_bytes) * 8;
181     if (cur_puff_ != puffs_.begin()) {
182       auto prev_deflate = std::prev(cur_deflate_);
183       if (deflate_bit_pos_ < prev_deflate->offset + prev_deflate->length) {
184         deflate_bit_pos_ = prev_deflate->offset + prev_deflate->length;
185       }
186     }
187   } else {
188     // Inside a puff.
189     puff_pos_ = cur_puff_->offset;
190     deflate_bit_pos_ = cur_deflate_->offset;
191   }
192   skip_bytes_ = offset - puff_pos_;
193   if (!is_for_puff_ && offset == 0) {
194     TEST_AND_RETURN_FALSE(stream_->Seek(0));
195     TEST_AND_RETURN_FALSE(SetExtraByte());
196   }
197   return true;
198 }
199 
Close()200 bool PuffinStream::Close() {
201   closed_ = true;
202   return stream_->Close();
203 }
204 
Read(void * buffer,size_t count)205 bool PuffinStream::Read(void* buffer, size_t count) {
206   TEST_AND_RETURN_FALSE(!closed_);
207   TEST_AND_RETURN_FALSE(is_for_puff_);
208   if (cur_puff_ == puffs_.end()) {
209     TEST_AND_RETURN_FALSE(count == 0);
210   }
211   auto bytes = static_cast<uint8_t*>(buffer);
212   uint64_t length = count;
213   uint64_t bytes_read = 0;
214   while (bytes_read < length) {
215     if (puff_pos_ < cur_puff_->offset) {
216       // Reading between two deflates. We also read bytes that have at least one
217       // bit of a deflate bit stream. The byte which has both deflate and raw
218       // data will be shifted or masked off the deflate bits and the remaining
219       // value will be saved in the puff stream as an byte integer.
220       uint64_t start_byte = (deflate_bit_pos_ / 8);
221       uint64_t end_byte = (cur_deflate_->offset + 7) / 8;
222       auto bytes_to_read = std::min(length - bytes_read, end_byte - start_byte);
223       TEST_AND_RETURN_FALSE(bytes_to_read >= 1);
224 
225       TEST_AND_RETURN_FALSE(stream_->Seek(start_byte));
226       TEST_AND_RETURN_FALSE(stream_->Read(bytes + bytes_read, bytes_to_read));
227 
228       // If true, we read the first byte of the curret deflate. So we have to
229       // mask out the deflate bits (which are most significant bits.)
230       if ((start_byte + bytes_to_read) * 8 > cur_deflate_->offset) {
231         bytes[bytes_read + bytes_to_read - 1] &=
232             (1 << (cur_deflate_->offset & 7)) - 1;
233       }
234 
235       // If true, we read the last byte of the previous deflate and we have to
236       // shift it out. The least significat bits belongs to the deflate
237       // stream. The order of these last two conditions are important because a
238       // byte can contain a finishing deflate and a starting deflate with some
239       // bits between them so we have to modify correctly. Keep in mind that in
240       // this situation both are modifying the same byte.
241       if (start_byte * 8 < deflate_bit_pos_) {
242         bytes[bytes_read] >>= deflate_bit_pos_ & 7;
243       }
244 
245       // Pass |deflate_bit_pos_| for all the read bytes.
246       deflate_bit_pos_ -= deflate_bit_pos_ & 7;
247       deflate_bit_pos_ += bytes_to_read * 8;
248       if (deflate_bit_pos_ > cur_deflate_->offset) {
249         // In case it reads into the first byte of the current deflate.
250         deflate_bit_pos_ = cur_deflate_->offset;
251       }
252 
253       bytes_read += bytes_to_read;
254       puff_pos_ += bytes_to_read;
255       TEST_AND_RETURN_FALSE(puff_pos_ <= cur_puff_->offset);
256     } else {
257       // Reading the deflate itself. We read all bytes including the first and
258       // last byte (which may partially include a deflate bit). Here we keep the
259       // |puff_pos_| point to the first byte of the puffed stream and
260       // |skip_bytes_| shows how many bytes in the puff we have copied till now.
261       auto start_byte = (cur_deflate_->offset / 8);
262       auto end_byte = (cur_deflate_->offset + cur_deflate_->length + 7) / 8;
263       auto bytes_to_read = end_byte - start_byte;
264       // Puff directly to buffer if it has space.
265       bool puff_directly_into_buffer =
266           max_cache_size_ == 0 && (skip_bytes_ == 0) &&
267           (length - bytes_read >= cur_puff_->length);
268 
269       auto cur_puff_idx = std::distance(puffs_.begin(), cur_puff_);
270       if (max_cache_size_ == 0 ||
271           !GetPuffCache(cur_puff_idx, cur_puff_->length, &puff_buffer_)) {
272         // Did not find the puff buffer in cache. We have to build it.
273         deflate_buffer_->resize(bytes_to_read);
274         TEST_AND_RETURN_FALSE(stream_->Seek(start_byte));
275         TEST_AND_RETURN_FALSE(
276             stream_->Read(deflate_buffer_->data(), bytes_to_read));
277         BufferBitReader bit_reader(deflate_buffer_->data(), bytes_to_read);
278 
279         BufferPuffWriter puff_writer(puff_directly_into_buffer
280                                          ? bytes + bytes_read
281                                          : puff_buffer_->data(),
282                                      cur_puff_->length);
283 
284         // Drop the first unused bits.
285         size_t extra_bits_len = cur_deflate_->offset & 7;
286         TEST_AND_RETURN_FALSE(bit_reader.CacheBits(extra_bits_len));
287         bit_reader.DropBits(extra_bits_len);
288 
289         TEST_AND_RETURN_FALSE(
290             puffer_->PuffDeflate(&bit_reader, &puff_writer, nullptr));
291         TEST_AND_RETURN_FALSE(bytes_to_read == bit_reader.Offset());
292         TEST_AND_RETURN_FALSE(cur_puff_->length == puff_writer.Size());
293       } else {
294         // Just seek to proper location.
295         TEST_AND_RETURN_FALSE(stream_->Seek(start_byte + bytes_to_read));
296       }
297       // Copy from puff buffer to output if needed.
298       auto bytes_to_copy =
299           std::min(length - bytes_read, cur_puff_->length - skip_bytes_);
300       if (!puff_directly_into_buffer) {
301         memcpy(bytes + bytes_read, puff_buffer_->data() + skip_bytes_,
302                bytes_to_copy);
303       }
304 
305       skip_bytes_ += bytes_to_copy;
306       bytes_read += bytes_to_copy;
307 
308       // Move to next puff.
309       if (puff_pos_ + skip_bytes_ == cur_puff_->offset + cur_puff_->length) {
310         puff_pos_ += skip_bytes_;
311         skip_bytes_ = 0;
312         deflate_bit_pos_ = cur_deflate_->offset + cur_deflate_->length;
313         cur_puff_++;
314         cur_deflate_++;
315         if (cur_puff_ == puffs_.end()) {
316           break;
317         }
318       }
319     }
320   }
321 
322   TEST_AND_RETURN_FALSE(bytes_read == length);
323   return true;
324 }
325 
Write(const void * buffer,size_t count)326 bool PuffinStream::Write(const void* buffer, size_t count) {
327   TEST_AND_RETURN_FALSE(!closed_);
328   TEST_AND_RETURN_FALSE(!is_for_puff_);
329   auto bytes = static_cast<const uint8_t*>(buffer);
330   uint64_t length = count;
331   uint64_t bytes_wrote = 0;
332   while (bytes_wrote < length) {
333     if (deflate_bit_pos_ < (cur_deflate_->offset & ~7ull)) {
334       // Between two puffs or before the first puff. We know that we are
335       // starting from the byte boundary because we have already processed the
336       // non-deflate bits of the last byte of the last deflate. Here we don't
337       // process any byte that has deflate bit.
338       TEST_AND_RETURN_FALSE((deflate_bit_pos_ & 7) == 0);
339       auto copy_len =
340           std::min((cur_deflate_->offset / 8) - (deflate_bit_pos_ / 8),
341                    length - bytes_wrote);
342       TEST_AND_RETURN_FALSE(stream_->Write(bytes + bytes_wrote, copy_len));
343       bytes_wrote += copy_len;
344       puff_pos_ += copy_len;
345       deflate_bit_pos_ += copy_len * 8;
346     } else {
347       // We are in a puff. We have to buffer incoming bytes until we reach the
348       // size of the current puff so we can huff :). If the last bit of the
349       // current deflate does not end in a byte boundary, then we have to read
350       // one more byte to fill up the last byte of the deflate stream before
351       // doing anything else.
352 
353       // |deflate_bit_pos_| now should be in the same byte as
354       // |cur_deflate->offset|.
355       if (deflate_bit_pos_ < cur_deflate_->offset) {
356         last_byte_ |= bytes[bytes_wrote++] << (deflate_bit_pos_ & 7);
357         skip_bytes_ = 0;
358         deflate_bit_pos_ = cur_deflate_->offset;
359         puff_pos_++;
360         TEST_AND_RETURN_FALSE(puff_pos_ == cur_puff_->offset);
361       }
362 
363       auto copy_len = std::min(length - bytes_wrote,
364                                cur_puff_->length + extra_byte_ - skip_bytes_);
365       TEST_AND_RETURN_FALSE(puff_buffer_->size() >= skip_bytes_ + copy_len);
366       memcpy(puff_buffer_->data() + skip_bytes_, bytes + bytes_wrote, copy_len);
367       skip_bytes_ += copy_len;
368       bytes_wrote += copy_len;
369 
370       if (skip_bytes_ == cur_puff_->length + extra_byte_) {
371         // |puff_buffer_| is full, now huff into the |deflate_buffer_|.
372         auto start_byte = cur_deflate_->offset / 8;
373         auto end_byte = (cur_deflate_->offset + cur_deflate_->length + 7) / 8;
374         auto bytes_to_write = end_byte - start_byte;
375 
376         deflate_buffer_->resize(bytes_to_write);
377         BufferBitWriter bit_writer(deflate_buffer_->data(), bytes_to_write);
378         BufferPuffReader puff_reader(puff_buffer_->data(), cur_puff_->length);
379 
380         // Write last byte if it has any.
381         TEST_AND_RETURN_FALSE(
382             bit_writer.WriteBits(cur_deflate_->offset & 7, last_byte_));
383         last_byte_ = 0;
384 
385         TEST_AND_RETURN_FALSE(huffer_->HuffDeflate(&puff_reader, &bit_writer));
386         TEST_AND_RETURN_FALSE(bit_writer.Size() == bytes_to_write);
387         TEST_AND_RETURN_FALSE(puff_reader.BytesLeft() == 0);
388 
389         deflate_bit_pos_ = cur_deflate_->offset + cur_deflate_->length;
390         if (extra_byte_ == 1) {
391           deflate_buffer_->data()[bytes_to_write - 1] |=
392               puff_buffer_->data()[cur_puff_->length] << (deflate_bit_pos_ & 7);
393           deflate_bit_pos_ = (deflate_bit_pos_ + 7) & ~7ull;
394         } else if ((deflate_bit_pos_ & 7) != 0) {
395           // This happens if current and next deflate finish and end on the same
396           // byte, then we cannot write into output until we have huffed the
397           // next puff buffer, so untill then we cache it into |last_byte_| and
398           // we won't write it out.
399           last_byte_ = deflate_buffer_->data()[bytes_to_write - 1];
400           bytes_to_write--;
401         }
402 
403         // Write |deflate_buffer_| into output.
404         TEST_AND_RETURN_FALSE(
405             stream_->Write(deflate_buffer_->data(), bytes_to_write));
406 
407         // Move to the next deflate/puff.
408         puff_pos_ += skip_bytes_;
409         skip_bytes_ = 0;
410         cur_puff_++;
411         cur_deflate_++;
412         if (cur_puff_ == puffs_.end()) {
413           break;
414         }
415         // Find if need an extra byte to cache at the end.
416         TEST_AND_RETURN_FALSE(SetExtraByte());
417       }
418     }
419   }
420 
421   TEST_AND_RETURN_FALSE(bytes_wrote == length);
422   return true;
423 }
424 
SetExtraByte()425 bool PuffinStream::SetExtraByte() {
426   TEST_AND_RETURN_FALSE(cur_deflate_ != deflates_.end());
427   if ((cur_deflate_ + 1) == deflates_.end()) {
428     extra_byte_ = 0;
429     return true;
430   }
431   uint64_t end_bit = cur_deflate_->offset + cur_deflate_->length;
432   if ((end_bit & 7) && ((end_bit + 7) & ~7ull) <= (cur_deflate_ + 1)->offset) {
433     extra_byte_ = 1;
434   } else {
435     extra_byte_ = 0;
436   }
437   return true;
438 }
439 
GetPuffCache(int puff_id,uint64_t puff_size,shared_ptr<Buffer> * buffer)440 bool PuffinStream::GetPuffCache(int puff_id,
441                                 uint64_t puff_size,
442                                 shared_ptr<Buffer>* buffer) {
443   bool found = false;
444   // Search for it.
445   std::pair<int, shared_ptr<Buffer>> cache;
446   // TODO(*): Find a faster way of doing this? Maybe change the data structure
447   // that supports faster search.
448   for (auto iter = caches_.begin(); iter != caches_.end(); ++iter) {
449     if (iter->first == puff_id) {
450       cache = std::move(*iter);
451       found = true;
452       // Remove it so later we can add it to the begining of the list.
453       caches_.erase(iter);
454       break;
455     }
456   }
457 
458   // If not found, either create one or get one from the list.
459   if (!found) {
460     // If |caches_| were full, remove last ones in the list (least used), until
461     // we have enough space for the new cache.
462     while (!caches_.empty() && cur_cache_size_ + puff_size > max_cache_size_) {
463       cache = std::move(caches_.back());
464       caches_.pop_back();  // Remove it from the list.
465       cur_cache_size_ -= cache.second->capacity();
466     }
467     // If we have not populated the cache yet, create one.
468     if (!cache.second) {
469       cache.second.reset(new Buffer(puff_size));
470     }
471     cache.second->resize(puff_size);
472 
473     constexpr uint64_t kMaxSizeDifference = 20 * 1024;
474     if (puff_size + kMaxSizeDifference < cache.second->capacity()) {
475       cache.second->shrink_to_fit();
476     }
477 
478     cur_cache_size_ += cache.second->capacity();
479     cache.first = puff_id;
480   }
481 
482   *buffer = cache.second;
483   // By now we have either removed a cache or created new one. Now we have to
484   // insert it in the front of the list so it becomes the most recently used
485   // one.
486   caches_.push_front(std::move(cache));
487   return found;
488 }
489 
490 }  // namespace puffin
491