1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <fcntl.h>
18 #include <stdlib.h>
19 #include <sys/mman.h>
20
21 extern "C" {
22 #include <fec.h>
23 }
24
25 #include "fec_private.h"
26
27 using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>;
28
29 /* prints a hexdump of `data' using warn(...) */
dump(const char * name,uint64_t value,const uint8_t * data,size_t size)30 static void dump(const char *name, uint64_t value, const uint8_t *data,
31 size_t size)
32 {
33 const int bytes_per_line = 16;
34 char hex[bytes_per_line * 3 + 1];
35 char prn[bytes_per_line + 1];
36
37 warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size);
38
39 if (!data) {
40 warn(" (null)");
41 return;
42 }
43
44 for (size_t n = 0; n < size; n += bytes_per_line) {
45 memset(hex, 0, sizeof(hex));
46 memset(prn, 0, sizeof(prn));
47
48 for (size_t m = 0; m < bytes_per_line; ++m) {
49 if (n + m < size) {
50 sprintf(&hex[m * 3], "%02x ", data[n + m]);
51
52 if (isprint(data[n + m])) {
53 prn[m] = data[n + m];
54 } else {
55 prn[m] = '.';
56 }
57 } else {
58 strcpy(&hex[m * 3], " ");
59 }
60 }
61
62 warn(" %04zu %s %s", n, hex, prn);
63 }
64 }
65
66 /* checks if `offset' is within a corrupted block */
is_erasure(fec_handle * f,uint64_t offset,const uint8_t * data)67 static inline bool is_erasure(fec_handle *f, uint64_t offset,
68 const uint8_t *data)
69 {
70 if (unlikely(offset >= f->data_size)) {
71 return false;
72 }
73
74 /* ideally, we would like to know if a specific byte on this block has
75 been corrupted, but knowing whether any of them is can be useful as
76 well, because often the entire block is corrupted */
77
78 uint64_t n = offset / FEC_BLOCKSIZE;
79
80 return !verity_check_block(f, &f->verity.hash[n * SHA256_DIGEST_LENGTH],
81 data);
82 }
83
84 /* check if `offset' is within a block expected to contain zeros */
is_zero(fec_handle * f,uint64_t offset)85 static inline bool is_zero(fec_handle *f, uint64_t offset)
86 {
87 verity_info *v = &f->verity;
88
89 if (!v->hash || unlikely(offset >= f->data_size)) {
90 return false;
91 }
92
93 uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH;
94
95 if (unlikely(hash_offset >
96 v->hash_data_blocks * FEC_BLOCKSIZE - SHA256_DIGEST_LENGTH)) {
97 return false;
98 }
99
100 return !memcmp(v->zero_hash, &v->hash[hash_offset], SHA256_DIGEST_LENGTH);
101 }
102
103 /* reads and decodes a single block starting from `offset', returns the number
104 of bytes corrected in `errors' */
__ecc_read(fec_handle * f,void * rs,uint8_t * dest,uint64_t offset,bool use_erasures,uint8_t * ecc_data,size_t * errors)105 static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset,
106 bool use_erasures, uint8_t *ecc_data, size_t *errors)
107 {
108 check(offset % FEC_BLOCKSIZE == 0);
109 ecc_info *e = &f->ecc;
110
111 /* reverse interleaving: calculate the RS block that includes the requested
112 offset */
113 uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) *
114 e->rounds * FEC_BLOCKSIZE;
115 int data_index = -1;
116 int erasures[e->rsn];
117 int neras = 0;
118
119 /* verity is required to check for erasures */
120 check(!use_erasures || f->verity.hash);
121
122 for (int i = 0; i < e->rsn; ++i) {
123 uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn,
124 e->rounds);
125
126 if (interleaved == offset) {
127 data_index = i;
128 }
129
130 /* to improve our chances of correcting IO errors, initialize the
131 buffer to zeros even if we are going to read to it later */
132 uint8_t bbuf[FEC_BLOCKSIZE] = {0};
133
134 if (likely(interleaved < e->start) && !is_zero(f, interleaved)) {
135 /* copy raw data to reconstruct the RS block */
136 if (!raw_pread(f, bbuf, FEC_BLOCKSIZE, interleaved)) {
137 warn("failed to read: %s", strerror(errno));
138
139 /* treat errors as corruption */
140 if (use_erasures && neras <= e->roots) {
141 erasures[neras++] = i;
142 }
143 } else if (use_erasures && neras <= e->roots &&
144 is_erasure(f, interleaved, bbuf)) {
145 erasures[neras++] = i;
146 }
147 }
148
149 for (int j = 0; j < FEC_BLOCKSIZE; ++j) {
150 ecc_data[j * FEC_RSM + i] = bbuf[j];
151 }
152 }
153
154 check(data_index >= 0);
155
156 size_t nerrs = 0;
157 uint8_t copy[FEC_RSM];
158
159 for (int i = 0; i < FEC_BLOCKSIZE; ++i) {
160 /* copy parity data */
161 if (!raw_pread(f, &ecc_data[i * FEC_RSM + e->rsn], e->roots,
162 e->start + (i + rsb) * e->roots)) {
163 error("failed to read ecc data: %s", strerror(errno));
164 return -1;
165 }
166
167 /* for debugging decoding failures, because decode_rs_char can mangle
168 ecc_data */
169 if (unlikely(use_erasures)) {
170 memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM);
171 }
172
173 /* decode */
174 int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras);
175
176 if (unlikely(rc < 0)) {
177 if (use_erasures) {
178 error("RS block %" PRIu64 ": decoding failed (%d erasures)",
179 rsb, neras);
180 dump("raw RS block", rsb, copy, FEC_RSM);
181 } else if (!f->verity.hash) {
182 warn("RS block %" PRIu64 ": decoding failed", rsb);
183 } else {
184 debug("RS block %" PRIu64 ": decoding failed", rsb);
185 }
186
187 errno = EIO;
188 return -1;
189 } else if (unlikely(rc > 0)) {
190 check(rc <= (use_erasures ? e->roots : e->roots / 2));
191 nerrs += rc;
192 }
193
194 dest[i] = ecc_data[i * FEC_RSM + data_index];
195 }
196
197 if (nerrs) {
198 warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs);
199 *errors += nerrs;
200 }
201
202 return FEC_BLOCKSIZE;
203 }
204
205 /* initializes RS decoder and allocates memory for interleaving */
ecc_init(fec_handle * f,rs_unique_ptr & rs,std::unique_ptr<uint8_t[]> & ecc_data)206 static int ecc_init(fec_handle *f, rs_unique_ptr& rs,
207 std::unique_ptr<uint8_t[]>& ecc_data)
208 {
209 check(f);
210
211 rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots)));
212
213 if (unlikely(!rs)) {
214 error("failed to initialize RS");
215 errno = ENOMEM;
216 return -1;
217 }
218
219 ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]);
220
221 if (unlikely(!ecc_data)) {
222 error("failed to allocate ecc buffer");
223 errno = ENOMEM;
224 return -1;
225 }
226
227 return 0;
228 }
229
230 /* reads `count' bytes from `offset' and corrects possible errors without
231 erasure detection, returning the number of corrected bytes in `errors' */
ecc_read(fec_handle * f,uint8_t * dest,size_t count,uint64_t offset,size_t * errors)232 static ssize_t ecc_read(fec_handle *f, uint8_t *dest, size_t count,
233 uint64_t offset, size_t *errors)
234 {
235 check(f);
236 check(dest);
237 check(offset < f->data_size);
238 check(offset + count <= f->data_size);
239 check(errors);
240
241 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
242
243 rs_unique_ptr rs(NULL, free_rs_char);
244 std::unique_ptr<uint8_t[]> ecc_data;
245
246 if (ecc_init(f, rs, ecc_data) == -1) {
247 return -1;
248 }
249
250 uint64_t curr = offset / FEC_BLOCKSIZE;
251 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
252 size_t left = count;
253
254 uint8_t data[FEC_BLOCKSIZE];
255
256 while (left > 0) {
257 /* there's no erasure detection without verity metadata */
258 if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false,
259 ecc_data.get(), errors) == -1) {
260 return -1;
261 }
262
263 size_t copy = FEC_BLOCKSIZE - coff;
264
265 if (copy > left) {
266 copy = left;
267 }
268
269 memcpy(dest, &data[coff], copy);
270
271 dest += copy;
272 left -= copy;
273 coff = 0;
274 ++curr;
275 }
276
277 return count;
278 }
279
280 /* reads `count' bytes from `offset', corrects possible errors with
281 erasure detection, and verifies the integrity of read data using
282 verity hash tree; returns the number of corrections in `errors' */
verity_read(fec_handle * f,uint8_t * dest,size_t count,uint64_t offset,size_t * errors)283 static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count,
284 uint64_t offset, size_t *errors)
285 {
286 check(f);
287 check(dest);
288 check(offset < f->data_size);
289 check(offset + count <= f->data_size);
290 check(f->verity.hash);
291 check(errors);
292
293 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);
294
295 rs_unique_ptr rs(NULL, free_rs_char);
296 std::unique_ptr<uint8_t[]> ecc_data;
297
298 if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) {
299 return -1;
300 }
301
302 uint64_t curr = offset / FEC_BLOCKSIZE;
303 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
304 size_t left = count;
305 uint8_t data[FEC_BLOCKSIZE];
306
307 uint64_t max_hash_block = (f->verity.hash_data_blocks * FEC_BLOCKSIZE -
308 SHA256_DIGEST_LENGTH) / SHA256_DIGEST_LENGTH;
309
310 while (left > 0) {
311 check(curr <= max_hash_block);
312
313 uint8_t *hash = &f->verity.hash[curr * SHA256_DIGEST_LENGTH];
314 uint64_t curr_offset = curr * FEC_BLOCKSIZE;
315
316 bool expect_zeros = is_zero(f, curr_offset);
317
318 /* if we are in read-only mode and expect to read a zero block,
319 skip reading and just return zeros */
320 if (f->mode & O_RDONLY && expect_zeros) {
321 memset(data, 0, FEC_BLOCKSIZE);
322 goto valid;
323 }
324
325 /* copy raw data without error correction */
326 if (!raw_pread(f, data, FEC_BLOCKSIZE, curr_offset)) {
327 error("failed to read: %s", strerror(errno));
328 return -1;
329 }
330
331 if (likely(verity_check_block(f, hash, data))) {
332 goto valid;
333 }
334
335 /* we know the block is supposed to contain zeros, so return zeros
336 instead of trying to correct it */
337 if (expect_zeros) {
338 memset(data, 0, FEC_BLOCKSIZE);
339 goto corrected;
340 }
341
342 if (!f->ecc.start) {
343 /* fatal error without ecc */
344 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
345 offset, offset + count, curr);
346 return -1;
347 } else {
348 debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
349 offset, offset + count, curr);
350 }
351
352 /* try to correct without erasures first, because checking for
353 erasure locations is slower */
354 if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(),
355 errors) == FEC_BLOCKSIZE &&
356 verity_check_block(f, hash, data)) {
357 goto corrected;
358 }
359
360 /* try to correct with erasures */
361 if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(),
362 errors) == FEC_BLOCKSIZE &&
363 verity_check_block(f, hash, data)) {
364 goto corrected;
365 }
366
367 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64
368 " (offset %" PRIu64 ") cannot be recovered",
369 offset, offset + count, curr, curr_offset);
370 dump("decoded block", curr, data, FEC_BLOCKSIZE);
371
372 errno = EIO;
373 return -1;
374
375 corrected:
376 /* update the corrected block to the file if we are in r/w mode */
377 if (f->mode & O_RDWR &&
378 !raw_pwrite(f, data, FEC_BLOCKSIZE, curr_offset)) {
379 error("failed to write: %s", strerror(errno));
380 return -1;
381 }
382
383 valid:
384 size_t copy = FEC_BLOCKSIZE - coff;
385
386 if (copy > left) {
387 copy = left;
388 }
389
390 memcpy(dest, &data[coff], copy);
391
392 dest += copy;
393 left -= copy;
394 coff = 0;
395 ++curr;
396 }
397
398 return count;
399 }
400
401 /* sets the internal file position to `offset' relative to `whence' */
fec_seek(struct fec_handle * f,int64_t offset,int whence)402 int fec_seek(struct fec_handle *f, int64_t offset, int whence)
403 {
404 check(f);
405
406 if (whence == SEEK_SET) {
407 if (offset < 0) {
408 errno = EOVERFLOW;
409 return -1;
410 }
411
412 f->pos = offset;
413 } else if (whence == SEEK_CUR) {
414 if (offset < 0 && f->pos < (uint64_t)-offset) {
415 errno = EOVERFLOW;
416 return -1;
417 } else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) {
418 errno = EOVERFLOW;
419 return -1;
420 }
421
422 f->pos += offset;
423 } else if (whence == SEEK_END) {
424 if (offset >= 0) {
425 errno = ENXIO;
426 return -1;
427 } else if ((uint64_t)-offset > f->size) {
428 errno = EOVERFLOW;
429 return -1;
430 }
431
432 f->pos = f->size + offset;
433 } else {
434 errno = EINVAL;
435 return -1;
436 }
437
438 return 0;
439 }
440
441 /* reads up to `count' bytes starting from the internal file position using
442 error correction and integrity validation, if available */
fec_read(struct fec_handle * f,void * buf,size_t count)443 ssize_t fec_read(struct fec_handle *f, void *buf, size_t count)
444 {
445 ssize_t rc = fec_pread(f, buf, count, f->pos);
446
447 if (rc > 0) {
448 check(f->pos < UINT64_MAX - rc);
449 f->pos += rc;
450 }
451
452 return rc;
453 }
454
455 /* for a file with size `max', returns the number of bytes we can read starting
456 from `offset', up to `count' bytes */
get_max_count(uint64_t offset,size_t count,uint64_t max)457 static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max)
458 {
459 if (offset >= max) {
460 return 0;
461 } else if (offset > max - count) {
462 return (size_t)(max - offset);
463 }
464
465 return count;
466 }
467
468 /* reads `count' bytes from `f->fd' starting from `offset', and copies the
469 data to `buf' */
raw_pread(fec_handle * f,void * buf,size_t count,uint64_t offset)470 bool raw_pread(fec_handle *f, void *buf, size_t count, uint64_t offset)
471 {
472 check(f);
473 check(buf);
474
475 uint8_t *p = (uint8_t *)buf;
476 size_t remaining = count;
477
478 while (remaining > 0) {
479 ssize_t n = TEMP_FAILURE_RETRY(pread64(f->fd, p, remaining, offset));
480
481 if (n <= 0) {
482 return false;
483 }
484
485 p += n;
486 remaining -= n;
487 offset += n;
488 }
489
490 return true;
491 }
492
493 /* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */
raw_pwrite(fec_handle * f,const void * buf,size_t count,uint64_t offset)494 bool raw_pwrite(fec_handle *f, const void *buf, size_t count, uint64_t offset)
495 {
496 check(f);
497 check(buf);
498
499 const uint8_t *p = (const uint8_t *)buf;
500 size_t remaining = count;
501
502 while (remaining > 0) {
503 ssize_t n = TEMP_FAILURE_RETRY(pwrite64(f->fd, p, remaining, offset));
504
505 if (n <= 0) {
506 return false;
507 }
508
509 p += n;
510 remaining -= n;
511 offset += n;
512 }
513
514 return true;
515 }
516
517 /* reads up to `count' bytes starting from `offset' using error correction and
518 integrity validation, if available */
fec_pread(struct fec_handle * f,void * buf,size_t count,uint64_t offset)519 ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count,
520 uint64_t offset)
521 {
522 check(f);
523 check(buf);
524
525 if (unlikely(offset > UINT64_MAX - count)) {
526 errno = EOVERFLOW;
527 return -1;
528 }
529
530 if (f->verity.hash) {
531 return process(f, (uint8_t *)buf,
532 get_max_count(offset, count, f->data_size), offset,
533 verity_read);
534 } else if (f->ecc.start) {
535 check(f->ecc.start < f->size);
536
537 count = get_max_count(offset, count, f->data_size);
538 ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read);
539
540 if (rc >= 0) {
541 return rc;
542 }
543
544 /* return raw data if pure ecc read fails; due to interleaving
545 the specific blocks the caller wants may still be fine */
546 } else {
547 count = get_max_count(offset, count, f->size);
548 }
549
550 if (raw_pread(f, buf, count, offset)) {
551 return count;
552 }
553
554 return -1;
555 }
556