1 /* 2 * Copyright (C) 2015 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 #include <fcntl.h> 18 #include <stdlib.h> 19 #include <sys/mman.h> 20 21 extern "C" { 22 #include <fec.h> 23 } 24 25 #include "fec_private.h" 26 27 using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>; 28 29 /* prints a hexdump of `data' using warn(...) */ 30 static void dump(const char *name, uint64_t value, const uint8_t *data, 31 size_t size) 32 { 33 const int bytes_per_line = 16; 34 char hex[bytes_per_line * 3 + 1]; 35 char prn[bytes_per_line + 1]; 36 37 warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size); 38 39 if (!data) { 40 warn(" (null)"); 41 return; 42 } 43 44 for (size_t n = 0; n < size; n += bytes_per_line) { 45 memset(hex, 0, sizeof(hex)); 46 memset(prn, 0, sizeof(prn)); 47 48 for (size_t m = 0; m < bytes_per_line; ++m) { 49 if (n + m < size) { 50 ptrdiff_t offset = &hex[m * 3] - hex; 51 snprintf(hex + offset, sizeof(hex) - offset, "%02x ", 52 data[n + m]); 53 54 if (isprint(data[n + m])) { 55 prn[m] = data[n + m]; 56 } else { 57 prn[m] = '.'; 58 } 59 } else { 60 strcpy(&hex[m * 3], " "); 61 } 62 } 63 64 warn(" %04zu %s %s", n, hex, prn); 65 } 66 } 67 68 /* checks if `offset' is within a corrupted block */ 69 static inline bool is_erasure(fec_handle *f, uint64_t offset, 70 const uint8_t *data) 71 { 72 if (unlikely(offset >= f->data_size)) { 73 return false; 74 } 75 76 /* ideally, we would like to know if a specific byte on this block has 77 been corrupted, but knowing whether any of them is can be useful as 78 well, because often the entire block is corrupted */ 79 80 uint64_t n = offset / FEC_BLOCKSIZE; 81 82 return !f->hashtree().check_block_hash_with_index(n, data); 83 } 84 85 /* check if `offset' is within a block expected to contain zeros */ 86 static inline bool is_zero(fec_handle *f, uint64_t offset) 87 { 88 auto hashtree = f->hashtree(); 89 90 if (hashtree.hash_data.empty() || unlikely(offset >= f->data_size)) { 91 return false; 92 } 93 94 uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH; 95 96 if (unlikely(hash_offset > 97 hashtree.hash_data.size() - SHA256_DIGEST_LENGTH)) { 98 return false; 99 } 100 101 return !memcmp(hashtree.zero_hash.data(), &hashtree.hash_data[hash_offset], 102 SHA256_DIGEST_LENGTH); 103 } 104 105 /* reads and decodes a single block starting from `offset', returns the number 106 of bytes corrected in `errors' */ 107 static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset, 108 bool use_erasures, uint8_t *ecc_data, size_t *errors) 109 { 110 check(offset % FEC_BLOCKSIZE == 0); 111 ecc_info *e = &f->ecc; 112 113 /* reverse interleaving: calculate the RS block that includes the requested 114 offset */ 115 uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) * 116 e->rounds * FEC_BLOCKSIZE; 117 int data_index = -1; 118 int erasures[e->rsn]; 119 int neras = 0; 120 121 /* verity is required to check for erasures */ 122 check(!use_erasures || !f->hashtree().hash_data.empty()); 123 124 for (int i = 0; i < e->rsn; ++i) { 125 uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn, 126 e->rounds); 127 128 if (interleaved == offset) { 129 data_index = i; 130 } 131 132 /* to improve our chances of correcting IO errors, initialize the 133 buffer to zeros even if we are going to read to it later */ 134 uint8_t bbuf[FEC_BLOCKSIZE] = {0}; 135 136 if (likely(interleaved < e->start) && !is_zero(f, interleaved)) { 137 /* copy raw data to reconstruct the RS block */ 138 if (!raw_pread(f->fd, bbuf, FEC_BLOCKSIZE, interleaved)) { 139 warn("failed to read: %s", strerror(errno)); 140 141 /* treat errors as corruption */ 142 if (use_erasures && neras <= e->roots) { 143 erasures[neras++] = i; 144 } 145 } else if (use_erasures && neras <= e->roots && 146 is_erasure(f, interleaved, bbuf)) { 147 erasures[neras++] = i; 148 } 149 } 150 151 for (int j = 0; j < FEC_BLOCKSIZE; ++j) { 152 ecc_data[j * FEC_RSM + i] = bbuf[j]; 153 } 154 } 155 156 check(data_index >= 0); 157 158 size_t nerrs = 0; 159 uint8_t copy[FEC_RSM]; 160 161 for (int i = 0; i < FEC_BLOCKSIZE; ++i) { 162 /* copy parity data */ 163 if (!raw_pread(f->fd, &ecc_data[i * FEC_RSM + e->rsn], e->roots, 164 e->start + (i + rsb) * e->roots)) { 165 error("failed to read ecc data: %s", strerror(errno)); 166 return -1; 167 } 168 169 /* for debugging decoding failures, because decode_rs_char can mangle 170 ecc_data */ 171 if (unlikely(use_erasures)) { 172 memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM); 173 } 174 175 /* decode */ 176 int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras); 177 178 if (unlikely(rc < 0)) { 179 if (use_erasures) { 180 error("RS block %" PRIu64 ": decoding failed (%d erasures)", 181 rsb, neras); 182 dump("raw RS block", rsb, copy, FEC_RSM); 183 } else if (f->hashtree().hash_data.empty()) { 184 warn("RS block %" PRIu64 ": decoding failed", rsb); 185 } else { 186 debug("RS block %" PRIu64 ": decoding failed", rsb); 187 } 188 189 errno = EIO; 190 return -1; 191 } else if (unlikely(rc > 0)) { 192 check(rc <= (use_erasures ? e->roots : e->roots / 2)); 193 nerrs += rc; 194 } 195 196 dest[i] = ecc_data[i * FEC_RSM + data_index]; 197 } 198 199 if (nerrs) { 200 warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs); 201 *errors += nerrs; 202 } 203 204 return FEC_BLOCKSIZE; 205 } 206 207 /* initializes RS decoder and allocates memory for interleaving */ 208 static int ecc_init(fec_handle *f, rs_unique_ptr& rs, 209 std::unique_ptr<uint8_t[]>& ecc_data) 210 { 211 check(f); 212 213 rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots))); 214 215 if (unlikely(!rs)) { 216 error("failed to initialize RS"); 217 errno = ENOMEM; 218 return -1; 219 } 220 221 ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]); 222 223 if (unlikely(!ecc_data)) { 224 error("failed to allocate ecc buffer"); 225 errno = ENOMEM; 226 return -1; 227 } 228 229 return 0; 230 } 231 232 /* reads `count' bytes from `offset' and corrects possible errors without 233 erasure detection, returning the number of corrected bytes in `errors' */ 234 static ssize_t ecc_read(fec_handle *f, uint8_t *dest, size_t count, 235 uint64_t offset, size_t *errors) 236 { 237 check(f); 238 check(dest); 239 check(offset < f->data_size); 240 check(offset + count <= f->data_size); 241 check(errors); 242 243 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count); 244 245 rs_unique_ptr rs(NULL, free_rs_char); 246 std::unique_ptr<uint8_t[]> ecc_data; 247 248 if (ecc_init(f, rs, ecc_data) == -1) { 249 return -1; 250 } 251 252 uint64_t curr = offset / FEC_BLOCKSIZE; 253 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE); 254 size_t left = count; 255 256 uint8_t data[FEC_BLOCKSIZE]; 257 258 while (left > 0) { 259 /* there's no erasure detection without verity metadata */ 260 if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false, 261 ecc_data.get(), errors) == -1) { 262 return -1; 263 } 264 265 size_t copy = FEC_BLOCKSIZE - coff; 266 267 if (copy > left) { 268 copy = left; 269 } 270 271 memcpy(dest, &data[coff], copy); 272 273 dest += copy; 274 left -= copy; 275 coff = 0; 276 ++curr; 277 } 278 279 return count; 280 } 281 282 /* reads `count' bytes from `offset', corrects possible errors with 283 erasure detection, and verifies the integrity of read data using 284 verity hash tree; returns the number of corrections in `errors' */ 285 static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count, 286 uint64_t offset, size_t *errors) 287 { 288 check(f); 289 check(dest); 290 check(offset < f->data_size); 291 check(offset + count <= f->data_size); 292 check(!f->hashtree().hash_data.empty()); 293 check(errors); 294 295 debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count); 296 297 rs_unique_ptr rs(NULL, free_rs_char); 298 std::unique_ptr<uint8_t[]> ecc_data; 299 300 if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) { 301 return -1; 302 } 303 304 uint64_t curr = offset / FEC_BLOCKSIZE; 305 size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE); 306 size_t left = count; 307 uint8_t data[FEC_BLOCKSIZE]; 308 309 uint64_t max_hash_block = 310 (f->hashtree().hash_data.size() - SHA256_DIGEST_LENGTH) / 311 SHA256_DIGEST_LENGTH; 312 313 while (left > 0) { 314 check(curr <= max_hash_block); 315 uint64_t curr_offset = curr * FEC_BLOCKSIZE; 316 317 bool expect_zeros = is_zero(f, curr_offset); 318 319 /* if we are in read-only mode and expect to read a zero block, 320 skip reading and just return zeros */ 321 if ((f->mode & O_ACCMODE) == O_RDONLY && expect_zeros) { 322 memset(data, 0, FEC_BLOCKSIZE); 323 goto valid; 324 } 325 326 /* copy raw data without error correction */ 327 if (!raw_pread(f->fd, data, FEC_BLOCKSIZE, curr_offset)) { 328 error("failed to read: %s", strerror(errno)); 329 return -1; 330 } 331 332 if (likely(f->hashtree().check_block_hash_with_index(curr, data))) { 333 goto valid; 334 } 335 336 /* we know the block is supposed to contain zeros, so return zeros 337 instead of trying to correct it */ 338 if (expect_zeros) { 339 memset(data, 0, FEC_BLOCKSIZE); 340 goto corrected; 341 } 342 343 if (!f->ecc.start) { 344 /* fatal error without ecc */ 345 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64, 346 offset, offset + count, curr); 347 return -1; 348 } else { 349 debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64, 350 offset, offset + count, curr); 351 } 352 353 /* try to correct without erasures first, because checking for 354 erasure locations is slower */ 355 if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(), 356 errors) == FEC_BLOCKSIZE && 357 f->hashtree().check_block_hash_with_index(curr, data)) { 358 goto corrected; 359 } 360 361 /* try to correct with erasures */ 362 if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(), 363 errors) == FEC_BLOCKSIZE && 364 f->hashtree().check_block_hash_with_index(curr, data)) { 365 goto corrected; 366 } 367 368 error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64 369 " (offset %" PRIu64 ") cannot be recovered", 370 offset, offset + count, curr, curr_offset); 371 dump("decoded block", curr, data, FEC_BLOCKSIZE); 372 373 errno = EIO; 374 return -1; 375 376 corrected: 377 /* update the corrected block to the file if we are in r/w mode */ 378 if (f->mode & O_RDWR && 379 !raw_pwrite(f->fd, data, FEC_BLOCKSIZE, curr_offset)) { 380 error("failed to write: %s", strerror(errno)); 381 return -1; 382 } 383 384 valid: 385 size_t copy = FEC_BLOCKSIZE - coff; 386 387 if (copy > left) { 388 copy = left; 389 } 390 391 memcpy(dest, &data[coff], copy); 392 393 dest += copy; 394 left -= copy; 395 coff = 0; 396 ++curr; 397 } 398 399 return count; 400 } 401 402 /* sets the internal file position to `offset' relative to `whence' */ 403 int fec_seek(struct fec_handle *f, int64_t offset, int whence) 404 { 405 check(f); 406 407 if (whence == SEEK_SET) { 408 if (offset < 0) { 409 errno = EOVERFLOW; 410 return -1; 411 } 412 413 f->pos = offset; 414 } else if (whence == SEEK_CUR) { 415 if (offset < 0 && f->pos < (uint64_t)-offset) { 416 errno = EOVERFLOW; 417 return -1; 418 } else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) { 419 errno = EOVERFLOW; 420 return -1; 421 } 422 423 f->pos += offset; 424 } else if (whence == SEEK_END) { 425 if (offset >= 0) { 426 errno = ENXIO; 427 return -1; 428 } else if ((uint64_t)-offset > f->size) { 429 errno = EOVERFLOW; 430 return -1; 431 } 432 433 f->pos = f->size + offset; 434 } else { 435 errno = EINVAL; 436 return -1; 437 } 438 439 return 0; 440 } 441 442 /* reads up to `count' bytes starting from the internal file position using 443 error correction and integrity validation, if available */ 444 ssize_t fec_read(struct fec_handle *f, void *buf, size_t count) 445 { 446 ssize_t rc = fec_pread(f, buf, count, f->pos); 447 448 if (rc > 0) { 449 check(f->pos < UINT64_MAX - rc); 450 f->pos += rc; 451 } 452 453 return rc; 454 } 455 456 /* for a file with size `max', returns the number of bytes we can read starting 457 from `offset', up to `count' bytes */ 458 static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max) 459 { 460 if (offset >= max) { 461 return 0; 462 } else if (offset > max - count) { 463 return (size_t)(max - offset); 464 } 465 466 return count; 467 } 468 469 /* reads `count' bytes from `f->fd' starting from `offset', and copies the 470 data to `buf' */ 471 bool raw_pread(int fd, void *buf, size_t count, uint64_t offset) { 472 check(buf); 473 474 uint8_t *p = (uint8_t *)buf; 475 size_t remaining = count; 476 477 while (remaining > 0) { 478 ssize_t n = TEMP_FAILURE_RETRY(pread64(fd, p, remaining, offset)); 479 480 if (n <= 0) { 481 return false; 482 } 483 484 p += n; 485 remaining -= n; 486 offset += n; 487 } 488 489 return true; 490 } 491 492 /* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */ 493 bool raw_pwrite(int fd, const void *buf, size_t count, uint64_t offset) { 494 check(buf); 495 496 const uint8_t *p = (const uint8_t *)buf; 497 size_t remaining = count; 498 499 while (remaining > 0) { 500 ssize_t n = TEMP_FAILURE_RETRY(pwrite64(fd, p, remaining, offset)); 501 502 if (n <= 0) { 503 return false; 504 } 505 506 p += n; 507 remaining -= n; 508 offset += n; 509 } 510 511 return true; 512 } 513 514 /* reads up to `count' bytes starting from `offset' using error correction and 515 integrity validation, if available */ 516 ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count, 517 uint64_t offset) 518 { 519 check(f); 520 check(buf); 521 522 if (unlikely(offset > UINT64_MAX - count)) { 523 errno = EOVERFLOW; 524 return -1; 525 } 526 527 if (!f->hashtree().hash_data.empty()) { 528 return process(f, (uint8_t *)buf, 529 get_max_count(offset, count, f->data_size), offset, 530 verity_read); 531 } else if (f->ecc.start) { 532 check(f->ecc.start < f->size); 533 534 count = get_max_count(offset, count, f->data_size); 535 ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read); 536 537 if (rc >= 0) { 538 return rc; 539 } 540 541 /* return raw data if pure ecc read fails; due to interleaving 542 the specific blocks the caller wants may still be fine */ 543 } else { 544 count = get_max_count(offset, count, f->size); 545 } 546 547 if (raw_pread(f->fd, buf, count, offset)) { 548 return count; 549 } 550 551 return -1; 552 } 553