libfec/fec_read.cpp - platform/system/extras - Git at Google

 /*
  * Copyright (C) 2015 The Android Open Source Project
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 #include <fcntl.h>
 #include <stdlib.h>
 #include <sys/mman.h>

 extern "C" {
     #include <fec.h>
 }

 #include "fec_private.h"

 using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>;

 /* prints a hexdump of `data' using warn(...) */
 static void dump(const char *name, uint64_t value, const uint8_t *data,
         size_t size)
 {
     const int bytes_per_line = 16;
     char hex[bytes_per_line * 3 + 1];
     char prn[bytes_per_line + 1];

     warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size);

     if (!data) {
         warn("    (null)");
         return;
     }

     for (size_t n = 0; n < size; n += bytes_per_line) {
         memset(hex, 0, sizeof(hex));
         memset(prn, 0, sizeof(prn));

         for (size_t m = 0; m < bytes_per_line; ++m) {
             if (n + m < size) {
                 ptrdiff_t offset = &hex[m * 3] - hex;
                 snprintf(hex + offset, sizeof(hex) - offset, "%02x ",
                          data[n + m]);

                 if (isprint(data[n + m])) {
                     prn[m] = data[n + m];
                 } else {
                     prn[m] = '.';
                 }
             } else {
                 strcpy(&hex[m * 3], "   ");
             }
         }

         warn("    %04zu   %s  %s", n, hex, prn);
     }
 }

 /* checks if `offset' is within a corrupted block */
 static inline bool is_erasure(fec_handle *f, uint64_t offset,
         const uint8_t *data)
 {
     if (unlikely(offset >= f->data_size)) {
         return false;
     }

     /* ideally, we would like to know if a specific byte on this block has
        been corrupted, but knowing whether any of them is can be useful as
        well, because often the entire block is corrupted */

     uint64_t n = offset / FEC_BLOCKSIZE;

     return !f->hashtree().check_block_hash_with_index(n, data);
 }

 /* check if `offset' is within a block expected to contain zeros */
 static inline bool is_zero(fec_handle *f, uint64_t offset)
 {
     auto hashtree = f->hashtree();

     if (hashtree.hash_data.empty() || unlikely(offset >= f->data_size)) {
         return false;
     }

     uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH;

     if (unlikely(hash_offset >
                  hashtree.hash_data.size() - SHA256_DIGEST_LENGTH)) {
         return false;
     }

     return !memcmp(hashtree.zero_hash.data(), &hashtree.hash_data[hash_offset],
                    SHA256_DIGEST_LENGTH);
 }

 /* reads and decodes a single block starting from `offset', returns the number
    of bytes corrected in `errors' */
 static int __ecc_read(fec_handle *f, void *rs, uint8_t *dest, uint64_t offset,
         bool use_erasures, uint8_t *ecc_data, size_t *errors)
 {
     check(offset % FEC_BLOCKSIZE == 0);
     ecc_info *e = &f->ecc;

     /* reverse interleaving: calculate the RS block that includes the requested
        offset */
     uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) *
                         e->rounds * FEC_BLOCKSIZE;
     int data_index = -1;
     int erasures[e->rsn];
     int neras = 0;

     /* verity is required to check for erasures */
     check(!use_erasures || !f->hashtree().hash_data.empty());

     for (int i = 0; i < e->rsn; ++i) {
         uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn,
                                     e->rounds);

         if (interleaved == offset) {
             data_index = i;
         }

         /* to improve our chances of correcting IO errors, initialize the
            buffer to zeros even if we are going to read to it later */
         uint8_t bbuf[FEC_BLOCKSIZE] = {0};

         if (likely(interleaved < e->start) && !is_zero(f, interleaved)) {
             /* copy raw data to reconstruct the RS block */
             if (!raw_pread(f->fd, bbuf, FEC_BLOCKSIZE, interleaved)) {
                 warn("failed to read: %s", strerror(errno));

                 /* treat errors as corruption */
                 if (use_erasures && neras <= e->roots) {
                     erasures[neras++] = i;
                 }
             } else if (use_erasures && neras <= e->roots &&
                        is_erasure(f, interleaved, bbuf)) {
                 erasures[neras++] = i;
             }
         }

         for (int j = 0; j < FEC_BLOCKSIZE; ++j) {
             ecc_data[j * FEC_RSM + i] = bbuf[j];
         }
     }

     check(data_index >= 0);

     size_t nerrs = 0;
     uint8_t copy[FEC_RSM];

     for (int i = 0; i < FEC_BLOCKSIZE; ++i) {
         /* copy parity data */
         if (!raw_pread(f->fd, &ecc_data[i * FEC_RSM + e->rsn], e->roots,
                        e->start + (i + rsb) * e->roots)) {
             error("failed to read ecc data: %s", strerror(errno));
             return -1;
         }

         /* for debugging decoding failures, because decode_rs_char can mangle
            ecc_data */
         if (unlikely(use_erasures)) {
             memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM);
         }

         /* decode */
         int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras);

         if (unlikely(rc < 0)) {
             if (use_erasures) {
                 error("RS block %" PRIu64 ": decoding failed (%d erasures)",
                     rsb, neras);
                 dump("raw RS block", rsb, copy, FEC_RSM);
             } else if (f->hashtree().hash_data.empty()) {
                 warn("RS block %" PRIu64 ": decoding failed", rsb);
             } else {
                 debug("RS block %" PRIu64 ": decoding failed", rsb);
             }

             errno = EIO;
             return -1;
         } else if (unlikely(rc > 0)) {
             check(rc <= (use_erasures ? e->roots : e->roots / 2));
             nerrs += rc;
         }

         dest[i] = ecc_data[i * FEC_RSM + data_index];
     }

     if (nerrs) {
         warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs);
         *errors += nerrs;
     }

     return FEC_BLOCKSIZE;
 }

 /* initializes RS decoder and allocates memory for interleaving */
 static int ecc_init(fec_handle *f, rs_unique_ptr& rs,
         std::unique_ptr<uint8_t[]>& ecc_data)
 {
     check(f);

     rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots)));

     if (unlikely(!rs)) {
         error("failed to initialize RS");
         errno = ENOMEM;
         return -1;
     }

     ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]);

     if (unlikely(!ecc_data)) {
         error("failed to allocate ecc buffer");
         errno = ENOMEM;
         return -1;
     }

     return 0;
 }

 /* reads `count' bytes from `offset' and corrects possible errors without
    erasure detection, returning the number of corrected bytes in `errors' */
 static ssize_t ecc_read(fec_handle *f, uint8_t *dest, size_t count,
         uint64_t offset, size_t *errors)
 {
     check(f);
     check(dest);
     check(offset < f->data_size);
     check(offset + count <= f->data_size);
     check(errors);

     debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);

     rs_unique_ptr rs(NULL, free_rs_char);
     std::unique_ptr<uint8_t[]> ecc_data;

     if (ecc_init(f, rs, ecc_data) == -1) {
         return -1;
     }

     uint64_t curr = offset / FEC_BLOCKSIZE;
     size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
     size_t left = count;

     uint8_t data[FEC_BLOCKSIZE];

     while (left > 0) {
         /* there's no erasure detection without verity metadata */
         if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false,
                 ecc_data.get(), errors) == -1) {
             return -1;
         }

         size_t copy = FEC_BLOCKSIZE - coff;

         if (copy > left) {
             copy = left;
         }

         memcpy(dest, &data[coff], copy);

         dest += copy;
         left -= copy;
         coff = 0;
         ++curr;
     }

     return count;
 }

 /* reads `count' bytes from `offset', corrects possible errors with
    erasure detection, and verifies the integrity of read data using
    verity hash tree; returns the number of corrections in `errors' */
 static ssize_t verity_read(fec_handle *f, uint8_t *dest, size_t count,
         uint64_t offset, size_t *errors)
 {
     check(f);
     check(dest);
     check(offset < f->data_size);
     check(offset + count <= f->data_size);
     check(!f->hashtree().hash_data.empty());
     check(errors);

     debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);

     rs_unique_ptr rs(NULL, free_rs_char);
     std::unique_ptr<uint8_t[]> ecc_data;

     if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) {
         return -1;
     }

     uint64_t curr = offset / FEC_BLOCKSIZE;
     size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
     size_t left = count;
     uint8_t data[FEC_BLOCKSIZE];

     uint64_t max_hash_block =
         (f->hashtree().hash_data.size() - SHA256_DIGEST_LENGTH) /
         SHA256_DIGEST_LENGTH;

     while (left > 0) {
         check(curr <= max_hash_block);
         uint64_t curr_offset = curr * FEC_BLOCKSIZE;

         bool expect_zeros = is_zero(f, curr_offset);

         /* if we are in read-only mode and expect to read a zero block,
            skip reading and just return zeros */
         if ((f->mode & O_ACCMODE) == O_RDONLY && expect_zeros) {
             memset(data, 0, FEC_BLOCKSIZE);
             goto valid;
         }

         /* copy raw data without error correction */
         if (!raw_pread(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
             if (errno == EIO) {
                 warn("I/O error encounter when reading, attempting to recover using fec");
             } else {
                 error("failed to read: %s", strerror(errno));
                 return -1;
             }
         }

         if (likely(f->hashtree().check_block_hash_with_index(curr, data))) {
             goto valid;
         }

         /* we know the block is supposed to contain zeros, so return zeros
            instead of trying to correct it */
         if (expect_zeros) {
             memset(data, 0, FEC_BLOCKSIZE);
             goto corrected;
         }

         if (!f->ecc.start) {
             /* fatal error without ecc */
             error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
                 offset, offset + count, curr);
             return -1;
         } else {
             debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
                 offset, offset + count, curr);
         }

         /* try to correct without erasures first, because checking for
            erasure locations is slower */
         if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(),
                        errors) == FEC_BLOCKSIZE &&
             f->hashtree().check_block_hash_with_index(curr, data)) {
             goto corrected;
         }

         /* try to correct with erasures */
         if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(),
                        errors) == FEC_BLOCKSIZE &&
             f->hashtree().check_block_hash_with_index(curr, data)) {
             goto corrected;
         }

         error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64
             " (offset %" PRIu64 ") cannot be recovered",
             offset, offset + count, curr, curr_offset);
         dump("decoded block", curr, data, FEC_BLOCKSIZE);

         errno = EIO;
         return -1;

 corrected:
         /* update the corrected block to the file if we are in r/w mode */
         if (f->mode & O_RDWR &&
             !raw_pwrite(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
             error("failed to write: %s", strerror(errno));
             return -1;
         }

 valid:
         size_t copy = FEC_BLOCKSIZE - coff;

         if (copy > left) {
             copy = left;
         }

         memcpy(dest, &data[coff], copy);

         dest += copy;
         left -= copy;
         coff = 0;
         ++curr;
     }

     return count;
 }

 /* sets the internal file position to `offset' relative to `whence' */
 int fec_seek(struct fec_handle *f, int64_t offset, int whence)
 {
     check(f);

     if (whence == SEEK_SET) {
         if (offset < 0) {
             errno = EOVERFLOW;
             return -1;
         }

         f->pos = offset;
     } else if (whence == SEEK_CUR) {
         if (offset < 0 && f->pos < (uint64_t)-offset) {
             errno = EOVERFLOW;
             return -1;
         } else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) {
             errno = EOVERFLOW;
             return -1;
         }

         f->pos += offset;
     } else if (whence == SEEK_END) {
         if (offset >= 0) {
             errno = ENXIO;
             return -1;
         } else if ((uint64_t)-offset > f->size) {
             errno = EOVERFLOW;
             return -1;
         }

         f->pos = f->size + offset;
     } else {
         errno = EINVAL;
         return -1;
     }

     return 0;
 }

 /* reads up to `count' bytes starting from the internal file position using
    error correction and integrity validation, if available */
 ssize_t fec_read(struct fec_handle *f, void *buf, size_t count)
 {
     ssize_t rc = fec_pread(f, buf, count, f->pos);

     if (rc > 0) {
         check(f->pos < UINT64_MAX - rc);
         f->pos += rc;
     }

     return rc;
 }

 /* for a file with size `max', returns the number of bytes we can read starting
    from `offset', up to `count' bytes */
 static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max)
 {
     if (offset >= max) {
         return 0;
     } else if (offset > max - count) {
         return (size_t)(max - offset);
     }

     return count;
 }

 /* reads `count' bytes from `f->fd' starting from `offset', and copies the
    data to `buf' */
 bool raw_pread(int fd, void *buf, size_t count, uint64_t offset) {
     check(buf);

     uint8_t *p = (uint8_t *)buf;
     size_t remaining = count;

     while (remaining > 0) {
         ssize_t n = TEMP_FAILURE_RETRY(pread64(fd, p, remaining, offset));

         if (n <= 0) {
             return false;
         }

         p += n;
         remaining -= n;
         offset += n;
     }

     return true;
 }

 /* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */
 bool raw_pwrite(int fd, const void *buf, size_t count, uint64_t offset) {
     check(buf);

     const uint8_t *p = (const uint8_t *)buf;
     size_t remaining = count;

     while (remaining > 0) {
         ssize_t n = TEMP_FAILURE_RETRY(pwrite64(fd, p, remaining, offset));

         if (n <= 0) {
             return false;
         }

         p += n;
         remaining -= n;
         offset += n;
     }

     return true;
 }

 /* reads up to `count' bytes starting from `offset' using error correction and
    integrity validation, if available */
 ssize_t fec_pread(struct fec_handle *f, void *buf, size_t count,
         uint64_t offset)
 {
     check(f);
     check(buf);

     if (unlikely(offset > UINT64_MAX - count)) {
         errno = EOVERFLOW;
         return -1;
     }

     if (!f->hashtree().hash_data.empty()) {
         return process(f, (uint8_t *)buf,
                        get_max_count(offset, count, f->data_size), offset,
                        verity_read);
     } else if (f->ecc.start) {
         check(f->ecc.start < f->size);

         count = get_max_count(offset, count, f->data_size);
         ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read);

         if (rc >= 0) {
             return rc;
         }

         /* return raw data if pure ecc read fails; due to interleaving
            the specific blocks the caller wants may still be fine */
     } else {
         count = get_max_count(offset, count, f->size);
     }

     if (raw_pread(f->fd, buf, count, offset)) {
         return count;
     }

     return -1;
 }
	/*
	* Copyright (C) 2015 The Android Open Source Project
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	#include <fcntl.h>
	#include <stdlib.h>
	#include <sys/mman.h>

	extern "C" {
	#include <fec.h>
	}

	#include "fec_private.h"

	using rs_unique_ptr = std::unique_ptr<void, decltype(&free_rs_char)>;

	/* prints a hexdump of `data' using warn(...) */
	static void dump(const char name, uint64_t value, const uint8_t data,
	size_t size)
	{
	const int bytes_per_line = 16;
	char hex[bytes_per_line * 3 + 1];
	char prn[bytes_per_line + 1];

	warn("%s (%" PRIu64 ") (%zu bytes):", name ? name : "", value, size);

	if (!data) {
	warn(" (null)");
	return;
	}

	for (size_t n = 0; n < size; n += bytes_per_line) {
	memset(hex, 0, sizeof(hex));
	memset(prn, 0, sizeof(prn));

	for (size_t m = 0; m < bytes_per_line; ++m) {
	if (n + m < size) {
	ptrdiff_t offset = &hex[m * 3] - hex;
	snprintf(hex + offset, sizeof(hex) - offset, "%02x ",
	data[n + m]);

	if (isprint(data[n + m])) {
	prn[m] = data[n + m];
	} else {
	prn[m] = '.';
	}
	} else {
	strcpy(&hex[m * 3], " ");
	}
	}

	warn(" %04zu %s %s", n, hex, prn);
	}
	}

	/* checks if `offset' is within a corrupted block */
	static inline bool is_erasure(fec_handle *f, uint64_t offset,
	const uint8_t *data)
	{
	if (unlikely(offset >= f->data_size)) {
	return false;
	}

	/* ideally, we would like to know if a specific byte on this block has
	been corrupted, but knowing whether any of them is can be useful as
	well, because often the entire block is corrupted */

	uint64_t n = offset / FEC_BLOCKSIZE;

	return !f->hashtree().check_block_hash_with_index(n, data);
	}

	/* check if `offset' is within a block expected to contain zeros */
	static inline bool is_zero(fec_handle *f, uint64_t offset)
	{
	auto hashtree = f->hashtree();

	if (hashtree.hash_data.empty() \|\| unlikely(offset >= f->data_size)) {
	return false;
	}

	uint64_t hash_offset = (offset / FEC_BLOCKSIZE) * SHA256_DIGEST_LENGTH;

	if (unlikely(hash_offset >
	hashtree.hash_data.size() - SHA256_DIGEST_LENGTH)) {
	return false;
	}

	return !memcmp(hashtree.zero_hash.data(), &hashtree.hash_data[hash_offset],
	SHA256_DIGEST_LENGTH);
	}

	/* reads and decodes a single block starting from `offset', returns the number
	of bytes corrected in `errors' */
	static int __ecc_read(fec_handle f, void rs, uint8_t *dest, uint64_t offset,
	bool use_erasures, uint8_t ecc_data, size_t errors)
	{
	check(offset % FEC_BLOCKSIZE == 0);
	ecc_info *e = &f->ecc;

	/* reverse interleaving: calculate the RS block that includes the requested
	offset */
	uint64_t rsb = offset - (offset / (e->rounds * FEC_BLOCKSIZE)) *
	e->rounds * FEC_BLOCKSIZE;
	int data_index = -1;
	int erasures[e->rsn];
	int neras = 0;

	/* verity is required to check for erasures */
	check(!use_erasures \|\| !f->hashtree().hash_data.empty());

	for (int i = 0; i < e->rsn; ++i) {
	uint64_t interleaved = fec_ecc_interleave(rsb * e->rsn + i, e->rsn,
	e->rounds);

	if (interleaved == offset) {
	data_index = i;
	}

	/* to improve our chances of correcting IO errors, initialize the
	buffer to zeros even if we are going to read to it later */
	uint8_t bbuf[FEC_BLOCKSIZE] = {0};

	if (likely(interleaved < e->start) && !is_zero(f, interleaved)) {
	/* copy raw data to reconstruct the RS block */
	if (!raw_pread(f->fd, bbuf, FEC_BLOCKSIZE, interleaved)) {
	warn("failed to read: %s", strerror(errno));

	/* treat errors as corruption */
	if (use_erasures && neras <= e->roots) {
	erasures[neras++] = i;
	}
	} else if (use_erasures && neras <= e->roots &&
	is_erasure(f, interleaved, bbuf)) {
	erasures[neras++] = i;
	}
	}

	for (int j = 0; j < FEC_BLOCKSIZE; ++j) {
	ecc_data[j * FEC_RSM + i] = bbuf[j];
	}
	}

	check(data_index >= 0);

	size_t nerrs = 0;
	uint8_t copy[FEC_RSM];

	for (int i = 0; i < FEC_BLOCKSIZE; ++i) {
	/* copy parity data */
	if (!raw_pread(f->fd, &ecc_data[i * FEC_RSM + e->rsn], e->roots,
	e->start + (i + rsb) * e->roots)) {
	error("failed to read ecc data: %s", strerror(errno));
	return -1;
	}

	/* for debugging decoding failures, because decode_rs_char can mangle
	ecc_data */
	if (unlikely(use_erasures)) {
	memcpy(copy, &ecc_data[i * FEC_RSM], FEC_RSM);
	}

	/* decode */
	int rc = decode_rs_char(rs, &ecc_data[i * FEC_RSM], erasures, neras);

	if (unlikely(rc < 0)) {
	if (use_erasures) {
	error("RS block %" PRIu64 ": decoding failed (%d erasures)",
	rsb, neras);
	dump("raw RS block", rsb, copy, FEC_RSM);
	} else if (f->hashtree().hash_data.empty()) {
	warn("RS block %" PRIu64 ": decoding failed", rsb);
	} else {
	debug("RS block %" PRIu64 ": decoding failed", rsb);
	}

	errno = EIO;
	return -1;
	} else if (unlikely(rc > 0)) {
	check(rc <= (use_erasures ? e->roots : e->roots / 2));
	nerrs += rc;
	}

	dest[i] = ecc_data[i * FEC_RSM + data_index];
	}

	if (nerrs) {
	warn("RS block %" PRIu64 ": corrected %zu errors", rsb, nerrs);
	*errors += nerrs;
	}

	return FEC_BLOCKSIZE;
	}

	/* initializes RS decoder and allocates memory for interleaving */
	static int ecc_init(fec_handle *f, rs_unique_ptr& rs,
	std::unique_ptr<uint8_t[]>& ecc_data)
	{
	check(f);

	rs.reset(init_rs_char(FEC_PARAMS(f->ecc.roots)));

	if (unlikely(!rs)) {
	error("failed to initialize RS");
	errno = ENOMEM;
	return -1;
	}

	ecc_data.reset(new (std::nothrow) uint8_t[FEC_RSM * FEC_BLOCKSIZE]);

	if (unlikely(!ecc_data)) {
	error("failed to allocate ecc buffer");
	errno = ENOMEM;
	return -1;
	}

	return 0;
	}

	/* reads `count' bytes from `offset' and corrects possible errors without
	erasure detection, returning the number of corrected bytes in `errors' */
	static ssize_t ecc_read(fec_handle f, uint8_t dest, size_t count,
	uint64_t offset, size_t *errors)
	{
	check(f);
	check(dest);
	check(offset < f->data_size);
	check(offset + count <= f->data_size);
	check(errors);

	debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);

	rs_unique_ptr rs(NULL, free_rs_char);
	std::unique_ptr<uint8_t[]> ecc_data;

	if (ecc_init(f, rs, ecc_data) == -1) {
	return -1;
	}

	uint64_t curr = offset / FEC_BLOCKSIZE;
	size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
	size_t left = count;

	uint8_t data[FEC_BLOCKSIZE];

	while (left > 0) {
	/* there's no erasure detection without verity metadata */
	if (__ecc_read(f, rs.get(), data, curr * FEC_BLOCKSIZE, false,
	ecc_data.get(), errors) == -1) {
	return -1;
	}

	size_t copy = FEC_BLOCKSIZE - coff;

	if (copy > left) {
	copy = left;
	}

	memcpy(dest, &data[coff], copy);

	dest += copy;
	left -= copy;
	coff = 0;
	++curr;
	}

	return count;
	}

	/* reads `count' bytes from `offset', corrects possible errors with
	erasure detection, and verifies the integrity of read data using
	verity hash tree; returns the number of corrections in `errors' */
	static ssize_t verity_read(fec_handle f, uint8_t dest, size_t count,
	uint64_t offset, size_t *errors)
	{
	check(f);
	check(dest);
	check(offset < f->data_size);
	check(offset + count <= f->data_size);
	check(!f->hashtree().hash_data.empty());
	check(errors);

	debug("[%" PRIu64 ", %" PRIu64 ")", offset, offset + count);

	rs_unique_ptr rs(NULL, free_rs_char);
	std::unique_ptr<uint8_t[]> ecc_data;

	if (f->ecc.start && ecc_init(f, rs, ecc_data) == -1) {
	return -1;
	}

	uint64_t curr = offset / FEC_BLOCKSIZE;
	size_t coff = (size_t)(offset - curr * FEC_BLOCKSIZE);
	size_t left = count;
	uint8_t data[FEC_BLOCKSIZE];

	uint64_t max_hash_block =
	(f->hashtree().hash_data.size() - SHA256_DIGEST_LENGTH) /
	SHA256_DIGEST_LENGTH;

	while (left > 0) {
	check(curr <= max_hash_block);
	uint64_t curr_offset = curr * FEC_BLOCKSIZE;

	bool expect_zeros = is_zero(f, curr_offset);

	/* if we are in read-only mode and expect to read a zero block,
	skip reading and just return zeros */
	if ((f->mode & O_ACCMODE) == O_RDONLY && expect_zeros) {
	memset(data, 0, FEC_BLOCKSIZE);
	goto valid;
	}

	/* copy raw data without error correction */
	if (!raw_pread(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
	if (errno == EIO) {
	warn("I/O error encounter when reading, attempting to recover using fec");
	} else {
	error("failed to read: %s", strerror(errno));
	return -1;
	}
	}

	if (likely(f->hashtree().check_block_hash_with_index(curr, data))) {
	goto valid;
	}

	/* we know the block is supposed to contain zeros, so return zeros
	instead of trying to correct it */
	if (expect_zeros) {
	memset(data, 0, FEC_BLOCKSIZE);
	goto corrected;
	}

	if (!f->ecc.start) {
	/* fatal error without ecc */
	error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
	offset, offset + count, curr);
	return -1;
	} else {
	debug("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64,
	offset, offset + count, curr);
	}

	/* try to correct without erasures first, because checking for
	erasure locations is slower */
	if (__ecc_read(f, rs.get(), data, curr_offset, false, ecc_data.get(),
	errors) == FEC_BLOCKSIZE &&
	f->hashtree().check_block_hash_with_index(curr, data)) {
	goto corrected;
	}

	/* try to correct with erasures */
	if (__ecc_read(f, rs.get(), data, curr_offset, true, ecc_data.get(),
	errors) == FEC_BLOCKSIZE &&
	f->hashtree().check_block_hash_with_index(curr, data)) {
	goto corrected;
	}

	error("[%" PRIu64 ", %" PRIu64 "): corrupted block %" PRIu64
	" (offset %" PRIu64 ") cannot be recovered",
	offset, offset + count, curr, curr_offset);
	dump("decoded block", curr, data, FEC_BLOCKSIZE);

	errno = EIO;
	return -1;

	corrected:
	/* update the corrected block to the file if we are in r/w mode */
	if (f->mode & O_RDWR &&
	!raw_pwrite(f->fd, data, FEC_BLOCKSIZE, curr_offset)) {
	error("failed to write: %s", strerror(errno));
	return -1;
	}

	valid:
	size_t copy = FEC_BLOCKSIZE - coff;

	if (copy > left) {
	copy = left;
	}

	memcpy(dest, &data[coff], copy);

	dest += copy;
	left -= copy;
	coff = 0;
	++curr;
	}

	return count;
	}

	/* sets the internal file position to `offset' relative to `whence' */
	int fec_seek(struct fec_handle *f, int64_t offset, int whence)
	{
	check(f);

	if (whence == SEEK_SET) {
	if (offset < 0) {
	errno = EOVERFLOW;
	return -1;
	}

	f->pos = offset;
	} else if (whence == SEEK_CUR) {
	if (offset < 0 && f->pos < (uint64_t)-offset) {
	errno = EOVERFLOW;
	return -1;
	} else if (offset > 0 && (uint64_t)offset > UINT64_MAX - f->pos) {
	errno = EOVERFLOW;
	return -1;
	}

	f->pos += offset;
	} else if (whence == SEEK_END) {
	if (offset >= 0) {
	errno = ENXIO;
	return -1;
	} else if ((uint64_t)-offset > f->size) {
	errno = EOVERFLOW;
	return -1;
	}

	f->pos = f->size + offset;
	} else {
	errno = EINVAL;
	return -1;
	}

	return 0;
	}

	/* reads up to `count' bytes starting from the internal file position using
	error correction and integrity validation, if available */
	ssize_t fec_read(struct fec_handle f, void buf, size_t count)
	{
	ssize_t rc = fec_pread(f, buf, count, f->pos);

	if (rc > 0) {
	check(f->pos < UINT64_MAX - rc);
	f->pos += rc;
	}

	return rc;
	}

	/* for a file with size `max', returns the number of bytes we can read starting
	from `offset', up to `count' bytes */
	static inline size_t get_max_count(uint64_t offset, size_t count, uint64_t max)
	{
	if (offset >= max) {
	return 0;
	} else if (offset > max - count) {
	return (size_t)(max - offset);
	}

	return count;
	}

	/* reads `count' bytes from `f->fd' starting from `offset', and copies the
	data to `buf' */
	bool raw_pread(int fd, void *buf, size_t count, uint64_t offset) {
	check(buf);

	uint8_t p = (uint8_t )buf;
	size_t remaining = count;

	while (remaining > 0) {
	ssize_t n = TEMP_FAILURE_RETRY(pread64(fd, p, remaining, offset));

	if (n <= 0) {
	return false;
	}

	p += n;
	remaining -= n;
	offset += n;
	}

	return true;
	}

	/* writes `count' bytes from `buf' to `f->fd' to a file position `offset' */
	bool raw_pwrite(int fd, const void *buf, size_t count, uint64_t offset) {
	check(buf);

	const uint8_t p = (const uint8_t )buf;
	size_t remaining = count;

	while (remaining > 0) {
	ssize_t n = TEMP_FAILURE_RETRY(pwrite64(fd, p, remaining, offset));

	if (n <= 0) {
	return false;
	}

	p += n;
	remaining -= n;
	offset += n;
	}

	return true;
	}

	/* reads up to `count' bytes starting from `offset' using error correction and
	integrity validation, if available */
	ssize_t fec_pread(struct fec_handle f, void buf, size_t count,
	uint64_t offset)
	{
	check(f);
	check(buf);

	if (unlikely(offset > UINT64_MAX - count)) {
	errno = EOVERFLOW;
	return -1;
	}

	if (!f->hashtree().hash_data.empty()) {
	return process(f, (uint8_t *)buf,
	get_max_count(offset, count, f->data_size), offset,
	verity_read);
	} else if (f->ecc.start) {
	check(f->ecc.start < f->size);

	count = get_max_count(offset, count, f->data_size);
	ssize_t rc = process(f, (uint8_t *)buf, count, offset, ecc_read);

	if (rc >= 0) {
	return rc;
	}

	/* return raw data if pure ecc read fails; due to interleaving
	the specific blocks the caller wants may still be fine */
	} else {
	count = get_max_count(offset, count, f->size);
	}

	if (raw_pread(f->fd, buf, count, offset)) {
	return count;
	}

	return -1;
	}