| // Copyright (c) 2015 Sandstorm Development Group, Inc. and contributors |
| // Licensed under the MIT License: |
| // |
| // Permission is hereby granted, free of charge, to any person obtaining a copy |
| // of this software and associated documentation files (the "Software"), to deal |
| // in the Software without restriction, including without limitation the rights |
| // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| // copies of the Software, and to permit persons to whom the Software is |
| // furnished to do so, subject to the following conditions: |
| // |
| // The above copyright notice and this permission notice shall be included in |
| // all copies or substantial portions of the Software. |
| // |
| // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| // THE SOFTWARE. |
| |
| #if !_WIN32 |
| |
| #ifndef _GNU_SOURCE |
| #define _GNU_SOURCE |
| #endif |
| |
| #include "filesystem.h" |
| #include "debug.h" |
| #include <sys/types.h> |
| #include <sys/stat.h> |
| #include <sys/ioctl.h> |
| #include <fcntl.h> |
| #include <unistd.h> |
| #include <stdio.h> |
| #include <sys/mman.h> |
| #include <errno.h> |
| #include <dirent.h> |
| #include <stdlib.h> |
| #include "vector.h" |
| #include "miniposix.h" |
| #include <algorithm> |
| |
| #if __linux__ |
| #include <syscall.h> |
| #include <linux/fs.h> |
| #include <sys/sendfile.h> |
| #endif |
| |
| namespace kj { |
| namespace { |
| |
| #define HIDDEN_PREFIX ".kj-tmp." |
| // Prefix for temp files which should be hidden when listing a directory. |
| // |
| // If you change this, make sure to update the unit test. |
| |
| #ifdef O_CLOEXEC |
| #define MAYBE_O_CLOEXEC O_CLOEXEC |
| #else |
| #define MAYBE_O_CLOEXEC 0 |
| #endif |
| |
| #ifdef O_DIRECTORY |
| #define MAYBE_O_DIRECTORY O_DIRECTORY |
| #else |
| #define MAYBE_O_DIRECTORY 0 |
| #endif |
| |
| #if __APPLE__ |
| // Mac OSX defines SEEK_HOLE, but it doesn't work. ("Inappropriate ioctl for device", it says.) |
| #undef SEEK_HOLE |
| #endif |
| |
| #if __BIONIC__ |
| // No no DTTOIF function |
| #undef DT_UNKNOWN |
| #endif |
| |
| static void setCloexec(int fd) KJ_UNUSED; |
| static void setCloexec(int fd) { |
| // Set the O_CLOEXEC flag on the given fd. |
| // |
| // We try to avoid the need to call this by taking advantage of syscall flags that set it |
| // atomically on new file descriptors. Unfortunately some platforms do not support such syscalls. |
| |
| #ifdef FIOCLEX |
| // Yay, we can set the flag in one call. |
| KJ_SYSCALL_HANDLE_ERRORS(ioctl(fd, FIOCLEX)) { |
| case EINVAL: |
| case EOPNOTSUPP: |
| break; |
| default: |
| KJ_FAIL_SYSCALL("ioctl(fd, FIOCLEX)", error) { break; } |
| break; |
| } else { |
| // success |
| return; |
| } |
| #endif |
| |
| // Sadness, we must resort to read/modify/write. |
| // |
| // (On many platforms, FD_CLOEXEC is the only flag modifiable via F_SETFD and therefore we could |
| // skip the read... but it seems dangerous to assume that's true of all platforms, and anyway |
| // most platforms support FIOCLEX.) |
| int flags; |
| KJ_SYSCALL(flags = fcntl(fd, F_GETFD)); |
| if (!(flags & FD_CLOEXEC)) { |
| KJ_SYSCALL(fcntl(fd, F_SETFD, flags | FD_CLOEXEC)); |
| } |
| } |
| |
| static Date toKjDate(struct timespec tv) { |
| return tv.tv_sec * SECONDS + tv.tv_nsec * NANOSECONDS + UNIX_EPOCH; |
| } |
| |
| static FsNode::Type modeToType(mode_t mode) { |
| switch (mode & S_IFMT) { |
| case S_IFREG : return FsNode::Type::FILE; |
| case S_IFDIR : return FsNode::Type::DIRECTORY; |
| case S_IFLNK : return FsNode::Type::SYMLINK; |
| case S_IFBLK : return FsNode::Type::BLOCK_DEVICE; |
| case S_IFCHR : return FsNode::Type::CHARACTER_DEVICE; |
| case S_IFIFO : return FsNode::Type::NAMED_PIPE; |
| case S_IFSOCK: return FsNode::Type::SOCKET; |
| default: return FsNode::Type::OTHER; |
| } |
| } |
| |
| static FsNode::Metadata statToMetadata(struct stat& stats) { |
| // Probably st_ino and st_dev are usually under 32 bits, so mix by rotating st_dev left 32 bits |
| // and XOR. |
| uint64_t d = stats.st_dev; |
| uint64_t hash = ((d << 32) | (d >> 32)) ^ stats.st_ino; |
| |
| return FsNode::Metadata { |
| modeToType(stats.st_mode), |
| implicitCast<uint64_t>(stats.st_size), |
| implicitCast<uint64_t>(stats.st_blocks * 512u), |
| #if __APPLE__ |
| toKjDate(stats.st_mtimespec), |
| #else |
| toKjDate(stats.st_mtim), |
| #endif |
| implicitCast<uint>(stats.st_nlink), |
| hash |
| }; |
| } |
| |
| static bool rmrf(int fd, StringPtr path); |
| |
| static void rmrfChildrenAndClose(int fd) { |
| // Assumes fd is seeked to beginning. |
| |
| DIR* dir = fdopendir(fd); |
| if (dir == nullptr) { |
| close(fd); |
| KJ_FAIL_SYSCALL("fdopendir", errno); |
| }; |
| KJ_DEFER(closedir(dir)); |
| |
| for (;;) { |
| errno = 0; |
| struct dirent* entry = readdir(dir); |
| if (entry == nullptr) { |
| int error = errno; |
| if (error == 0) { |
| break; |
| } else { |
| KJ_FAIL_SYSCALL("readdir", error); |
| } |
| } |
| |
| if (entry->d_name[0] == '.' && |
| (entry->d_name[1] == '\0' || |
| (entry->d_name[1] == '.' && |
| entry->d_name[2] == '\0'))) { |
| // ignore . and .. |
| } else { |
| #ifdef DT_UNKNOWN // d_type is not available on all platforms. |
| if (entry->d_type == DT_DIR) { |
| int subdirFd; |
| KJ_SYSCALL(subdirFd = openat( |
| fd, entry->d_name, O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)); |
| rmrfChildrenAndClose(subdirFd); |
| KJ_SYSCALL(unlinkat(fd, entry->d_name, AT_REMOVEDIR)); |
| } else if (entry->d_type != DT_UNKNOWN) { |
| KJ_SYSCALL(unlinkat(fd, entry->d_name, 0)); |
| } else { |
| #endif |
| KJ_ASSERT(rmrf(fd, entry->d_name)); |
| #ifdef DT_UNKNOWN |
| } |
| #endif |
| } |
| } |
| } |
| |
| static bool rmrf(int fd, StringPtr path) { |
| struct stat stats; |
| KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { |
| case ENOENT: |
| case ENOTDIR: |
| // Doesn't exist. |
| return false; |
| default: |
| KJ_FAIL_SYSCALL("lstat(path)", error, path) { return false; } |
| } |
| |
| if (S_ISDIR(stats.st_mode)) { |
| int subdirFd; |
| KJ_SYSCALL(subdirFd = openat( |
| fd, path.cStr(), O_RDONLY | MAYBE_O_DIRECTORY | MAYBE_O_CLOEXEC)) { return false; } |
| rmrfChildrenAndClose(subdirFd); |
| KJ_SYSCALL(unlinkat(fd, path.cStr(), AT_REMOVEDIR)) { return false; } |
| } else { |
| KJ_SYSCALL(unlinkat(fd, path.cStr(), 0)) { return false; } |
| } |
| |
| return true; |
| } |
| |
| struct MmapRange { |
| uint64_t offset; |
| uint64_t size; |
| }; |
| |
| static MmapRange getMmapRange(uint64_t offset, uint64_t size) { |
| // Comes up with an offset and size to pass to mmap(), given an offset and size requested by |
| // the caller, and considering the fact that mappings must start at a page boundary. |
| // |
| // The offset is rounded down to the nearest page boundary, and the size is increased to |
| // compensate. Note that the endpoint of the mapping is *not* rounded up to a page boundary, as |
| // mmap() does not actually require this, and it causes trouble on some systems (notably Cygwin). |
| |
| #ifndef _SC_PAGESIZE |
| #define _SC_PAGESIZE _SC_PAGE_SIZE |
| #endif |
| static const uint64_t pageSize = sysconf(_SC_PAGESIZE); |
| uint64_t pageMask = pageSize - 1; |
| |
| uint64_t realOffset = offset & ~pageMask; |
| |
| return { realOffset, offset + size - realOffset }; |
| } |
| |
| class MmapDisposer: public ArrayDisposer { |
| protected: |
| void disposeImpl(void* firstElement, size_t elementSize, size_t elementCount, |
| size_t capacity, void (*destroyElement)(void*)) const { |
| auto range = getMmapRange(reinterpret_cast<uintptr_t>(firstElement), |
| elementSize * elementCount); |
| KJ_SYSCALL(munmap(reinterpret_cast<byte*>(range.offset), range.size)) { break; } |
| } |
| }; |
| |
| constexpr MmapDisposer mmapDisposer = MmapDisposer(); |
| |
| class DiskHandle { |
| // We need to implement each of ReadableFile, AppendableFile, File, ReadableDirectory, and |
| // Directory for disk handles. There is a lot of implementation overlap between these, especially |
| // stat(), sync(), etc. We can't have everything inherit from a common DiskFsNode that implements |
| // these because then we get diamond inheritance which means we need to make all our inheritance |
| // virtual which means downcasting requires RTTI which violates our goal of supporting compiling |
| // with no RTTI. So instead we have the DiskHandle class which implements all the methods without |
| // inheriting anything, and then we have DiskFile, DiskDirectory, etc. hold this and delegate to |
| // it. Ugly, but works. |
| |
| public: |
| DiskHandle(AutoCloseFd&& fd): fd(kj::mv(fd)) {} |
| |
| // OsHandle ------------------------------------------------------------------ |
| |
| AutoCloseFd clone() const { |
| int fd2; |
| #ifdef F_DUPFD_CLOEXEC |
| KJ_SYSCALL_HANDLE_ERRORS(fd2 = fcntl(fd, F_DUPFD_CLOEXEC, 3)) { |
| case EINVAL: |
| case EOPNOTSUPP: |
| // fall back |
| break; |
| default: |
| KJ_FAIL_SYSCALL("fnctl(fd, F_DUPFD_CLOEXEC, 3)", error) { break; } |
| break; |
| } else { |
| return AutoCloseFd(fd2); |
| } |
| #endif |
| |
| KJ_SYSCALL(fd2 = ::dup(fd)); |
| AutoCloseFd result(fd2); |
| setCloexec(result); |
| return result; |
| } |
| |
| int getFd() const { |
| return fd.get(); |
| } |
| |
| // FsNode -------------------------------------------------------------------- |
| |
| FsNode::Metadata stat() const { |
| struct stat stats; |
| KJ_SYSCALL(::fstat(fd, &stats)); |
| return statToMetadata(stats); |
| } |
| |
| void sync() const { |
| #if __APPLE__ |
| // For whatever reason, fsync() on OSX only flushes kernel buffers. It does not flush hardware |
| // disk buffers. This makes it not very useful. But OSX documents fcntl F_FULLFSYNC which does |
| // the right thing. Why they don't just make fsync() do the right thing, I do not know. |
| KJ_SYSCALL(fcntl(fd, F_FULLFSYNC)); |
| #else |
| KJ_SYSCALL(fsync(fd)); |
| #endif |
| } |
| |
| void datasync() const { |
| // The presence of the _POSIX_SYNCHRONIZED_IO define is supposed to tell us that fdatasync() |
| // exists. But Apple defines this yet doesn't offer fdatasync(). Thanks, Apple. |
| #if _POSIX_SYNCHRONIZED_IO && !__APPLE__ |
| KJ_SYSCALL(fdatasync(fd)); |
| #else |
| this->sync(); |
| #endif |
| } |
| |
| // ReadableFile -------------------------------------------------------------- |
| |
| size_t read(uint64_t offset, ArrayPtr<byte> buffer) const { |
| // pread() probably never returns short reads unless it hits EOF. Unfortunately, though, per |
| // spec we are not allowed to assume this. |
| |
| size_t total = 0; |
| while (buffer.size() > 0) { |
| ssize_t n; |
| KJ_SYSCALL(n = pread(fd, buffer.begin(), buffer.size(), offset)); |
| if (n == 0) break; |
| total += n; |
| offset += n; |
| buffer = buffer.slice(n, buffer.size()); |
| } |
| return total; |
| } |
| |
| Array<const byte> mmap(uint64_t offset, uint64_t size) const { |
| if (size == 0) return nullptr; // zero-length mmap() returns EINVAL, so avoid it |
| auto range = getMmapRange(offset, size); |
| const void* mapping = ::mmap(NULL, range.size, PROT_READ, MAP_SHARED, fd, range.offset); |
| if (mapping == MAP_FAILED) { |
| KJ_FAIL_SYSCALL("mmap", errno); |
| } |
| return Array<const byte>(reinterpret_cast<const byte*>(mapping) + (offset - range.offset), |
| size, mmapDisposer); |
| } |
| |
| Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const { |
| if (size == 0) return nullptr; // zero-length mmap() returns EINVAL, so avoid it |
| auto range = getMmapRange(offset, size); |
| void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, range.offset); |
| if (mapping == MAP_FAILED) { |
| KJ_FAIL_SYSCALL("mmap", errno); |
| } |
| return Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), |
| size, mmapDisposer); |
| } |
| |
| // File ---------------------------------------------------------------------- |
| |
| void write(uint64_t offset, ArrayPtr<const byte> data) const { |
| // pwrite() probably never returns short writes unless there's no space left on disk. |
| // Unfortunately, though, per spec we are not allowed to assume this. |
| |
| while (data.size() > 0) { |
| ssize_t n; |
| KJ_SYSCALL(n = pwrite(fd, data.begin(), data.size(), offset)); |
| KJ_ASSERT(n > 0, "pwrite() returned zero?"); |
| offset += n; |
| data = data.slice(n, data.size()); |
| } |
| } |
| |
| void zero(uint64_t offset, uint64_t size) const { |
| // If FALLOC_FL_PUNCH_HOLE is defined, use it to efficiently zero the area. |
| // |
| // A fallocate() wrapper was only added to Android's Bionic C library as of API level 21, |
| // but FALLOC_FL_PUNCH_HOLE is apparently defined in the headers before that, so we'll |
| // have to explicitly test for that case. |
| #if defined(FALLOC_FL_PUNCH_HOLE) && !(__ANDROID__ && __BIONIC__ && __ANDROID_API__ < 21) |
| KJ_SYSCALL_HANDLE_ERRORS( |
| fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, size)) { |
| case EOPNOTSUPP: |
| // fall back to below |
| break; |
| default: |
| KJ_FAIL_SYSCALL("fallocate(FALLOC_FL_PUNCH_HOLE)", error) { return; } |
| } else { |
| return; |
| } |
| #endif |
| |
| static const byte ZEROS[4096] = { 0 }; |
| |
| #if __APPLE__ || __CYGWIN__ || (defined(__ANDROID__) && __ANDROID_API__ < 24) |
| // Mac & Cygwin & Android API levels 23 and lower doesn't have pwritev(). |
| while (size > sizeof(ZEROS)) { |
| write(offset, ZEROS); |
| size -= sizeof(ZEROS); |
| offset += sizeof(ZEROS); |
| } |
| write(offset, kj::arrayPtr(ZEROS, size)); |
| #else |
| // Use a 4k buffer of zeros amplified by iov to write zeros with as few syscalls as possible. |
| size_t count = (size + sizeof(ZEROS) - 1) / sizeof(ZEROS); |
| const size_t iovmax = miniposix::iovMax(); |
| KJ_STACK_ARRAY(struct iovec, iov, kj::min(iovmax, count), 16, 256); |
| |
| for (auto& item: iov) { |
| item.iov_base = const_cast<byte*>(ZEROS); |
| item.iov_len = sizeof(ZEROS); |
| } |
| |
| while (size > 0) { |
| size_t iovCount; |
| if (size >= iov.size() * sizeof(ZEROS)) { |
| iovCount = iov.size(); |
| } else { |
| iovCount = size / sizeof(ZEROS); |
| size_t rem = size % sizeof(ZEROS); |
| if (rem > 0) { |
| iov[iovCount++].iov_len = rem; |
| } |
| } |
| |
| ssize_t n; |
| KJ_SYSCALL(n = pwritev(fd, iov.begin(), count, offset)); |
| KJ_ASSERT(n > 0, "pwrite() returned zero?"); |
| |
| offset += n; |
| size -= n; |
| } |
| #endif |
| } |
| |
| void truncate(uint64_t size) const { |
| KJ_SYSCALL(ftruncate(fd, size)); |
| } |
| |
| class WritableFileMappingImpl final: public WritableFileMapping { |
| public: |
| WritableFileMappingImpl(Array<byte> bytes): bytes(kj::mv(bytes)) {} |
| |
| ArrayPtr<byte> get() const override { |
| // const_cast OK because WritableFileMapping does indeed provide a writable view despite |
| // being const itself. |
| return arrayPtr(const_cast<byte*>(bytes.begin()), bytes.size()); |
| } |
| |
| void changed(ArrayPtr<byte> slice) const override { |
| KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), |
| "byte range is not part of this mapping"); |
| if (slice.size() == 0) return; |
| |
| // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. |
| auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); |
| KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_ASYNC)); |
| } |
| |
| void sync(ArrayPtr<byte> slice) const override { |
| KJ_REQUIRE(slice.begin() >= bytes.begin() && slice.end() <= bytes.end(), |
| "byte range is not part of this mapping"); |
| if (slice.size() == 0) return; |
| |
| // msync() requires page-alignment, apparently, so use getMmapRange() to accomplish that. |
| auto range = getMmapRange(reinterpret_cast<uintptr_t>(slice.begin()), slice.size()); |
| KJ_SYSCALL(msync(reinterpret_cast<void*>(range.offset), range.size, MS_SYNC)); |
| } |
| |
| private: |
| Array<byte> bytes; |
| }; |
| |
| Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const { |
| if (size == 0) { |
| // zero-length mmap() returns EINVAL, so avoid it |
| return heap<WritableFileMappingImpl>(nullptr); |
| } |
| auto range = getMmapRange(offset, size); |
| void* mapping = ::mmap(NULL, range.size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, range.offset); |
| if (mapping == MAP_FAILED) { |
| KJ_FAIL_SYSCALL("mmap", errno); |
| } |
| auto array = Array<byte>(reinterpret_cast<byte*>(mapping) + (offset - range.offset), |
| size, mmapDisposer); |
| return heap<WritableFileMappingImpl>(kj::mv(array)); |
| } |
| |
| size_t copyChunk(uint64_t offset, int fromFd, uint64_t fromOffset, uint64_t size) const { |
| // Copies a range of bytes from `fromFd` to this file in the most efficient way possible for |
| // the OS. Only returns less than `size` if EOF. Does not account for holes. |
| |
| #if __linux__ |
| { |
| KJ_SYSCALL(lseek(fd, offset, SEEK_SET)); |
| off_t fromPos = fromOffset; |
| off_t end = fromOffset + size; |
| while (fromPos < end) { |
| ssize_t n; |
| KJ_SYSCALL_HANDLE_ERRORS(n = sendfile(fd, fromFd, &fromPos, end - fromPos)) { |
| case EINVAL: |
| case ENOSYS: |
| goto sendfileNotAvailable; |
| default: |
| KJ_FAIL_SYSCALL("sendfile", error) { return fromPos - fromOffset; } |
| } |
| if (n == 0) break; |
| } |
| return fromPos - fromOffset; |
| } |
| |
| sendfileNotAvailable: |
| #endif |
| uint64_t total = 0; |
| while (size > 0) { |
| byte buffer[4096]; |
| ssize_t n; |
| KJ_SYSCALL(n = pread(fromFd, buffer, kj::min(sizeof(buffer), size), fromOffset)); |
| if (n == 0) break; |
| write(offset, arrayPtr(buffer, n)); |
| fromOffset += n; |
| offset += n; |
| total += n; |
| size -= n; |
| } |
| return total; |
| } |
| |
| kj::Maybe<size_t> copy(uint64_t offset, const ReadableFile& from, |
| uint64_t fromOffset, uint64_t size) const { |
| KJ_IF_MAYBE(otherFd, from.getFd()) { |
| #ifdef FICLONE |
| if (offset == 0 && fromOffset == 0 && size == kj::maxValue && stat().size == 0) { |
| if (ioctl(fd, FICLONE, *otherFd) >= 0) { |
| return stat().size; |
| } |
| } else if (size > 0) { // src_length = 0 has special meaning for the syscall, so avoid. |
| struct file_clone_range range; |
| memset(&range, 0, sizeof(range)); |
| range.src_fd = *otherFd; |
| range.dest_offset = offset; |
| range.src_offset = fromOffset; |
| range.src_length = size == kj::maxValue ? 0 : size; |
| if (ioctl(fd, FICLONERANGE, &range) >= 0) { |
| // TODO(someday): What does FICLONERANGE actually do if the range goes past EOF? The docs |
| // don't say. Maybe it only copies the parts that exist. Maybe it punches holes for the |
| // rest. Where does the destination file's EOF marker end up? Who knows? |
| return kj::min(from.stat().size - fromOffset, size); |
| } |
| } else { |
| // size == 0 |
| return size_t(0); |
| } |
| |
| // ioctl failed. Almost all failures documented for these are of the form "the operation is |
| // not supported for the filesystem(s) specified", so fall back to other approaches. |
| #endif |
| |
| off_t toPos = offset; |
| off_t fromPos = fromOffset; |
| off_t end = size == kj::maxValue ? off_t(kj::maxValue) : off_t(fromOffset + size); |
| |
| for (;;) { |
| // Handle data. |
| { |
| // Find out how much data there is before the next hole. |
| off_t nextHole; |
| #ifdef SEEK_HOLE |
| KJ_SYSCALL_HANDLE_ERRORS(nextHole = lseek(*otherFd, fromPos, SEEK_HOLE)) { |
| case EINVAL: |
| // SEEK_HOLE probably not supported. Assume no holes. |
| nextHole = end; |
| break; |
| case ENXIO: |
| // Past EOF. Stop here. |
| return fromPos - fromOffset; |
| default: |
| KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } |
| } |
| #else |
| // SEEK_HOLE not supported. Assume no holes. |
| nextHole = end; |
| #endif |
| |
| // Copy the next chunk of data. |
| off_t copyTo = kj::min(end, nextHole); |
| size_t amount = copyTo - fromPos; |
| if (amount > 0) { |
| size_t n = copyChunk(toPos, *otherFd, fromPos, amount); |
| fromPos += n; |
| toPos += n; |
| |
| if (n < amount) { |
| return fromPos - fromOffset; |
| } |
| } |
| |
| if (fromPos == end) { |
| return fromPos - fromOffset; |
| } |
| } |
| |
| #ifdef SEEK_HOLE |
| // Handle hole. |
| { |
| // Find out how much hole there is before the next data. |
| off_t nextData; |
| KJ_SYSCALL_HANDLE_ERRORS(nextData = lseek(*otherFd, fromPos, SEEK_DATA)) { |
| case EINVAL: |
| // SEEK_DATA probably not supported. But we should only have gotten here if we |
| // were expecting a hole. |
| KJ_FAIL_ASSERT("can't determine hole size; SEEK_DATA not supported"); |
| break; |
| case ENXIO: |
| // No more data. Set to EOF. |
| KJ_SYSCALL(nextData = lseek(*otherFd, 0, SEEK_END)); |
| if (nextData > end) { |
| end = nextData; |
| } |
| break; |
| default: |
| KJ_FAIL_SYSCALL("lseek(fd, pos, SEEK_HOLE)", error) { return fromPos - fromOffset; } |
| } |
| |
| // Write zeros. |
| off_t zeroTo = kj::min(end, nextData); |
| off_t amount = zeroTo - fromPos; |
| if (amount > 0) { |
| zero(toPos, amount); |
| toPos += amount; |
| fromPos = zeroTo; |
| } |
| |
| if (fromPos == end) { |
| return fromPos - fromOffset; |
| } |
| } |
| #endif |
| } |
| } |
| |
| // Indicates caller should call File::copy() default implementation. |
| return nullptr; |
| } |
| |
| // ReadableDirectory --------------------------------------------------------- |
| |
| template <typename Func> |
| auto list(bool needTypes, Func&& func) const |
| -> Array<Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))>> { |
| // Seek to start of directory. |
| KJ_SYSCALL(lseek(fd, 0, SEEK_SET)); |
| |
| // Unfortunately, fdopendir() takes ownership of the file descriptor. Therefore we need to |
| // make a duplicate. |
| int duped; |
| KJ_SYSCALL(duped = dup(fd)); |
| DIR* dir = fdopendir(duped); |
| if (dir == nullptr) { |
| close(duped); |
| KJ_FAIL_SYSCALL("fdopendir", errno); |
| } |
| |
| KJ_DEFER(closedir(dir)); |
| typedef Decay<decltype(func(instance<StringPtr>(), instance<FsNode::Type>()))> Entry; |
| kj::Vector<Entry> entries; |
| |
| for (;;) { |
| errno = 0; |
| struct dirent* entry = readdir(dir); |
| if (entry == nullptr) { |
| int error = errno; |
| if (error == 0) { |
| break; |
| } else { |
| KJ_FAIL_SYSCALL("readdir", error); |
| } |
| } |
| |
| kj::StringPtr name = entry->d_name; |
| if (name != "." && name != ".." && !name.startsWith(HIDDEN_PREFIX)) { |
| #ifdef DT_UNKNOWN // d_type is not available on all platforms. |
| if (entry->d_type != DT_UNKNOWN) { |
| entries.add(func(name, modeToType(DTTOIF(entry->d_type)))); |
| } else { |
| #endif |
| if (needTypes) { |
| // Unknown type. Fall back to stat. |
| struct stat stats; |
| KJ_SYSCALL(fstatat(fd, name.cStr(), &stats, AT_SYMLINK_NOFOLLOW)); |
| entries.add(func(name, modeToType(stats.st_mode))); |
| } else { |
| entries.add(func(name, FsNode::Type::OTHER)); |
| } |
| #ifdef DT_UNKNOWN |
| } |
| #endif |
| } |
| } |
| |
| auto result = entries.releaseAsArray(); |
| std::sort(result.begin(), result.end()); |
| return result; |
| } |
| |
| Array<String> listNames() const { |
| return list(false, [](StringPtr name, FsNode::Type type) { return heapString(name); }); |
| } |
| |
| Array<ReadableDirectory::Entry> listEntries() const { |
| return list(true, [](StringPtr name, FsNode::Type type) { |
| return ReadableDirectory::Entry { type, heapString(name), }; |
| }); |
| } |
| |
| bool exists(PathPtr path) const { |
| KJ_SYSCALL_HANDLE_ERRORS(faccessat(fd, path.toString().cStr(), F_OK, 0)) { |
| case ENOENT: |
| case ENOTDIR: |
| return false; |
| default: |
| KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return false; } |
| } |
| return true; |
| } |
| |
| Maybe<FsNode::Metadata> tryLstat(PathPtr path) const { |
| struct stat stats; |
| KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, path.toString().cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { |
| case ENOENT: |
| case ENOTDIR: |
| return nullptr; |
| default: |
| KJ_FAIL_SYSCALL("faccessat(fd, path)", error, path) { return nullptr; } |
| } |
| return statToMetadata(stats); |
| } |
| |
| Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const { |
| int newFd; |
| KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( |
| fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC)) { |
| case ENOENT: |
| case ENOTDIR: |
| return nullptr; |
| default: |
| KJ_FAIL_SYSCALL("openat(fd, path, O_RDONLY)", error, path) { return nullptr; } |
| } |
| |
| kj::AutoCloseFd result(newFd); |
| #ifndef O_CLOEXEC |
| setCloexec(result); |
| #endif |
| |
| return newDiskReadableFile(kj::mv(result)); |
| } |
| |
| Maybe<AutoCloseFd> tryOpenSubdirInternal(PathPtr path) const { |
| int newFd; |
| KJ_SYSCALL_HANDLE_ERRORS(newFd = openat( |
| fd, path.toString().cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { |
| case ENOENT: |
| return nullptr; |
| case ENOTDIR: |
| // Could mean that a parent is not a directory, which we treat as "doesn't exist". |
| // Could also mean that the specified file is not a directory, which should throw. |
| // Check using exists(). |
| if (!exists(path)) { |
| return nullptr; |
| } |
| KJ_FALLTHROUGH; |
| default: |
| KJ_FAIL_SYSCALL("openat(fd, path, O_DIRECTORY)", error, path) { return nullptr; } |
| } |
| |
| kj::AutoCloseFd result(newFd); |
| #ifndef O_CLOEXEC |
| setCloexec(result); |
| #endif |
| |
| return kj::mv(result); |
| } |
| |
| Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const { |
| return tryOpenSubdirInternal(path).map(newDiskReadableDirectory); |
| } |
| |
| Maybe<String> tryReadlink(PathPtr path) const { |
| size_t trySize = 256; |
| for (;;) { |
| KJ_STACK_ARRAY(char, buf, trySize, 256, 4096); |
| ssize_t n = readlinkat(fd, path.toString().cStr(), buf.begin(), buf.size()); |
| if (n < 0) { |
| int error = errno; |
| switch (error) { |
| case EINTR: |
| continue; |
| case ENOENT: |
| case ENOTDIR: |
| case EINVAL: // not a link |
| return nullptr; |
| default: |
| KJ_FAIL_SYSCALL("readlinkat(fd, path)", error, path) { return nullptr; } |
| } |
| } |
| |
| if (n >= buf.size()) { |
| // Didn't give it enough space. Better retry with a bigger buffer. |
| trySize *= 2; |
| continue; |
| } |
| |
| return heapString(buf.begin(), n); |
| } |
| } |
| |
| // Directory ----------------------------------------------------------------- |
| |
| bool tryMkdir(PathPtr path, WriteMode mode, bool noThrow) const { |
| // Internal function to make a directory. |
| |
| auto filename = path.toString(); |
| mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; |
| |
| KJ_SYSCALL_HANDLE_ERRORS(mkdirat(fd, filename.cStr(), acl)) { |
| case EEXIST: { |
| // Apparently this path exists. |
| if (!has(mode, WriteMode::MODIFY)) { |
| // Require exclusive create. |
| return false; |
| } |
| |
| // MODIFY is allowed, so we just need to check whether the existing entry is a directory. |
| struct stat stats; |
| KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd, filename.cStr(), &stats, 0)) { |
| default: |
| // mkdir() says EEXIST but we can't stat it. Maybe it's a dangling link, or maybe |
| // we can't access it for some reason. Assume failure. |
| // |
| // TODO(someday): Maybe we should be creating the directory at the target of the |
| // link? |
| goto failed; |
| } |
| return (stats.st_mode & S_IFMT) == S_IFDIR; |
| } |
| case ENOENT: |
| if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && |
| tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | |
| WriteMode::CREATE_PARENT, true)) { |
| // Retry, but make sure we don't try to create the parent again. |
| return tryMkdir(path, mode - WriteMode::CREATE_PARENT, noThrow); |
| } else { |
| goto failed; |
| } |
| default: |
| failed: |
| if (noThrow) { |
| // Caller requested no throwing. |
| return false; |
| } else { |
| KJ_FAIL_SYSCALL("mkdirat(fd, path)", error, path); |
| } |
| } |
| |
| return true; |
| } |
| |
| kj::Maybe<String> createNamedTemporary( |
| PathPtr finalName, WriteMode mode, Function<int(StringPtr)> tryCreate) const { |
| // Create a temporary file which will eventually replace `finalName`. |
| // |
| // Calls `tryCreate` to actually create the temporary, passing in the desired path. tryCreate() |
| // is expected to behave like a syscall, returning a negative value and setting `errno` on |
| // error. tryCreate() MUST fail with EEXIST if the path exists -- this is not checked in |
| // advance, since it needs to be checked atomically. In the case of EEXIST, tryCreate() will |
| // be called again with a new path. |
| // |
| // Returns the temporary path that succeeded. Only returns nullptr if there was an exception |
| // but we're compiled with -fno-exceptions. |
| |
| if (finalName.size() == 0) { |
| KJ_FAIL_REQUIRE("can't replace self") { break; } |
| return nullptr; |
| } |
| |
| static uint counter = 0; |
| static const pid_t pid = getpid(); |
| String pathPrefix; |
| if (finalName.size() > 1) { |
| pathPrefix = kj::str(finalName.parent(), '/'); |
| } |
| auto path = kj::str(pathPrefix, HIDDEN_PREFIX, pid, '.', counter++, '.', |
| finalName.basename()[0], ".partial"); |
| |
| KJ_SYSCALL_HANDLE_ERRORS(tryCreate(path)) { |
| case EEXIST: |
| return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); |
| case ENOENT: |
| if (has(mode, WriteMode::CREATE_PARENT) && finalName.size() > 1 && |
| tryMkdir(finalName.parent(), WriteMode::CREATE | WriteMode::MODIFY | |
| WriteMode::CREATE_PARENT, true)) { |
| // Retry, but make sure we don't try to create the parent again. |
| mode = mode - WriteMode::CREATE_PARENT; |
| return createNamedTemporary(finalName, mode, kj::mv(tryCreate)); |
| } |
| KJ_FALLTHROUGH; |
| default: |
| KJ_FAIL_SYSCALL("create(path)", error, path) { break; } |
| return nullptr; |
| } |
| |
| return kj::mv(path); |
| } |
| |
| bool tryReplaceNode(PathPtr path, WriteMode mode, Function<int(StringPtr)> tryCreate) const { |
| // Replaces the given path with an object created by calling tryCreate(). |
| // |
| // tryCreate() must behave like a syscall which creates the node at the path passed to it, |
| // returning a negative value on error. If the path passed to tryCreate already exists, it |
| // MUST fail with EEXIST. |
| // |
| // When `mode` includes MODIFY, replaceNode() reacts to EEXIST by creating the node in a |
| // temporary location and then rename()ing it into place. |
| |
| if (path.size() == 0) { |
| KJ_FAIL_REQUIRE("can't replace self") { return false; } |
| } |
| |
| auto filename = path.toString(); |
| |
| if (has(mode, WriteMode::CREATE)) { |
| // First try just cerating the node in-place. |
| KJ_SYSCALL_HANDLE_ERRORS(tryCreate(filename)) { |
| case EEXIST: |
| // Target exists. |
| if (has(mode, WriteMode::MODIFY)) { |
| // Fall back to MODIFY path, below. |
| break; |
| } else { |
| return false; |
| } |
| case ENOENT: |
| if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && |
| tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | |
| WriteMode::CREATE_PARENT, true)) { |
| // Retry, but make sure we don't try to create the parent again. |
| return tryReplaceNode(path, mode - WriteMode::CREATE_PARENT, kj::mv(tryCreate)); |
| } |
| KJ_FALLTHROUGH; |
| default: |
| KJ_FAIL_SYSCALL("create(path)", error, path) { return false; } |
| } else { |
| // Success. |
| return true; |
| } |
| } |
| |
| // Either we don't have CREATE mode or the target already exists. We need to perform a |
| // replacement instead. |
| |
| KJ_IF_MAYBE(tempPath, createNamedTemporary(path, mode, kj::mv(tryCreate))) { |
| if (tryCommitReplacement(filename, fd, *tempPath, mode)) { |
| return true; |
| } else { |
| KJ_SYSCALL_HANDLE_ERRORS(unlinkat(fd, tempPath->cStr(), 0)) { |
| case ENOENT: |
| // meh |
| break; |
| default: |
| KJ_FAIL_SYSCALL("unlinkat(fd, tempPath, 0)", error, *tempPath); |
| } |
| return false; |
| } |
| } else { |
| // threw, but exceptions are disabled |
| return false; |
| } |
| } |
| |
| Maybe<AutoCloseFd> tryOpenFileInternal(PathPtr path, WriteMode mode, bool append) const { |
| uint flags = O_RDWR | MAYBE_O_CLOEXEC; |
| mode_t acl = 0666; |
| if (has(mode, WriteMode::CREATE)) { |
| flags |= O_CREAT; |
| } |
| if (!has(mode, WriteMode::MODIFY)) { |
| if (!has(mode, WriteMode::CREATE)) { |
| // Neither CREATE nor MODIFY -- impossible to satisfy preconditions. |
| return nullptr; |
| } |
| flags |= O_EXCL; |
| } |
| if (append) { |
| flags |= O_APPEND; |
| } |
| if (has(mode, WriteMode::EXECUTABLE)) { |
| acl = 0777; |
| } |
| if (has(mode, WriteMode::PRIVATE)) { |
| acl &= 0700; |
| } |
| |
| auto filename = path.toString(); |
| |
| int newFd; |
| KJ_SYSCALL_HANDLE_ERRORS(newFd = openat(fd, filename.cStr(), flags, acl)) { |
| case ENOENT: |
| if (has(mode, WriteMode::CREATE)) { |
| // Either: |
| // - The file is a broken symlink. |
| // - A parent directory didn't exist. |
| if (has(mode, WriteMode::CREATE_PARENT) && path.size() > 0 && |
| tryMkdir(path.parent(), WriteMode::CREATE | WriteMode::MODIFY | |
| WriteMode::CREATE_PARENT, true)) { |
| // Retry, but make sure we don't try to create the parent again. |
| return tryOpenFileInternal(path, mode - WriteMode::CREATE_PARENT, append); |
| } |
| |
| // Check for broken link. |
| if (!has(mode, WriteMode::MODIFY) && |
| faccessat(fd, filename.cStr(), F_OK, AT_SYMLINK_NOFOLLOW) >= 0) { |
| // Yep. We treat this as already-exists, which means in CREATE-only mode this is a |
| // simple failure. |
| return nullptr; |
| } |
| |
| KJ_FAIL_REQUIRE("parent is not a directory", path) { return nullptr; } |
| } else { |
| // MODIFY-only mode. ENOENT = doesn't exist = return null. |
| return nullptr; |
| } |
| case ENOTDIR: |
| if (!has(mode, WriteMode::CREATE)) { |
| // MODIFY-only mode. ENOTDIR = parent not a directory = doesn't exist = return null. |
| return nullptr; |
| } |
| goto failed; |
| case EEXIST: |
| if (!has(mode, WriteMode::MODIFY)) { |
| // CREATE-only mode. EEXIST = already exists = return null. |
| return nullptr; |
| } |
| goto failed; |
| default: |
| failed: |
| KJ_FAIL_SYSCALL("openat(fd, path, O_RDWR | ...)", error, path) { return nullptr; } |
| } |
| |
| kj::AutoCloseFd result(newFd); |
| #ifndef O_CLOEXEC |
| setCloexec(result); |
| #endif |
| |
| return kj::mv(result); |
| } |
| |
| bool tryCommitReplacement(StringPtr toPath, int fromDirFd, StringPtr fromPath, WriteMode mode, |
| int* errorReason = nullptr) const { |
| if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { |
| // Always clobber. Try it. |
| KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr())) { |
| case EISDIR: |
| case ENOTDIR: |
| case ENOTEMPTY: |
| case EEXIST: |
| // Failed because target exists and due to the various weird quirks of rename(), it |
| // can't remove it for us. On Linux we can try an exchange instead. On others we have |
| // to move the target out of the way. |
| break; |
| default: |
| if (errorReason == nullptr) { |
| KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { return false; } |
| } else { |
| *errorReason = error; |
| return false; |
| } |
| } else { |
| return true; |
| } |
| } |
| |
| #if __linux__ && defined(RENAME_EXCHANGE) |
| // Try to use Linux's renameat2() to atomically check preconditions and apply. |
| |
| if (has(mode, WriteMode::MODIFY)) { |
| // Use an exchange to implement modification. |
| // |
| // We reach this branch when performing a MODIFY-only, or when performing a CREATE | MODIFY |
| // in which we determined above that there's a node of a different type blocking the |
| // exchange. |
| |
| KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, |
| fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_EXCHANGE)) { |
| case ENOSYS: // Syscall not supported by kernel. |
| case EINVAL: // Maybe we screwed up, or maybe the syscall is not supported by the |
| // filesystem. Unfortunately, there's no way to tell, so assume the latter. |
| // ZFS in particular apparently produces EINVAL. |
| break; // fall back to traditional means |
| case ENOENT: |
| // Presumably because the target path doesn't exist. |
| if (has(mode, WriteMode::CREATE)) { |
| KJ_FAIL_ASSERT("rename(tmp, path) claimed path exists but " |
| "renameat2(fromPath, toPath, EXCHANGE) said it doest; concurrent modification?", |
| fromPath, toPath) { return false; } |
| } else { |
| // Assume target doesn't exist. |
| return false; |
| } |
| default: |
| if (errorReason == nullptr) { |
| KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, EXCHANGE)", error, fromPath, toPath) { |
| return false; |
| } |
| } else { |
| *errorReason = error; |
| return false; |
| } |
| } else { |
| // Successful swap! Delete swapped-out content. |
| rmrf(fromDirFd, fromPath); |
| return true; |
| } |
| } else if (has(mode, WriteMode::CREATE)) { |
| KJ_SYSCALL_HANDLE_ERRORS(syscall(SYS_renameat2, |
| fromDirFd, fromPath.cStr(), fd.get(), toPath.cStr(), RENAME_NOREPLACE)) { |
| case ENOSYS: // Syscall not supported by kernel. |
| case EINVAL: // Maybe we screwed up, or maybe the syscall is not supported by the |
| // filesystem. Unfortunately, there's no way to tell, so assume the latter. |
| // ZFS in particular apparently produces EINVAL. |
| break; // fall back to traditional means |
| case EEXIST: |
| return false; |
| default: |
| if (errorReason == nullptr) { |
| KJ_FAIL_SYSCALL("renameat2(fromPath, toPath, NOREPLACE)", error, fromPath, toPath) { |
| return false; |
| } |
| } else { |
| *errorReason = error; |
| return false; |
| } |
| } else { |
| return true; |
| } |
| } |
| #endif |
| |
| // We're unable to do what we wanted atomically. :( |
| |
| if (has(mode, WriteMode::CREATE) && has(mode, WriteMode::MODIFY)) { |
| // We failed to atomically delete the target previously. So now we need to do two calls in |
| // rapid succession to move the old file away then move the new one into place. |
| |
| // Find out what kind of file exists at the target path. |
| struct stat stats; |
| KJ_SYSCALL(fstatat(fd, toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { return false; } |
| |
| // Create a temporary location to move the existing object to. Note that rename() allows a |
| // non-directory to replace a non-directory, and allows a directory to replace an empty |
| // directory. So we have to create the right type. |
| Path toPathParsed = Path::parse(toPath); |
| String away; |
| KJ_IF_MAYBE(awayPath, createNamedTemporary(toPathParsed, WriteMode::CREATE, |
| [&](StringPtr candidatePath) { |
| if (S_ISDIR(stats.st_mode)) { |
| return mkdirat(fd, candidatePath.cStr(), 0700); |
| } else { |
| #if __APPLE__ || __FreeBSD__ |
| // - No mknodat() on OSX, gotta open() a file, ugh. |
| // - On a modern FreeBSD, mknodat() is reserved strictly for device nodes, |
| // you cannot create a regular file using it (EINVAL). |
| int newFd = openat(fd, candidatePath.cStr(), |
| O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0700); |
| if (newFd >= 0) close(newFd); |
| return newFd; |
| #else |
| return mknodat(fd, candidatePath.cStr(), S_IFREG | 0600, dev_t()); |
| #endif |
| } |
| })) { |
| away = kj::mv(*awayPath); |
| } else { |
| // Already threw. |
| return false; |
| } |
| |
| // OK, now move the target object to replace the thing we just created. |
| KJ_SYSCALL(renameat(fd, toPath.cStr(), fd, away.cStr())) { |
| // Something went wrong. Remove the thing we just created. |
| unlinkat(fd, away.cStr(), S_ISDIR(stats.st_mode) ? AT_REMOVEDIR : 0); |
| return false; |
| } |
| |
| // Now move the source object to the target location. |
| KJ_SYSCALL_HANDLE_ERRORS(renameat(fromDirFd, fromPath.cStr(), fd, toPath.cStr())) { |
| default: |
| // Try to put things back where they were. If this fails, though, then we have little |
| // choice but to leave things broken. |
| KJ_SYSCALL_HANDLE_ERRORS(renameat(fd, away.cStr(), fd, toPath.cStr())) { |
| default: break; |
| } |
| |
| if (errorReason == nullptr) { |
| KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { |
| return false; |
| } |
| } else { |
| *errorReason = error; |
| return false; |
| } |
| } |
| |
| // OK, success. Delete the old content. |
| rmrf(fd, away); |
| return true; |
| } else { |
| // Only one of CREATE or MODIFY is specified, so we need to verify non-atomically that the |
| // corresponding precondition (must-not-exist or must-exist, respectively) is held. |
| if (has(mode, WriteMode::CREATE)) { |
| struct stat stats; |
| KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { |
| case ENOENT: |
| case ENOTDIR: |
| break; // doesn't exist; continue |
| default: |
| KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } |
| } else { |
| return false; // already exists; fail |
| } |
| } else if (has(mode, WriteMode::MODIFY)) { |
| struct stat stats; |
| KJ_SYSCALL_HANDLE_ERRORS(fstatat(fd.get(), toPath.cStr(), &stats, AT_SYMLINK_NOFOLLOW)) { |
| case ENOENT: |
| case ENOTDIR: |
| return false; // doesn't exist; fail |
| default: |
| KJ_FAIL_SYSCALL("fstatat(fd, toPath)", error, toPath) { return false; } |
| } else { |
| // already exists; continue |
| } |
| } else { |
| // Neither CREATE nor MODIFY. |
| return false; |
| } |
| |
| // Start over in create-and-modify mode. |
| return tryCommitReplacement(toPath, fromDirFd, fromPath, |
| WriteMode::CREATE | WriteMode::MODIFY, |
| errorReason); |
| } |
| } |
| |
| template <typename T> |
| class ReplacerImpl final: public Directory::Replacer<T> { |
| public: |
| ReplacerImpl(Own<const T>&& object, const DiskHandle& handle, |
| String&& tempPath, String&& path, WriteMode mode) |
| : Directory::Replacer<T>(mode), |
| object(kj::mv(object)), handle(handle), |
| tempPath(kj::mv(tempPath)), path(kj::mv(path)) {} |
| |
| ~ReplacerImpl() noexcept(false) { |
| if (!committed) { |
| rmrf(handle.fd, tempPath); |
| } |
| } |
| |
| const T& get() override { |
| return *object; |
| } |
| |
| bool tryCommit() override { |
| KJ_ASSERT(!committed, "already committed") { return false; } |
| return committed = handle.tryCommitReplacement(path, handle.fd, tempPath, |
| Directory::Replacer<T>::mode); |
| } |
| |
| private: |
| Own<const T> object; |
| const DiskHandle& handle; |
| String tempPath; |
| String path; |
| bool committed = false; // true if *successfully* committed (in which case tempPath is gone) |
| }; |
| |
| template <typename T> |
| class BrokenReplacer final: public Directory::Replacer<T> { |
| // For recovery path when exceptions are disabled. |
| |
| public: |
| BrokenReplacer(Own<const T> inner) |
| : Directory::Replacer<T>(WriteMode::CREATE | WriteMode::MODIFY), |
| inner(kj::mv(inner)) {} |
| |
| const T& get() override { return *inner; } |
| bool tryCommit() override { return false; } |
| |
| private: |
| Own<const T> inner; |
| }; |
| |
| Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const { |
| return tryOpenFileInternal(path, mode, false).map(newDiskFile); |
| } |
| |
| Own<Directory::Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const { |
| mode_t acl = 0666; |
| if (has(mode, WriteMode::EXECUTABLE)) { |
| acl = 0777; |
| } |
| if (has(mode, WriteMode::PRIVATE)) { |
| acl &= 0700; |
| } |
| |
| int newFd_; |
| KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, |
| [&](StringPtr candidatePath) { |
| return newFd_ = openat(fd, candidatePath.cStr(), |
| O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, acl); |
| })) { |
| AutoCloseFd newFd(newFd_); |
| #ifndef O_CLOEXEC |
| setCloexec(newFd); |
| #endif |
| return heap<ReplacerImpl<File>>(newDiskFile(kj::mv(newFd)), *this, kj::mv(*temp), |
| path.toString(), mode); |
| } else { |
| // threw, but exceptions are disabled |
| return heap<BrokenReplacer<File>>(newInMemoryFile(nullClock())); |
| } |
| } |
| |
| Own<const File> createTemporary() const { |
| int newFd_; |
| |
| #if __linux__ && defined(O_TMPFILE) |
| // Use syscall() to work around glibc bug with O_TMPFILE: |
| // https://sourceware.org/bugzilla/show_bug.cgi?id=17523 |
| KJ_SYSCALL_HANDLE_ERRORS(newFd_ = syscall( |
| SYS_openat, fd.get(), ".", O_RDWR | O_TMPFILE, 0700)) { |
| case EOPNOTSUPP: |
| case EINVAL: |
| case EISDIR: |
| // Maybe not supported by this kernel / filesystem. Fall back to below. |
| break; |
| default: |
| KJ_FAIL_SYSCALL("open(O_TMPFILE)", error) { break; } |
| break; |
| } else { |
| AutoCloseFd newFd(newFd_); |
| #ifndef O_CLOEXEC |
| setCloexec(newFd); |
| #endif |
| return newDiskFile(kj::mv(newFd)); |
| } |
| #endif |
| |
| KJ_IF_MAYBE(temp, createNamedTemporary(Path("unnamed"), WriteMode::CREATE, |
| [&](StringPtr path) { |
| return newFd_ = openat(fd, path.cStr(), O_RDWR | O_CREAT | O_EXCL | MAYBE_O_CLOEXEC, 0600); |
| })) { |
| AutoCloseFd newFd(newFd_); |
| #ifndef O_CLOEXEC |
| setCloexec(newFd); |
| #endif |
| auto result = newDiskFile(kj::mv(newFd)); |
| KJ_SYSCALL(unlinkat(fd, temp->cStr(), 0)) { break; } |
| return kj::mv(result); |
| } else { |
| // threw, but exceptions are disabled |
| return newInMemoryFile(nullClock()); |
| } |
| } |
| |
| Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const { |
| return tryOpenFileInternal(path, mode, true).map(newDiskAppendableFile); |
| } |
| |
| Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const { |
| // Must create before open. |
| if (has(mode, WriteMode::CREATE)) { |
| if (!tryMkdir(path, mode, false)) return nullptr; |
| } |
| |
| return tryOpenSubdirInternal(path).map(newDiskDirectory); |
| } |
| |
| Own<Directory::Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const { |
| mode_t acl = has(mode, WriteMode::PRIVATE) ? 0700 : 0777; |
| |
| KJ_IF_MAYBE(temp, createNamedTemporary(path, mode, |
| [&](StringPtr candidatePath) { |
| return mkdirat(fd, candidatePath.cStr(), acl); |
| })) { |
| int subdirFd_; |
| KJ_SYSCALL_HANDLE_ERRORS(subdirFd_ = openat( |
| fd, temp->cStr(), O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)) { |
| default: |
| KJ_FAIL_SYSCALL("open(just-created-temporary)", error); |
| return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); |
| } |
| |
| AutoCloseFd subdirFd(subdirFd_); |
| #ifndef O_CLOEXEC |
| setCloexec(subdirFd); |
| #endif |
| return heap<ReplacerImpl<Directory>>( |
| newDiskDirectory(kj::mv(subdirFd)), *this, kj::mv(*temp), path.toString(), mode); |
| } else { |
| // threw, but exceptions are disabled |
| return heap<BrokenReplacer<Directory>>(newInMemoryDirectory(nullClock())); |
| } |
| } |
| |
| bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const { |
| return tryReplaceNode(linkpath, mode, [&](StringPtr candidatePath) { |
| return symlinkat(content.cStr(), fd, candidatePath.cStr()); |
| }); |
| } |
| |
| bool tryTransfer(PathPtr toPath, WriteMode toMode, |
| const Directory& fromDirectory, PathPtr fromPath, |
| TransferMode mode, const Directory& self) const { |
| KJ_REQUIRE(toPath.size() > 0, "can't replace self") { return false; } |
| |
| if (mode == TransferMode::LINK) { |
| KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { |
| // Other is a disk directory, so we can hopefully do an efficient move/link. |
| return tryReplaceNode(toPath, toMode, [&](StringPtr candidatePath) { |
| return linkat(*fromFd, fromPath.toString().cStr(), fd, candidatePath.cStr(), 0); |
| }); |
| }; |
| } else if (mode == TransferMode::MOVE) { |
| KJ_IF_MAYBE(fromFd, fromDirectory.getFd()) { |
| KJ_ASSERT(mode == TransferMode::MOVE); |
| |
| int error = 0; |
| if (tryCommitReplacement(toPath.toString(), *fromFd, fromPath.toString(), toMode, |
| &error)) { |
| return true; |
| } else switch (error) { |
| case 0: |
| // Plain old WriteMode precondition failure. |
| return false; |
| case EXDEV: |
| // Can't move between devices. Fall back to default implementation, which does |
| // copy/delete. |
| break; |
| case ENOENT: |
| // Either the destination directory doesn't exist or the source path doesn't exist. |
| // Unfortunately we don't really know. If CREATE_PARENT was provided, try creating |
| // the parent directory. Otherwise, we don't actually need to distinguish between |
| // these two errors; just return false. |
| if (has(toMode, WriteMode::CREATE) && has(toMode, WriteMode::CREATE_PARENT) && |
| toPath.size() > 0 && tryMkdir(toPath.parent(), |
| WriteMode::CREATE | WriteMode::MODIFY | WriteMode::CREATE_PARENT, true)) { |
| // Retry, but make sure we don't try to create the parent again. |
| return tryTransfer(toPath, toMode - WriteMode::CREATE_PARENT, |
| fromDirectory, fromPath, mode, self); |
| } |
| return false; |
| default: |
| KJ_FAIL_SYSCALL("rename(fromPath, toPath)", error, fromPath, toPath) { |
| return false; |
| } |
| } |
| } |
| } |
| |
| // OK, we can't do anything efficient using the OS. Fall back to default implementation. |
| return self.Directory::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode); |
| } |
| |
| bool tryRemove(PathPtr path) const { |
| return rmrf(fd, path.toString()); |
| } |
| |
| protected: |
| AutoCloseFd fd; |
| }; |
| |
| #define FSNODE_METHODS(classname) \ |
| Maybe<int> getFd() const override { return DiskHandle::getFd(); } \ |
| \ |
| Own<const FsNode> cloneFsNode() const override { \ |
| return heap<classname>(DiskHandle::clone()); \ |
| } \ |
| \ |
| Metadata stat() const override { return DiskHandle::stat(); } \ |
| void sync() const override { DiskHandle::sync(); } \ |
| void datasync() const override { DiskHandle::datasync(); } |
| |
| class DiskReadableFile final: public ReadableFile, public DiskHandle { |
| public: |
| DiskReadableFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} |
| |
| FSNODE_METHODS(DiskReadableFile); |
| |
| size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { |
| return DiskHandle::read(offset, buffer); |
| } |
| Array<const byte> mmap(uint64_t offset, uint64_t size) const override { |
| return DiskHandle::mmap(offset, size); |
| } |
| Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { |
| return DiskHandle::mmapPrivate(offset, size); |
| } |
| }; |
| |
| class DiskAppendableFile final: public AppendableFile, public DiskHandle, public FdOutputStream { |
| public: |
| DiskAppendableFile(AutoCloseFd&& fd) |
| : DiskHandle(kj::mv(fd)), |
| FdOutputStream(DiskHandle::fd.get()) {} |
| |
| FSNODE_METHODS(DiskAppendableFile); |
| |
| void write(const void* buffer, size_t size) override { |
| FdOutputStream::write(buffer, size); |
| } |
| void write(ArrayPtr<const ArrayPtr<const byte>> pieces) override { |
| FdOutputStream::write(pieces); |
| } |
| }; |
| |
| class DiskFile final: public File, public DiskHandle { |
| public: |
| DiskFile(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} |
| |
| FSNODE_METHODS(DiskFile); |
| |
| size_t read(uint64_t offset, ArrayPtr<byte> buffer) const override { |
| return DiskHandle::read(offset, buffer); |
| } |
| Array<const byte> mmap(uint64_t offset, uint64_t size) const override { |
| return DiskHandle::mmap(offset, size); |
| } |
| Array<byte> mmapPrivate(uint64_t offset, uint64_t size) const override { |
| return DiskHandle::mmapPrivate(offset, size); |
| } |
| |
| void write(uint64_t offset, ArrayPtr<const byte> data) const override { |
| DiskHandle::write(offset, data); |
| } |
| void zero(uint64_t offset, uint64_t size) const override { |
| DiskHandle::zero(offset, size); |
| } |
| void truncate(uint64_t size) const override { |
| DiskHandle::truncate(size); |
| } |
| Own<const WritableFileMapping> mmapWritable(uint64_t offset, uint64_t size) const override { |
| return DiskHandle::mmapWritable(offset, size); |
| } |
| size_t copy(uint64_t offset, const ReadableFile& from, |
| uint64_t fromOffset, uint64_t size) const override { |
| KJ_IF_MAYBE(result, DiskHandle::copy(offset, from, fromOffset, size)) { |
| return *result; |
| } else { |
| return File::copy(offset, from, fromOffset, size); |
| } |
| } |
| }; |
| |
| class DiskReadableDirectory final: public ReadableDirectory, public DiskHandle { |
| public: |
| DiskReadableDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} |
| |
| FSNODE_METHODS(DiskReadableDirectory); |
| |
| Array<String> listNames() const override { return DiskHandle::listNames(); } |
| Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } |
| bool exists(PathPtr path) const override { return DiskHandle::exists(path); } |
| Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { |
| return DiskHandle::tryLstat(path); |
| } |
| Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { |
| return DiskHandle::tryOpenFile(path); |
| } |
| Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { |
| return DiskHandle::tryOpenSubdir(path); |
| } |
| Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } |
| }; |
| |
| class DiskDirectory final: public Directory, public DiskHandle { |
| public: |
| DiskDirectory(AutoCloseFd&& fd): DiskHandle(kj::mv(fd)) {} |
| |
| FSNODE_METHODS(DiskDirectory); |
| |
| Array<String> listNames() const override { return DiskHandle::listNames(); } |
| Array<Entry> listEntries() const override { return DiskHandle::listEntries(); } |
| bool exists(PathPtr path) const override { return DiskHandle::exists(path); } |
| Maybe<FsNode::Metadata> tryLstat(PathPtr path) const override { |
| return DiskHandle::tryLstat(path); |
| } |
| Maybe<Own<const ReadableFile>> tryOpenFile(PathPtr path) const override { |
| return DiskHandle::tryOpenFile(path); |
| } |
| Maybe<Own<const ReadableDirectory>> tryOpenSubdir(PathPtr path) const override { |
| return DiskHandle::tryOpenSubdir(path); |
| } |
| Maybe<String> tryReadlink(PathPtr path) const override { return DiskHandle::tryReadlink(path); } |
| |
| Maybe<Own<const File>> tryOpenFile(PathPtr path, WriteMode mode) const override { |
| return DiskHandle::tryOpenFile(path, mode); |
| } |
| Own<Replacer<File>> replaceFile(PathPtr path, WriteMode mode) const override { |
| return DiskHandle::replaceFile(path, mode); |
| } |
| Own<const File> createTemporary() const override { |
| return DiskHandle::createTemporary(); |
| } |
| Maybe<Own<AppendableFile>> tryAppendFile(PathPtr path, WriteMode mode) const override { |
| return DiskHandle::tryAppendFile(path, mode); |
| } |
| Maybe<Own<const Directory>> tryOpenSubdir(PathPtr path, WriteMode mode) const override { |
| return DiskHandle::tryOpenSubdir(path, mode); |
| } |
| Own<Replacer<Directory>> replaceSubdir(PathPtr path, WriteMode mode) const override { |
| return DiskHandle::replaceSubdir(path, mode); |
| } |
| bool trySymlink(PathPtr linkpath, StringPtr content, WriteMode mode) const override { |
| return DiskHandle::trySymlink(linkpath, content, mode); |
| } |
| bool tryTransfer(PathPtr toPath, WriteMode toMode, |
| const Directory& fromDirectory, PathPtr fromPath, |
| TransferMode mode) const override { |
| return DiskHandle::tryTransfer(toPath, toMode, fromDirectory, fromPath, mode, *this); |
| } |
| // tryTransferTo() not implemented because we have nothing special we can do. |
| bool tryRemove(PathPtr path) const override { |
| return DiskHandle::tryRemove(path); |
| } |
| }; |
| |
| class DiskFilesystem final: public Filesystem { |
| public: |
| DiskFilesystem() |
| : root(openDir("/")), |
| current(openDir(".")), |
| currentPath(computeCurrentPath()) {} |
| |
| const Directory& getRoot() const override { |
| return root; |
| } |
| |
| const Directory& getCurrent() const override { |
| return current; |
| } |
| |
| PathPtr getCurrentPath() const override { |
| return currentPath; |
| } |
| |
| private: |
| DiskDirectory root; |
| DiskDirectory current; |
| Path currentPath; |
| |
| static AutoCloseFd openDir(const char* dir) { |
| int newFd; |
| KJ_SYSCALL(newFd = open(dir, O_RDONLY | MAYBE_O_CLOEXEC | MAYBE_O_DIRECTORY)); |
| AutoCloseFd result(newFd); |
| #ifndef O_CLOEXEC |
| setCloexec(result); |
| #endif |
| return result; |
| } |
| |
| static Path computeCurrentPath() { |
| // If env var PWD is set and points to the current directory, use it. This captures the current |
| // path according to the user's shell, which may differ from the kernel's idea in the presence |
| // of symlinks. |
| const char* pwd = getenv("PWD"); |
| if (pwd != nullptr) { |
| Path result = nullptr; |
| struct stat pwdStat, dotStat; |
| KJ_IF_MAYBE(e, kj::runCatchingExceptions([&]() { |
| KJ_ASSERT(pwd[0] == '/') { return; } |
| result = Path::parse(pwd + 1); |
| KJ_SYSCALL(lstat(result.toString(true).cStr(), &pwdStat), result) { return; } |
| KJ_SYSCALL(lstat(".", &dotStat)) { return; } |
| })) { |
| // failed, give up on PWD |
| KJ_LOG(WARNING, "PWD environment variable seems invalid", pwd, *e); |
| } else { |
| if (pwdStat.st_ino == dotStat.st_ino && |
| pwdStat.st_dev == dotStat.st_dev) { |
| return kj::mv(result); |
| } else { |
| KJ_LOG(WARNING, "PWD environment variable doesn't match current directory", pwd); |
| } |
| } |
| } |
| |
| size_t size = 256; |
| retry: |
| KJ_STACK_ARRAY(char, buf, size, 256, 4096); |
| if (getcwd(buf.begin(), size) == nullptr) { |
| int error = errno; |
| if (error == ENAMETOOLONG) { |
| size *= 2; |
| goto retry; |
| } else { |
| KJ_FAIL_SYSCALL("getcwd()", error); |
| } |
| } |
| |
| StringPtr path = buf.begin(); |
| |
| // On Linux, the path will start with "(unreachable)" if the working directory is not a subdir |
| // of the root directory, which is possible via chroot() or mount namespaces. |
| KJ_ASSERT(!path.startsWith("(unreachable)"), |
| "working directory is not reachable from root", path); |
| KJ_ASSERT(path.startsWith("/"), "current directory is not absolute", path); |
| |
| return Path::parse(path.slice(1)); |
| } |
| }; |
| |
| } // namespace |
| |
| Own<ReadableFile> newDiskReadableFile(kj::AutoCloseFd fd) { |
| return heap<DiskReadableFile>(kj::mv(fd)); |
| } |
| Own<AppendableFile> newDiskAppendableFile(kj::AutoCloseFd fd) { |
| return heap<DiskAppendableFile>(kj::mv(fd)); |
| } |
| Own<File> newDiskFile(kj::AutoCloseFd fd) { |
| return heap<DiskFile>(kj::mv(fd)); |
| } |
| Own<ReadableDirectory> newDiskReadableDirectory(kj::AutoCloseFd fd) { |
| return heap<DiskReadableDirectory>(kj::mv(fd)); |
| } |
| Own<Directory> newDiskDirectory(kj::AutoCloseFd fd) { |
| return heap<DiskDirectory>(kj::mv(fd)); |
| } |
| |
| Own<Filesystem> newDiskFilesystem() { |
| return heap<DiskFilesystem>(); |
| } |
| |
| } // namespace kj |
| |
| #endif // !_WIN32 |