blob: c8c8ece56f04635c63c370dfd36d1c42dcd35de2 [file] [log] [blame] [edit]
// SPDX-License-Identifier: GPL-2.0
#include "ublksrv_tgt.h"
#include "qcow2_format.h"
#include "qcow2.h"
#define HEADER_SIZE 512
#define QCOW2_UNMAPPED (u64)(-1)
static int qcow2_init_tgt(struct ublksrv_dev *dev, int type, int argc, char
*argv[])
{
struct ublksrv_tgt_info *tgt = &dev->tgt;
const struct ublksrv_ctrl_dev_info *info =
ublksrv_ctrl_get_dev_info(ublksrv_get_ctrl_dev(dev));
static const struct option lo_longopts[] = {
{ "file", 1, NULL, 'f' },
{ NULL }
};
int jbuf_size;
char *jbuf;
int fd, opt, ret;
void *header_buf;
QCowHeader *header;
char *file = NULL;
struct ublksrv_tgt_base_json tgt_json = {
.type = type,
};
struct ublk_params p = {
.types = UBLK_PARAM_TYPE_BASIC,
.basic = {
//.attrs = UBLK_ATTR_READ_ONLY,
.logical_bs_shift = 9,
.physical_bs_shift = 12,
.io_opt_shift = 12,
.io_min_shift = 9,
.max_sectors = info->max_io_buf_bytes >> 9,
},
};
Qcow2State *qs;
/* qcow2 doesn't support user copy yet */
if (info->flags & UBLK_F_USER_COPY)
return -EINVAL;
//1024 queue depth is enough for qcow2, then we can store
//tag & l1 entry index in single u32 variable.
if (info->queue_depth > QCOW2_MAX_QUEUE_DEPTH)
return -EINVAL;
//qcow2 target doesn't support MQ yet
if (info->nr_hw_queues > 1)
return -EINVAL;
strcpy(tgt_json.name, "qcow2");
if (type != UBLKSRV_TGT_TYPE_QCOW2)
return -EINVAL;
while ((opt = getopt_long(argc, argv, "-:f:",
lo_longopts, NULL)) != -1) {
switch (opt) {
case 'f':
file = strdup(optarg);
break;
}
}
if (!file)
return -EINVAL;
if (posix_memalign((void **)&header_buf, 512, HEADER_SIZE))
return -EINVAL;
header = (QCowHeader *)header_buf;
fd = open(file, O_RDWR);
if (fd < 0) {
ublk_err( "%s backing file %s can't be opened\n",
__func__, file);
return -EINVAL;
}
if (fcntl(fd, F_SETFL, O_DIRECT))
ublk_err( "%s direct io on file %s isn't supported\n",
__func__, file);
ret = read(fd, header_buf, HEADER_SIZE);
if (ret != HEADER_SIZE) {
ublk_err( "%s: return backing file %s %d %d\n",
__func__, file, HEADER_SIZE, ret);
return -EINVAL;
}
if (be64_to_cpu(header->nb_snapshots) != 0) {
ublk_err( "%s: not support snapshots\n", __func__);
return -EINVAL;
}
tgt_json.dev_size = tgt->dev_size = be64_to_cpu(header->size);
p.basic.dev_sectors = tgt->dev_size >> 9,
p.basic.chunk_sectors = 1 << (be32_to_cpu(header->cluster_bits) - 9);
tgt->tgt_ring_depth = info->queue_depth * 4;
tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
tgt->iowq_max_workers[0] = 1;
tgt->nr_fds = 1;
tgt->fds[1] = fd;
tgt->tgt_data = qs = make_qcow2state(file, dev);
ublksrv_tgt_set_io_data_size(tgt);
jbuf = ublksrv_tgt_realloc_json_buf(dev, &jbuf_size);
ublk_json_write_dev_info(dev, &jbuf, &jbuf_size);
ublk_json_write_target_base(dev, &jbuf, &jbuf_size, &tgt_json);
ublk_json_write_params(dev, &jbuf, &jbuf_size, &p);
ublk_json_write_tgt_str(dev, &jbuf, &jbuf_size,
"backing_file", file);
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"version", qs->header.get_version());
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"cluster_bits", qs->header.get_cluster_bits());
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"header_length", qs->header.get_header_length());
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"l1_size", qs->header.get_l1_size());
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"refcount_table_clusters",
qs->header.get_refcount_table_clusters());
ublk_json_write_tgt_ulong(dev, &jbuf, &jbuf_size,
"refcount_order", qs->header.get_refcount_order());
qs->header.dump_ext();
return 0;
}
static int qcow2_recovery_tgt(struct ublksrv_dev *dev, int type)
{
const struct ublksrv_ctrl_dev *cdev = ublksrv_get_ctrl_dev(dev);
const char *jbuf = ublksrv_ctrl_get_recovery_jbuf(cdev);
const struct ublksrv_ctrl_dev_info *info =
ublksrv_ctrl_get_dev_info(cdev);
struct ublksrv_tgt_info *tgt = &dev->tgt;
int fd, ret;
char file[PATH_MAX];
struct ublk_params p;
int tgt_depth;
ublk_assert(jbuf);
ublk_assert(info->state == UBLK_S_DEV_QUIESCED);
ublk_assert(type == UBLKSRV_TGT_TYPE_QCOW2);
/* qcow2 doesn't support user copy yet */
if (info->flags & UBLK_F_USER_COPY)
return -EINVAL;
ret = ublksrv_json_read_target_str_info(jbuf, PATH_MAX, "backing_file", file);
if (ret < 0) {
ublk_err( "%s: backing file can't be retrieved from jbuf %d\n",
__func__, ret);
return ret;
}
ret = ublksrv_json_read_params(&p, jbuf);
if (ret) {
ublk_err( "%s: read ublk params failed %d\n",
__func__, ret);
return ret;
}
fd = open(file, O_RDWR);
if (fd < 0) {
ublk_err( "%s: backing file %s can't be opened\n",
__func__, file);
return fd;
}
if (fcntl(fd, F_SETFL, O_DIRECT))
ublk_err( "%s direct io on file %s isn't supported\n",
__func__, file);
tgt_depth = QCOW2_PARA::META_MAX_TAGS > info->queue_depth * 2 ?
QCOW2_PARA::META_MAX_TAGS : info->queue_depth * 2;
tgt->dev_size = p.basic.dev_sectors << 9;
tgt->extra_ios = QCOW2_PARA::META_MAX_TAGS;
tgt->tgt_ring_depth = tgt_depth;
tgt->iowq_max_workers[0] = 1;
tgt->nr_fds = 1;
tgt->fds[1] = fd;
tgt->tgt_data = make_qcow2state(file, dev);
ublksrv_tgt_set_io_data_size(tgt);
return 0;
}
static void qcow2_usage_for_add(void)
{
printf(" qcow2: -f backing_file\n");
}
/* todo: flush meta dirty data */
static inline int qcow2_queue_tgt_fsync(const struct ublksrv_queue *q,
unsigned io_op, int tag, u32 len, u64 offset)
{
int fd = q->dev->tgt.fds[1];
struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
if (!sqe) {
ublk_err("%s: tag %d offset %lx op %d, no sqe\n",
__func__, tag, offset, io_op);
return -ENOMEM;
}
io_uring_prep_sync_file_range(sqe, fd, len ,offset,
IORING_FSYNC_DATASYNC);
sqe->user_data = build_user_data(tag, io_op, 0, 1);
qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
" (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
__func__, io_op, sqe->off, sqe->len, sqe->addr,
q->q_id, tag, io_op, 1, sqe->user_data);
return 1;
}
static inline int qcow2_queue_tgt_zero_cluster(const Qcow2State *qs,
const struct ublksrv_queue *q, int tag, u64 offset)
{
int mode = FALLOC_FL_ZERO_RANGE;
int fd = q->dev->tgt.fds[1];
struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
if (!sqe) {
ublk_err("%s: tag %d offset %lx op %d, no sqe for zeroing\n",
__func__, tag, offset, IORING_OP_FALLOCATE);
return -ENOMEM;
}
io_uring_prep_fallocate(sqe, fd, mode, offset,
(1ULL << qs->header.cluster_bits));
sqe->user_data = build_user_data(tag,
IORING_OP_FALLOCATE, 0, 1);
qcow2_io_log("%s: queue io op %d(%llx %llx %llx)"
" (qid %d tag %u, target: %d, user_data %llx)\n",
__func__, IORING_OP_FALLOCATE, offset,
sqe->len, sqe->addr, q->q_id, tag, 1, sqe->user_data);
return 1;
}
static inline int qcow2_queue_tgt_rw_fast(const struct ublksrv_queue *q,
unsigned io_op, int tag, u64 offset,
const struct ublksrv_io_desc *iod)
{
struct io_uring_sqe *sqe = io_uring_get_sqe(q->ring_ptr);
if (!sqe) {
ublk_err("%s: tag %d offset %lx op %d, no sqe for rw\n",
__func__, tag, offset, io_op);
return -ENOMEM;
}
io_uring_prep_rw(io_op, sqe, 1, (void *)iod->addr,
iod->nr_sectors << 9, offset);
sqe->flags = IOSQE_FIXED_FILE;
sqe->user_data = build_user_data(tag, io_op, 0, 1);
qcow2_io_log("%s: queue io op %d(%llu %llx %llx)"
" (qid %d tag %u, cmd_op %u target: %d, user_data %llx)\n",
__func__, io_op, sqe->off, sqe->len, sqe->addr,
q->q_id, tag, io_op, 1, sqe->user_data);
return 1;
}
static inline int qcow2_queue_tgt_rw(const struct ublksrv_queue *q, unsigned io_op,
int tag, u64 offset, const struct ublksrv_io_desc *iod,
u32 *expected_op)
{
Qcow2State *qs = queue_to_qcow2state(q);
u64 cluster_start = offset & ~((1ULL << qs->header.cluster_bits) - 1);
Qcow2ClusterState *cs = qs->cluster_allocator.
get_cluster_state(cluster_start);
u8 cs_state = (cs == nullptr ? QCOW2_ALLOC_DONE : cs->get_state());
if (cs_state >= QCOW2_ALLOC_ZEROED) {
*expected_op = io_op;
return qcow2_queue_tgt_rw_fast(q, io_op, tag, offset, iod);
}
if (io_op == IORING_OP_WRITE) {
if (cs_state == QCOW2_ALLOC_ZEROING) {
cs->add_waiter(tag);
throw MetaUpdateException();
}
if (cs_state == QCOW2_ALLOC_STARTED) {
int ret = qcow2_queue_tgt_zero_cluster(qs, q, tag,
cluster_start);
if (ret >= 0)
cs->set_state(QCOW2_ALLOC_ZEROING);
*expected_op = IORING_OP_FALLOCATE;
return ret;
}
return 0;
} else {
memset((void *)iod->addr, 0,
iod->nr_sectors << 9);
return 0;
}
}
/* return how many sqes queued */
static int qcow2_queue_tgt_io(const struct ublksrv_queue *q, unsigned io_op,
int tag, u64 offset, u32 *exp_op,
const struct ublksrv_io_desc *iod)
{
int ret;
//we don't support discard yet
if (io_op == IORING_OP_FALLOCATE)
return -ENOTSUP;
if (io_op == IORING_OP_FSYNC) {
ret = qcow2_queue_tgt_fsync(q, io_op, tag,
iod->nr_sectors << 9, offset);
*exp_op = io_op;
} else
ret = qcow2_queue_tgt_rw(q, io_op, tag, offset, iod, exp_op);
return ret;
}
static inline bool l2_entry_read_as_zero(u64 entry)
{
if (!entry || (entry & 0x1))
return true;
return false;
}
static co_io_job __qcow2_handle_io_async(const struct ublksrv_queue *q,
const struct ublk_io_data *data, int tag)
{
struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
Qcow2State *qs = queue_to_qcow2state(q);
const struct ublksrv_io_desc *iod = data->iod;
unsigned long start = iod->start_sector << 9;
u64 mapped_start;
qcow2_io_ctx_t ioc(tag, q->q_id);
const struct io_uring_cqe *cqe;
int ret = 0;
unsigned int op = ublksrv_get_op(iod);
bool wait;
qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u\n",
__func__, tag, op, start, (iod->nr_sectors << 9));
qcow2_assert((start + (unsigned long)(iod->nr_sectors << 9)) <=
qs->get_dev_size());
again:
try {
mapped_start = qs->cluster_map.map_cluster(ioc, start,
op == UBLK_IO_OP_WRITE);
wait = false;
} catch (MetaIoException &meta_error) {
wait = true;
} catch (MetaUpdateException &meta_update_error) {
wait = true;
}
if (wait) {
co_await__suspend_always(tag);
cqe = io->tgt_io_cqe;
io->tgt_io_cqe = NULL;
ret = qcow2_meta_io_done(q, cqe);
if (ret == -EAGAIN)
goto again;
if (ret < 0)
goto exit;
}
qcow2_io_log("%s: tag %d, ublk op %x virt %llx/%u to host %llx\n",
__func__, tag, op, start, (iod->nr_sectors << 9),
mapped_start);
if (mapped_start == -1) {
ublk_err("%s: tag %d virt %lx op %d, unsupported format\n",
__func__, tag, start, op);
ret = -EIO;
} else if (!mapped_start) {
// write to unallocated cluster, so have to allocate first
if ((op == UBLK_IO_OP_READ) &&
l2_entry_read_as_zero(mapped_start)) {
ret = iod->nr_sectors << 9;
memset((void *)iod->addr, 0, ret);
} else {
ublk_err("%s: tag %d virt %lx op %d map failed\n",
__func__, tag, start, op);
ret = -EIO;
}
} else {
unsigned io_op = ublksrv_convert_cmd_op(iod);
unsigned exp_op;
mapped_start &= ((1ULL << 63) - 1);
qcow2_assert(mapped_start + (iod->nr_sectors << 9) <=
qs->cluster_allocator.max_physical_size);
queue_io:
//the only exception is from handling zeroing cluster
try {
ret = qcow2_queue_tgt_io(q, io_op, tag, mapped_start,
&exp_op, iod);
wait = false;
} catch (MetaUpdateException &meta_error) {
wait = true;
}
if (wait) {
co_await__suspend_always(tag);
goto queue_io;
}
if (ret > 0) {
u64 cluster_start = mapped_start &
~((1ULL << qs->header.cluster_bits) - 1);
co_await__suspend_always(tag);
cqe = io->tgt_io_cqe;
ret = cqe->res;
if (ret == -EAGAIN) {
qcow2_log("%s zeroing cluster IO eagain\n",
__func__);
//submit this write IO again
if (user_data_to_op(cqe->user_data) == io_op)
goto queue_io;
//if the cluster zeroing IO isn't done, retry
if (qs->cluster_allocator.
alloc_cluster_reset(cluster_start))
goto queue_io;
}
qcow2_io_log("%s: io done, tag %d res %d user_data %llx\n",
__func__, tag, ret,
cqe->user_data);
if (exp_op != io_op) {
if (user_data_to_op(cqe->user_data) == IORING_OP_FALLOCATE)
qs->cluster_allocator.alloc_cluster_zeroed(q,
tag, cluster_start);
goto queue_io;
}
} else if (ret == 0) {
ret = iod->nr_sectors << 9;
}
}
exit:
if (ret < 0)
ublk_err("%s io failed(%d %lx %u) ret %d\n", __func__,
op, start, iod->nr_sectors, ret);
qcow2_io_log("%s tag %d io complete(%d %llx %lu) ret %d\n", __func__,
tag, op, start, iod->nr_sectors, ret);
ublksrv_complete_io(q, tag, ret);
}
static int qcow2_handle_io_async(const struct ublksrv_queue *q,
const struct ublk_io_data *data)
{
struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
io->co = __qcow2_handle_io_async(q, data, data->tag);
return 0;
}
static void qcow2_deinit_tgt(const struct ublksrv_dev *dev)
{
Qcow2State *qs = dev_to_qcow2state(dev);
//now all io slots are available, just use the zero tag
qcow2_io_ctx_t ioc(0, 0);
qs->dump_meta();
delete qs;
}
static void qcow2_tgt_io_done(const struct ublksrv_queue *q,
const struct ublk_io_data *data, const struct io_uring_cqe *cqe)
{
unsigned tag = user_data_to_tag(cqe->user_data);
qcow2_io_log("%s: res %d qid %u tag %u, cmd_op %u\n",
__func__, cqe->res, q->q_id,
user_data_to_tag(cqe->user_data),
user_data_to_op(cqe->user_data));
//special tag is ignored, so far it is used in sending
//fsync during flushing meta
if (tag != 0xffff) {
struct ublk_io_tgt *io = __ublk_get_io_tgt_data(data);
io->tgt_io_cqe = cqe;
io->co.resume();
}
}
static void qcow2_handle_io_bg(const struct ublksrv_queue *q, int nr_queued_io)
{
Qcow2State *qs = queue_to_qcow2state(q);
ublk_dbg(UBLK_DBG_QCOW2_FLUSH | UBLK_DBG_QCOW2_META,
"%s %d, queued io %d\n", __func__, __LINE__, nr_queued_io);
qs->kill_slices(q);
again:
qs->meta_flushing.run_flush(q, nr_queued_io);
if (!nr_queued_io && !qs->meta_flushing.is_flushing()) {
if (qs->has_dirty_slice())
goto again;
}
}
static void qcow2_idle(const struct ublksrv_queue *q, bool enter)
{
Qcow2State *qs = queue_to_qcow2state(q);
if (!enter)
return;
qs->shrink_cache();
}
static int qcow2_init_queue(const struct ublksrv_queue *q,
void **queue_data_ptr)
{
Qcow2State *qs = dev_to_qcow2state(q->dev);
*queue_data_ptr = (void *)qs;
return 0;
}
struct ublksrv_tgt_type qcow2_tgt_type = {
.handle_io_async = qcow2_handle_io_async,
.tgt_io_done = qcow2_tgt_io_done,
.handle_io_background = qcow2_handle_io_bg,
.usage_for_add = qcow2_usage_for_add,
.init_tgt = qcow2_init_tgt,
.deinit_tgt = qcow2_deinit_tgt,
.idle_fn = qcow2_idle,
.type = UBLKSRV_TGT_TYPE_QCOW2,
.name = "qcow2",
.recovery_tgt = qcow2_recovery_tgt,
.init_queue = qcow2_init_queue,
};
static void tgt_qcow2_init() __attribute__((constructor));
static void tgt_qcow2_init(void)
{
ublksrv_register_tgt_type(&qcow2_tgt_type);
}