|  | // SPDX-License-Identifier: GPL-2.0 | 
|  | /* | 
|  | * Copyright (C) 2010 Kent Overstreet <[email protected]> | 
|  | * | 
|  | * Uses a block device as cache for other block devices; optimized for SSDs. | 
|  | * All allocation is done in buckets, which should match the erase block size | 
|  | * of the device. | 
|  | * | 
|  | * Buckets containing cached data are kept on a heap sorted by priority; | 
|  | * bucket priority is increased on cache hit, and periodically all the buckets | 
|  | * on the heap have their priority scaled down. This currently is just used as | 
|  | * an LRU but in the future should allow for more intelligent heuristics. | 
|  | * | 
|  | * Buckets have an 8 bit counter; freeing is accomplished by incrementing the | 
|  | * counter. Garbage collection is used to remove stale pointers. | 
|  | * | 
|  | * Indexing is done via a btree; nodes are not necessarily fully sorted, rather | 
|  | * as keys are inserted we only sort the pages that have not yet been written. | 
|  | * When garbage collection is run, we resort the entire node. | 
|  | * | 
|  | * All configuration is done via sysfs; see Documentation/admin-guide/bcache.rst. | 
|  | */ | 
|  |  | 
|  | #include "bcache.h" | 
|  | #include "btree.h" | 
|  | #include "debug.h" | 
|  | #include "extents.h" | 
|  |  | 
|  | #include <linux/slab.h> | 
|  | #include <linux/bitops.h> | 
|  | #include <linux/hash.h> | 
|  | #include <linux/kthread.h> | 
|  | #include <linux/prefetch.h> | 
|  | #include <linux/random.h> | 
|  | #include <linux/rcupdate.h> | 
|  | #include <linux/sched/clock.h> | 
|  | #include <linux/rculist.h> | 
|  | #include <linux/delay.h> | 
|  | #include <trace/events/bcache.h> | 
|  |  | 
|  | /* | 
|  | * Todo: | 
|  | * register_bcache: Return errors out to userspace correctly | 
|  | * | 
|  | * Writeback: don't undirty key until after a cache flush | 
|  | * | 
|  | * Create an iterator for key pointers | 
|  | * | 
|  | * On btree write error, mark bucket such that it won't be freed from the cache | 
|  | * | 
|  | * Journalling: | 
|  | *   Check for bad keys in replay | 
|  | *   Propagate barriers | 
|  | *   Refcount journal entries in journal_replay | 
|  | * | 
|  | * Garbage collection: | 
|  | *   Finish incremental gc | 
|  | *   Gc should free old UUIDs, data for invalid UUIDs | 
|  | * | 
|  | * Provide a way to list backing device UUIDs we have data cached for, and | 
|  | * probably how long it's been since we've seen them, and a way to invalidate | 
|  | * dirty data for devices that will never be attached again | 
|  | * | 
|  | * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so | 
|  | * that based on that and how much dirty data we have we can keep writeback | 
|  | * from being starved | 
|  | * | 
|  | * Add a tracepoint or somesuch to watch for writeback starvation | 
|  | * | 
|  | * When btree depth > 1 and splitting an interior node, we have to make sure | 
|  | * alloc_bucket() cannot fail. This should be true but is not completely | 
|  | * obvious. | 
|  | * | 
|  | * Plugging? | 
|  | * | 
|  | * If data write is less than hard sector size of ssd, round up offset in open | 
|  | * bucket to the next whole sector | 
|  | * | 
|  | * Superblock needs to be fleshed out for multiple cache devices | 
|  | * | 
|  | * Add a sysfs tunable for the number of writeback IOs in flight | 
|  | * | 
|  | * Add a sysfs tunable for the number of open data buckets | 
|  | * | 
|  | * IO tracking: Can we track when one process is doing io on behalf of another? | 
|  | * IO tracking: Don't use just an average, weigh more recent stuff higher | 
|  | * | 
|  | * Test module load/unload | 
|  | */ | 
|  |  | 
|  | #define MAX_NEED_GC		64 | 
|  | #define MAX_SAVE_PRIO		72 | 
|  | #define MAX_GC_TIMES		100 | 
|  | #define MIN_GC_NODES		100 | 
|  | #define GC_SLEEP_MS		100 | 
|  |  | 
|  | #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36)) | 
|  |  | 
|  | #define PTR_HASH(c, k)							\ | 
|  | (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) | 
|  |  | 
|  | static struct workqueue_struct *btree_io_wq; | 
|  |  | 
|  | #define insert_lock(s, b)	((b)->level <= (s)->lock) | 
|  |  | 
|  |  | 
|  | static inline struct bset *write_block(struct btree *b) | 
|  | { | 
|  | return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c->cache); | 
|  | } | 
|  |  | 
|  | static void bch_btree_init_next(struct btree *b) | 
|  | { | 
|  | /* If not a leaf node, always sort */ | 
|  | if (b->level && b->keys.nsets) | 
|  | bch_btree_sort(&b->keys, &b->c->sort); | 
|  | else | 
|  | bch_btree_sort_lazy(&b->keys, &b->c->sort); | 
|  |  | 
|  | if (b->written < btree_blocks(b)) | 
|  | bch_bset_init_next(&b->keys, write_block(b), | 
|  | bset_magic(&b->c->cache->sb)); | 
|  |  | 
|  | } | 
|  |  | 
|  | /* Btree key manipulation */ | 
|  |  | 
|  | void bkey_put(struct cache_set *c, struct bkey *k) | 
|  | { | 
|  | unsigned int i; | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(k); i++) | 
|  | if (ptr_available(c, k, i)) | 
|  | atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); | 
|  | } | 
|  |  | 
|  | /* Btree IO */ | 
|  |  | 
|  | static uint64_t btree_csum_set(struct btree *b, struct bset *i) | 
|  | { | 
|  | uint64_t crc = b->key.ptr[0]; | 
|  | void *data = (void *) i + 8, *end = bset_bkey_last(i); | 
|  |  | 
|  | crc = bch_crc64_update(crc, data, end - data); | 
|  | return crc ^ 0xffffffffffffffffULL; | 
|  | } | 
|  |  | 
|  | void bch_btree_node_read_done(struct btree *b) | 
|  | { | 
|  | const char *err = "bad btree header"; | 
|  | struct bset *i = btree_bset_first(b); | 
|  | struct btree_iter *iter; | 
|  |  | 
|  | /* | 
|  | * c->fill_iter can allocate an iterator with more memory space | 
|  | * than static MAX_BSETS. | 
|  | * See the comment arount cache_set->fill_iter. | 
|  | */ | 
|  | iter = mempool_alloc(&b->c->fill_iter, GFP_NOIO); | 
|  | iter->size = b->c->cache->sb.bucket_size / b->c->cache->sb.block_size; | 
|  | iter->used = 0; | 
|  |  | 
|  | #ifdef CONFIG_BCACHE_DEBUG | 
|  | iter->b = &b->keys; | 
|  | #endif | 
|  |  | 
|  | if (!i->seq) | 
|  | goto err; | 
|  |  | 
|  | for (; | 
|  | b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; | 
|  | i = write_block(b)) { | 
|  | err = "unsupported bset version"; | 
|  | if (i->version > BCACHE_BSET_VERSION) | 
|  | goto err; | 
|  |  | 
|  | err = "bad btree header"; | 
|  | if (b->written + set_blocks(i, block_bytes(b->c->cache)) > | 
|  | btree_blocks(b)) | 
|  | goto err; | 
|  |  | 
|  | err = "bad magic"; | 
|  | if (i->magic != bset_magic(&b->c->cache->sb)) | 
|  | goto err; | 
|  |  | 
|  | err = "bad checksum"; | 
|  | switch (i->version) { | 
|  | case 0: | 
|  | if (i->csum != csum_set(i)) | 
|  | goto err; | 
|  | break; | 
|  | case BCACHE_BSET_VERSION: | 
|  | if (i->csum != btree_csum_set(b, i)) | 
|  | goto err; | 
|  | break; | 
|  | } | 
|  |  | 
|  | err = "empty set"; | 
|  | if (i != b->keys.set[0].data && !i->keys) | 
|  | goto err; | 
|  |  | 
|  | bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); | 
|  |  | 
|  | b->written += set_blocks(i, block_bytes(b->c->cache)); | 
|  | } | 
|  |  | 
|  | err = "corrupted btree"; | 
|  | for (i = write_block(b); | 
|  | bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); | 
|  | i = ((void *) i) + block_bytes(b->c->cache)) | 
|  | if (i->seq == b->keys.set[0].data->seq) | 
|  | goto err; | 
|  |  | 
|  | bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); | 
|  |  | 
|  | i = b->keys.set[0].data; | 
|  | err = "short btree key"; | 
|  | if (b->keys.set[0].size && | 
|  | bkey_cmp(&b->key, &b->keys.set[0].end) < 0) | 
|  | goto err; | 
|  |  | 
|  | if (b->written < btree_blocks(b)) | 
|  | bch_bset_init_next(&b->keys, write_block(b), | 
|  | bset_magic(&b->c->cache->sb)); | 
|  | out: | 
|  | mempool_free(iter, &b->c->fill_iter); | 
|  | return; | 
|  | err: | 
|  | set_btree_node_io_error(b); | 
|  | bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", | 
|  | err, PTR_BUCKET_NR(b->c, &b->key, 0), | 
|  | bset_block_offset(b, i), i->keys); | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | static void btree_node_read_endio(struct bio *bio) | 
|  | { | 
|  | struct closure *cl = bio->bi_private; | 
|  |  | 
|  | closure_put(cl); | 
|  | } | 
|  |  | 
|  | static void bch_btree_node_read(struct btree *b) | 
|  | { | 
|  | uint64_t start_time = local_clock(); | 
|  | struct closure cl; | 
|  | struct bio *bio; | 
|  |  | 
|  | trace_bcache_btree_read(b); | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | bio = bch_bbio_alloc(b->c); | 
|  | bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; | 
|  | bio->bi_end_io	= btree_node_read_endio; | 
|  | bio->bi_private	= &cl; | 
|  | bio->bi_opf = REQ_OP_READ | REQ_META; | 
|  |  | 
|  | bch_bio_map(bio, b->keys.set[0].data); | 
|  |  | 
|  | bch_submit_bbio(bio, b->c, &b->key, 0); | 
|  | closure_sync(&cl); | 
|  |  | 
|  | if (bio->bi_status) | 
|  | set_btree_node_io_error(b); | 
|  |  | 
|  | bch_bbio_free(bio, b->c); | 
|  |  | 
|  | if (btree_node_io_error(b)) | 
|  | goto err; | 
|  |  | 
|  | bch_btree_node_read_done(b); | 
|  | bch_time_stats_update(&b->c->btree_read_time, start_time); | 
|  |  | 
|  | return; | 
|  | err: | 
|  | bch_cache_set_error(b->c, "io error reading bucket %zu", | 
|  | PTR_BUCKET_NR(b->c, &b->key, 0)); | 
|  | } | 
|  |  | 
|  | static void btree_complete_write(struct btree *b, struct btree_write *w) | 
|  | { | 
|  | if (w->prio_blocked && | 
|  | !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) | 
|  | wake_up_allocators(b->c); | 
|  |  | 
|  | if (w->journal) { | 
|  | atomic_dec_bug(w->journal); | 
|  | __closure_wake_up(&b->c->journal.wait); | 
|  | } | 
|  |  | 
|  | w->prio_blocked	= 0; | 
|  | w->journal	= NULL; | 
|  | } | 
|  |  | 
|  | static void btree_node_write_unlock(struct closure *cl) | 
|  | { | 
|  | struct btree *b = container_of(cl, struct btree, io); | 
|  |  | 
|  | up(&b->io_mutex); | 
|  | } | 
|  |  | 
|  | static void __btree_node_write_done(struct closure *cl) | 
|  | { | 
|  | struct btree *b = container_of(cl, struct btree, io); | 
|  | struct btree_write *w = btree_prev_write(b); | 
|  |  | 
|  | bch_bbio_free(b->bio, b->c); | 
|  | b->bio = NULL; | 
|  | btree_complete_write(b, w); | 
|  |  | 
|  | if (btree_node_dirty(b)) | 
|  | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | 
|  |  | 
|  | closure_return_with_destructor(cl, btree_node_write_unlock); | 
|  | } | 
|  |  | 
|  | static void btree_node_write_done(struct closure *cl) | 
|  | { | 
|  | struct btree *b = container_of(cl, struct btree, io); | 
|  |  | 
|  | bio_free_pages(b->bio); | 
|  | __btree_node_write_done(cl); | 
|  | } | 
|  |  | 
|  | static void btree_node_write_endio(struct bio *bio) | 
|  | { | 
|  | struct closure *cl = bio->bi_private; | 
|  | struct btree *b = container_of(cl, struct btree, io); | 
|  |  | 
|  | if (bio->bi_status) | 
|  | set_btree_node_io_error(b); | 
|  |  | 
|  | bch_bbio_count_io_errors(b->c, bio, bio->bi_status, "writing btree"); | 
|  | closure_put(cl); | 
|  | } | 
|  |  | 
|  | static void do_btree_node_write(struct btree *b) | 
|  | { | 
|  | struct closure *cl = &b->io; | 
|  | struct bset *i = btree_bset_last(b); | 
|  | BKEY_PADDED(key) k; | 
|  |  | 
|  | i->version	= BCACHE_BSET_VERSION; | 
|  | i->csum		= btree_csum_set(b, i); | 
|  |  | 
|  | BUG_ON(b->bio); | 
|  | b->bio = bch_bbio_alloc(b->c); | 
|  |  | 
|  | b->bio->bi_end_io	= btree_node_write_endio; | 
|  | b->bio->bi_private	= cl; | 
|  | b->bio->bi_iter.bi_size	= roundup(set_bytes(i), block_bytes(b->c->cache)); | 
|  | b->bio->bi_opf		= REQ_OP_WRITE | REQ_META | REQ_FUA; | 
|  | bch_bio_map(b->bio, i); | 
|  |  | 
|  | /* | 
|  | * If we're appending to a leaf node, we don't technically need FUA - | 
|  | * this write just needs to be persisted before the next journal write, | 
|  | * which will be marked FLUSH|FUA. | 
|  | * | 
|  | * Similarly if we're writing a new btree root - the pointer is going to | 
|  | * be in the next journal entry. | 
|  | * | 
|  | * But if we're writing a new btree node (that isn't a root) or | 
|  | * appending to a non leaf btree node, we need either FUA or a flush | 
|  | * when we write the parent with the new pointer. FUA is cheaper than a | 
|  | * flush, and writes appending to leaf nodes aren't blocking anything so | 
|  | * just make all btree node writes FUA to keep things sane. | 
|  | */ | 
|  |  | 
|  | bkey_copy(&k.key, &b->key); | 
|  | SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + | 
|  | bset_sector_offset(&b->keys, i)); | 
|  |  | 
|  | if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) { | 
|  | struct bio_vec *bv; | 
|  | void *addr = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); | 
|  | struct bvec_iter_all iter_all; | 
|  |  | 
|  | bio_for_each_segment_all(bv, b->bio, iter_all) { | 
|  | memcpy(page_address(bv->bv_page), addr, PAGE_SIZE); | 
|  | addr += PAGE_SIZE; | 
|  | } | 
|  |  | 
|  | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 
|  |  | 
|  | continue_at(cl, btree_node_write_done, NULL); | 
|  | } else { | 
|  | /* | 
|  | * No problem for multipage bvec since the bio is | 
|  | * just allocated | 
|  | */ | 
|  | b->bio->bi_vcnt = 0; | 
|  | bch_bio_map(b->bio, i); | 
|  |  | 
|  | bch_submit_bbio(b->bio, b->c, &k.key, 0); | 
|  |  | 
|  | closure_sync(cl); | 
|  | continue_at_nobarrier(cl, __btree_node_write_done, NULL); | 
|  | } | 
|  | } | 
|  |  | 
|  | void __bch_btree_node_write(struct btree *b, struct closure *parent) | 
|  | { | 
|  | struct bset *i = btree_bset_last(b); | 
|  |  | 
|  | lockdep_assert_held(&b->write_lock); | 
|  |  | 
|  | trace_bcache_btree_write(b); | 
|  |  | 
|  | BUG_ON(current->bio_list); | 
|  | BUG_ON(b->written >= btree_blocks(b)); | 
|  | BUG_ON(b->written && !i->keys); | 
|  | BUG_ON(btree_bset_first(b)->seq != i->seq); | 
|  | bch_check_keys(&b->keys, "writing"); | 
|  |  | 
|  | cancel_delayed_work(&b->work); | 
|  |  | 
|  | /* If caller isn't waiting for write, parent refcount is cache set */ | 
|  | down(&b->io_mutex); | 
|  | closure_init(&b->io, parent ?: &b->c->cl); | 
|  |  | 
|  | clear_bit(BTREE_NODE_dirty,	 &b->flags); | 
|  | change_bit(BTREE_NODE_write_idx, &b->flags); | 
|  |  | 
|  | do_btree_node_write(b); | 
|  |  | 
|  | atomic_long_add(set_blocks(i, block_bytes(b->c->cache)) * b->c->cache->sb.block_size, | 
|  | &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); | 
|  |  | 
|  | b->written += set_blocks(i, block_bytes(b->c->cache)); | 
|  | } | 
|  |  | 
|  | void bch_btree_node_write(struct btree *b, struct closure *parent) | 
|  | { | 
|  | unsigned int nsets = b->keys.nsets; | 
|  |  | 
|  | lockdep_assert_held(&b->lock); | 
|  |  | 
|  | __bch_btree_node_write(b, parent); | 
|  |  | 
|  | /* | 
|  | * do verify if there was more than one set initially (i.e. we did a | 
|  | * sort) and we sorted down to a single set: | 
|  | */ | 
|  | if (nsets && !b->keys.nsets) | 
|  | bch_btree_verify(b); | 
|  |  | 
|  | bch_btree_init_next(b); | 
|  | } | 
|  |  | 
|  | static void bch_btree_node_write_sync(struct btree *b) | 
|  | { | 
|  | struct closure cl; | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | mutex_lock(&b->write_lock); | 
|  | bch_btree_node_write(b, &cl); | 
|  | mutex_unlock(&b->write_lock); | 
|  |  | 
|  | closure_sync(&cl); | 
|  | } | 
|  |  | 
|  | static void btree_node_write_work(struct work_struct *w) | 
|  | { | 
|  | struct btree *b = container_of(to_delayed_work(w), struct btree, work); | 
|  |  | 
|  | mutex_lock(&b->write_lock); | 
|  | if (btree_node_dirty(b)) | 
|  | __bch_btree_node_write(b, NULL); | 
|  | mutex_unlock(&b->write_lock); | 
|  | } | 
|  |  | 
|  | static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) | 
|  | { | 
|  | struct bset *i = btree_bset_last(b); | 
|  | struct btree_write *w = btree_current_write(b); | 
|  |  | 
|  | lockdep_assert_held(&b->write_lock); | 
|  |  | 
|  | BUG_ON(!b->written); | 
|  | BUG_ON(!i->keys); | 
|  |  | 
|  | if (!btree_node_dirty(b)) | 
|  | queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); | 
|  |  | 
|  | set_btree_node_dirty(b); | 
|  |  | 
|  | /* | 
|  | * w->journal is always the oldest journal pin of all bkeys | 
|  | * in the leaf node, to make sure the oldest jset seq won't | 
|  | * be increased before this btree node is flushed. | 
|  | */ | 
|  | if (journal_ref) { | 
|  | if (w->journal && | 
|  | journal_pin_cmp(b->c, w->journal, journal_ref)) { | 
|  | atomic_dec_bug(w->journal); | 
|  | w->journal = NULL; | 
|  | } | 
|  |  | 
|  | if (!w->journal) { | 
|  | w->journal = journal_ref; | 
|  | atomic_inc(w->journal); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Force write if set is too big */ | 
|  | if (set_bytes(i) > PAGE_SIZE - 48 && | 
|  | !current->bio_list) | 
|  | bch_btree_node_write(b, NULL); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * Btree in memory cache - allocation/freeing | 
|  | * mca -> memory cache | 
|  | */ | 
|  |  | 
|  | #define mca_reserve(c)	(((!IS_ERR_OR_NULL(c->root) && c->root->level) \ | 
|  | ? c->root->level : 1) * 8 + 16) | 
|  | #define mca_can_free(c)						\ | 
|  | max_t(int, 0, c->btree_cache_used - mca_reserve(c)) | 
|  |  | 
|  | static void mca_data_free(struct btree *b) | 
|  | { | 
|  | BUG_ON(b->io_mutex.count != 1); | 
|  |  | 
|  | bch_btree_keys_free(&b->keys); | 
|  |  | 
|  | b->c->btree_cache_used--; | 
|  | list_move(&b->list, &b->c->btree_cache_freed); | 
|  | } | 
|  |  | 
|  | static void mca_bucket_free(struct btree *b) | 
|  | { | 
|  | BUG_ON(btree_node_dirty(b)); | 
|  |  | 
|  | b->key.ptr[0] = 0; | 
|  | hlist_del_init_rcu(&b->hash); | 
|  | list_move(&b->list, &b->c->btree_cache_freeable); | 
|  | } | 
|  |  | 
|  | static unsigned int btree_order(struct bkey *k) | 
|  | { | 
|  | return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); | 
|  | } | 
|  |  | 
|  | static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) | 
|  | { | 
|  | if (!bch_btree_keys_alloc(&b->keys, | 
|  | max_t(unsigned int, | 
|  | ilog2(b->c->btree_pages), | 
|  | btree_order(k)), | 
|  | gfp)) { | 
|  | b->c->btree_cache_used++; | 
|  | list_move(&b->list, &b->c->btree_cache); | 
|  | } else { | 
|  | list_move(&b->list, &b->c->btree_cache_freed); | 
|  | } | 
|  | } | 
|  |  | 
|  | static struct btree *mca_bucket_alloc(struct cache_set *c, | 
|  | struct bkey *k, gfp_t gfp) | 
|  | { | 
|  | /* | 
|  | * kzalloc() is necessary here for initialization, | 
|  | * see code comments in bch_btree_keys_init(). | 
|  | */ | 
|  | struct btree *b = kzalloc(sizeof(struct btree), gfp); | 
|  |  | 
|  | if (!b) | 
|  | return NULL; | 
|  |  | 
|  | init_rwsem(&b->lock); | 
|  | lockdep_set_novalidate_class(&b->lock); | 
|  | mutex_init(&b->write_lock); | 
|  | lockdep_set_novalidate_class(&b->write_lock); | 
|  | INIT_LIST_HEAD(&b->list); | 
|  | INIT_DELAYED_WORK(&b->work, btree_node_write_work); | 
|  | b->c = c; | 
|  | sema_init(&b->io_mutex, 1); | 
|  |  | 
|  | mca_data_alloc(b, k, gfp); | 
|  | return b; | 
|  | } | 
|  |  | 
|  | static int mca_reap(struct btree *b, unsigned int min_order, bool flush) | 
|  | { | 
|  | struct closure cl; | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  | lockdep_assert_held(&b->c->bucket_lock); | 
|  |  | 
|  | if (!down_write_trylock(&b->lock)) | 
|  | return -ENOMEM; | 
|  |  | 
|  | BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); | 
|  |  | 
|  | if (b->keys.page_order < min_order) | 
|  | goto out_unlock; | 
|  |  | 
|  | if (!flush) { | 
|  | if (btree_node_dirty(b)) | 
|  | goto out_unlock; | 
|  |  | 
|  | if (down_trylock(&b->io_mutex)) | 
|  | goto out_unlock; | 
|  | up(&b->io_mutex); | 
|  | } | 
|  |  | 
|  | retry: | 
|  | /* | 
|  | * BTREE_NODE_dirty might be cleared in btree_flush_btree() by | 
|  | * __bch_btree_node_write(). To avoid an extra flush, acquire | 
|  | * b->write_lock before checking BTREE_NODE_dirty bit. | 
|  | */ | 
|  | mutex_lock(&b->write_lock); | 
|  | /* | 
|  | * If this btree node is selected in btree_flush_write() by journal | 
|  | * code, delay and retry until the node is flushed by journal code | 
|  | * and BTREE_NODE_journal_flush bit cleared by btree_flush_write(). | 
|  | */ | 
|  | if (btree_node_journal_flush(b)) { | 
|  | pr_debug("bnode %p is flushing by journal, retry\n", b); | 
|  | mutex_unlock(&b->write_lock); | 
|  | udelay(1); | 
|  | goto retry; | 
|  | } | 
|  |  | 
|  | if (btree_node_dirty(b)) | 
|  | __bch_btree_node_write(b, &cl); | 
|  | mutex_unlock(&b->write_lock); | 
|  |  | 
|  | closure_sync(&cl); | 
|  |  | 
|  | /* wait for any in flight btree write */ | 
|  | down(&b->io_mutex); | 
|  | up(&b->io_mutex); | 
|  |  | 
|  | return 0; | 
|  | out_unlock: | 
|  | rw_unlock(true, b); | 
|  | return -ENOMEM; | 
|  | } | 
|  |  | 
|  | static unsigned long bch_mca_scan(struct shrinker *shrink, | 
|  | struct shrink_control *sc) | 
|  | { | 
|  | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | 
|  | struct btree *b, *t; | 
|  | unsigned long i, nr = sc->nr_to_scan; | 
|  | unsigned long freed = 0; | 
|  | unsigned int btree_cache_used; | 
|  |  | 
|  | if (c->shrinker_disabled) | 
|  | return SHRINK_STOP; | 
|  |  | 
|  | if (c->btree_cache_alloc_lock) | 
|  | return SHRINK_STOP; | 
|  |  | 
|  | /* Return -1 if we can't do anything right now */ | 
|  | if (sc->gfp_mask & __GFP_IO) | 
|  | mutex_lock(&c->bucket_lock); | 
|  | else if (!mutex_trylock(&c->bucket_lock)) | 
|  | return -1; | 
|  |  | 
|  | /* | 
|  | * It's _really_ critical that we don't free too many btree nodes - we | 
|  | * have to always leave ourselves a reserve. The reserve is how we | 
|  | * guarantee that allocating memory for a new btree node can always | 
|  | * succeed, so that inserting keys into the btree can always succeed and | 
|  | * IO can always make forward progress: | 
|  | */ | 
|  | nr /= c->btree_pages; | 
|  | if (nr == 0) | 
|  | nr = 1; | 
|  | nr = min_t(unsigned long, nr, mca_can_free(c)); | 
|  |  | 
|  | i = 0; | 
|  | btree_cache_used = c->btree_cache_used; | 
|  | list_for_each_entry_safe_reverse(b, t, &c->btree_cache_freeable, list) { | 
|  | if (nr <= 0) | 
|  | goto out; | 
|  |  | 
|  | if (!mca_reap(b, 0, false)) { | 
|  | mca_data_free(b); | 
|  | rw_unlock(true, b); | 
|  | freed++; | 
|  | } | 
|  | nr--; | 
|  | i++; | 
|  | } | 
|  |  | 
|  | list_for_each_entry_safe_reverse(b, t, &c->btree_cache, list) { | 
|  | if (nr <= 0 || i >= btree_cache_used) | 
|  | goto out; | 
|  |  | 
|  | if (!mca_reap(b, 0, false)) { | 
|  | mca_bucket_free(b); | 
|  | mca_data_free(b); | 
|  | rw_unlock(true, b); | 
|  | freed++; | 
|  | } | 
|  |  | 
|  | nr--; | 
|  | i++; | 
|  | } | 
|  | out: | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | return freed * c->btree_pages; | 
|  | } | 
|  |  | 
|  | static unsigned long bch_mca_count(struct shrinker *shrink, | 
|  | struct shrink_control *sc) | 
|  | { | 
|  | struct cache_set *c = container_of(shrink, struct cache_set, shrink); | 
|  |  | 
|  | if (c->shrinker_disabled) | 
|  | return 0; | 
|  |  | 
|  | if (c->btree_cache_alloc_lock) | 
|  | return 0; | 
|  |  | 
|  | return mca_can_free(c) * c->btree_pages; | 
|  | } | 
|  |  | 
|  | void bch_btree_cache_free(struct cache_set *c) | 
|  | { | 
|  | struct btree *b; | 
|  | struct closure cl; | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | if (c->shrink.list.next) | 
|  | unregister_shrinker(&c->shrink); | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  |  | 
|  | #ifdef CONFIG_BCACHE_DEBUG | 
|  | if (c->verify_data) | 
|  | list_move(&c->verify_data->list, &c->btree_cache); | 
|  |  | 
|  | free_pages((unsigned long) c->verify_ondisk, ilog2(meta_bucket_pages(&c->cache->sb))); | 
|  | #endif | 
|  |  | 
|  | list_splice(&c->btree_cache_freeable, | 
|  | &c->btree_cache); | 
|  |  | 
|  | while (!list_empty(&c->btree_cache)) { | 
|  | b = list_first_entry(&c->btree_cache, struct btree, list); | 
|  |  | 
|  | /* | 
|  | * This function is called by cache_set_free(), no I/O | 
|  | * request on cache now, it is unnecessary to acquire | 
|  | * b->write_lock before clearing BTREE_NODE_dirty anymore. | 
|  | */ | 
|  | if (btree_node_dirty(b)) { | 
|  | btree_complete_write(b, btree_current_write(b)); | 
|  | clear_bit(BTREE_NODE_dirty, &b->flags); | 
|  | } | 
|  | mca_data_free(b); | 
|  | } | 
|  |  | 
|  | while (!list_empty(&c->btree_cache_freed)) { | 
|  | b = list_first_entry(&c->btree_cache_freed, | 
|  | struct btree, list); | 
|  | list_del(&b->list); | 
|  | cancel_delayed_work_sync(&b->work); | 
|  | kfree(b); | 
|  | } | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | } | 
|  |  | 
|  | int bch_btree_cache_alloc(struct cache_set *c) | 
|  | { | 
|  | unsigned int i; | 
|  |  | 
|  | for (i = 0; i < mca_reserve(c); i++) | 
|  | if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) | 
|  | return -ENOMEM; | 
|  |  | 
|  | list_splice_init(&c->btree_cache, | 
|  | &c->btree_cache_freeable); | 
|  |  | 
|  | #ifdef CONFIG_BCACHE_DEBUG | 
|  | mutex_init(&c->verify_lock); | 
|  |  | 
|  | c->verify_ondisk = (void *) | 
|  | __get_free_pages(GFP_KERNEL|__GFP_COMP, | 
|  | ilog2(meta_bucket_pages(&c->cache->sb))); | 
|  | if (!c->verify_ondisk) { | 
|  | /* | 
|  | * Don't worry about the mca_rereserve buckets | 
|  | * allocated in previous for-loop, they will be | 
|  | * handled properly in bch_cache_set_unregister(). | 
|  | */ | 
|  | return -ENOMEM; | 
|  | } | 
|  |  | 
|  | c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); | 
|  |  | 
|  | if (c->verify_data && | 
|  | c->verify_data->keys.set->data) | 
|  | list_del_init(&c->verify_data->list); | 
|  | else | 
|  | c->verify_data = NULL; | 
|  | #endif | 
|  |  | 
|  | c->shrink.count_objects = bch_mca_count; | 
|  | c->shrink.scan_objects = bch_mca_scan; | 
|  | c->shrink.seeks = 4; | 
|  | c->shrink.batch = c->btree_pages * 2; | 
|  |  | 
|  | if (register_shrinker(&c->shrink)) | 
|  | pr_warn("bcache: %s: could not register shrinker\n", | 
|  | __func__); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | /* Btree in memory cache - hash table */ | 
|  |  | 
|  | static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) | 
|  | { | 
|  | return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; | 
|  | } | 
|  |  | 
|  | static struct btree *mca_find(struct cache_set *c, struct bkey *k) | 
|  | { | 
|  | struct btree *b; | 
|  |  | 
|  | rcu_read_lock(); | 
|  | hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) | 
|  | if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) | 
|  | goto out; | 
|  | b = NULL; | 
|  | out: | 
|  | rcu_read_unlock(); | 
|  | return b; | 
|  | } | 
|  |  | 
|  | static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) | 
|  | { | 
|  | spin_lock(&c->btree_cannibalize_lock); | 
|  | if (likely(c->btree_cache_alloc_lock == NULL)) { | 
|  | c->btree_cache_alloc_lock = current; | 
|  | } else if (c->btree_cache_alloc_lock != current) { | 
|  | if (op) | 
|  | prepare_to_wait(&c->btree_cache_wait, &op->wait, | 
|  | TASK_UNINTERRUPTIBLE); | 
|  | spin_unlock(&c->btree_cannibalize_lock); | 
|  | return -EINTR; | 
|  | } | 
|  | spin_unlock(&c->btree_cannibalize_lock); | 
|  |  | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, | 
|  | struct bkey *k) | 
|  | { | 
|  | struct btree *b; | 
|  |  | 
|  | trace_bcache_btree_cache_cannibalize(c); | 
|  |  | 
|  | if (mca_cannibalize_lock(c, op)) | 
|  | return ERR_PTR(-EINTR); | 
|  |  | 
|  | list_for_each_entry_reverse(b, &c->btree_cache, list) | 
|  | if (!mca_reap(b, btree_order(k), false)) | 
|  | return b; | 
|  |  | 
|  | list_for_each_entry_reverse(b, &c->btree_cache, list) | 
|  | if (!mca_reap(b, btree_order(k), true)) | 
|  | return b; | 
|  |  | 
|  | WARN(1, "btree cache cannibalize failed\n"); | 
|  | return ERR_PTR(-ENOMEM); | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We can only have one thread cannibalizing other cached btree nodes at a time, | 
|  | * or we'll deadlock. We use an open coded mutex to ensure that, which a | 
|  | * cannibalize_bucket() will take. This means every time we unlock the root of | 
|  | * the btree, we need to release this lock if we have it held. | 
|  | */ | 
|  | static void bch_cannibalize_unlock(struct cache_set *c) | 
|  | { | 
|  | spin_lock(&c->btree_cannibalize_lock); | 
|  | if (c->btree_cache_alloc_lock == current) { | 
|  | c->btree_cache_alloc_lock = NULL; | 
|  | wake_up(&c->btree_cache_wait); | 
|  | } | 
|  | spin_unlock(&c->btree_cannibalize_lock); | 
|  | } | 
|  |  | 
|  | static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, | 
|  | struct bkey *k, int level) | 
|  | { | 
|  | struct btree *b; | 
|  |  | 
|  | BUG_ON(current->bio_list); | 
|  |  | 
|  | lockdep_assert_held(&c->bucket_lock); | 
|  |  | 
|  | if (mca_find(c, k)) | 
|  | return NULL; | 
|  |  | 
|  | /* btree_free() doesn't free memory; it sticks the node on the end of | 
|  | * the list. Check if there's any freed nodes there: | 
|  | */ | 
|  | list_for_each_entry(b, &c->btree_cache_freeable, list) | 
|  | if (!mca_reap(b, btree_order(k), false)) | 
|  | goto out; | 
|  |  | 
|  | /* We never free struct btree itself, just the memory that holds the on | 
|  | * disk node. Check the freed list before allocating a new one: | 
|  | */ | 
|  | list_for_each_entry(b, &c->btree_cache_freed, list) | 
|  | if (!mca_reap(b, 0, false)) { | 
|  | mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); | 
|  | if (!b->keys.set[0].data) | 
|  | goto err; | 
|  | else | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); | 
|  | if (!b) | 
|  | goto err; | 
|  |  | 
|  | BUG_ON(!down_write_trylock(&b->lock)); | 
|  | if (!b->keys.set->data) | 
|  | goto err; | 
|  | out: | 
|  | BUG_ON(b->io_mutex.count != 1); | 
|  |  | 
|  | bkey_copy(&b->key, k); | 
|  | list_move(&b->list, &c->btree_cache); | 
|  | hlist_del_init_rcu(&b->hash); | 
|  | hlist_add_head_rcu(&b->hash, mca_hash(c, k)); | 
|  |  | 
|  | lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); | 
|  | b->parent	= (void *) ~0UL; | 
|  | b->flags	= 0; | 
|  | b->written	= 0; | 
|  | b->level	= level; | 
|  |  | 
|  | if (!b->level) | 
|  | bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, | 
|  | &b->c->expensive_debug_checks); | 
|  | else | 
|  | bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, | 
|  | &b->c->expensive_debug_checks); | 
|  |  | 
|  | return b; | 
|  | err: | 
|  | if (b) | 
|  | rw_unlock(true, b); | 
|  |  | 
|  | b = mca_cannibalize(c, op, k); | 
|  | if (!IS_ERR(b)) | 
|  | goto out; | 
|  |  | 
|  | return b; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * bch_btree_node_get - find a btree node in the cache and lock it, reading it | 
|  | * in from disk if necessary. | 
|  | * | 
|  | * If IO is necessary and running under submit_bio_noacct, returns -EAGAIN. | 
|  | * | 
|  | * The btree node will have either a read or a write lock held, depending on | 
|  | * level and op->lock. | 
|  | */ | 
|  | struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, | 
|  | struct bkey *k, int level, bool write, | 
|  | struct btree *parent) | 
|  | { | 
|  | int i = 0; | 
|  | struct btree *b; | 
|  |  | 
|  | BUG_ON(level < 0); | 
|  | retry: | 
|  | b = mca_find(c, k); | 
|  |  | 
|  | if (!b) { | 
|  | if (current->bio_list) | 
|  | return ERR_PTR(-EAGAIN); | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  | b = mca_alloc(c, op, k, level); | 
|  | mutex_unlock(&c->bucket_lock); | 
|  |  | 
|  | if (!b) | 
|  | goto retry; | 
|  | if (IS_ERR(b)) | 
|  | return b; | 
|  |  | 
|  | bch_btree_node_read(b); | 
|  |  | 
|  | if (!write) | 
|  | downgrade_write(&b->lock); | 
|  | } else { | 
|  | rw_lock(write, b, level); | 
|  | if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { | 
|  | rw_unlock(write, b); | 
|  | goto retry; | 
|  | } | 
|  | BUG_ON(b->level != level); | 
|  | } | 
|  |  | 
|  | if (btree_node_io_error(b)) { | 
|  | rw_unlock(write, b); | 
|  | return ERR_PTR(-EIO); | 
|  | } | 
|  |  | 
|  | BUG_ON(!b->written); | 
|  |  | 
|  | b->parent = parent; | 
|  |  | 
|  | for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { | 
|  | prefetch(b->keys.set[i].tree); | 
|  | prefetch(b->keys.set[i].data); | 
|  | } | 
|  |  | 
|  | for (; i <= b->keys.nsets; i++) | 
|  | prefetch(b->keys.set[i].data); | 
|  |  | 
|  | return b; | 
|  | } | 
|  |  | 
|  | static void btree_node_prefetch(struct btree *parent, struct bkey *k) | 
|  | { | 
|  | struct btree *b; | 
|  |  | 
|  | mutex_lock(&parent->c->bucket_lock); | 
|  | b = mca_alloc(parent->c, NULL, k, parent->level - 1); | 
|  | mutex_unlock(&parent->c->bucket_lock); | 
|  |  | 
|  | if (!IS_ERR_OR_NULL(b)) { | 
|  | b->parent = parent; | 
|  | bch_btree_node_read(b); | 
|  | rw_unlock(true, b); | 
|  | } | 
|  | } | 
|  |  | 
|  | /* Btree alloc */ | 
|  |  | 
|  | static void btree_node_free(struct btree *b) | 
|  | { | 
|  | trace_bcache_btree_node_free(b); | 
|  |  | 
|  | BUG_ON(b == b->c->root); | 
|  |  | 
|  | retry: | 
|  | mutex_lock(&b->write_lock); | 
|  | /* | 
|  | * If the btree node is selected and flushing in btree_flush_write(), | 
|  | * delay and retry until the BTREE_NODE_journal_flush bit cleared, | 
|  | * then it is safe to free the btree node here. Otherwise this btree | 
|  | * node will be in race condition. | 
|  | */ | 
|  | if (btree_node_journal_flush(b)) { | 
|  | mutex_unlock(&b->write_lock); | 
|  | pr_debug("bnode %p journal_flush set, retry\n", b); | 
|  | udelay(1); | 
|  | goto retry; | 
|  | } | 
|  |  | 
|  | if (btree_node_dirty(b)) { | 
|  | btree_complete_write(b, btree_current_write(b)); | 
|  | clear_bit(BTREE_NODE_dirty, &b->flags); | 
|  | } | 
|  |  | 
|  | mutex_unlock(&b->write_lock); | 
|  |  | 
|  | cancel_delayed_work(&b->work); | 
|  |  | 
|  | mutex_lock(&b->c->bucket_lock); | 
|  | bch_bucket_free(b->c, &b->key); | 
|  | mca_bucket_free(b); | 
|  | mutex_unlock(&b->c->bucket_lock); | 
|  | } | 
|  |  | 
|  | struct btree *__bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, | 
|  | int level, bool wait, | 
|  | struct btree *parent) | 
|  | { | 
|  | BKEY_PADDED(key) k; | 
|  | struct btree *b = ERR_PTR(-EAGAIN); | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  | retry: | 
|  | if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, wait)) | 
|  | goto err; | 
|  |  | 
|  | bkey_put(c, &k.key); | 
|  | SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); | 
|  |  | 
|  | b = mca_alloc(c, op, &k.key, level); | 
|  | if (IS_ERR(b)) | 
|  | goto err_free; | 
|  |  | 
|  | if (!b) { | 
|  | cache_bug(c, | 
|  | "Tried to allocate bucket that was in btree cache"); | 
|  | goto retry; | 
|  | } | 
|  |  | 
|  | b->parent = parent; | 
|  | bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->cache->sb)); | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  |  | 
|  | trace_bcache_btree_node_alloc(b); | 
|  | return b; | 
|  | err_free: | 
|  | bch_bucket_free(c, &k.key); | 
|  | err: | 
|  | mutex_unlock(&c->bucket_lock); | 
|  |  | 
|  | trace_bcache_btree_node_alloc_fail(c); | 
|  | return b; | 
|  | } | 
|  |  | 
|  | static struct btree *bch_btree_node_alloc(struct cache_set *c, | 
|  | struct btree_op *op, int level, | 
|  | struct btree *parent) | 
|  | { | 
|  | return __bch_btree_node_alloc(c, op, level, op != NULL, parent); | 
|  | } | 
|  |  | 
|  | static struct btree *btree_node_alloc_replacement(struct btree *b, | 
|  | struct btree_op *op) | 
|  | { | 
|  | struct btree *n = bch_btree_node_alloc(b->c, op, b->level, b->parent); | 
|  |  | 
|  | if (!IS_ERR_OR_NULL(n)) { | 
|  | mutex_lock(&n->write_lock); | 
|  | bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); | 
|  | bkey_copy_key(&n->key, &b->key); | 
|  | mutex_unlock(&n->write_lock); | 
|  | } | 
|  |  | 
|  | return n; | 
|  | } | 
|  |  | 
|  | static void make_btree_freeing_key(struct btree *b, struct bkey *k) | 
|  | { | 
|  | unsigned int i; | 
|  |  | 
|  | mutex_lock(&b->c->bucket_lock); | 
|  |  | 
|  | atomic_inc(&b->c->prio_blocked); | 
|  |  | 
|  | bkey_copy(k, &b->key); | 
|  | bkey_copy_key(k, &ZERO_KEY); | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(k); i++) | 
|  | SET_PTR_GEN(k, i, | 
|  | bch_inc_gen(PTR_CACHE(b->c, &b->key, i), | 
|  | PTR_BUCKET(b->c, &b->key, i))); | 
|  |  | 
|  | mutex_unlock(&b->c->bucket_lock); | 
|  | } | 
|  |  | 
|  | static int btree_check_reserve(struct btree *b, struct btree_op *op) | 
|  | { | 
|  | struct cache_set *c = b->c; | 
|  | struct cache *ca = c->cache; | 
|  | unsigned int reserve = (c->root->level - b->level) * 2 + 1; | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  |  | 
|  | if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { | 
|  | if (op) | 
|  | prepare_to_wait(&c->btree_cache_wait, &op->wait, | 
|  | TASK_UNINTERRUPTIBLE); | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | return -EINTR; | 
|  | } | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  |  | 
|  | return mca_cannibalize_lock(b->c, op); | 
|  | } | 
|  |  | 
|  | /* Garbage collection */ | 
|  |  | 
|  | static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, | 
|  | struct bkey *k) | 
|  | { | 
|  | uint8_t stale = 0; | 
|  | unsigned int i; | 
|  | struct bucket *g; | 
|  |  | 
|  | /* | 
|  | * ptr_invalid() can't return true for the keys that mark btree nodes as | 
|  | * freed, but since ptr_bad() returns true we'll never actually use them | 
|  | * for anything and thus we don't want mark their pointers here | 
|  | */ | 
|  | if (!bkey_cmp(k, &ZERO_KEY)) | 
|  | return stale; | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(k); i++) { | 
|  | if (!ptr_available(c, k, i)) | 
|  | continue; | 
|  |  | 
|  | g = PTR_BUCKET(c, k, i); | 
|  |  | 
|  | if (gen_after(g->last_gc, PTR_GEN(k, i))) | 
|  | g->last_gc = PTR_GEN(k, i); | 
|  |  | 
|  | if (ptr_stale(c, k, i)) { | 
|  | stale = max(stale, ptr_stale(c, k, i)); | 
|  | continue; | 
|  | } | 
|  |  | 
|  | cache_bug_on(GC_MARK(g) && | 
|  | (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), | 
|  | c, "inconsistent ptrs: mark = %llu, level = %i", | 
|  | GC_MARK(g), level); | 
|  |  | 
|  | if (level) | 
|  | SET_GC_MARK(g, GC_MARK_METADATA); | 
|  | else if (KEY_DIRTY(k)) | 
|  | SET_GC_MARK(g, GC_MARK_DIRTY); | 
|  | else if (!GC_MARK(g)) | 
|  | SET_GC_MARK(g, GC_MARK_RECLAIMABLE); | 
|  |  | 
|  | /* guard against overflow */ | 
|  | SET_GC_SECTORS_USED(g, min_t(unsigned int, | 
|  | GC_SECTORS_USED(g) + KEY_SIZE(k), | 
|  | MAX_GC_SECTORS_USED)); | 
|  |  | 
|  | BUG_ON(!GC_SECTORS_USED(g)); | 
|  | } | 
|  |  | 
|  | return stale; | 
|  | } | 
|  |  | 
|  | #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k) | 
|  |  | 
|  | void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) | 
|  | { | 
|  | unsigned int i; | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(k); i++) | 
|  | if (ptr_available(c, k, i) && | 
|  | !ptr_stale(c, k, i)) { | 
|  | struct bucket *b = PTR_BUCKET(c, k, i); | 
|  |  | 
|  | b->gen = PTR_GEN(k, i); | 
|  |  | 
|  | if (level && bkey_cmp(k, &ZERO_KEY)) | 
|  | b->prio = BTREE_PRIO; | 
|  | else if (!level && b->prio == BTREE_PRIO) | 
|  | b->prio = INITIAL_PRIO; | 
|  | } | 
|  |  | 
|  | __bch_btree_mark_key(c, level, k); | 
|  | } | 
|  |  | 
|  | void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats) | 
|  | { | 
|  | stats->in_use = (c->nbuckets - c->avail_nbuckets) * 100 / c->nbuckets; | 
|  | } | 
|  |  | 
|  | static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) | 
|  | { | 
|  | uint8_t stale = 0; | 
|  | unsigned int keys = 0, good_keys = 0; | 
|  | struct bkey *k; | 
|  | struct btree_iter iter; | 
|  | struct bset_tree *t; | 
|  |  | 
|  | gc->nodes++; | 
|  |  | 
|  | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { | 
|  | stale = max(stale, btree_mark_key(b, k)); | 
|  | keys++; | 
|  |  | 
|  | if (bch_ptr_bad(&b->keys, k)) | 
|  | continue; | 
|  |  | 
|  | gc->key_bytes += bkey_u64s(k); | 
|  | gc->nkeys++; | 
|  | good_keys++; | 
|  |  | 
|  | gc->data += KEY_SIZE(k); | 
|  | } | 
|  |  | 
|  | for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) | 
|  | btree_bug_on(t->size && | 
|  | bset_written(&b->keys, t) && | 
|  | bkey_cmp(&b->key, &t->end) < 0, | 
|  | b, "found short btree key in gc"); | 
|  |  | 
|  | if (b->c->gc_always_rewrite) | 
|  | return true; | 
|  |  | 
|  | if (stale > 10) | 
|  | return true; | 
|  |  | 
|  | if ((keys - good_keys) * 2 > keys) | 
|  | return true; | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | #define GC_MERGE_NODES	4U | 
|  |  | 
|  | struct gc_merge_info { | 
|  | struct btree	*b; | 
|  | unsigned int	keys; | 
|  | }; | 
|  |  | 
|  | static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | 
|  | struct keylist *insert_keys, | 
|  | atomic_t *journal_ref, | 
|  | struct bkey *replace_key); | 
|  |  | 
|  | static int btree_gc_coalesce(struct btree *b, struct btree_op *op, | 
|  | struct gc_stat *gc, struct gc_merge_info *r) | 
|  | { | 
|  | unsigned int i, nodes = 0, keys = 0, blocks; | 
|  | struct btree *new_nodes[GC_MERGE_NODES]; | 
|  | struct keylist keylist; | 
|  | struct closure cl; | 
|  | struct bkey *k; | 
|  |  | 
|  | bch_keylist_init(&keylist); | 
|  |  | 
|  | if (btree_check_reserve(b, NULL)) | 
|  | return 0; | 
|  |  | 
|  | memset(new_nodes, 0, sizeof(new_nodes)); | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) | 
|  | keys += r[nodes++].keys; | 
|  |  | 
|  | blocks = btree_default_blocks(b->c) * 2 / 3; | 
|  |  | 
|  | if (nodes < 2 || | 
|  | __set_blocks(b->keys.set[0].data, keys, | 
|  | block_bytes(b->c->cache)) > blocks * (nodes - 1)) | 
|  | return 0; | 
|  |  | 
|  | for (i = 0; i < nodes; i++) { | 
|  | new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); | 
|  | if (IS_ERR_OR_NULL(new_nodes[i])) | 
|  | goto out_nocoalesce; | 
|  | } | 
|  |  | 
|  | /* | 
|  | * We have to check the reserve here, after we've allocated our new | 
|  | * nodes, to make sure the insert below will succeed - we also check | 
|  | * before as an optimization to potentially avoid a bunch of expensive | 
|  | * allocs/sorts | 
|  | */ | 
|  | if (btree_check_reserve(b, NULL)) | 
|  | goto out_nocoalesce; | 
|  |  | 
|  | for (i = 0; i < nodes; i++) | 
|  | mutex_lock(&new_nodes[i]->write_lock); | 
|  |  | 
|  | for (i = nodes - 1; i > 0; --i) { | 
|  | struct bset *n1 = btree_bset_first(new_nodes[i]); | 
|  | struct bset *n2 = btree_bset_first(new_nodes[i - 1]); | 
|  | struct bkey *k, *last = NULL; | 
|  |  | 
|  | keys = 0; | 
|  |  | 
|  | if (i > 1) { | 
|  | for (k = n2->start; | 
|  | k < bset_bkey_last(n2); | 
|  | k = bkey_next(k)) { | 
|  | if (__set_blocks(n1, n1->keys + keys + | 
|  | bkey_u64s(k), | 
|  | block_bytes(b->c->cache)) > blocks) | 
|  | break; | 
|  |  | 
|  | last = k; | 
|  | keys += bkey_u64s(k); | 
|  | } | 
|  | } else { | 
|  | /* | 
|  | * Last node we're not getting rid of - we're getting | 
|  | * rid of the node at r[0]. Have to try and fit all of | 
|  | * the remaining keys into this node; we can't ensure | 
|  | * they will always fit due to rounding and variable | 
|  | * length keys (shouldn't be possible in practice, | 
|  | * though) | 
|  | */ | 
|  | if (__set_blocks(n1, n1->keys + n2->keys, | 
|  | block_bytes(b->c->cache)) > | 
|  | btree_blocks(new_nodes[i])) | 
|  | goto out_unlock_nocoalesce; | 
|  |  | 
|  | keys = n2->keys; | 
|  | /* Take the key of the node we're getting rid of */ | 
|  | last = &r->b->key; | 
|  | } | 
|  |  | 
|  | BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c->cache)) > | 
|  | btree_blocks(new_nodes[i])); | 
|  |  | 
|  | if (last) | 
|  | bkey_copy_key(&new_nodes[i]->key, last); | 
|  |  | 
|  | memcpy(bset_bkey_last(n1), | 
|  | n2->start, | 
|  | (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); | 
|  |  | 
|  | n1->keys += keys; | 
|  | r[i].keys = n1->keys; | 
|  |  | 
|  | memmove(n2->start, | 
|  | bset_bkey_idx(n2, keys), | 
|  | (void *) bset_bkey_last(n2) - | 
|  | (void *) bset_bkey_idx(n2, keys)); | 
|  |  | 
|  | n2->keys -= keys; | 
|  |  | 
|  | if (__bch_keylist_realloc(&keylist, | 
|  | bkey_u64s(&new_nodes[i]->key))) | 
|  | goto out_unlock_nocoalesce; | 
|  |  | 
|  | bch_btree_node_write(new_nodes[i], &cl); | 
|  | bch_keylist_add(&keylist, &new_nodes[i]->key); | 
|  | } | 
|  |  | 
|  | for (i = 0; i < nodes; i++) | 
|  | mutex_unlock(&new_nodes[i]->write_lock); | 
|  |  | 
|  | closure_sync(&cl); | 
|  |  | 
|  | /* We emptied out this node */ | 
|  | BUG_ON(btree_bset_first(new_nodes[0])->keys); | 
|  | btree_node_free(new_nodes[0]); | 
|  | rw_unlock(true, new_nodes[0]); | 
|  | new_nodes[0] = NULL; | 
|  |  | 
|  | for (i = 0; i < nodes; i++) { | 
|  | if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) | 
|  | goto out_nocoalesce; | 
|  |  | 
|  | make_btree_freeing_key(r[i].b, keylist.top); | 
|  | bch_keylist_push(&keylist); | 
|  | } | 
|  |  | 
|  | bch_btree_insert_node(b, op, &keylist, NULL, NULL); | 
|  | BUG_ON(!bch_keylist_empty(&keylist)); | 
|  |  | 
|  | for (i = 0; i < nodes; i++) { | 
|  | btree_node_free(r[i].b); | 
|  | rw_unlock(true, r[i].b); | 
|  |  | 
|  | r[i].b = new_nodes[i]; | 
|  | } | 
|  |  | 
|  | memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); | 
|  | r[nodes - 1].b = ERR_PTR(-EINTR); | 
|  |  | 
|  | trace_bcache_btree_gc_coalesce(nodes); | 
|  | gc->nodes--; | 
|  |  | 
|  | bch_keylist_free(&keylist); | 
|  |  | 
|  | /* Invalidated our iterator */ | 
|  | return -EINTR; | 
|  |  | 
|  | out_unlock_nocoalesce: | 
|  | for (i = 0; i < nodes; i++) | 
|  | mutex_unlock(&new_nodes[i]->write_lock); | 
|  |  | 
|  | out_nocoalesce: | 
|  | closure_sync(&cl); | 
|  |  | 
|  | while ((k = bch_keylist_pop(&keylist))) | 
|  | if (!bkey_cmp(k, &ZERO_KEY)) | 
|  | atomic_dec(&b->c->prio_blocked); | 
|  | bch_keylist_free(&keylist); | 
|  |  | 
|  | for (i = 0; i < nodes; i++) | 
|  | if (!IS_ERR_OR_NULL(new_nodes[i])) { | 
|  | btree_node_free(new_nodes[i]); | 
|  | rw_unlock(true, new_nodes[i]); | 
|  | } | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, | 
|  | struct btree *replace) | 
|  | { | 
|  | struct keylist keys; | 
|  | struct btree *n; | 
|  |  | 
|  | if (btree_check_reserve(b, NULL)) | 
|  | return 0; | 
|  |  | 
|  | n = btree_node_alloc_replacement(replace, NULL); | 
|  |  | 
|  | /* recheck reserve after allocating replacement node */ | 
|  | if (btree_check_reserve(b, NULL)) { | 
|  | btree_node_free(n); | 
|  | rw_unlock(true, n); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | bch_btree_node_write_sync(n); | 
|  |  | 
|  | bch_keylist_init(&keys); | 
|  | bch_keylist_add(&keys, &n->key); | 
|  |  | 
|  | make_btree_freeing_key(replace, keys.top); | 
|  | bch_keylist_push(&keys); | 
|  |  | 
|  | bch_btree_insert_node(b, op, &keys, NULL, NULL); | 
|  | BUG_ON(!bch_keylist_empty(&keys)); | 
|  |  | 
|  | btree_node_free(replace); | 
|  | rw_unlock(true, n); | 
|  |  | 
|  | /* Invalidated our iterator */ | 
|  | return -EINTR; | 
|  | } | 
|  |  | 
|  | static unsigned int btree_gc_count_keys(struct btree *b) | 
|  | { | 
|  | struct bkey *k; | 
|  | struct btree_iter iter; | 
|  | unsigned int ret = 0; | 
|  |  | 
|  | for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) | 
|  | ret += bkey_u64s(k); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static size_t btree_gc_min_nodes(struct cache_set *c) | 
|  | { | 
|  | size_t min_nodes; | 
|  |  | 
|  | /* | 
|  | * Since incremental GC would stop 100ms when front | 
|  | * side I/O comes, so when there are many btree nodes, | 
|  | * if GC only processes constant (100) nodes each time, | 
|  | * GC would last a long time, and the front side I/Os | 
|  | * would run out of the buckets (since no new bucket | 
|  | * can be allocated during GC), and be blocked again. | 
|  | * So GC should not process constant nodes, but varied | 
|  | * nodes according to the number of btree nodes, which | 
|  | * realized by dividing GC into constant(100) times, | 
|  | * so when there are many btree nodes, GC can process | 
|  | * more nodes each time, otherwise, GC will process less | 
|  | * nodes each time (but no less than MIN_GC_NODES) | 
|  | */ | 
|  | min_nodes = c->gc_stats.nodes / MAX_GC_TIMES; | 
|  | if (min_nodes < MIN_GC_NODES) | 
|  | min_nodes = MIN_GC_NODES; | 
|  |  | 
|  | return min_nodes; | 
|  | } | 
|  |  | 
|  |  | 
|  | static int btree_gc_recurse(struct btree *b, struct btree_op *op, | 
|  | struct closure *writes, struct gc_stat *gc) | 
|  | { | 
|  | int ret = 0; | 
|  | bool should_rewrite; | 
|  | struct bkey *k; | 
|  | struct btree_iter iter; | 
|  | struct gc_merge_info r[GC_MERGE_NODES]; | 
|  | struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; | 
|  |  | 
|  | bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); | 
|  |  | 
|  | for (i = r; i < r + ARRAY_SIZE(r); i++) | 
|  | i->b = ERR_PTR(-EINTR); | 
|  |  | 
|  | while (1) { | 
|  | k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); | 
|  | if (k) { | 
|  | r->b = bch_btree_node_get(b->c, op, k, b->level - 1, | 
|  | true, b); | 
|  | if (IS_ERR(r->b)) { | 
|  | ret = PTR_ERR(r->b); | 
|  | break; | 
|  | } | 
|  |  | 
|  | r->keys = btree_gc_count_keys(r->b); | 
|  |  | 
|  | ret = btree_gc_coalesce(b, op, gc, r); | 
|  | if (ret) | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (!last->b) | 
|  | break; | 
|  |  | 
|  | if (!IS_ERR(last->b)) { | 
|  | should_rewrite = btree_gc_mark_node(last->b, gc); | 
|  | if (should_rewrite) { | 
|  | ret = btree_gc_rewrite_node(b, op, last->b); | 
|  | if (ret) | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (last->b->level) { | 
|  | ret = btree_gc_recurse(last->b, op, writes, gc); | 
|  | if (ret) | 
|  | break; | 
|  | } | 
|  |  | 
|  | bkey_copy_key(&b->c->gc_done, &last->b->key); | 
|  |  | 
|  | /* | 
|  | * Must flush leaf nodes before gc ends, since replace | 
|  | * operations aren't journalled | 
|  | */ | 
|  | mutex_lock(&last->b->write_lock); | 
|  | if (btree_node_dirty(last->b)) | 
|  | bch_btree_node_write(last->b, writes); | 
|  | mutex_unlock(&last->b->write_lock); | 
|  | rw_unlock(true, last->b); | 
|  | } | 
|  |  | 
|  | memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); | 
|  | r->b = NULL; | 
|  |  | 
|  | if (atomic_read(&b->c->search_inflight) && | 
|  | gc->nodes >= gc->nodes_pre + btree_gc_min_nodes(b->c)) { | 
|  | gc->nodes_pre =  gc->nodes; | 
|  | ret = -EAGAIN; | 
|  | break; | 
|  | } | 
|  |  | 
|  | if (need_resched()) { | 
|  | ret = -EAGAIN; | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | for (i = r; i < r + ARRAY_SIZE(r); i++) | 
|  | if (!IS_ERR_OR_NULL(i->b)) { | 
|  | mutex_lock(&i->b->write_lock); | 
|  | if (btree_node_dirty(i->b)) | 
|  | bch_btree_node_write(i->b, writes); | 
|  | mutex_unlock(&i->b->write_lock); | 
|  | rw_unlock(true, i->b); | 
|  | } | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int bch_btree_gc_root(struct btree *b, struct btree_op *op, | 
|  | struct closure *writes, struct gc_stat *gc) | 
|  | { | 
|  | struct btree *n = NULL; | 
|  | int ret = 0; | 
|  | bool should_rewrite; | 
|  |  | 
|  | should_rewrite = btree_gc_mark_node(b, gc); | 
|  | if (should_rewrite) { | 
|  | n = btree_node_alloc_replacement(b, NULL); | 
|  |  | 
|  | if (!IS_ERR_OR_NULL(n)) { | 
|  | bch_btree_node_write_sync(n); | 
|  |  | 
|  | bch_btree_set_root(n); | 
|  | btree_node_free(b); | 
|  | rw_unlock(true, n); | 
|  |  | 
|  | return -EINTR; | 
|  | } | 
|  | } | 
|  |  | 
|  | __bch_btree_mark_key(b->c, b->level + 1, &b->key); | 
|  |  | 
|  | if (b->level) { | 
|  | ret = btree_gc_recurse(b, op, writes, gc); | 
|  | if (ret) | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | bkey_copy_key(&b->c->gc_done, &b->key); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static void btree_gc_start(struct cache_set *c) | 
|  | { | 
|  | struct cache *ca; | 
|  | struct bucket *b; | 
|  |  | 
|  | if (!c->gc_mark_valid) | 
|  | return; | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  |  | 
|  | c->gc_mark_valid = 0; | 
|  | c->gc_done = ZERO_KEY; | 
|  |  | 
|  | ca = c->cache; | 
|  | for_each_bucket(b, ca) { | 
|  | b->last_gc = b->gen; | 
|  | if (!atomic_read(&b->pin)) { | 
|  | SET_GC_MARK(b, 0); | 
|  | SET_GC_SECTORS_USED(b, 0); | 
|  | } | 
|  | } | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | } | 
|  |  | 
|  | static void bch_btree_gc_finish(struct cache_set *c) | 
|  | { | 
|  | struct bucket *b; | 
|  | struct cache *ca; | 
|  | unsigned int i, j; | 
|  | uint64_t *k; | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  |  | 
|  | set_gc_sectors(c); | 
|  | c->gc_mark_valid = 1; | 
|  | c->need_gc	= 0; | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) | 
|  | SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), | 
|  | GC_MARK_METADATA); | 
|  |  | 
|  | /* don't reclaim buckets to which writeback keys point */ | 
|  | rcu_read_lock(); | 
|  | for (i = 0; i < c->devices_max_used; i++) { | 
|  | struct bcache_device *d = c->devices[i]; | 
|  | struct cached_dev *dc; | 
|  | struct keybuf_key *w, *n; | 
|  |  | 
|  | if (!d || UUID_FLASH_ONLY(&c->uuids[i])) | 
|  | continue; | 
|  | dc = container_of(d, struct cached_dev, disk); | 
|  |  | 
|  | spin_lock(&dc->writeback_keys.lock); | 
|  | rbtree_postorder_for_each_entry_safe(w, n, | 
|  | &dc->writeback_keys.keys, node) | 
|  | for (j = 0; j < KEY_PTRS(&w->key); j++) | 
|  | SET_GC_MARK(PTR_BUCKET(c, &w->key, j), | 
|  | GC_MARK_DIRTY); | 
|  | spin_unlock(&dc->writeback_keys.lock); | 
|  | } | 
|  | rcu_read_unlock(); | 
|  |  | 
|  | c->avail_nbuckets = 0; | 
|  |  | 
|  | ca = c->cache; | 
|  | ca->invalidate_needs_gc = 0; | 
|  |  | 
|  | for (k = ca->sb.d; k < ca->sb.d + ca->sb.keys; k++) | 
|  | SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); | 
|  |  | 
|  | for (k = ca->prio_buckets; | 
|  | k < ca->prio_buckets + prio_buckets(ca) * 2; k++) | 
|  | SET_GC_MARK(ca->buckets + *k, GC_MARK_METADATA); | 
|  |  | 
|  | for_each_bucket(b, ca) { | 
|  | c->need_gc	= max(c->need_gc, bucket_gc_gen(b)); | 
|  |  | 
|  | if (atomic_read(&b->pin)) | 
|  | continue; | 
|  |  | 
|  | BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); | 
|  |  | 
|  | if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) | 
|  | c->avail_nbuckets++; | 
|  | } | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | } | 
|  |  | 
|  | static void bch_btree_gc(struct cache_set *c) | 
|  | { | 
|  | int ret; | 
|  | struct gc_stat stats; | 
|  | struct closure writes; | 
|  | struct btree_op op; | 
|  | uint64_t start_time = local_clock(); | 
|  |  | 
|  | trace_bcache_gc_start(c); | 
|  |  | 
|  | memset(&stats, 0, sizeof(struct gc_stat)); | 
|  | closure_init_stack(&writes); | 
|  | bch_btree_op_init(&op, SHRT_MAX); | 
|  |  | 
|  | btree_gc_start(c); | 
|  |  | 
|  | /* if CACHE_SET_IO_DISABLE set, gc thread should stop too */ | 
|  | do { | 
|  | ret = bcache_btree_root(gc_root, c, &op, &writes, &stats); | 
|  | closure_sync(&writes); | 
|  | cond_resched(); | 
|  |  | 
|  | if (ret == -EAGAIN) | 
|  | schedule_timeout_interruptible(msecs_to_jiffies | 
|  | (GC_SLEEP_MS)); | 
|  | else if (ret) | 
|  | pr_warn("gc failed!\n"); | 
|  | } while (ret && !test_bit(CACHE_SET_IO_DISABLE, &c->flags)); | 
|  |  | 
|  | bch_btree_gc_finish(c); | 
|  | wake_up_allocators(c); | 
|  |  | 
|  | bch_time_stats_update(&c->btree_gc_time, start_time); | 
|  |  | 
|  | stats.key_bytes *= sizeof(uint64_t); | 
|  | stats.data	<<= 9; | 
|  | bch_update_bucket_in_use(c, &stats); | 
|  | memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); | 
|  |  | 
|  | trace_bcache_gc_end(c); | 
|  |  | 
|  | bch_moving_gc(c); | 
|  | } | 
|  |  | 
|  | static bool gc_should_run(struct cache_set *c) | 
|  | { | 
|  | struct cache *ca = c->cache; | 
|  |  | 
|  | if (ca->invalidate_needs_gc) | 
|  | return true; | 
|  |  | 
|  | if (atomic_read(&c->sectors_to_gc) < 0) | 
|  | return true; | 
|  |  | 
|  | return false; | 
|  | } | 
|  |  | 
|  | static int bch_gc_thread(void *arg) | 
|  | { | 
|  | struct cache_set *c = arg; | 
|  |  | 
|  | while (1) { | 
|  | wait_event_interruptible(c->gc_wait, | 
|  | kthread_should_stop() || | 
|  | test_bit(CACHE_SET_IO_DISABLE, &c->flags) || | 
|  | gc_should_run(c)); | 
|  |  | 
|  | if (kthread_should_stop() || | 
|  | test_bit(CACHE_SET_IO_DISABLE, &c->flags)) | 
|  | break; | 
|  |  | 
|  | set_gc_sectors(c); | 
|  | bch_btree_gc(c); | 
|  | } | 
|  |  | 
|  | wait_for_kthread_stop(); | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | int bch_gc_thread_start(struct cache_set *c) | 
|  | { | 
|  | c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc"); | 
|  | return PTR_ERR_OR_ZERO(c->gc_thread); | 
|  | } | 
|  |  | 
|  | /* Initial partial gc */ | 
|  |  | 
|  | static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) | 
|  | { | 
|  | int ret = 0; | 
|  | struct bkey *k, *p = NULL; | 
|  | struct btree_iter iter; | 
|  |  | 
|  | for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) | 
|  | bch_initial_mark_key(b->c, b->level, k); | 
|  |  | 
|  | bch_initial_mark_key(b->c, b->level + 1, &b->key); | 
|  |  | 
|  | if (b->level) { | 
|  | bch_btree_iter_init(&b->keys, &iter, NULL); | 
|  |  | 
|  | do { | 
|  | k = bch_btree_iter_next_filter(&iter, &b->keys, | 
|  | bch_ptr_bad); | 
|  | if (k) { | 
|  | btree_node_prefetch(b, k); | 
|  | /* | 
|  | * initiallize c->gc_stats.nodes | 
|  | * for incremental GC | 
|  | */ | 
|  | b->c->gc_stats.nodes++; | 
|  | } | 
|  |  | 
|  | if (p) | 
|  | ret = bcache_btree(check_recurse, p, b, op); | 
|  |  | 
|  | p = k; | 
|  | } while (p && !ret); | 
|  | } | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  |  | 
|  | static int bch_btree_check_thread(void *arg) | 
|  | { | 
|  | int ret; | 
|  | struct btree_check_info *info = arg; | 
|  | struct btree_check_state *check_state = info->state; | 
|  | struct cache_set *c = check_state->c; | 
|  | struct btree_iter iter; | 
|  | struct bkey *k, *p; | 
|  | int cur_idx, prev_idx, skip_nr; | 
|  |  | 
|  | k = p = NULL; | 
|  | cur_idx = prev_idx = 0; | 
|  | ret = 0; | 
|  |  | 
|  | /* root node keys are checked before thread created */ | 
|  | bch_btree_iter_init(&c->root->keys, &iter, NULL); | 
|  | k = bch_btree_iter_next_filter(&iter, &c->root->keys, bch_ptr_bad); | 
|  | BUG_ON(!k); | 
|  |  | 
|  | p = k; | 
|  | while (k) { | 
|  | /* | 
|  | * Fetch a root node key index, skip the keys which | 
|  | * should be fetched by other threads, then check the | 
|  | * sub-tree indexed by the fetched key. | 
|  | */ | 
|  | spin_lock(&check_state->idx_lock); | 
|  | cur_idx = check_state->key_idx; | 
|  | check_state->key_idx++; | 
|  | spin_unlock(&check_state->idx_lock); | 
|  |  | 
|  | skip_nr = cur_idx - prev_idx; | 
|  |  | 
|  | while (skip_nr) { | 
|  | k = bch_btree_iter_next_filter(&iter, | 
|  | &c->root->keys, | 
|  | bch_ptr_bad); | 
|  | if (k) | 
|  | p = k; | 
|  | else { | 
|  | /* | 
|  | * No more keys to check in root node, | 
|  | * current checking threads are enough, | 
|  | * stop creating more. | 
|  | */ | 
|  | atomic_set(&check_state->enough, 1); | 
|  | /* Update check_state->enough earlier */ | 
|  | smp_mb__after_atomic(); | 
|  | goto out; | 
|  | } | 
|  | skip_nr--; | 
|  | cond_resched(); | 
|  | } | 
|  |  | 
|  | if (p) { | 
|  | struct btree_op op; | 
|  |  | 
|  | btree_node_prefetch(c->root, p); | 
|  | c->gc_stats.nodes++; | 
|  | bch_btree_op_init(&op, 0); | 
|  | ret = bcache_btree(check_recurse, p, c->root, &op); | 
|  | if (ret) | 
|  | goto out; | 
|  | } | 
|  | p = NULL; | 
|  | prev_idx = cur_idx; | 
|  | cond_resched(); | 
|  | } | 
|  |  | 
|  | out: | 
|  | info->result = ret; | 
|  | /* update check_state->started among all CPUs */ | 
|  | smp_mb__before_atomic(); | 
|  | if (atomic_dec_and_test(&check_state->started)) | 
|  | wake_up(&check_state->wait); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  |  | 
|  |  | 
|  | static int bch_btree_chkthread_nr(void) | 
|  | { | 
|  | int n = num_online_cpus()/2; | 
|  |  | 
|  | if (n == 0) | 
|  | n = 1; | 
|  | else if (n > BCH_BTR_CHKTHREAD_MAX) | 
|  | n = BCH_BTR_CHKTHREAD_MAX; | 
|  |  | 
|  | return n; | 
|  | } | 
|  |  | 
|  | int bch_btree_check(struct cache_set *c) | 
|  | { | 
|  | int ret = 0; | 
|  | int i; | 
|  | struct bkey *k = NULL; | 
|  | struct btree_iter iter; | 
|  | struct btree_check_state *check_state; | 
|  | char name[32]; | 
|  |  | 
|  | /* check and mark root node keys */ | 
|  | for_each_key_filter(&c->root->keys, k, &iter, bch_ptr_invalid) | 
|  | bch_initial_mark_key(c, c->root->level, k); | 
|  |  | 
|  | bch_initial_mark_key(c, c->root->level + 1, &c->root->key); | 
|  |  | 
|  | if (c->root->level == 0) | 
|  | return 0; | 
|  |  | 
|  | check_state = kzalloc(sizeof(struct btree_check_state), GFP_KERNEL); | 
|  | if (!check_state) | 
|  | return -ENOMEM; | 
|  |  | 
|  | check_state->c = c; | 
|  | check_state->total_threads = bch_btree_chkthread_nr(); | 
|  | check_state->key_idx = 0; | 
|  | spin_lock_init(&check_state->idx_lock); | 
|  | atomic_set(&check_state->started, 0); | 
|  | atomic_set(&check_state->enough, 0); | 
|  | init_waitqueue_head(&check_state->wait); | 
|  |  | 
|  | /* | 
|  | * Run multiple threads to check btree nodes in parallel, | 
|  | * if check_state->enough is non-zero, it means current | 
|  | * running check threads are enough, unncessary to create | 
|  | * more. | 
|  | */ | 
|  | for (i = 0; i < check_state->total_threads; i++) { | 
|  | /* fetch latest check_state->enough earlier */ | 
|  | smp_mb__before_atomic(); | 
|  | if (atomic_read(&check_state->enough)) | 
|  | break; | 
|  |  | 
|  | check_state->infos[i].result = 0; | 
|  | check_state->infos[i].state = check_state; | 
|  | snprintf(name, sizeof(name), "bch_btrchk[%u]", i); | 
|  | atomic_inc(&check_state->started); | 
|  |  | 
|  | check_state->infos[i].thread = | 
|  | kthread_run(bch_btree_check_thread, | 
|  | &check_state->infos[i], | 
|  | name); | 
|  | if (IS_ERR(check_state->infos[i].thread)) { | 
|  | pr_err("fails to run thread bch_btrchk[%d]\n", i); | 
|  | for (--i; i >= 0; i--) | 
|  | kthread_stop(check_state->infos[i].thread); | 
|  | ret = -ENOMEM; | 
|  | goto out; | 
|  | } | 
|  | } | 
|  |  | 
|  | wait_event_interruptible(check_state->wait, | 
|  | atomic_read(&check_state->started) == 0 || | 
|  | test_bit(CACHE_SET_IO_DISABLE, &c->flags)); | 
|  |  | 
|  | for (i = 0; i < check_state->total_threads; i++) { | 
|  | if (check_state->infos[i].result) { | 
|  | ret = check_state->infos[i].result; | 
|  | goto out; | 
|  | } | 
|  | } | 
|  |  | 
|  | out: | 
|  | kfree(check_state); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | void bch_initial_gc_finish(struct cache_set *c) | 
|  | { | 
|  | struct cache *ca = c->cache; | 
|  | struct bucket *b; | 
|  |  | 
|  | bch_btree_gc_finish(c); | 
|  |  | 
|  | mutex_lock(&c->bucket_lock); | 
|  |  | 
|  | /* | 
|  | * We need to put some unused buckets directly on the prio freelist in | 
|  | * order to get the allocator thread started - it needs freed buckets in | 
|  | * order to rewrite the prios and gens, and it needs to rewrite prios | 
|  | * and gens in order to free buckets. | 
|  | * | 
|  | * This is only safe for buckets that have no live data in them, which | 
|  | * there should always be some of. | 
|  | */ | 
|  | for_each_bucket(b, ca) { | 
|  | if (fifo_full(&ca->free[RESERVE_PRIO]) && | 
|  | fifo_full(&ca->free[RESERVE_BTREE])) | 
|  | break; | 
|  |  | 
|  | if (bch_can_invalidate_bucket(ca, b) && | 
|  | !GC_MARK(b)) { | 
|  | __bch_invalidate_one_bucket(ca, b); | 
|  | if (!fifo_push(&ca->free[RESERVE_PRIO], | 
|  | b - ca->buckets)) | 
|  | fifo_push(&ca->free[RESERVE_BTREE], | 
|  | b - ca->buckets); | 
|  | } | 
|  | } | 
|  |  | 
|  | mutex_unlock(&c->bucket_lock); | 
|  | } | 
|  |  | 
|  | /* Btree insertion */ | 
|  |  | 
|  | static bool btree_insert_key(struct btree *b, struct bkey *k, | 
|  | struct bkey *replace_key) | 
|  | { | 
|  | unsigned int status; | 
|  |  | 
|  | BUG_ON(bkey_cmp(k, &b->key) > 0); | 
|  |  | 
|  | status = bch_btree_insert_key(&b->keys, k, replace_key); | 
|  | if (status != BTREE_INSERT_STATUS_NO_INSERT) { | 
|  | bch_check_keys(&b->keys, "%u for %s", status, | 
|  | replace_key ? "replace" : "insert"); | 
|  |  | 
|  | trace_bcache_btree_insert_key(b, k, replace_key != NULL, | 
|  | status); | 
|  | return true; | 
|  | } else | 
|  | return false; | 
|  | } | 
|  |  | 
|  | static size_t insert_u64s_remaining(struct btree *b) | 
|  | { | 
|  | long ret = bch_btree_keys_u64s_remaining(&b->keys); | 
|  |  | 
|  | /* | 
|  | * Might land in the middle of an existing extent and have to split it | 
|  | */ | 
|  | if (b->keys.ops->is_extents) | 
|  | ret -= KEY_MAX_U64S; | 
|  |  | 
|  | return max(ret, 0L); | 
|  | } | 
|  |  | 
|  | static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, | 
|  | struct keylist *insert_keys, | 
|  | struct bkey *replace_key) | 
|  | { | 
|  | bool ret = false; | 
|  | int oldsize = bch_count_data(&b->keys); | 
|  |  | 
|  | while (!bch_keylist_empty(insert_keys)) { | 
|  | struct bkey *k = insert_keys->keys; | 
|  |  | 
|  | if (bkey_u64s(k) > insert_u64s_remaining(b)) | 
|  | break; | 
|  |  | 
|  | if (bkey_cmp(k, &b->key) <= 0) { | 
|  | if (!b->level) | 
|  | bkey_put(b->c, k); | 
|  |  | 
|  | ret |= btree_insert_key(b, k, replace_key); | 
|  | bch_keylist_pop_front(insert_keys); | 
|  | } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { | 
|  | BKEY_PADDED(key) temp; | 
|  | bkey_copy(&temp.key, insert_keys->keys); | 
|  |  | 
|  | bch_cut_back(&b->key, &temp.key); | 
|  | bch_cut_front(&b->key, insert_keys->keys); | 
|  |  | 
|  | ret |= btree_insert_key(b, &temp.key, replace_key); | 
|  | break; | 
|  | } else { | 
|  | break; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!ret) | 
|  | op->insert_collision = true; | 
|  |  | 
|  | BUG_ON(!bch_keylist_empty(insert_keys) && b->level); | 
|  |  | 
|  | BUG_ON(bch_count_data(&b->keys) < oldsize); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | static int btree_split(struct btree *b, struct btree_op *op, | 
|  | struct keylist *insert_keys, | 
|  | struct bkey *replace_key) | 
|  | { | 
|  | bool split; | 
|  | struct btree *n1, *n2 = NULL, *n3 = NULL; | 
|  | uint64_t start_time = local_clock(); | 
|  | struct closure cl; | 
|  | struct keylist parent_keys; | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  | bch_keylist_init(&parent_keys); | 
|  |  | 
|  | if (btree_check_reserve(b, op)) { | 
|  | if (!b->level) | 
|  | return -EINTR; | 
|  | else | 
|  | WARN(1, "insufficient reserve for split\n"); | 
|  | } | 
|  |  | 
|  | n1 = btree_node_alloc_replacement(b, op); | 
|  | if (IS_ERR(n1)) | 
|  | goto err; | 
|  |  | 
|  | split = set_blocks(btree_bset_first(n1), | 
|  | block_bytes(n1->c->cache)) > (btree_blocks(b) * 4) / 5; | 
|  |  | 
|  | if (split) { | 
|  | unsigned int keys = 0; | 
|  |  | 
|  | trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); | 
|  |  | 
|  | n2 = bch_btree_node_alloc(b->c, op, b->level, b->parent); | 
|  | if (IS_ERR(n2)) | 
|  | goto err_free1; | 
|  |  | 
|  | if (!b->parent) { | 
|  | n3 = bch_btree_node_alloc(b->c, op, b->level + 1, NULL); | 
|  | if (IS_ERR(n3)) | 
|  | goto err_free2; | 
|  | } | 
|  |  | 
|  | mutex_lock(&n1->write_lock); | 
|  | mutex_lock(&n2->write_lock); | 
|  |  | 
|  | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 
|  |  | 
|  | /* | 
|  | * Has to be a linear search because we don't have an auxiliary | 
|  | * search tree yet | 
|  | */ | 
|  |  | 
|  | while (keys < (btree_bset_first(n1)->keys * 3) / 5) | 
|  | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), | 
|  | keys)); | 
|  |  | 
|  | bkey_copy_key(&n1->key, | 
|  | bset_bkey_idx(btree_bset_first(n1), keys)); | 
|  | keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); | 
|  |  | 
|  | btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; | 
|  | btree_bset_first(n1)->keys = keys; | 
|  |  | 
|  | memcpy(btree_bset_first(n2)->start, | 
|  | bset_bkey_last(btree_bset_first(n1)), | 
|  | btree_bset_first(n2)->keys * sizeof(uint64_t)); | 
|  |  | 
|  | bkey_copy_key(&n2->key, &b->key); | 
|  |  | 
|  | bch_keylist_add(&parent_keys, &n2->key); | 
|  | bch_btree_node_write(n2, &cl); | 
|  | mutex_unlock(&n2->write_lock); | 
|  | rw_unlock(true, n2); | 
|  | } else { | 
|  | trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); | 
|  |  | 
|  | mutex_lock(&n1->write_lock); | 
|  | bch_btree_insert_keys(n1, op, insert_keys, replace_key); | 
|  | } | 
|  |  | 
|  | bch_keylist_add(&parent_keys, &n1->key); | 
|  | bch_btree_node_write(n1, &cl); | 
|  | mutex_unlock(&n1->write_lock); | 
|  |  | 
|  | if (n3) { | 
|  | /* Depth increases, make a new root */ | 
|  | mutex_lock(&n3->write_lock); | 
|  | bkey_copy_key(&n3->key, &MAX_KEY); | 
|  | bch_btree_insert_keys(n3, op, &parent_keys, NULL); | 
|  | bch_btree_node_write(n3, &cl); | 
|  | mutex_unlock(&n3->write_lock); | 
|  |  | 
|  | closure_sync(&cl); | 
|  | bch_btree_set_root(n3); | 
|  | rw_unlock(true, n3); | 
|  | } else if (!b->parent) { | 
|  | /* Root filled up but didn't need to be split */ | 
|  | closure_sync(&cl); | 
|  | bch_btree_set_root(n1); | 
|  | } else { | 
|  | /* Split a non root node */ | 
|  | closure_sync(&cl); | 
|  | make_btree_freeing_key(b, parent_keys.top); | 
|  | bch_keylist_push(&parent_keys); | 
|  |  | 
|  | bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); | 
|  | BUG_ON(!bch_keylist_empty(&parent_keys)); | 
|  | } | 
|  |  | 
|  | btree_node_free(b); | 
|  | rw_unlock(true, n1); | 
|  |  | 
|  | bch_time_stats_update(&b->c->btree_split_time, start_time); | 
|  |  | 
|  | return 0; | 
|  | err_free2: | 
|  | bkey_put(b->c, &n2->key); | 
|  | btree_node_free(n2); | 
|  | rw_unlock(true, n2); | 
|  | err_free1: | 
|  | bkey_put(b->c, &n1->key); | 
|  | btree_node_free(n1); | 
|  | rw_unlock(true, n1); | 
|  | err: | 
|  | WARN(1, "bcache: btree split failed (level %u)", b->level); | 
|  |  | 
|  | if (n3 == ERR_PTR(-EAGAIN) || | 
|  | n2 == ERR_PTR(-EAGAIN) || | 
|  | n1 == ERR_PTR(-EAGAIN)) | 
|  | return -EAGAIN; | 
|  |  | 
|  | return -ENOMEM; | 
|  | } | 
|  |  | 
|  | static int bch_btree_insert_node(struct btree *b, struct btree_op *op, | 
|  | struct keylist *insert_keys, | 
|  | atomic_t *journal_ref, | 
|  | struct bkey *replace_key) | 
|  | { | 
|  | struct closure cl; | 
|  |  | 
|  | BUG_ON(b->level && replace_key); | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | mutex_lock(&b->write_lock); | 
|  |  | 
|  | if (write_block(b) != btree_bset_last(b) && | 
|  | b->keys.last_set_unwritten) | 
|  | bch_btree_init_next(b); /* just wrote a set */ | 
|  |  | 
|  | if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { | 
|  | mutex_unlock(&b->write_lock); | 
|  | goto split; | 
|  | } | 
|  |  | 
|  | BUG_ON(write_block(b) != btree_bset_last(b)); | 
|  |  | 
|  | if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { | 
|  | if (!b->level) | 
|  | bch_btree_leaf_dirty(b, journal_ref); | 
|  | else | 
|  | bch_btree_node_write(b, &cl); | 
|  | } | 
|  |  | 
|  | mutex_unlock(&b->write_lock); | 
|  |  | 
|  | /* wait for btree node write if necessary, after unlock */ | 
|  | closure_sync(&cl); | 
|  |  | 
|  | return 0; | 
|  | split: | 
|  | if (current->bio_list) { | 
|  | op->lock = b->c->root->level + 1; | 
|  | return -EAGAIN; | 
|  | } else if (op->lock <= b->c->root->level) { | 
|  | op->lock = b->c->root->level + 1; | 
|  | return -EINTR; | 
|  | } else { | 
|  | /* Invalidated all iterators */ | 
|  | int ret = btree_split(b, op, insert_keys, replace_key); | 
|  |  | 
|  | if (bch_keylist_empty(insert_keys)) | 
|  | return 0; | 
|  | else if (!ret) | 
|  | return -EINTR; | 
|  | return ret; | 
|  | } | 
|  | } | 
|  |  | 
|  | int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, | 
|  | struct bkey *check_key) | 
|  | { | 
|  | int ret = -EINTR; | 
|  | uint64_t btree_ptr = b->key.ptr[0]; | 
|  | unsigned long seq = b->seq; | 
|  | struct keylist insert; | 
|  | bool upgrade = op->lock == -1; | 
|  |  | 
|  | bch_keylist_init(&insert); | 
|  |  | 
|  | if (upgrade) { | 
|  | rw_unlock(false, b); | 
|  | rw_lock(true, b, b->level); | 
|  |  | 
|  | if (b->key.ptr[0] != btree_ptr || | 
|  | b->seq != seq + 1) { | 
|  | op->lock = b->level; | 
|  | goto out; | 
|  | } | 
|  | } | 
|  |  | 
|  | SET_KEY_PTRS(check_key, 1); | 
|  | get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); | 
|  |  | 
|  | SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); | 
|  |  | 
|  | bch_keylist_add(&insert, check_key); | 
|  |  | 
|  | ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); | 
|  |  | 
|  | BUG_ON(!ret && !bch_keylist_empty(&insert)); | 
|  | out: | 
|  | if (upgrade) | 
|  | downgrade_write(&b->lock); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | struct btree_insert_op { | 
|  | struct btree_op	op; | 
|  | struct keylist	*keys; | 
|  | atomic_t	*journal_ref; | 
|  | struct bkey	*replace_key; | 
|  | }; | 
|  |  | 
|  | static int btree_insert_fn(struct btree_op *b_op, struct btree *b) | 
|  | { | 
|  | struct btree_insert_op *op = container_of(b_op, | 
|  | struct btree_insert_op, op); | 
|  |  | 
|  | int ret = bch_btree_insert_node(b, &op->op, op->keys, | 
|  | op->journal_ref, op->replace_key); | 
|  | if (ret && !bch_keylist_empty(op->keys)) | 
|  | return ret; | 
|  | else | 
|  | return MAP_DONE; | 
|  | } | 
|  |  | 
|  | int bch_btree_insert(struct cache_set *c, struct keylist *keys, | 
|  | atomic_t *journal_ref, struct bkey *replace_key) | 
|  | { | 
|  | struct btree_insert_op op; | 
|  | int ret = 0; | 
|  |  | 
|  | BUG_ON(current->bio_list); | 
|  | BUG_ON(bch_keylist_empty(keys)); | 
|  |  | 
|  | bch_btree_op_init(&op.op, 0); | 
|  | op.keys		= keys; | 
|  | op.journal_ref	= journal_ref; | 
|  | op.replace_key	= replace_key; | 
|  |  | 
|  | while (!ret && !bch_keylist_empty(keys)) { | 
|  | op.op.lock = 0; | 
|  | ret = bch_btree_map_leaf_nodes(&op.op, c, | 
|  | &START_KEY(keys->keys), | 
|  | btree_insert_fn); | 
|  | } | 
|  |  | 
|  | if (ret) { | 
|  | struct bkey *k; | 
|  |  | 
|  | pr_err("error %i\n", ret); | 
|  |  | 
|  | while ((k = bch_keylist_pop(keys))) | 
|  | bkey_put(c, k); | 
|  | } else if (op.op.insert_collision) | 
|  | ret = -ESRCH; | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | void bch_btree_set_root(struct btree *b) | 
|  | { | 
|  | unsigned int i; | 
|  | struct closure cl; | 
|  |  | 
|  | closure_init_stack(&cl); | 
|  |  | 
|  | trace_bcache_btree_set_root(b); | 
|  |  | 
|  | BUG_ON(!b->written); | 
|  |  | 
|  | for (i = 0; i < KEY_PTRS(&b->key); i++) | 
|  | BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); | 
|  |  | 
|  | mutex_lock(&b->c->bucket_lock); | 
|  | list_del_init(&b->list); | 
|  | mutex_unlock(&b->c->bucket_lock); | 
|  |  | 
|  | b->c->root = b; | 
|  |  | 
|  | bch_journal_meta(b->c, &cl); | 
|  | closure_sync(&cl); | 
|  | } | 
|  |  | 
|  | /* Map across nodes or keys */ | 
|  |  | 
|  | static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, | 
|  | struct bkey *from, | 
|  | btree_map_nodes_fn *fn, int flags) | 
|  | { | 
|  | int ret = MAP_CONTINUE; | 
|  |  | 
|  | if (b->level) { | 
|  | struct bkey *k; | 
|  | struct btree_iter iter; | 
|  |  | 
|  | bch_btree_iter_init(&b->keys, &iter, from); | 
|  |  | 
|  | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, | 
|  | bch_ptr_bad))) { | 
|  | ret = bcache_btree(map_nodes_recurse, k, b, | 
|  | op, from, fn, flags); | 
|  | from = NULL; | 
|  |  | 
|  | if (ret != MAP_CONTINUE) | 
|  | return ret; | 
|  | } | 
|  | } | 
|  |  | 
|  | if (!b->level || flags == MAP_ALL_NODES) | 
|  | ret = fn(op, b); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, | 
|  | struct bkey *from, btree_map_nodes_fn *fn, int flags) | 
|  | { | 
|  | return bcache_btree_root(map_nodes_recurse, c, op, from, fn, flags); | 
|  | } | 
|  |  | 
|  | int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, | 
|  | struct bkey *from, btree_map_keys_fn *fn, | 
|  | int flags) | 
|  | { | 
|  | int ret = MAP_CONTINUE; | 
|  | struct bkey *k; | 
|  | struct btree_iter iter; | 
|  |  | 
|  | bch_btree_iter_init(&b->keys, &iter, from); | 
|  |  | 
|  | while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { | 
|  | ret = !b->level | 
|  | ? fn(op, b, k) | 
|  | : bcache_btree(map_keys_recurse, k, | 
|  | b, op, from, fn, flags); | 
|  | from = NULL; | 
|  |  | 
|  | if (ret != MAP_CONTINUE) | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | if (!b->level && (flags & MAP_END_KEY)) | 
|  | ret = fn(op, b, &KEY(KEY_INODE(&b->key), | 
|  | KEY_OFFSET(&b->key), 0)); | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, | 
|  | struct bkey *from, btree_map_keys_fn *fn, int flags) | 
|  | { | 
|  | return bcache_btree_root(map_keys_recurse, c, op, from, fn, flags); | 
|  | } | 
|  |  | 
|  | /* Keybuf code */ | 
|  |  | 
|  | static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) | 
|  | { | 
|  | /* Overlapping keys compare equal */ | 
|  | if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) | 
|  | return -1; | 
|  | if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) | 
|  | return 1; | 
|  | return 0; | 
|  | } | 
|  |  | 
|  | static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, | 
|  | struct keybuf_key *r) | 
|  | { | 
|  | return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); | 
|  | } | 
|  |  | 
|  | struct refill { | 
|  | struct btree_op	op; | 
|  | unsigned int	nr_found; | 
|  | struct keybuf	*buf; | 
|  | struct bkey	*end; | 
|  | keybuf_pred_fn	*pred; | 
|  | }; | 
|  |  | 
|  | static int refill_keybuf_fn(struct btree_op *op, struct btree *b, | 
|  | struct bkey *k) | 
|  | { | 
|  | struct refill *refill = container_of(op, struct refill, op); | 
|  | struct keybuf *buf = refill->buf; | 
|  | int ret = MAP_CONTINUE; | 
|  |  | 
|  | if (bkey_cmp(k, refill->end) > 0) { | 
|  | ret = MAP_DONE; | 
|  | goto out; | 
|  | } | 
|  |  | 
|  | if (!KEY_SIZE(k)) /* end key */ | 
|  | goto out; | 
|  |  | 
|  | if (refill->pred(buf, k)) { | 
|  | struct keybuf_key *w; | 
|  |  | 
|  | spin_lock(&buf->lock); | 
|  |  | 
|  | w = array_alloc(&buf->freelist); | 
|  | if (!w) { | 
|  | spin_unlock(&buf->lock); | 
|  | return MAP_DONE; | 
|  | } | 
|  |  | 
|  | w->private = NULL; | 
|  | bkey_copy(&w->key, k); | 
|  |  | 
|  | if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) | 
|  | array_free(&buf->freelist, w); | 
|  | else | 
|  | refill->nr_found++; | 
|  |  | 
|  | if (array_freelist_empty(&buf->freelist)) | 
|  | ret = MAP_DONE; | 
|  |  | 
|  | spin_unlock(&buf->lock); | 
|  | } | 
|  | out: | 
|  | buf->last_scanned = *k; | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, | 
|  | struct bkey *end, keybuf_pred_fn *pred) | 
|  | { | 
|  | struct bkey start = buf->last_scanned; | 
|  | struct refill refill; | 
|  |  | 
|  | cond_resched(); | 
|  |  | 
|  | bch_btree_op_init(&refill.op, -1); | 
|  | refill.nr_found	= 0; | 
|  | refill.buf	= buf; | 
|  | refill.end	= end; | 
|  | refill.pred	= pred; | 
|  |  | 
|  | bch_btree_map_keys(&refill.op, c, &buf->last_scanned, | 
|  | refill_keybuf_fn, MAP_END_KEY); | 
|  |  | 
|  | trace_bcache_keyscan(refill.nr_found, | 
|  | KEY_INODE(&start), KEY_OFFSET(&start), | 
|  | KEY_INODE(&buf->last_scanned), | 
|  | KEY_OFFSET(&buf->last_scanned)); | 
|  |  | 
|  | spin_lock(&buf->lock); | 
|  |  | 
|  | if (!RB_EMPTY_ROOT(&buf->keys)) { | 
|  | struct keybuf_key *w; | 
|  |  | 
|  | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 
|  | buf->start	= START_KEY(&w->key); | 
|  |  | 
|  | w = RB_LAST(&buf->keys, struct keybuf_key, node); | 
|  | buf->end	= w->key; | 
|  | } else { | 
|  | buf->start	= MAX_KEY; | 
|  | buf->end	= MAX_KEY; | 
|  | } | 
|  |  | 
|  | spin_unlock(&buf->lock); | 
|  | } | 
|  |  | 
|  | static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 
|  | { | 
|  | rb_erase(&w->node, &buf->keys); | 
|  | array_free(&buf->freelist, w); | 
|  | } | 
|  |  | 
|  | void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) | 
|  | { | 
|  | spin_lock(&buf->lock); | 
|  | __bch_keybuf_del(buf, w); | 
|  | spin_unlock(&buf->lock); | 
|  | } | 
|  |  | 
|  | bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, | 
|  | struct bkey *end) | 
|  | { | 
|  | bool ret = false; | 
|  | struct keybuf_key *p, *w, s; | 
|  |  | 
|  | s.key = *start; | 
|  |  | 
|  | if (bkey_cmp(end, &buf->start) <= 0 || | 
|  | bkey_cmp(start, &buf->end) >= 0) | 
|  | return false; | 
|  |  | 
|  | spin_lock(&buf->lock); | 
|  | w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); | 
|  |  | 
|  | while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { | 
|  | p = w; | 
|  | w = RB_NEXT(w, node); | 
|  |  | 
|  | if (p->private) | 
|  | ret = true; | 
|  | else | 
|  | __bch_keybuf_del(buf, p); | 
|  | } | 
|  |  | 
|  | spin_unlock(&buf->lock); | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | struct keybuf_key *bch_keybuf_next(struct keybuf *buf) | 
|  | { | 
|  | struct keybuf_key *w; | 
|  |  | 
|  | spin_lock(&buf->lock); | 
|  |  | 
|  | w = RB_FIRST(&buf->keys, struct keybuf_key, node); | 
|  |  | 
|  | while (w && w->private) | 
|  | w = RB_NEXT(w, node); | 
|  |  | 
|  | if (w) | 
|  | w->private = ERR_PTR(-EINTR); | 
|  |  | 
|  | spin_unlock(&buf->lock); | 
|  | return w; | 
|  | } | 
|  |  | 
|  | struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, | 
|  | struct keybuf *buf, | 
|  | struct bkey *end, | 
|  | keybuf_pred_fn *pred) | 
|  | { | 
|  | struct keybuf_key *ret; | 
|  |  | 
|  | while (1) { | 
|  | ret = bch_keybuf_next(buf); | 
|  | if (ret) | 
|  | break; | 
|  |  | 
|  | if (bkey_cmp(&buf->last_scanned, end) >= 0) { | 
|  | pr_debug("scan finished\n"); | 
|  | break; | 
|  | } | 
|  |  | 
|  | bch_refill_keybuf(c, buf, end, pred); | 
|  | } | 
|  |  | 
|  | return ret; | 
|  | } | 
|  |  | 
|  | void bch_keybuf_init(struct keybuf *buf) | 
|  | { | 
|  | buf->last_scanned	= MAX_KEY; | 
|  | buf->keys		= RB_ROOT; | 
|  |  | 
|  | spin_lock_init(&buf->lock); | 
|  | array_allocator_init(&buf->freelist); | 
|  | } | 
|  |  | 
|  | void bch_btree_exit(void) | 
|  | { | 
|  | if (btree_io_wq) | 
|  | destroy_workqueue(btree_io_wq); | 
|  | } | 
|  |  | 
|  | int __init bch_btree_init(void) | 
|  | { | 
|  | btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0); | 
|  | if (!btree_io_wq) | 
|  | return -ENOMEM; | 
|  |  | 
|  | return 0; | 
|  | } |