summaryrefslogtreecommitdiff
path: root/drivers/md
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/bcache/bcache.h7
-rw-r--r--drivers/md/bcache/bset.c12
-rw-r--r--drivers/md/bcache/btree.c21
-rw-r--r--drivers/md/bcache/features.h6
-rw-r--r--drivers/md/bcache/journal.c4
-rw-r--r--drivers/md/bcache/super.c24
-rw-r--r--drivers/md/bcache/sysfs.c29
-rw-r--r--drivers/md/bcache/writeback.c42
-rw-r--r--drivers/md/bcache/writeback.h4
-rw-r--r--drivers/md/raid10.c4
-rw-r--r--drivers/md/raid5.c2
11 files changed, 138 insertions, 17 deletions
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 1d57f48307e6..848dd4db1659 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -373,6 +373,7 @@ struct cached_dev {
unsigned int partial_stripes_expensive:1;
unsigned int writeback_metadata:1;
unsigned int writeback_running:1;
+ unsigned int writeback_consider_fragment:1;
unsigned char writeback_percent;
unsigned int writeback_delay;
@@ -385,6 +386,9 @@ struct cached_dev {
unsigned int writeback_rate_update_seconds;
unsigned int writeback_rate_i_term_inverse;
unsigned int writeback_rate_p_term_inverse;
+ unsigned int writeback_rate_fp_term_low;
+ unsigned int writeback_rate_fp_term_mid;
+ unsigned int writeback_rate_fp_term_high;
unsigned int writeback_rate_minimum;
enum stop_on_failure stop_when_cache_set_failed;
@@ -1001,6 +1005,7 @@ void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent);
extern struct workqueue_struct *bcache_wq;
extern struct workqueue_struct *bch_journal_wq;
+extern struct workqueue_struct *bch_flush_wq;
extern struct mutex bch_register_lock;
extern struct list_head bch_cache_sets;
@@ -1042,5 +1047,7 @@ void bch_debug_exit(void);
void bch_debug_init(void);
void bch_request_exit(void);
int bch_request_init(void);
+void bch_btree_exit(void);
+int bch_btree_init(void);
#endif /* _BCACHE_H */
diff --git a/drivers/md/bcache/bset.c b/drivers/md/bcache/bset.c
index 67a2c47f4201..94d38e8a59b3 100644
--- a/drivers/md/bcache/bset.c
+++ b/drivers/md/bcache/bset.c
@@ -712,8 +712,10 @@ void bch_bset_build_written_tree(struct btree_keys *b)
for (j = inorder_next(0, t->size);
j;
j = inorder_next(j, t->size)) {
- while (bkey_to_cacheline(t, k) < cacheline)
- prev = k, k = bkey_next(k);
+ while (bkey_to_cacheline(t, k) < cacheline) {
+ prev = k;
+ k = bkey_next(k);
+ }
t->prev[j] = bkey_u64s(prev);
t->tree[j].m = bkey_to_cacheline_offset(t, cacheline++, k);
@@ -901,8 +903,10 @@ unsigned int bch_btree_insert_key(struct btree_keys *b, struct bkey *k,
status = BTREE_INSERT_STATUS_INSERT;
while (m != bset_bkey_last(i) &&
- bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0)
- prev = m, m = bkey_next(m);
+ bkey_cmp(k, b->ops->is_extents ? &START_KEY(m) : m) > 0) {
+ prev = m;
+ m = bkey_next(m);
+ }
/* prev is in the tree, if we merge we're done */
status = BTREE_INSERT_STATUS_BACK_MERGE;
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 910df242c83d..fe6dce125aba 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -99,6 +99,8 @@
#define PTR_HASH(c, k) \
(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
+static struct workqueue_struct *btree_io_wq;
+
#define insert_lock(s, b) ((b)->level <= (s)->lock)
@@ -308,7 +310,7 @@ static void __btree_node_write_done(struct closure *cl)
btree_complete_write(b, w);
if (btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
closure_return_with_destructor(cl, btree_node_write_unlock);
}
@@ -481,7 +483,7 @@ static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
BUG_ON(!i->keys);
if (!btree_node_dirty(b))
- schedule_delayed_work(&b->work, 30 * HZ);
+ queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
set_btree_node_dirty(b);
@@ -2764,3 +2766,18 @@ void bch_keybuf_init(struct keybuf *buf)
spin_lock_init(&buf->lock);
array_allocator_init(&buf->freelist);
}
+
+void bch_btree_exit(void)
+{
+ if (btree_io_wq)
+ destroy_workqueue(btree_io_wq);
+}
+
+int __init bch_btree_init(void)
+{
+ btree_io_wq = alloc_workqueue("bch_btree_io", WQ_MEM_RECLAIM, 0);
+ if (!btree_io_wq)
+ return -ENOMEM;
+
+ return 0;
+}
diff --git a/drivers/md/bcache/features.h b/drivers/md/bcache/features.h
index 84fc2c0f0101..d1c8fd3977fc 100644
--- a/drivers/md/bcache/features.h
+++ b/drivers/md/bcache/features.h
@@ -33,6 +33,8 @@
#define BCH_FEATURE_COMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
return (((sb)->feature_compat & \
BCH##_FEATURE_COMPAT_##flagname) != 0); \
} \
@@ -50,6 +52,8 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
#define BCH_FEATURE_RO_COMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
return (((sb)->feature_ro_compat & \
BCH##_FEATURE_RO_COMPAT_##flagname) != 0); \
} \
@@ -67,6 +71,8 @@ static inline void bch_clear_feature_##name(struct cache_sb *sb) \
#define BCH_FEATURE_INCOMPAT_FUNCS(name, flagname) \
static inline int bch_has_feature_##name(struct cache_sb *sb) \
{ \
+ if (sb->version < BCACHE_SB_VERSION_CDEV_WITH_FEATURES) \
+ return 0; \
return (((sb)->feature_incompat & \
BCH##_FEATURE_INCOMPAT_##flagname) != 0); \
} \
diff --git a/drivers/md/bcache/journal.c b/drivers/md/bcache/journal.c
index aefbdb7e003b..c6613e817333 100644
--- a/drivers/md/bcache/journal.c
+++ b/drivers/md/bcache/journal.c
@@ -932,8 +932,8 @@ atomic_t *bch_journal(struct cache_set *c,
journal_try_write(c);
} else if (!w->dirty) {
w->dirty = true;
- schedule_delayed_work(&c->journal.work,
- msecs_to_jiffies(c->journal_delay_ms));
+ queue_delayed_work(bch_flush_wq, &c->journal.work,
+ msecs_to_jiffies(c->journal_delay_ms));
spin_unlock(&c->journal.lock);
} else {
spin_unlock(&c->journal.lock);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index 193fe7652329..71691f32959b 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -49,6 +49,7 @@ static int bcache_major;
static DEFINE_IDA(bcache_device_idx);
static wait_queue_head_t unregister_wait;
struct workqueue_struct *bcache_wq;
+struct workqueue_struct *bch_flush_wq;
struct workqueue_struct *bch_journal_wq;
@@ -2517,7 +2518,7 @@ out:
module_put(THIS_MODULE);
}
-static void register_device_aync(struct async_reg_args *args)
+static void register_device_async(struct async_reg_args *args)
{
if (SB_IS_BDEV(args->sb))
INIT_DELAYED_WORK(&args->reg_work, register_bdev_worker);
@@ -2611,7 +2612,7 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr,
args->sb = sb;
args->sb_disk = sb_disk;
args->bdev = bdev;
- register_device_aync(args);
+ register_device_async(args);
/* No wait and returns to user space */
goto async_done;
}
@@ -2821,6 +2822,9 @@ static void bcache_exit(void)
destroy_workqueue(bcache_wq);
if (bch_journal_wq)
destroy_workqueue(bch_journal_wq);
+ if (bch_flush_wq)
+ destroy_workqueue(bch_flush_wq);
+ bch_btree_exit();
if (bcache_major)
unregister_blkdev(bcache_major, "bcache");
@@ -2876,10 +2880,26 @@ static int __init bcache_init(void)
return bcache_major;
}
+ if (bch_btree_init())
+ goto err;
+
bcache_wq = alloc_workqueue("bcache", WQ_MEM_RECLAIM, 0);
if (!bcache_wq)
goto err;
+ /*
+ * Let's not make this `WQ_MEM_RECLAIM` for the following reasons:
+ *
+ * 1. It used `system_wq` before which also does no memory reclaim.
+ * 2. With `WQ_MEM_RECLAIM` desktop stalls, increased boot times, and
+ * reduced throughput can be observed.
+ *
+ * We still want to user our own queue to not congest the `system_wq`.
+ */
+ bch_flush_wq = alloc_workqueue("bch_flush", 0, 0);
+ if (!bch_flush_wq)
+ goto err;
+
bch_journal_wq = alloc_workqueue("bch_journal", WQ_MEM_RECLAIM, 0);
if (!bch_journal_wq)
goto err;
diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c
index 00a520c03f41..cc89f3156d1a 100644
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -117,10 +117,14 @@ rw_attribute(writeback_running);
rw_attribute(writeback_percent);
rw_attribute(writeback_delay);
rw_attribute(writeback_rate);
+rw_attribute(writeback_consider_fragment);
rw_attribute(writeback_rate_update_seconds);
rw_attribute(writeback_rate_i_term_inverse);
rw_attribute(writeback_rate_p_term_inverse);
+rw_attribute(writeback_rate_fp_term_low);
+rw_attribute(writeback_rate_fp_term_mid);
+rw_attribute(writeback_rate_fp_term_high);
rw_attribute(writeback_rate_minimum);
read_attribute(writeback_rate_debug);
@@ -195,6 +199,7 @@ SHOW(__bch_cached_dev)
var_printf(bypass_torture_test, "%i");
var_printf(writeback_metadata, "%i");
var_printf(writeback_running, "%i");
+ var_printf(writeback_consider_fragment, "%i");
var_print(writeback_delay);
var_print(writeback_percent);
sysfs_hprint(writeback_rate,
@@ -205,6 +210,9 @@ SHOW(__bch_cached_dev)
var_print(writeback_rate_update_seconds);
var_print(writeback_rate_i_term_inverse);
var_print(writeback_rate_p_term_inverse);
+ var_print(writeback_rate_fp_term_low);
+ var_print(writeback_rate_fp_term_mid);
+ var_print(writeback_rate_fp_term_high);
var_print(writeback_rate_minimum);
if (attr == &sysfs_writeback_rate_debug) {
@@ -303,6 +311,7 @@ STORE(__cached_dev)
sysfs_strtoul_bool(bypass_torture_test, dc->bypass_torture_test);
sysfs_strtoul_bool(writeback_metadata, dc->writeback_metadata);
sysfs_strtoul_bool(writeback_running, dc->writeback_running);
+ sysfs_strtoul_bool(writeback_consider_fragment, dc->writeback_consider_fragment);
sysfs_strtoul_clamp(writeback_delay, dc->writeback_delay, 0, UINT_MAX);
sysfs_strtoul_clamp(writeback_percent, dc->writeback_percent,
@@ -331,6 +340,16 @@ STORE(__cached_dev)
sysfs_strtoul_clamp(writeback_rate_p_term_inverse,
dc->writeback_rate_p_term_inverse,
1, UINT_MAX);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_low,
+ dc->writeback_rate_fp_term_low,
+ 1, dc->writeback_rate_fp_term_mid - 1);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_mid,
+ dc->writeback_rate_fp_term_mid,
+ dc->writeback_rate_fp_term_low + 1,
+ dc->writeback_rate_fp_term_high - 1);
+ sysfs_strtoul_clamp(writeback_rate_fp_term_high,
+ dc->writeback_rate_fp_term_high,
+ dc->writeback_rate_fp_term_mid + 1, UINT_MAX);
sysfs_strtoul_clamp(writeback_rate_minimum,
dc->writeback_rate_minimum,
1, UINT_MAX);
@@ -499,9 +518,13 @@ static struct attribute *bch_cached_dev_files[] = {
&sysfs_writeback_delay,
&sysfs_writeback_percent,
&sysfs_writeback_rate,
+ &sysfs_writeback_consider_fragment,
&sysfs_writeback_rate_update_seconds,
&sysfs_writeback_rate_i_term_inverse,
&sysfs_writeback_rate_p_term_inverse,
+ &sysfs_writeback_rate_fp_term_low,
+ &sysfs_writeback_rate_fp_term_mid,
+ &sysfs_writeback_rate_fp_term_high,
&sysfs_writeback_rate_minimum,
&sysfs_writeback_rate_debug,
&sysfs_io_errors,
@@ -1071,8 +1094,10 @@ SHOW(__bch_cache)
--n;
while (cached < p + n &&
- *cached == BTREE_PRIO)
- cached++, n--;
+ *cached == BTREE_PRIO) {
+ cached++;
+ n--;
+ }
for (i = 0; i < n; i++)
sum += INITIAL_PRIO - cached[i];
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index a129e4d2707c..82d4e0880a99 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -88,6 +88,44 @@ static void __update_writeback_rate(struct cached_dev *dc)
int64_t integral_scaled;
uint32_t new_rate;
+ /*
+ * We need to consider the number of dirty buckets as well
+ * when calculating the proportional_scaled, Otherwise we might
+ * have an unreasonable small writeback rate at a highly fragmented situation
+ * when very few dirty sectors consumed a lot dirty buckets, the
+ * worst case is when dirty buckets reached cutoff_writeback_sync and
+ * dirty data is still not even reached to writeback percent, so the rate
+ * still will be at the minimum value, which will cause the write
+ * stuck at a non-writeback mode.
+ */
+ struct cache_set *c = dc->disk.c;
+
+ int64_t dirty_buckets = c->nbuckets - c->avail_nbuckets;
+
+ if (dc->writeback_consider_fragment &&
+ c->gc_stats.in_use > BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW && dirty > 0) {
+ int64_t fragment =
+ div_s64((dirty_buckets * c->cache->sb.bucket_size), dirty);
+ int64_t fp_term;
+ int64_t fps;
+
+ if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID) {
+ fp_term = dc->writeback_rate_fp_term_low *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW);
+ } else if (c->gc_stats.in_use <= BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH) {
+ fp_term = dc->writeback_rate_fp_term_mid *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID);
+ } else {
+ fp_term = dc->writeback_rate_fp_term_high *
+ (c->gc_stats.in_use - BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH);
+ }
+ fps = div_s64(dirty, dirty_buckets) * fp_term;
+ if (fragment > 3 && fps > proportional_scaled) {
+ /* Only overrite the p when fragment > 3 */
+ proportional_scaled = fps;
+ }
+ }
+
if ((error < 0 && dc->writeback_rate_integral > 0) ||
(error > 0 && time_before64(local_clock(),
dc->writeback_rate.next + NSEC_PER_MSEC))) {
@@ -977,6 +1015,7 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_metadata = true;
dc->writeback_running = false;
+ dc->writeback_consider_fragment = true;
dc->writeback_percent = 10;
dc->writeback_delay = 30;
atomic_long_set(&dc->writeback_rate.rate, 1024);
@@ -984,6 +1023,9 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc)
dc->writeback_rate_update_seconds = WRITEBACK_RATE_UPDATE_SECS_DEFAULT;
dc->writeback_rate_p_term_inverse = 40;
+ dc->writeback_rate_fp_term_low = 1;
+ dc->writeback_rate_fp_term_mid = 10;
+ dc->writeback_rate_fp_term_high = 1000;
dc->writeback_rate_i_term_inverse = 10000;
WARN_ON(test_and_clear_bit(BCACHE_DEV_WB_RUNNING, &dc->disk.flags));
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index 3f1230e22de0..02b2f9df73f6 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -16,6 +16,10 @@
#define BCH_AUTO_GC_DIRTY_THRESHOLD 50
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_LOW 50
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_MID 57
+#define BCH_WRITEBACK_FRAGMENT_THRESHOLD_HIGH 64
+
#define BCH_DIRTY_INIT_THRD_MAX 64
/*
* 14 (16384ths) is chosen here as something that each backing device
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index e1eefbec15d4..a9ae7d113492 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4539,10 +4539,6 @@ read_more:
read_bio->bi_private = r10_bio;
read_bio->bi_end_io = end_reshape_read;
bio_set_op_attrs(read_bio, REQ_OP_READ, 0);
- read_bio->bi_flags &= (~0UL << BIO_RESET_BITS);
- read_bio->bi_status = 0;
- read_bio->bi_vcnt = 0;
- read_bio->bi_iter.bi_size = 0;
r10_bio->master_bio = read_bio;
r10_bio->read_slot = r10_bio->devs[r10_bio->read_slot].devnum;
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a348b2adf2a9..5d57a5bd171f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -7643,7 +7643,7 @@ static int raid5_run(struct mddev *mddev)
}
/* device size must be a multiple of chunk size */
- mddev->dev_sectors &= ~(mddev->chunk_sectors - 1);
+ mddev->dev_sectors &= ~((sector_t)mddev->chunk_sectors - 1);
mddev->resync_max_sectors = mddev->dev_sectors;
if (mddev->degraded > dirty_parity_disks &&