diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 9a670bb2ccfb..bba83011b18b 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -31,8 +31,6 @@ static const unsigned BCH_ALLOC_V1_FIELD_BYTES[] = { #undef x }; -static void bch2_recalc_oldest_io(struct bch_fs *, struct bch_dev *, int); - /* Ratelimiting/PD controllers */ static void pd_controllers_update(struct work_struct *work) @@ -340,9 +338,7 @@ static int bch2_alloc_read_fn(struct bch_fs *c, enum btree_id id, int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) { - struct bch_dev *ca; - unsigned i; - int ret = 0; + int ret; down_read(&c->gc_lock); ret = bch2_btree_and_journal_walk(c, journal_keys, BTREE_ID_ALLOC, @@ -358,22 +354,6 @@ int bch2_alloc_read(struct bch_fs *c, struct journal_keys *journal_keys) bch2_dev_usage_from_buckets(c); percpu_up_write(&c->mark_lock); - mutex_lock(&c->bucket_clock[READ].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, READ); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[READ].lock); - - mutex_lock(&c->bucket_clock[WRITE].lock); - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - bch2_recalc_oldest_io(c, ca, WRITE); - up_read(&ca->bucket_lock); - } - mutex_unlock(&c->bucket_clock[WRITE].lock); - return 0; } @@ -460,114 +440,6 @@ int bch2_alloc_write(struct bch_fs *c, unsigned flags) /* Bucket IO clocks: */ -static void bch2_recalc_oldest_io(struct bch_fs *c, struct bch_dev *ca, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets = bucket_array(ca); - struct bucket *g; - u16 max_last_io = 0; - unsigned i; - - lockdep_assert_held(&c->bucket_clock[rw].lock); - - /* Recalculate max_last_io for this device: */ - for_each_bucket(g, buckets) - max_last_io = max(max_last_io, bucket_last_io(c, g, rw)); - - ca->max_last_bucket_io[rw] = max_last_io; - - /* Recalculate global max_last_io: */ - max_last_io = 0; - - for_each_member_device(ca, c, i) - max_last_io = max(max_last_io, ca->max_last_bucket_io[rw]); - - clock->max_last_io = max_last_io; -} - -static void bch2_rescale_bucket_io_times(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - struct bucket_array *buckets; - struct bch_dev *ca; - struct bucket *g; - unsigned i; - - trace_rescale_prios(c); - - for_each_member_device(ca, c, i) { - down_read(&ca->bucket_lock); - buckets = bucket_array(ca); - - for_each_bucket(g, buckets) - g->io_time[rw] = clock->hand - - bucket_last_io(c, g, rw) / 2; - - bch2_recalc_oldest_io(c, ca, rw); - - up_read(&ca->bucket_lock); - } -} - -static inline u64 bucket_clock_freq(u64 capacity) -{ - return max(capacity >> 10, 2028ULL); -} - -static void bch2_inc_clock_hand(struct io_timer *timer) -{ - struct bucket_clock *clock = container_of(timer, - struct bucket_clock, rescale); - struct bch_fs *c = container_of(clock, - struct bch_fs, bucket_clock[clock->rw]); - struct bch_dev *ca; - u64 capacity; - unsigned i; - - mutex_lock(&clock->lock); - - /* if clock cannot be advanced more, rescale prio */ - if (clock->max_last_io >= U16_MAX - 2) - bch2_rescale_bucket_io_times(c, clock->rw); - - BUG_ON(clock->max_last_io >= U16_MAX - 2); - - for_each_member_device(ca, c, i) - ca->max_last_bucket_io[clock->rw]++; - clock->max_last_io++; - clock->hand++; - - mutex_unlock(&clock->lock); - - capacity = READ_ONCE(c->capacity); - - if (!capacity) - return; - - /* - * we only increment when 0.1% of the filesystem capacity has been read - * or written too, this determines if it's time - * - * XXX: we shouldn't really be going off of the capacity of devices in - * RW mode (that will be 0 when we're RO, yet we can still service - * reads) - */ - timer->expire += bucket_clock_freq(capacity); - - bch2_io_timer_add(&c->io_clock[clock->rw], timer); -} - -static void bch2_bucket_clock_init(struct bch_fs *c, int rw) -{ - struct bucket_clock *clock = &c->bucket_clock[rw]; - - clock->hand = 1; - clock->rw = rw; - clock->rescale.fn = bch2_inc_clock_hand; - clock->rescale.expire = bucket_clock_freq(c->capacity); - mutex_init(&clock->lock); -} - int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, size_t bucket_nr, int rw) { @@ -577,7 +449,7 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, struct bucket *g; struct bkey_alloc_buf *a; struct bkey_alloc_unpacked u; - u64 *time; + u64 *time, now; int ret = 0; iter = bch2_trans_get_iter(trans, BTREE_ID_ALLOC, POS(dev, bucket_nr), @@ -599,10 +471,11 @@ int bch2_bucket_io_time_reset(struct btree_trans *trans, unsigned dev, percpu_up_read(&c->mark_lock); time = rw == READ ? &u.read_time : &u.write_time; - if (*time == c->bucket_clock[rw].hand) + now = atomic64_read(&c->io_clock[rw].now); + if (*time == now) goto out; - *time = c->bucket_clock[rw].hand; + *time = now; bch2_alloc_pack(c, a, u); ret = bch2_trans_update(trans, iter, &a->k, 0) ?: @@ -674,23 +547,22 @@ static int wait_buckets_available(struct bch_fs *c, struct bch_dev *ca) return ret; } -static bool bch2_can_invalidate_bucket(struct bch_dev *ca, - size_t bucket, - struct bucket_mark mark) +static bool bch2_can_invalidate_bucket(struct bch_dev *ca, size_t b, + struct bucket_mark m) { u8 gc_gen; - if (!is_available_bucket(mark)) + if (!is_available_bucket(m)) return false; - if (mark.owned_by_allocator) + if (m.owned_by_allocator) return false; if (ca->buckets_nouse && - test_bit(bucket, ca->buckets_nouse)) + test_bit(b, ca->buckets_nouse)) return false; - gc_gen = bucket_gc_gen(ca, bucket); + gc_gen = bucket_gc_gen(bucket(ca, b)); if (gc_gen >= BUCKET_GC_GEN_MAX / 2) ca->inc_gen_needs_gc++; @@ -704,43 +576,33 @@ static bool bch2_can_invalidate_bucket(struct bch_dev *ca, /* * Determines what order we're going to reuse buckets, smallest bucket_key() * first. - * - * - * - We take into account the read prio of the bucket, which gives us an - * indication of how hot the data is -- we scale the prio so that the prio - * farthest from the clock is worth 1/8th of the closest. - * - * - The number of sectors of cached data in the bucket, which gives us an - * indication of the cost in cache misses this eviction will cause. - * - * - If hotness * sectors used compares equal, we pick the bucket with the - * smallest bucket_gc_gen() - since incrementing the same bucket's generation - * number repeatedly forces us to run mark and sweep gc to avoid generation - * number wraparound. */ -static unsigned long bucket_sort_key(struct bch_fs *c, struct bch_dev *ca, - size_t b, struct bucket_mark m) +static unsigned bucket_sort_key(struct bucket *g, struct bucket_mark m, + u64 now, u64 last_seq_ondisk) { - unsigned last_io = bucket_last_io(c, bucket(ca, b), READ); - unsigned max_last_io = ca->max_last_bucket_io[READ]; + unsigned used = bucket_sectors_used(m); - /* - * Time since last read, scaled to [0, 8) where larger value indicates - * more recently read data: - */ - unsigned long hotness = (max_last_io - last_io) * 7 / max_last_io; + if (used) { + /* + * Prefer to keep buckets that have been read more recently, and + * buckets that have more data in them: + */ + u64 last_read = max_t(s64, 0, now - g->io_time[READ]); + u32 last_read_scaled = max_t(u64, U32_MAX, div_u64(last_read, used)); - /* How much we want to keep the data in this bucket: */ - unsigned long data_wantness = - (hotness + 1) * bucket_sectors_used(m); - - unsigned long needs_journal_commit = - bucket_needs_journal_commit(m, c->journal.last_seq_ondisk); - - return (data_wantness << 9) | - (needs_journal_commit << 8) | - (bucket_gc_gen(ca, b) / 16); + return -last_read_scaled; + } else { + /* + * Prefer to use buckets with smaller gc_gen so that we don't + * have to walk the btree and recalculate oldest_gen - but shift + * off the low bits so that buckets will still have equal sort + * keys when there's only a small difference, so that we can + * keep sequential buckets together: + */ + return (bucket_needs_journal_commit(m, last_seq_ondisk) << 4)| + (bucket_gc_gen(g) >> 4); + } } static inline int bucket_alloc_cmp(alloc_heap *h, @@ -763,16 +625,15 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) { struct bucket_array *buckets; struct alloc_heap_entry e = { 0 }; + u64 now, last_seq_ondisk; size_t b, i, nr = 0; - ca->alloc_heap.used = 0; - - mutex_lock(&c->bucket_clock[READ].lock); down_read(&ca->bucket_lock); buckets = bucket_array(ca); - - bch2_recalc_oldest_io(c, ca, READ); + ca->alloc_heap.used = 0; + now = atomic64_read(&c->io_clock[READ].now); + last_seq_ondisk = c->journal.last_seq_ondisk; /* * Find buckets with lowest read priority, by building a maxheap sorted @@ -780,8 +641,9 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) * all buckets have been visited. */ for (b = ca->mi.first_bucket; b < ca->mi.nbuckets; b++) { - struct bucket_mark m = READ_ONCE(buckets->b[b].mark); - unsigned long key = bucket_sort_key(c, ca, b, m); + struct bucket *g = &buckets->b[b]; + struct bucket_mark m = READ_ONCE(g->mark); + unsigned key = bucket_sort_key(g, m, now, last_seq_ondisk); if (!bch2_can_invalidate_bucket(ca, b, m)) continue; @@ -816,7 +678,6 @@ static void find_reclaimable_buckets_lru(struct bch_fs *c, struct bch_dev *ca) } up_read(&ca->bucket_lock); - mutex_unlock(&c->bucket_clock[READ].lock); } static void find_reclaimable_buckets_fifo(struct bch_fs *c, struct bch_dev *ca) @@ -1031,8 +892,8 @@ static int bch2_invalidate_one_bucket2(struct btree_trans *trans, u.data_type = 0; u.dirty_sectors = 0; u.cached_sectors = 0; - u.read_time = c->bucket_clock[READ].hand; - u.write_time = c->bucket_clock[WRITE].hand; + u.read_time = atomic64_read(&c->io_clock[READ].now); + u.write_time = atomic64_read(&c->io_clock[WRITE].now); bch2_alloc_pack(c, &a, u); bch2_trans_update(trans, iter, &a.k, @@ -1542,8 +1403,6 @@ int bch2_dev_allocator_start(struct bch_dev *ca) void bch2_fs_allocator_background_init(struct bch_fs *c) { spin_lock_init(&c->freelist_lock); - bch2_bucket_clock_init(c, READ); - bch2_bucket_clock_init(c, WRITE); c->pd_controllers_update_seconds = 5; INIT_DELAYED_WORK(&c->pd_controllers_update, pd_controllers_update); diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 1abfff5290bc..be164d6108bb 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -10,30 +10,6 @@ struct ec_bucket_buf; -/* There's two of these clocks, one for reads and one for writes: */ -struct bucket_clock { - /* - * "now" in (read/write) IO time - incremented whenever we do X amount - * of reads or writes. - * - * Goes with the bucket read/write prios: when we read or write to a - * bucket we reset the bucket's prio to the current hand; thus hand - - * prio = time since bucket was last read/written. - * - * The units are some amount (bytes/sectors) of data read/written, and - * the units can change on the fly if we need to rescale to fit - * everything in a u16 - your only guarantee is that the units are - * consistent. - */ - u16 hand; - u16 max_last_io; - - int rw; - - struct io_timer rescale; - struct mutex lock; -}; - enum alloc_reserve { RESERVE_BTREE_MOVINGGC = -2, RESERVE_BTREE = -1, diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index bd675b88b354..763cac0efa0c 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -451,9 +451,6 @@ struct bch_dev { size_t fifo_last_bucket; - /* last calculated minimum prio */ - u16 max_last_bucket_io[2]; - size_t inc_gen_needs_gc; size_t inc_gen_really_needs_gc; @@ -693,14 +690,6 @@ struct bch_fs { struct mutex usage_scratch_lock; struct bch_fs_usage_online *usage_scratch; - /* - * When we invalidate buckets, we use both the priority and the amount - * of good data to determine which buckets to reuse first - to weight - * those together consistently we keep track of the smallest nonzero - * priority of any bucket. - */ - struct bucket_clock bucket_clock[2]; - struct io_clock io_clock[2]; /* JOURNAL SEQ BLACKLIST */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index b6c7e57b6bcd..5dab5bfd228a 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -1143,8 +1143,8 @@ struct bch_sb_field_clean { struct bch_sb_field field; __le32 flags; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; __le64 journal_seq; union { @@ -1511,7 +1511,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(blacklist, 3) \ x(blacklist_v2, 4) \ x(usage, 5) \ - x(data_usage, 6) + x(data_usage, 6) \ + x(clock, 7) enum { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, @@ -1559,6 +1560,13 @@ struct jset_entry_data_usage { struct bch_replicas_entry r; } __attribute__((packed)); +struct jset_entry_clock { + struct jset_entry entry; + __u8 rw; + __u8 pad[7]; + __le64 time; +} __attribute__((packed)); + /* * On disk format for a journal entry: * seq is monotonically increasing; every journal entry has its own unique @@ -1581,8 +1589,8 @@ struct jset { __u8 encrypted_start[0]; - __le16 read_clock; - __le16 write_clock; + __le16 _read_clock; /* no longer used */ + __le16 _write_clock; /* Sequence number of oldest dirty journal entry */ __le64 last_seq; diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 9e123736a125..5ea9bae09d59 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -1489,7 +1489,7 @@ static int bch2_gc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last = atomic_long_read(&clock->now); + unsigned long last = atomic64_read(&clock->now); unsigned last_kick = atomic_read(&c->kick_gc); int ret; @@ -1510,7 +1510,7 @@ static int bch2_gc_thread(void *arg) if (c->btree_gc_periodic) { unsigned long next = last + c->capacity / 16; - if (atomic_long_read(&clock->now) >= next) + if (atomic64_read(&clock->now) >= next) break; bch2_io_clock_schedule_timeout(clock, next); @@ -1522,7 +1522,7 @@ static int bch2_gc_thread(void *arg) } __set_current_state(TASK_RUNNING); - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); last_kick = atomic_read(&c->kick_gc); /* diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index 4103ea7e769a..50989d286190 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -58,20 +58,13 @@ static inline struct bucket *bucket(struct bch_dev *ca, size_t b) return __bucket(ca, b, false); } -static inline u16 bucket_last_io(struct bch_fs *c, struct bucket *g, int rw) -{ - return c->bucket_clock[rw].hand - g->io_time[rw]; -} - /* * bucket_gc_gen() returns the difference between the bucket's current gen and * the oldest gen of any pointer into that bucket in the btree. */ -static inline u8 bucket_gc_gen(struct bch_dev *ca, size_t b) +static inline u8 bucket_gc_gen(struct bucket *g) { - struct bucket *g = bucket(ca, b); - return g->mark.gen - g->oldest_gen; } diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 99ab9f48ba9d..b6ea67506cc2 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -37,7 +37,7 @@ struct bucket { const struct bucket_mark mark; }; - u16 io_time[2]; + u64 io_time[2]; u8 oldest_gen; u8 gc_gen; unsigned gen_valid:1; diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 869ba1887757..da91c95e3ffc 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -19,7 +19,7 @@ void bch2_io_timer_add(struct io_clock *clock, struct io_timer *timer) spin_lock(&clock->timer_lock); - if (time_after_eq((unsigned long) atomic_long_read(&clock->now), + if (time_after_eq((unsigned long) atomic64_read(&clock->now), timer->expire)) { spin_unlock(&clock->timer_lock); timer->fn(timer); @@ -146,7 +146,7 @@ static struct io_timer *get_expired_timer(struct io_clock *clock, void __bch2_increment_clock(struct io_clock *clock, unsigned sectors) { struct io_timer *timer; - unsigned long now = atomic_long_add_return(sectors, &clock->now); + unsigned long now = atomic64_add_return(sectors, &clock->now); while ((timer = get_expired_timer(clock, now))) timer->fn(timer); @@ -158,7 +158,7 @@ void bch2_io_timers_to_text(struct printbuf *out, struct io_clock *clock) unsigned i; spin_lock(&clock->timer_lock); - now = atomic_long_read(&clock->now); + now = atomic64_read(&clock->now); for (i = 0; i < clock->timers.used; i++) pr_buf(out, "%ps:\t%li\n", @@ -175,7 +175,7 @@ void bch2_io_clock_exit(struct io_clock *clock) int bch2_io_clock_init(struct io_clock *clock) { - atomic_long_set(&clock->now, 0); + atomic64_set(&clock->now, 0); spin_lock_init(&clock->timer_lock); clock->max_slop = IO_CLOCK_PCPU_SECTORS * num_possible_cpus(); diff --git a/fs/bcachefs/clock_types.h b/fs/bcachefs/clock_types.h index 92c740a47565..5fae0012d808 100644 --- a/fs/bcachefs/clock_types.h +++ b/fs/bcachefs/clock_types.h @@ -26,7 +26,7 @@ struct io_timer { typedef HEAP(struct io_timer *) io_timer_heap; struct io_clock { - atomic_long_t now; + atomic64_t now; u16 __percpu *pcpu_buf; unsigned max_slop; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index ba37c78c01db..379b9ad2c0f9 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -1123,6 +1123,9 @@ int bch2_fs_journal_init(struct journal *j) j->entry_u64s_reserved += BTREE_ID_NR * (JSET_KEYS_U64s + BKEY_BTREE_PTR_U64s_MAX); + j->entry_u64s_reserved += + 2 * (sizeof(struct jset_entry_clock) / sizeof(u64)); + atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 7e726db77881..a82548983dbd 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -426,6 +426,32 @@ static int journal_entry_validate_data_usage(struct bch_fs *c, return ret; } +static int journal_entry_validate_clock(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + int write) +{ + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + unsigned bytes = jset_u64s(le16_to_cpu(entry->u64s)) * sizeof(u64); + int ret = 0; + + if (journal_entry_err_on(bytes != sizeof(*clock), + c, "invalid journal entry clock: bad size")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + + if (journal_entry_err_on(clock->rw > 1, + c, "invalid journal entry clock: bad rw")) { + journal_entry_null_range(entry, vstruct_next(entry)); + return ret; + } + +fsck_err: + return ret; +} + struct jset_entry_ops { int (*validate)(struct bch_fs *, struct jset *, struct jset_entry *, int); @@ -1361,8 +1387,8 @@ void bch2_journal_write(struct closure *cl) end = bch2_btree_roots_to_journal_entries(c, jset->start, end); - end = bch2_journal_super_entries_add_common(c, end, - le64_to_cpu(jset->seq)); + bch2_journal_super_entries_add_common(c, &end, + le64_to_cpu(jset->seq)); u64s = (u64 *) end - (u64 *) start; BUG_ON(u64s > j->entry_u64s_reserved); @@ -1371,10 +1397,7 @@ void bch2_journal_write(struct closure *cl) journal_write_compact(jset); - jset->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - jset->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); jset->magic = cpu_to_le64(jset_magic(c)); - jset->version = c->sb.version < bcachefs_metadata_version_new_versioning ? cpu_to_le32(BCH_JSET_VERSION_OLD) : cpu_to_le32(c->sb.version); diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 8e6e4cd73886..e2472c19beaf 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -298,7 +298,7 @@ static int bch2_copygc_thread(void *arg) { struct bch_fs *c = arg; struct io_clock *clock = &c->io_clock[WRITE]; - unsigned long last, wait; + u64 last, wait; set_freezable(); @@ -306,7 +306,7 @@ static int bch2_copygc_thread(void *arg) if (kthread_wait_freezable(c->copy_gc_enabled)) break; - last = atomic_long_read(&clock->now); + last = atomic64_read(&clock->now); wait = bch2_copygc_wait_amount(c); if (wait > clock->max_slop) { diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index f9a12dd797a5..2263ee41c444 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -169,12 +169,12 @@ static int bch2_rebalance_thread(void *arg) unsigned long start, prev_start; unsigned long prev_run_time, prev_run_cputime; unsigned long cputime, prev_cputime; - unsigned long io_start; + u64 io_start; long throttle; set_freezable(); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = rebalance_work(c); prev_start = jiffies; prev_cputime = curr_cputime(); @@ -210,7 +210,7 @@ static int bch2_rebalance_thread(void *arg) (20 - w.dev_most_full_percent), 50); - if (atomic_long_read(&clock->now) + clock->max_slop < + if (atomic64_read(&clock->now) + clock->max_slop < r->throttled_until_iotime) { r->throttled_until_cputime = start + throttle; r->state = REBALANCE_THROTTLED; @@ -229,7 +229,7 @@ static int bch2_rebalance_thread(void *arg) max(p.dev_most_full_percent, 1U) / max(w.dev_most_full_percent, 1U)); - io_start = atomic_long_read(&clock->now); + io_start = atomic64_read(&clock->now); p = w; prev_start = start; prev_cputime = cputime; @@ -274,7 +274,7 @@ void bch2_rebalance_work_to_text(struct printbuf *out, struct bch_fs *c) case REBALANCE_THROTTLED: bch2_hprint(&PBUF(h1), (r->throttled_until_iotime - - atomic_long_read(&c->io_clock[WRITE].now)) << 9); + atomic64_read(&c->io_clock[WRITE].now)) << 9); pr_buf(out, "throttled for %lu sec or %s io\n", (r->throttled_until_cputime - jiffies) / HZ, h1); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 192c6be20ced..2f62a643c39f 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -17,7 +17,7 @@ struct bch_fs_rebalance { atomic64_t work_unknown_dev; enum rebalance_state state; - unsigned long throttled_until_iotime; + u64 throttled_until_iotime; unsigned long throttled_until_cputime; struct bch_move_stats move_stats; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index f470e0e233ce..55f7771e11c8 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -847,6 +847,12 @@ static int journal_replay_entry_early(struct bch_fs *c, le64_to_cpu(bl_entry->end) + 1); break; } + case BCH_JSET_ENTRY_clock: { + struct jset_entry_clock *clock = + container_of(entry, struct jset_entry_clock, entry); + + atomic64_set(&c->io_clock[clock->rw].now, clock->time); + } } return ret; @@ -861,9 +867,6 @@ static int journal_replay_early(struct bch_fs *c, int ret; if (clean) { - c->bucket_clock[READ].hand = le16_to_cpu(clean->read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(clean->write_clock); - for (entry = clean->start; entry != vstruct_end(&clean->field); entry = vstruct_next(entry)) { @@ -876,9 +879,6 @@ static int journal_replay_early(struct bch_fs *c, if (i->ignore) continue; - c->bucket_clock[READ].hand = le16_to_cpu(i->j.read_clock); - c->bucket_clock[WRITE].hand = le16_to_cpu(i->j.write_clock); - vstruct_for_each(&i->j, entry) { ret = journal_replay_entry_early(c, entry); if (ret) @@ -942,13 +942,6 @@ static int verify_superblock_clean(struct bch_fs *c, return 0; } - mustfix_fsck_err_on(j->read_clock != clean->read_clock, c, - "superblock read clock %u doesn't match journal %u after clean shutdown", - clean->read_clock, j->read_clock); - mustfix_fsck_err_on(j->write_clock != clean->write_clock, c, - "superblock write clock %u doesn't match journal %u after clean shutdown", - clean->write_clock, j->write_clock); - for (i = 0; i < BTREE_ID_NR; i++) { char buf1[200], buf2[200]; struct bkey_i *k1, *k2; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 61b947313c88..3b082da934fb 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -966,29 +966,25 @@ int bch2_fs_mark_dirty(struct bch_fs *c) return ret; } -static void -entry_init_u64s(struct jset_entry *entry, unsigned u64s) +static struct jset_entry *jset_entry_init(struct jset_entry **end, size_t size) { - memset(entry, 0, u64s * sizeof(u64)); + struct jset_entry *entry = *end; + unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); + memset(entry, 0, u64s * sizeof(u64)); /* * The u64s field counts from the start of data, ignoring the shared * fields. */ entry->u64s = u64s - 1; + + *end = vstruct_next(*end); + return entry; } -static void -entry_init_size(struct jset_entry *entry, size_t size) -{ - unsigned u64s = DIV_ROUND_UP(size, sizeof(u64)); - entry_init_u64s(entry, u64s); -} - -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *c, - struct jset_entry *entry, - u64 journal_seq) +void bch2_journal_super_entries_add_common(struct bch_fs *c, + struct jset_entry **end, + u64 journal_seq) { unsigned i; @@ -1003,59 +999,59 @@ bch2_journal_super_entries_add_common(struct bch_fs *c, { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_INODES; u->v = cpu_to_le64(c->usage_base->nr_inodes); - - entry = vstruct_next(entry); } { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_KEY_VERSION; u->v = cpu_to_le64(atomic64_read(&c->key_version)); - - entry = vstruct_next(entry); } for (i = 0; i < BCH_REPLICAS_MAX; i++) { struct jset_entry_usage *u = - container_of(entry, struct jset_entry_usage, entry); + container_of(jset_entry_init(end, sizeof(*u)), + struct jset_entry_usage, entry); - entry_init_size(entry, sizeof(*u)); u->entry.type = BCH_JSET_ENTRY_usage; u->entry.btree_id = FS_USAGE_RESERVED; u->entry.level = i; u->v = cpu_to_le64(c->usage_base->persistent_reserved[i]); - - entry = vstruct_next(entry); } for (i = 0; i < c->replicas.nr; i++) { struct bch_replicas_entry *e = cpu_replicas_entry(&c->replicas, i); struct jset_entry_data_usage *u = - container_of(entry, struct jset_entry_data_usage, entry); + container_of(jset_entry_init(end, sizeof(*u) + e->nr_devs), + struct jset_entry_data_usage, entry); - entry_init_size(entry, sizeof(*u) + e->nr_devs); u->entry.type = BCH_JSET_ENTRY_data_usage; u->v = cpu_to_le64(c->usage_base->replicas[i]); unsafe_memcpy(&u->r, e, replicas_entry_bytes(e), "embedded variable length struct"); - - entry = vstruct_next(entry); } percpu_up_read(&c->mark_lock); - return entry; + for (i = 0; i < 2; i++) { + struct jset_entry_clock *clock = + container_of(jset_entry_init(end, sizeof(*clock)), + struct jset_entry_clock, entry); + + clock->entry.type = BCH_JSET_ENTRY_clock; + clock->rw = i; + clock->time = atomic64_read(&c->io_clock[i].now); + } } void bch2_fs_mark_clean(struct bch_fs *c) @@ -1084,15 +1080,13 @@ void bch2_fs_mark_clean(struct bch_fs *c) } sb_clean->flags = 0; - sb_clean->read_clock = cpu_to_le16(c->bucket_clock[READ].hand); - sb_clean->write_clock = cpu_to_le16(c->bucket_clock[WRITE].hand); sb_clean->journal_seq = cpu_to_le64(journal_cur_seq(&c->journal) - 1); /* Trying to catch outstanding bug: */ BUG_ON(le64_to_cpu(sb_clean->journal_seq) > S64_MAX); entry = sb_clean->start; - entry = bch2_journal_super_entries_add_common(c, entry, 0); + bch2_journal_super_entries_add_common(c, &entry, 0); entry = bch2_btree_roots_to_journal_entries(c, entry, entry); BUG_ON((void *) entry > vstruct_end(&sb_clean->field)); diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index 402ae563b3c7..dd8d4ba911f0 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -122,9 +122,8 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) /* BCH_SB_FIELD_clean: */ -struct jset_entry * -bch2_journal_super_entries_add_common(struct bch_fs *, - struct jset_entry *, u64); +void bch2_journal_super_entries_add_common(struct bch_fs *, + struct jset_entry **, u64); void bch2_sb_clean_renumber(struct bch_sb_field_clean *, int); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index d451a29b517b..5f5893ab9edf 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -181,9 +181,6 @@ static void __bch2_fs_read_only(struct bch_fs *c) bch2_copygc_stop(c); bch2_gc_thread_stop(c); - bch2_io_timer_del(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_del(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - /* * Flush journal before stopping allocators, because flushing journal * blacklist entries involves allocating new btree nodes: @@ -406,9 +403,6 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); - bch2_io_timer_add(&c->io_clock[READ], &c->bucket_clock[READ].rescale); - bch2_io_timer_add(&c->io_clock[WRITE], &c->bucket_clock[WRITE].rescale); - for_each_rw_member(ca, c, i) { ret = bch2_dev_allocator_start(ca); if (ret) { diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 521b6d8d518f..8fdbeaf9df32 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -705,7 +705,7 @@ static unsigned bucket_last_io_fn(struct bch_fs *c, struct bch_dev *ca, { int rw = (private ? 1 : 0); - return bucket_last_io(c, bucket(ca, b), rw); + return atomic64_read(&c->io_clock[rw].now) - bucket(ca, b)->io_time[rw]; } static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, @@ -718,7 +718,7 @@ static unsigned bucket_sectors_used_fn(struct bch_fs *c, struct bch_dev *ca, static unsigned bucket_oldest_gen_fn(struct bch_fs *c, struct bch_dev *ca, size_t b, void *private) { - return bucket_gc_gen(ca, b); + return bucket_gc_gen(bucket(ca, b)); } static int unsigned_cmp(const void *_l, const void *_r)