mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-03 08:19:58 -05:00
1487 lines
53 KiB
C
1487 lines
53 KiB
C
#include "file.h"
|
|
|
|
#include <linux/uio.h>
|
|
#include <linux/sched/mm.h>
|
|
#include <linux/mm.h>
|
|
#include <linux/pagemap.h>
|
|
|
|
#include "bincode.h"
|
|
#include "inode.h"
|
|
#include "log.h"
|
|
#include "metadata.h"
|
|
#include "err.h"
|
|
#include "rs.h"
|
|
#include "block.h"
|
|
#include "crc.h"
|
|
#include "trace.h"
|
|
#include "span.h"
|
|
#include "wq.h"
|
|
#include "bincode.h"
|
|
#include "dir.h"
|
|
#include "inode_compat.h"
|
|
|
|
unsigned ternfs_atime_update_interval_sec = 0;
|
|
|
|
unsigned ternfs_file_io_timeout_sec = 86400;
|
|
|
|
unsigned ternfs_file_io_retry_refresh_span_interval_sec = 60; // 1 minute interval for refreshing spans during IO retries
|
|
|
|
unsigned ternfs_max_write_span_attempts = 5;
|
|
|
|
static struct kmem_cache* ternfs_transient_span_cachep;
|
|
|
|
struct ternfs_transient_span {
|
|
// The transient span holds a reference to this. As long as the
|
|
// transient span lives, the enode must live.
|
|
struct ternfs_inode* enode;
|
|
// Offset in the file for this span
|
|
u64 offset;
|
|
// Linear list of pages with the body of the span, used when we're still gathering
|
|
// content for it. We use `index` in the page to track where we're writing to.
|
|
struct list_head pages;
|
|
u32 written; // how much we've written to this span
|
|
atomic_t refcount;
|
|
|
|
// These are finalized when we start flushing out a span.
|
|
char failure_domains[TERNFS_MAX_BLOCKS][16]; // failure domains for current run so we can blacklist failures
|
|
char blacklisted_failure_domains[TERNFS_MAX_BLACKLIST_LENGTH][16]; // blacklisted failure domains
|
|
struct list_head blocks[TERNFS_MAX_BLOCKS]; // the pages for each block
|
|
u64 block_ids[TERNFS_MAX_BLOCKS]; // the block ids assigned by add span initiate
|
|
spinlock_t lock; // we use this for various modifications
|
|
u64 blocks_proofs[TERNFS_MAX_BLOCKS]; // when completing blocks, one of proof or err is set.
|
|
int blocks_errs[TERNFS_MAX_BLOCKS];
|
|
u32 span_crc;
|
|
u32 cell_crcs[TERNFS_MAX_BLOCKS*TERNFS_MAX_STRIPES];
|
|
u32 block_size;
|
|
u8 attempts;
|
|
u8 stripes;
|
|
u8 storage_class;
|
|
u8 parity;
|
|
u8 blacklist_length;
|
|
bool started_flushing;
|
|
};
|
|
// just a sanity check, this is just two per page rn
|
|
static_assert(sizeof(struct ternfs_transient_span) < (2<<10));
|
|
|
|
// open_mutex held here
|
|
// really want atomic open for this
|
|
static int file_open(struct inode* inode, struct file* filp) {
|
|
inode_lock(inode); // for the .status modification below
|
|
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
|
|
ternfs_debug("enode=%p status=%d owner=%p", enode, enode->file.status, current->group_leader);
|
|
int err = 0;
|
|
|
|
if ((filp->f_mode&FMODE_WRITE) && (enode->file.status == TERNFS_FILE_STATUS_WRITING)) {
|
|
// this is the "common" writing case, we've just created a file to write it.
|
|
// note that we never change the file owner, which might lead to confusing behavior
|
|
// but is probably the only sensible thing to do.
|
|
} else {
|
|
// otherwise, the file must be already there, we're very relaxed in what we allow
|
|
// in f_mode here and we just fail when operations that can't be done (e.g. writing
|
|
// to files) are attempted. the reason is that some workflows (such as open write +
|
|
// setattr) _will_ work.
|
|
enode->file.status = TERNFS_FILE_STATUS_READING;
|
|
// also, set atime, if requested
|
|
if (!(filp->f_flags&O_NOATIME)) {
|
|
u64 atime_ns = ktime_get_real_ns();
|
|
struct timespec64 atime_ts = ns_to_timespec64(atime_ns);
|
|
u64 diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec);
|
|
if (diff < ternfs_atime_update_interval_sec) {
|
|
// we don't think we should update
|
|
goto out;
|
|
}
|
|
|
|
// https://internal-repo/issues/292
|
|
// we might have cached data and another client updated atime.
|
|
// ternfs_do_getattr is orders of magnitude cheaper than ternfs_shard_set_time,
|
|
// so we might as well refresh and re-check
|
|
int err = ternfs_do_getattr(enode, ATTR_CACHE_NO_TIMEOUT);
|
|
if (err) {
|
|
inode_unlock(inode);
|
|
goto out;
|
|
}
|
|
diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec);
|
|
if (diff < ternfs_atime_update_interval_sec) {
|
|
// out local time changed and we see we don't need to update
|
|
goto out;
|
|
}
|
|
|
|
if ((inode_get_atime_sec(&enode->inode) > atime_ts.tv_sec) ||
|
|
(inode_get_atime_sec(&enode->inode) == atime_ts.tv_sec &&
|
|
inode_get_atime_nsec(&enode->inode) == atime_ts.tv_nsec
|
|
)
|
|
) {
|
|
// we don't want atime to go into the past don't update
|
|
goto out;
|
|
}
|
|
u64 atime = atime_ns | (1ull<<63);
|
|
err = ternfs_shard_set_time((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, inode->i_ino, 0, atime);
|
|
if (err) {
|
|
goto out;
|
|
}
|
|
// we updated time. we don't need to refresh it now but allow refresh on next stat by getattr_expiry
|
|
smp_store_release(&enode->getattr_expiry, 0);
|
|
}
|
|
}
|
|
out:
|
|
inode_unlock(inode);
|
|
return err;
|
|
}
|
|
|
|
static void init_transient_span(void* p) {
|
|
struct ternfs_transient_span* span = (struct ternfs_transient_span*)p;
|
|
INIT_LIST_HEAD(&span->pages);
|
|
int i;
|
|
for (i = 0; i < TERNFS_MAX_BLOCKS; i++) {
|
|
INIT_LIST_HEAD(&span->blocks[i]);
|
|
}
|
|
// we set it to 0, we bug if not 0 when getting from cache
|
|
atomic_set(&span->refcount, 0);
|
|
spin_lock_init(&span->lock);
|
|
}
|
|
|
|
// starts out with refcount = 1
|
|
static struct ternfs_transient_span* new_transient_span(struct ternfs_inode* enode, u64 offset) {
|
|
struct ternfs_transient_span* span = kmem_cache_alloc(ternfs_transient_span_cachep, GFP_KERNEL);
|
|
if (span == NULL) { return span; }
|
|
span->enode = enode;
|
|
BUG_ON(atomic_read(&span->refcount) != 0);
|
|
atomic_set(&span->refcount, 1);
|
|
span->offset = offset;
|
|
span->written = 0;
|
|
memset(span->failure_domains, 0, sizeof(span->failure_domains));
|
|
memset(span->blacklisted_failure_domains, 0, sizeof(span->blacklisted_failure_domains));
|
|
memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs));
|
|
memset(span->blocks_errs, 0, sizeof(span->blocks_errs));
|
|
span->span_crc = 0;
|
|
memset(span->cell_crcs, 0, sizeof(span->cell_crcs));
|
|
span->attempts = 0;
|
|
// We set the parity to zero here so that `put_transient_span`
|
|
// is always safe to run, since it traverses the blocks to free
|
|
// the pages, if any.
|
|
span->parity = 0;
|
|
span->blacklist_length = 0;
|
|
span->started_flushing = false;
|
|
return span;
|
|
}
|
|
|
|
static void hold_transient_span(struct ternfs_transient_span* span) {
|
|
atomic_inc(&span->refcount);
|
|
}
|
|
|
|
static bool put_transient_span(struct ternfs_transient_span* span) {
|
|
if (atomic_dec_return(&span->refcount) == 0) {
|
|
BUG_ON(spin_is_locked(&span->lock));
|
|
// free pages, adjust OOM score
|
|
int num_pages = 0;
|
|
// We are not necessarily holding the last reference to a page here.
|
|
// We could have errored out before fully sending out the request.
|
|
// In which case network stack could still hold a reference to it.
|
|
// It is however fine to adjust the OOM score here as technically
|
|
// the file system is now responsible for this memory and it's no
|
|
// longer tied to the process lifetime.
|
|
#define FREE_PAGES(__pages) \
|
|
while (!list_empty(__pages)) { \
|
|
struct page* victim = lru_to_page(__pages); \
|
|
list_del(&victim->lru); \
|
|
put_page(victim); \
|
|
num_pages++; \
|
|
}
|
|
FREE_PAGES(&span->pages);
|
|
int b;
|
|
for (b = 0; b < ternfs_blocks(span->parity); b++) {
|
|
FREE_PAGES(&span->blocks[b]);
|
|
}
|
|
#undef FREE_PAGES
|
|
#if (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0))
|
|
atomic_long_add_return(-num_pages, &span->enode->file.mm->rss_stat.count[MM_FILEPAGES]);
|
|
#else
|
|
percpu_counter_add(&span->enode->file.mm->rss_stat[MM_FILEPAGES], -num_pages);
|
|
#endif
|
|
if (span->started_flushing) {
|
|
up(&span->enode->file.flushing_span_sema);
|
|
}
|
|
// Free the span itself
|
|
kmem_cache_free(ternfs_transient_span_cachep, span);
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static int add_span_initiate(struct ternfs_transient_span* span);
|
|
|
|
static int move_span_and_restart(struct ternfs_transient_span* span) {
|
|
struct ternfs_inode* enode = span->enode;
|
|
|
|
// create new file
|
|
u64 ino;
|
|
u64 cookie;
|
|
const char* name = "move_span";
|
|
int err = ternfs_error_to_linux(ternfs_shard_create_file(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
ternfs_inode_shard(enode->inode.i_ino), TERNFS_INODE_FILE, name, strlen(name), &ino, &cookie
|
|
));
|
|
if (err) { goto out_err; }
|
|
|
|
// move span
|
|
err = ternfs_error_to_linux(ternfs_shard_move_span(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
enode->inode.i_ino, span->offset, enode->file.cookie, ino, 0, cookie, span->written
|
|
));
|
|
if (err) { goto out_err; }
|
|
|
|
// go for it
|
|
err = add_span_initiate(span);
|
|
if (err) { goto out_err; }
|
|
|
|
return 0;
|
|
|
|
out_err:
|
|
ternfs_info("moving span failed %d", err);
|
|
return err;
|
|
}
|
|
|
|
static int retry_after_block_error(struct ternfs_transient_span* span) {
|
|
int i;
|
|
int B = ternfs_blocks(span->parity);
|
|
int err = 0;
|
|
|
|
for (i = 0; i < B; i++) {
|
|
if (span->blocks_errs[i]) {
|
|
if (err) {
|
|
ternfs_info("dropping err %d, since we already returned %d", span->blocks_errs[i], err);
|
|
} else {
|
|
err = span->blocks_errs[i];
|
|
}
|
|
}
|
|
}
|
|
BUG_ON(err == 0);
|
|
|
|
if (span->attempts < ternfs_max_write_span_attempts) {
|
|
ternfs_info("writing span failed with err %d after %d attempts, will try again", err, span->attempts);
|
|
return move_span_and_restart(span);
|
|
} else {
|
|
ternfs_warn("writing span failed with err %d after %d attempts, giving up", err, span->attempts);
|
|
return err;
|
|
}
|
|
}
|
|
|
|
static void write_block_finalize(struct ternfs_transient_span* span, int b, u64 proof, int block_write_err) {
|
|
int i;
|
|
int B = ternfs_blocks(span->parity);
|
|
struct ternfs_inode* enode = span->enode;
|
|
|
|
ternfs_debug("block=%d proof=%016llx err=%d", b, proof, block_write_err);
|
|
|
|
bool finished_writing = true;
|
|
bool any_block_errs = false;
|
|
// Mark the current one as done/error out, and check if we're done with all block
|
|
// writing requests.
|
|
spin_lock_bh(&span->lock);
|
|
for (i = 0; i < B; i++) {
|
|
BUG_ON(span->block_ids[i] == 0);
|
|
if (i == b) {
|
|
if (block_write_err == 0) {
|
|
span->blocks_proofs[i] = proof;
|
|
} else {
|
|
span->blocks_errs[i] = block_write_err;
|
|
}
|
|
} else if (span->blocks_proofs[i] == 0 && span->blocks_errs[i] == 0) {
|
|
// we're waiting on some other block proof.
|
|
finished_writing = false;
|
|
}
|
|
any_block_errs = any_block_errs || (span->blocks_errs[i] != 0);
|
|
}
|
|
spin_unlock_bh(&span->lock);
|
|
|
|
int err = 0;
|
|
if (finished_writing && likely(!any_block_errs)) {
|
|
// TODO just like in the other case, would be nicer to have this async, since we're in a queue here.
|
|
err = ternfs_error_to_linux(ternfs_shard_add_span_certify(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
enode->inode.i_ino, enode->file.cookie, span->offset, span->parity,
|
|
span->block_ids, span->blocks_proofs
|
|
));
|
|
}
|
|
|
|
if (finished_writing) {
|
|
span->attempts++;
|
|
if (unlikely(err == 0 && any_block_errs)) {
|
|
// if we failed writing blocks, consider retrying
|
|
err = retry_after_block_error(span);
|
|
}
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, err);
|
|
}
|
|
// waiters will be woken up once span ref count reaches 0
|
|
put_transient_span(span);
|
|
}
|
|
|
|
static void write_block_done(void* data, struct list_head* pages, u64 block_id, u64 proof, int block_write_err) {
|
|
// We need to be careful with locking here: the writer waits on the
|
|
// flushing semaphore while holding the inode lock.
|
|
|
|
// we use 0 to mean "no proof yet"
|
|
if (unlikely(block_write_err == 0 && proof == 0)) {
|
|
block_write_err = -EIO; // we need to get _very_ unlucky for this to happen!
|
|
}
|
|
|
|
struct ternfs_transient_span* span = (struct ternfs_transient_span*)data;
|
|
|
|
ternfs_debug("block_id=%016llx proof=%016llx err=%d", block_id, proof, block_write_err);
|
|
|
|
int B = ternfs_blocks(span->parity);
|
|
|
|
// find the index that concerns us now
|
|
int b;
|
|
for (b = 0; b < B; b++) {
|
|
if (span->block_ids[b] == block_id) {
|
|
break;
|
|
}
|
|
}
|
|
BUG_ON(b == B); // we found the block
|
|
BUG_ON(span->blocks_proofs[b] != 0); // no proof or err already
|
|
BUG_ON(span->blocks_errs[b] != 0);
|
|
|
|
// re-acquire the pages
|
|
list_replace_init(pages, &span->blocks[b]);
|
|
|
|
write_block_finalize(span, b, proof, block_write_err);
|
|
}
|
|
|
|
static int add_span_initiate(struct ternfs_transient_span* span) {
|
|
int i;
|
|
struct ternfs_inode* enode = span->enode;
|
|
int B = ternfs_blocks(span->parity);
|
|
|
|
int cell_size = span->block_size / span->stripes;
|
|
|
|
ternfs_debug("add span initiate");
|
|
|
|
// fill in blacklist
|
|
for (i = 0; i < B; i++) {
|
|
if (span->blocks_errs[i]) {
|
|
if (span->blacklist_length == TERNFS_MAX_BLACKLIST_LENGTH) {
|
|
// it's ok to try again, we have limit on number of attempts
|
|
ternfs_warn("span initiate blacklist full, dropping other errored out blocks");
|
|
break;
|
|
}
|
|
static_assert(sizeof(span->blacklisted_failure_domains[span->blacklist_length]) == sizeof(span->failure_domains[i]));
|
|
memcpy(span->blacklisted_failure_domains[span->blacklist_length], span->failure_domains[i], sizeof(span->failure_domains[i]));
|
|
span->blacklist_length++;
|
|
}
|
|
}
|
|
|
|
// reset stuff (including bad failure domains themselves)
|
|
memset(span->block_ids, 0, sizeof(span->block_ids));
|
|
memset(span->failure_domains, 0, sizeof(span->failure_domains));
|
|
memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs));
|
|
memset(span->blocks_errs, 0, sizeof(span->blocks_errs));
|
|
|
|
struct ternfs_add_span_initiate_block blocks[TERNFS_MAX_BLOCKS];
|
|
int err = ternfs_error_to_linux(ternfs_shard_add_span_initiate(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
|
|
span, enode->inode.i_ino, enode->file.cookie, span->offset, span->written,
|
|
span->span_crc, span->storage_class, span->parity, span->stripes, cell_size, span->cell_crcs,
|
|
span->blacklist_length, span->blacklisted_failure_domains,
|
|
blocks
|
|
));
|
|
if (unlikely(err)) {
|
|
// this means that we've failed to do the metadata (not the blocks), we can't recover from this
|
|
return err;
|
|
}
|
|
|
|
// send all the requests. at this point we never fail, we instead defer
|
|
// failure to the block requests. we intentionally let all the requests
|
|
// reach completion (rather than erroring on the first one) to try to
|
|
// clear out all bad failure domains in one go.
|
|
for (i = 0; i < B; i++) {
|
|
// first mark the downloading blocks. required to do this upfront
|
|
// so that we'll mark everything as finished only once we've gone
|
|
// through all the requests.
|
|
struct ternfs_add_span_initiate_block* block = &blocks[i];
|
|
span->block_ids[i] = block->block_id;
|
|
static_assert(sizeof(span->failure_domains[i]) == sizeof(block->failure_domain));
|
|
memcpy(span->failure_domains[i], block->failure_domain, sizeof(block->failure_domain));
|
|
}
|
|
for (i = 0; i < B; i++) { // then start uploading
|
|
struct ternfs_add_span_initiate_block* block = &blocks[i];
|
|
u32 block_crc = 0;
|
|
int s;
|
|
for (s = 0; s < span->stripes; s++) {
|
|
block_crc = ternfs_crc32c_append(block_crc, span->cell_crcs[s*B + i], cell_size);
|
|
}
|
|
struct ternfs_block_service bs = {
|
|
.id = block->block_service_id,
|
|
.ip1 = block->ip1,
|
|
.port1 = block->port1,
|
|
.ip2 = block->ip2,
|
|
.port2 = block->port2,
|
|
};
|
|
hold_transient_span(span); // for the callback when block is done
|
|
int err = ternfs_write_block(
|
|
&write_block_done,
|
|
span,
|
|
&bs,
|
|
block->block_id,
|
|
block->certificate,
|
|
span->block_size,
|
|
block_crc,
|
|
&span->blocks[i]
|
|
);
|
|
if (err) { // "finish" immediately, we've errored out
|
|
write_block_finalize(span, i, 0, err);
|
|
}
|
|
}
|
|
|
|
// Now we wait.
|
|
ternfs_debug("sent all block writing requests");
|
|
return 0;
|
|
}
|
|
|
|
static struct page* alloc_write_page(struct ternfs_inode_file* file) {
|
|
// The zeroing is to assume that in the blocks we have trailing zeros,
|
|
// and we also use this to just append zeros to the blocks without
|
|
// copying. The former is definitely not needed, but anyway, simpler
|
|
// for now.
|
|
struct page* p = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
|
if (p == NULL) { return p; }
|
|
// This contributes to OOM score.
|
|
#if (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0))
|
|
atomic_long_inc_return(&file->mm->rss_stat.count[MM_FILEPAGES]);
|
|
#else
|
|
percpu_counter_inc(&file->mm->rss_stat[MM_FILEPAGES]);
|
|
#endif
|
|
return p;
|
|
}
|
|
|
|
static int compute_span_parameters(
|
|
struct ternfs_policy* block_policy_p,
|
|
struct ternfs_policy* span_policy_p,
|
|
struct ternfs_policy* stripe_policy_p,
|
|
struct ternfs_transient_span* span
|
|
) {
|
|
u32 span_size = span->written;
|
|
ternfs_debug("span_size=%u", span_size);
|
|
|
|
// Easy case: inline span (or empty, can happen for empty files)
|
|
if (span_size < 256) {
|
|
span->storage_class = span_size > 0 ? TERNFS_INLINE_STORAGE : TERNFS_EMPTY_STORAGE;
|
|
span->block_size = 0;
|
|
span->stripes = 0;
|
|
span->parity = 0;
|
|
return 0;
|
|
}
|
|
|
|
// fetch the bodies of the policies
|
|
struct ternfs_policy_body block_policy;
|
|
ternfs_get_policy_body(block_policy_p, &block_policy);
|
|
struct ternfs_policy_body span_policy;
|
|
ternfs_get_policy_body(span_policy_p, &span_policy);
|
|
struct ternfs_policy_body stripe_policy;
|
|
ternfs_get_policy_body(stripe_policy_p, &stripe_policy);
|
|
|
|
// Pick parity
|
|
int num_span_policies = ternfs_span_policy_len(span_policy.body, span_policy.len);
|
|
BUG_ON(num_span_policies == 0);
|
|
int i;
|
|
u8 parity = 0;
|
|
for (i = num_span_policies - 2; i >= 0; i--) {
|
|
u32 max_size;
|
|
u8 this_parity;
|
|
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &this_parity);
|
|
if (span_size > max_size) {
|
|
i++;
|
|
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &parity);
|
|
break;
|
|
}
|
|
}
|
|
if (parity == 0) {
|
|
i = 0;
|
|
u32 max_size;
|
|
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &parity);
|
|
BUG_ON(span_size > max_size);
|
|
}
|
|
if (unlikely(ternfs_data_blocks(parity) > TERNFS_MAX_DATA || ternfs_parity_blocks(parity) > TERNFS_MAX_PARITY)) {
|
|
ternfs_info("got asked to write with parity (%d,%d), which is beyond max blocks (%d,%d)", ternfs_data_blocks(parity), ternfs_parity_blocks(parity), TERNFS_MAX_DATA, TERNFS_MAX_PARITY);
|
|
return -EINVAL;
|
|
}
|
|
|
|
// For simplicity, we have all cells to be a multiple of PAGE_SIZE, which means that all
|
|
// blocks/stripes/spans are multiples of PAGE_SIZE. We might also have some extra pages
|
|
// at the end to have things to line up correctly. This should only happens for big spans
|
|
// anyway (small spans will just use mirroring).
|
|
u32 target_stripe_size = ternfs_stripe_policy(stripe_policy.body, stripe_policy.len);
|
|
int S;
|
|
if (ternfs_data_blocks(parity) == 1) {
|
|
// If we only have one data block, things are also pretty simple (just mirroring).
|
|
// It doesn't make sense to have stripes in this case.
|
|
S = 1;
|
|
} else {
|
|
// Otherwise compute striping etc.
|
|
// We use target_stripe_size as an upper bound, for now (i.e. the stripe will always be smaller
|
|
// than TargetStripeSize)
|
|
S = min(max((u32)1, span_size / target_stripe_size), (u32)15);
|
|
}
|
|
int D = ternfs_data_blocks(parity);
|
|
int block_size = (span_size + D - 1) / D;
|
|
int cell_size = (block_size + S - 1) / S;
|
|
// Round up to cell to page size
|
|
cell_size = (cell_size + PAGE_SIZE - 1) & PAGE_MASK;
|
|
block_size = cell_size * S;
|
|
|
|
// Pick storage class
|
|
BUG_ON(span_size > TERNFS_MAX_BLOCK_SIZE);
|
|
int num_block_policies = ternfs_block_policy_len(block_policy.body, block_policy.len);
|
|
BUG_ON(num_block_policies == 0);
|
|
u8 storage_class;
|
|
for (i = num_block_policies-1; i >= 0; i--) {
|
|
u32 min_size;
|
|
ternfs_block_policy_get(block_policy.body, block_policy.len, i, &storage_class, &min_size);
|
|
if (block_size > min_size) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
span->storage_class = storage_class;
|
|
span->block_size = block_size;
|
|
span->stripes = S;
|
|
span->parity = parity;
|
|
|
|
return 0;
|
|
}
|
|
|
|
// To be called with the inode lock. Acquires the flushing sema.
|
|
static int wait_flushed(struct ternfs_inode* enode, bool non_blocking) {
|
|
BUG_ON(!inode_is_locked(&enode->inode));
|
|
|
|
// make sure we're free to go
|
|
if (down_trylock(&enode->file.flushing_span_sema) == 1) {
|
|
if (non_blocking) {
|
|
return -EAGAIN;
|
|
}
|
|
down(&enode->file.flushing_span_sema);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|
|
static int write_blocks(struct ternfs_transient_span* span) {
|
|
int i, j;
|
|
int err = 0;
|
|
|
|
struct ternfs_inode* enode = span->enode;
|
|
|
|
// transient file is already borked
|
|
err = atomic_read(&enode->file.transient_err);
|
|
if (err) { goto out; }
|
|
|
|
int D = ternfs_data_blocks(span->parity);
|
|
int B = ternfs_blocks(span->parity);
|
|
int S = span->stripes;
|
|
|
|
ternfs_debug("size=%d, D=%d, B=%d, S=%d", span->written, D, B, S);
|
|
loff_t offset = enode->inode.i_size - span->written;
|
|
trace_eggsfs_span_flush_enter(
|
|
enode->inode.i_ino, offset, span->written, span->parity, span->stripes, span->storage_class
|
|
);
|
|
// Below, remember that cell size is always a multiple of PAGE_SIZE.
|
|
u32 pages_per_block = span->block_size/PAGE_SIZE;
|
|
u32 data_pages = pages_per_block*D;
|
|
u32 current_data_pages = (span->written + PAGE_SIZE - 1) / PAGE_SIZE;
|
|
ternfs_debug("allocating padding pages_per_block=%u total_pages=%u current_pages=%u", pages_per_block, data_pages, current_data_pages);
|
|
// This can happen if we have trailing zeros because of how we arrange the
|
|
// stripes.
|
|
for (i = current_data_pages; i < data_pages; i++) {
|
|
struct page* zpage = alloc_write_page(&enode->file);
|
|
if (zpage == NULL) {
|
|
err = -ENOMEM;
|
|
// we'd have to back out the pages we allocated so far and i can't be bothered
|
|
goto out;
|
|
}
|
|
list_add_tail(&zpage->lru, &span->pages);
|
|
}
|
|
int cell_size = span->block_size / S;
|
|
ternfs_debug("arranging pages");
|
|
// First, we arrange the data blocks appropriatedly.
|
|
{
|
|
int cell_offset = 0;
|
|
int block = 0;
|
|
struct page* page;
|
|
struct page* tmp;
|
|
list_for_each_entry_safe(page, tmp, &span->pages, lru) {
|
|
list_del(&page->lru);
|
|
list_add_tail(&page->lru, &span->blocks[block]);
|
|
cell_offset += PAGE_SIZE;
|
|
if (cell_offset >= cell_size) {
|
|
block++;
|
|
block = block % D;
|
|
cell_offset = 0;
|
|
}
|
|
}
|
|
}
|
|
// Then we allocate the parity blocks
|
|
for (i = D; i < B; i++) {
|
|
for (j = 0; j < pages_per_block; j++) {
|
|
struct page* ppage = alloc_write_page(&enode->file);
|
|
if (ppage == NULL) { err = -ENOMEM; goto out; }
|
|
list_add_tail(&ppage->lru, &span->blocks[i]);
|
|
}
|
|
}
|
|
ternfs_debug("computing parity");
|
|
// Then, we compute the parity blocks and the CRCs for each block.
|
|
{
|
|
char* pages_bufs[TERNFS_MAX_BLOCKS];
|
|
kernel_fpu_begin();
|
|
// traverse each block, page by page, rotating as we go along
|
|
for (j = 0; j < pages_per_block; j++) {
|
|
// get buffer pointers
|
|
int s = (j*PAGE_SIZE)/cell_size;
|
|
for (i = 0; i < B; i++) {
|
|
pages_bufs[i] = kmap_atomic(list_first_entry(&span->blocks[i], struct page, lru));
|
|
}
|
|
// compute parity
|
|
err = ternfs_compute_parity(span->parity, PAGE_SIZE, (const char**)&pages_bufs[0], &pages_bufs[D]);
|
|
// compute CRCs
|
|
for (i = 0; i < B; i++) {
|
|
span->cell_crcs[s*B + i] = ternfs_crc32c(span->cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
|
|
}
|
|
for (i = 0; i < B; i++) {
|
|
kunmap_atomic(pages_bufs[i]);
|
|
list_rotate_left(&span->blocks[i]);
|
|
}
|
|
// unmap pages before exiting
|
|
if (err) { goto out_err_fpu; }
|
|
}
|
|
// Compute overall span crc
|
|
int s;
|
|
for (s = 0; s < S; s++) {
|
|
for (i = 0; i < D; i++) {
|
|
span->span_crc = ternfs_crc32c_append(span->span_crc, span->cell_crcs[s*B + i], cell_size);
|
|
}
|
|
}
|
|
span->span_crc = ternfs_crc32c_zero_extend(span->span_crc, (int)span->written - (int)(span->block_size*D));
|
|
kernel_fpu_end();
|
|
}
|
|
// Start the first attempt
|
|
// Now we need to init the span (the callback will actually send the requests)
|
|
// TODO would be better to have this async, and free up the queue for other stuff,
|
|
// and also not block in this call which might be in a non-blocking write.
|
|
// TODO Isn't this called on file write, and therefore not taking up time in
|
|
// the queue? I think the comment above is partially outdated.
|
|
err = add_span_initiate(span);
|
|
|
|
out:
|
|
if (err) { // we've failed before trying to write blocks, terminate immediately
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, err); // store error if we have one
|
|
}
|
|
return err;
|
|
|
|
out_err_fpu:
|
|
kernel_fpu_end();
|
|
goto out;
|
|
}
|
|
|
|
// To be called with the inode lock.
|
|
static int start_flushing(struct ternfs_inode* enode, bool non_blocking) {
|
|
BUG_ON(!inode_is_locked(&enode->inode));
|
|
|
|
// Wait for the existing thing to be flushed
|
|
int err = wait_flushed(enode, non_blocking);
|
|
if (err < 0) { return err; }
|
|
|
|
// Now turn the writing span into a flushing span
|
|
struct ternfs_transient_span* span = enode->file.writing_span;
|
|
|
|
if (span == NULL) {
|
|
up(&enode->file.flushing_span_sema);
|
|
return 0;
|
|
}
|
|
enode->file.writing_span = NULL;
|
|
// We should hold the only reference
|
|
BUG_ON(atomic_read(&span->refcount) != 1);
|
|
// We mark span as started flushin so we know to wake up waiters on last reference going away.
|
|
// At that point operation is done in either failure or success.
|
|
span->started_flushing = true;
|
|
err = compute_span_parameters(
|
|
enode->block_policy, enode->span_policy, enode->stripe_policy, span
|
|
);
|
|
if (err) {
|
|
goto out;
|
|
}
|
|
|
|
// Here (and in `write_blocks`) we might block even if we're non
|
|
// blocking, since we don't have async metadata requests yet, which is
|
|
// a bit unfortunate.
|
|
|
|
if (span->storage_class == TERNFS_INLINE_STORAGE) {
|
|
// this is an easy one, just add the inline span
|
|
struct page* page = list_first_entry(&span->pages, struct page, lru);
|
|
char* data = kmap(page);
|
|
ternfs_debug("adding inline span of length %d", span->written);
|
|
err = ternfs_error_to_linux(ternfs_shard_add_inline_span(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, enode->file.cookie,
|
|
span->offset, span->written, data, span->written
|
|
));
|
|
kunmap(page);
|
|
} else if (span->storage_class != TERNFS_EMPTY_STORAGE) {
|
|
// The real deal, we need to write blocks. Note that
|
|
// this guy is tasked with releasing the flushing semaphore
|
|
// if things fail (otherwise we're not done yet).
|
|
err = write_blocks(span);
|
|
}
|
|
|
|
out:
|
|
if (err) {
|
|
ternfs_info("attempting to flush failed permanently, err=%d ino=%016lx", err, enode->inode.i_ino);
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, err);
|
|
}
|
|
// we don't need this anymore (the requests might though)
|
|
put_transient_span(span);
|
|
return err;
|
|
}
|
|
|
|
// To be called with the inode lock.
|
|
//
|
|
// This accepts a NULL `from`, in which case zeros will be written.
|
|
ssize_t ternfs_file_write_internal(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from, size_t count) {
|
|
BUG_ON(!inode_is_locked(&enode->inode));
|
|
|
|
int err;
|
|
if (flags & IOCB_DIRECT) { return -ENOSYS; }
|
|
|
|
loff_t ppos_before = *ppos;
|
|
|
|
ternfs_debug("enode=%p, ino=%lu, count=%lu, size=%lld, *ppos=%lld status=%d", enode, enode->inode.i_ino, count, enode->inode.i_size, *ppos, enode->file.status);
|
|
|
|
if (enode->file.status != TERNFS_FILE_STATUS_WRITING) {
|
|
err = -EROFS;
|
|
goto out_err;
|
|
}
|
|
|
|
// permanent error
|
|
err = atomic_read(&enode->file.transient_err);
|
|
if (err) { goto out_err; }
|
|
|
|
// we're writing at the right offset
|
|
if (*ppos != enode->inode.i_size) {
|
|
err = -EINVAL;
|
|
goto out_err;
|
|
}
|
|
|
|
// Precompute the bound for spans
|
|
u32 max_span_size;
|
|
{
|
|
struct ternfs_policy_body span_policy;
|
|
ternfs_get_policy_body(enode->span_policy, &span_policy);
|
|
u8 p;
|
|
ternfs_span_policy_last(span_policy.body, span_policy.len, &max_span_size, &p);
|
|
}
|
|
BUG_ON(max_span_size%PAGE_SIZE != 0); // needed for "new span" logic paired with page copying below, we could avoid it
|
|
|
|
// Get the span we're currently writing at
|
|
if (enode->file.writing_span == NULL) {
|
|
enode->file.writing_span = new_transient_span(enode, enode->inode.i_size);
|
|
if (enode->file.writing_span == NULL) {
|
|
err = -ENOMEM;
|
|
goto out_err;
|
|
}
|
|
}
|
|
struct ternfs_transient_span* span = enode->file.writing_span;
|
|
|
|
// Update mtime
|
|
struct timespec64 mtime;
|
|
ktime_get_real_ts64(&mtime);
|
|
inode_set_mtime_to_ts(&enode->inode, mtime);
|
|
|
|
// We now start writing into the span, as much as we can anyway
|
|
while (span->written < max_span_size && count) {
|
|
// grab the page to write to
|
|
struct page* page = list_empty(&span->pages) ? NULL : list_last_entry(&span->pages, struct page, lru);
|
|
if (page == NULL || page->index == 0) { // we're the first ones to get here, or we need to switch to the next one
|
|
BUG_ON(page != NULL && page->index > PAGE_SIZE);
|
|
page = alloc_write_page(&enode->file);
|
|
if (!page) {
|
|
err = -ENOMEM;
|
|
goto out_err; // we haven't corrupted anything, so no need to make it permanent
|
|
}
|
|
page->index = 0;
|
|
list_add_tail(&page->lru, &span->pages);
|
|
}
|
|
|
|
// copy stuff into page
|
|
int ret;
|
|
if (likely(from)) {
|
|
ret = copy_page_from_iter(page, page->index, PAGE_SIZE - page->index, from);
|
|
} else {
|
|
ret = min(count, PAGE_SIZE - page->index);
|
|
}
|
|
if (ret < 0) { err = ret; goto out_err_permanent; }
|
|
ternfs_debug("written %d to page %p", ret, page);
|
|
enode->inode.i_size += ret;
|
|
span->written += ret;
|
|
page->index = (page->index + ret) % PAGE_SIZE;
|
|
*ppos += ret;
|
|
count -= ret;
|
|
}
|
|
|
|
if (span->written >= max_span_size) { // we need to start flushing the span
|
|
// this might block even if it says nonblock, which is unfortunate.
|
|
err = start_flushing(enode, !!(flags&IOCB_NOWAIT));
|
|
if (err < 0) { goto out_err_permanent; }
|
|
}
|
|
|
|
int written;
|
|
out:
|
|
written = *ppos - ppos_before;
|
|
ternfs_debug("written=%d", written);
|
|
return written;
|
|
|
|
out_err_permanent:
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, err);
|
|
out_err:
|
|
ternfs_debug("err=%d", err);
|
|
if (err == -EAGAIN && *ppos > ppos_before) {
|
|
// We can just return what we've already written. In fact, it's important
|
|
// that we do so, otherwise repeated calls might fail because they'd be
|
|
// with the wrong offset.
|
|
goto out;
|
|
}
|
|
return err;
|
|
}
|
|
|
|
ssize_t ternfs_file_write(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from) {
|
|
return ternfs_file_write_internal(enode, flags, ppos, from, iov_iter_count(from));
|
|
}
|
|
|
|
static ssize_t file_write_iter(struct kiocb* iocb, struct iov_iter* from) {
|
|
struct file* file = iocb->ki_filp;
|
|
struct inode* inode = file->f_inode;
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
|
|
if (!inode_trylock(inode)) {
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
return -EAGAIN;
|
|
}
|
|
inode_lock(inode);
|
|
}
|
|
ssize_t res = ternfs_file_write(enode, iocb->ki_flags, &iocb->ki_pos, from);
|
|
inode_unlock(inode);
|
|
|
|
return res;
|
|
}
|
|
|
|
int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) {
|
|
inode_lock(&enode->inode);
|
|
|
|
int err = 0;
|
|
|
|
// Not writing, there's nothing to do, there's nothing to do, files are immutable
|
|
if (enode->file.status != TERNFS_FILE_STATUS_WRITING) {
|
|
ternfs_debug("status=%d, won't flush", enode->file.status);
|
|
goto out_early;
|
|
}
|
|
|
|
// We are in another process, skip
|
|
if (enode->file.owner != current->group_leader) {
|
|
ternfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader);
|
|
goto out_early;
|
|
}
|
|
|
|
bool file_is_alive_and_flushing = false;
|
|
|
|
// if we've errored out already, just exit
|
|
err = atomic_read(&enode->file.transient_err);
|
|
if (err < 0) { goto out; }
|
|
|
|
// If the owner doesn't have an mm anymore, it means that the file
|
|
// is being torn down after the process has been terminated. In that case
|
|
// we shouldn't even link the file.
|
|
if (current->mm == NULL) {
|
|
// abort everything
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, -EIO);
|
|
goto out;
|
|
}
|
|
|
|
// OK, in this case we are indeed linking it, we need to do two things: first wait
|
|
// for the span which is being flushed to be flushed, and then flush the current span,
|
|
// if we have one.
|
|
err = start_flushing(enode, false);
|
|
if (err < 0) { goto out; }
|
|
|
|
down(&enode->file.flushing_span_sema);
|
|
file_is_alive_and_flushing = true;
|
|
|
|
// the requests might have failed
|
|
err = atomic_read(&enode->file.transient_err);
|
|
if (err < 0) { goto out; }
|
|
|
|
// now we actually need to link
|
|
ternfs_debug("linking file");
|
|
err = ternfs_error_to_linux(ternfs_shard_link_file(
|
|
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino,
|
|
enode->file.cookie, dentry->d_parent->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len,
|
|
&enode->edge_creation_time
|
|
));
|
|
if (err < 0) { goto out; }
|
|
|
|
// finalize mtime
|
|
inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000);
|
|
inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000);
|
|
|
|
// Switch the file to a normal file
|
|
enode->file.status = TERNFS_FILE_STATUS_READING;
|
|
smp_store_release(&enode->getattr_expiry, 0);
|
|
|
|
// expire the directory listing -- we know for a fact that it
|
|
// is wrong, it now contains this file.
|
|
{
|
|
struct dentry* parent = dget_parent(dentry);
|
|
WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0);
|
|
dput(parent);
|
|
}
|
|
|
|
out:
|
|
if (err) {
|
|
atomic_cmpxchg(&enode->file.transient_err, 0, err);
|
|
}
|
|
if (!file_is_alive_and_flushing) {
|
|
// The file is dead, we want to make sure that any in-flight flushings
|
|
// are gone. We might enter this function multiple times with a dead
|
|
// file: imagine for instance a process dying with many threads having
|
|
// the reference to the same file. This is why we also release the
|
|
// semaphore afterwards, so that after the first one that cleans up
|
|
// the others will not get stuck. So this section acts as a flushing
|
|
// barrier of sorts. Since the file is dead this should not cause problems
|
|
// (e.g. we should not get genuine flushes due to writes which then conflict
|
|
// with this logic).
|
|
down(&enode->file.flushing_span_sema);
|
|
up(&enode->file.flushing_span_sema);
|
|
}
|
|
// There are cases where we decide not to flush, we still need to free the writing span
|
|
if (enode->file.writing_span != NULL) {
|
|
// if writing_span is not NULL we should be the one holding last reference to it
|
|
BUG_ON(!put_transient_span(enode->file.writing_span));
|
|
enode->file.writing_span = NULL;
|
|
}
|
|
// after we last cleared it should have no longer be set
|
|
|
|
BUG_ON(enode->file.writing_span != NULL);
|
|
if (enode->file.mm) {
|
|
mmdrop(enode->file.mm);
|
|
}
|
|
enode->file.mm = NULL;
|
|
inode_unlock(&enode->inode);
|
|
return err;
|
|
|
|
out_early:
|
|
inode_unlock(&enode->inode);
|
|
return err;
|
|
}
|
|
|
|
static int file_flush_internal(struct file* filp, fl_owner_t id) { // can we get write while this is in progress?
|
|
struct ternfs_inode* enode = TERNFS_I(filp->f_inode);
|
|
struct dentry* dentry = filp->f_path.dentry;
|
|
return ternfs_file_flush(enode, dentry);
|
|
}
|
|
|
|
static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
|
|
struct file* file = iocb->ki_filp;
|
|
struct inode* inode = file->f_inode;
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
|
|
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; }
|
|
|
|
// make sure we have size information
|
|
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
|
|
if (err) { return err; }
|
|
|
|
return generic_file_read_iter(iocb, to);
|
|
}
|
|
|
|
void ternfs_link_destructor(void* buf) {
|
|
kfree(buf);
|
|
}
|
|
|
|
char* ternfs_read_link(struct ternfs_inode* enode) {
|
|
ternfs_debug("ino=%016lx", enode->inode.i_ino);
|
|
|
|
BUG_ON(ternfs_inode_type(enode->inode.i_ino) != TERNFS_INODE_SYMLINK);
|
|
|
|
// make sure we have size information
|
|
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
|
|
if (err) { return ERR_PTR(err); }
|
|
|
|
BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity...
|
|
size_t size = enode->inode.i_size;
|
|
|
|
ternfs_debug("size=%lu", size);
|
|
|
|
struct ternfs_span* span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, 0);
|
|
if (span == NULL) {
|
|
ternfs_debug("got no span, empty file?");
|
|
return "";
|
|
}
|
|
if (IS_ERR(span)) { return ERR_CAST(span); }
|
|
|
|
BUG_ON(span->end-span->start != size); // again, for simplicity
|
|
|
|
char* buf = kmalloc(size+1, GFP_KERNEL);
|
|
if (buf == NULL) { err = -ENOMEM; goto out_err; }
|
|
|
|
if (span->storage_class == TERNFS_INLINE_STORAGE) {
|
|
memcpy(buf, TERNFS_INLINE_SPAN(span)->body, size);
|
|
} else {
|
|
struct page* page;
|
|
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
|
|
LIST_HEAD(pages);
|
|
page = alloc_page(GFP_KERNEL);
|
|
if(IS_ERR(page)) {
|
|
err = PTR_ERR(page);
|
|
goto out_err;
|
|
}
|
|
page->index = 0;
|
|
list_add_tail(&page->lru, &pages);
|
|
LIST_HEAD(extra_pages);
|
|
err = ternfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages);
|
|
if (err) goto out_err;
|
|
list_del(&page->lru);
|
|
put_pages_list(&pages);
|
|
put_pages_list(&extra_pages);
|
|
char* page_buf = kmap(page);
|
|
memcpy(buf, page_buf, size);
|
|
kunmap(page);
|
|
}
|
|
buf[size] = '\0';
|
|
|
|
ternfs_debug("link %*pE", (int)size, buf);
|
|
|
|
return buf;
|
|
|
|
out_err:
|
|
ternfs_debug("get_link err=%d", err);
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static loff_t file_lseek(struct file *file, loff_t offset, int whence) {
|
|
// When seeking a read file, we just do as normal.
|
|
// When seeking a file we're writing, we only allow to seek forward,
|
|
// which is the same as writing zeros past the end.
|
|
struct inode* inode = file->f_inode;
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
|
|
if (likely(smp_load_acquire(&enode->file.status) == TERNFS_FILE_STATUS_READING)) {
|
|
// make sure we have size information
|
|
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
|
|
if (err) { return err; }
|
|
return generic_file_llseek(file, offset, whence);
|
|
}
|
|
|
|
inode_lock(inode);
|
|
|
|
loff_t ppos = file->f_pos;
|
|
// We only support ppos < i_size when reading the current position.
|
|
if (ppos != enode->inode.i_size && (offset != 0 || whence != SEEK_CUR)) {
|
|
goto out_err;
|
|
}
|
|
|
|
switch (whence) {
|
|
case SEEK_SET:
|
|
if (offset < ppos) { goto out_err; }
|
|
break;
|
|
case SEEK_CUR:
|
|
case SEEK_END:
|
|
if (offset < 0) { goto out_err; }
|
|
offset = ppos + offset;
|
|
break;
|
|
default:
|
|
goto out_err;
|
|
}
|
|
while (ppos < offset) {
|
|
ssize_t written = ternfs_file_write_internal(enode, 0, &ppos, NULL, offset - ppos);
|
|
if (unlikely(written < 0)) {
|
|
offset = written;
|
|
goto out;
|
|
}
|
|
file->f_pos = ppos;
|
|
file->f_version = 0; // what's this for?
|
|
}
|
|
|
|
out:
|
|
inode_unlock(inode);
|
|
return offset;
|
|
|
|
out_err:
|
|
offset = -EINVAL;
|
|
goto out;
|
|
}
|
|
|
|
// Files are not visible until they're fully persisted, so fsync is unnecessary.
|
|
static int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) {
|
|
return 0;
|
|
}
|
|
|
|
const struct file_operations ternfs_file_operations = {
|
|
.open = file_open,
|
|
.read_iter = file_read_iter,
|
|
.write_iter = file_write_iter,
|
|
.flush = file_flush_internal,
|
|
.llseek = file_lseek,
|
|
.mmap = generic_file_readonly_mmap,
|
|
.fsync = file_fsync,
|
|
};
|
|
|
|
static void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
|
|
struct page* page;
|
|
int err;
|
|
for(;;) {
|
|
page = list_first_entry_or_null(pages, struct page, lru);
|
|
if (page == NULL) {
|
|
break;
|
|
}
|
|
list_del(&page->lru);
|
|
u32 refct = atomic_read(&page->_refcount);
|
|
if(refct != 1) ternfs_warn("page refcount=%d, expected 1 at index %ld, pages=%d", refct, page->index, nr_pages);
|
|
err = add_to_page_cache_lru(page,
|
|
mapping,
|
|
page->index,
|
|
readahead_gfp_mask(mapping));
|
|
if (err != 0) {
|
|
ternfs_debug("failed adding page at index=%ld, err=%d", page->index, err);
|
|
} else {
|
|
unlock_page(page);
|
|
SetPageUptodate(page);
|
|
}
|
|
put_page(page);
|
|
}
|
|
}
|
|
|
|
static int file_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
|
|
int err = 0;
|
|
|
|
struct inode* inode = file_inode(filp);
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
loff_t off = page_offset(lru_to_page(pages));
|
|
ternfs_debug("enode=%p, ino=%ld, offset=%lld, nr_pages=%u", enode, inode->i_ino, off, nr_pages);
|
|
|
|
struct ternfs_span* span = NULL;
|
|
struct page *page;
|
|
LIST_HEAD(extra_pages);
|
|
|
|
if (off >= inode->i_size) { // out of bounds
|
|
list_for_each_entry(page, pages, lru) {
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
|
}
|
|
goto out;
|
|
}
|
|
|
|
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
|
|
if (IS_ERR(span)) {
|
|
ternfs_warn("ternfs_get_span_failed at pos %llu", off);
|
|
err = PTR_ERR(span);
|
|
goto out_err;
|
|
}
|
|
|
|
if (span->storage_class == TERNFS_INLINE_STORAGE) {
|
|
struct ternfs_inline_span* inline_span = TERNFS_INLINE_SPAN(span);
|
|
page = lru_to_page(pages);
|
|
if (page == NULL) {
|
|
err = -EIO;
|
|
goto out_err;
|
|
}
|
|
list_del(&page->lru);
|
|
// rest is past end of file, just drop it
|
|
put_pages_list(pages);
|
|
// return page back
|
|
list_add_tail(&page->lru, pages);
|
|
BUG_ON(page_offset(page) != span->start);
|
|
char* dst = kmap_atomic(page);
|
|
memcpy(dst, inline_span->body, inline_span->len);
|
|
kunmap_atomic(dst);
|
|
} else {
|
|
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
|
|
err = ternfs_span_get_pages(block_span, mapping, pages, nr_pages, &extra_pages);
|
|
if (err) {
|
|
ternfs_warn("readahead of %d pages at off=%lld in file %016lx failed with error %d", nr_pages, off, enode->inode.i_ino, err);
|
|
put_pages_list(&extra_pages);
|
|
goto out_err;
|
|
}
|
|
}
|
|
out:
|
|
process_file_pages(mapping, pages, nr_pages);
|
|
process_file_pages(mapping, &extra_pages, 0);
|
|
return 0;
|
|
|
|
out_err:
|
|
put_pages_list(pages);
|
|
return err;
|
|
}
|
|
|
|
static int file_readpage(struct file* filp, struct page* page) {
|
|
struct inode* inode = file_inode(filp);
|
|
struct ternfs_inode* enode = TERNFS_I(inode);
|
|
u64 off = page_offset(page);
|
|
int err = 0;
|
|
|
|
struct ternfs_span* span = NULL;
|
|
|
|
struct timespec64 start_ts = ns_to_timespec64(ktime_get_real_ns());
|
|
struct timespec64 last_span_refresh_ts = ns_to_timespec64(0);
|
|
|
|
if (off >= inode->i_size) { // out of bounds
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
|
goto out;
|
|
}
|
|
|
|
retry:
|
|
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
|
|
if (IS_ERR(span)) {
|
|
ternfs_debug("ternfs_get_span_failed at pos %llu", off);
|
|
err = PTR_ERR(span);
|
|
goto out;
|
|
}
|
|
|
|
if (span->start%PAGE_SIZE != 0) {
|
|
ternfs_warn("span start is not a multiple of page size %llu", span->start);
|
|
span = NULL;
|
|
err = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
u64 span_offset = off - span->start;
|
|
|
|
if (span->storage_class == TERNFS_INLINE_STORAGE) {
|
|
struct ternfs_inline_span* inline_span = TERNFS_INLINE_SPAN(span);
|
|
size_t to_copy = inline_span->len - span_offset;
|
|
BUG_ON(to_copy > PAGE_SIZE);
|
|
char* dst = kmap_atomic(page);
|
|
memcpy(dst, inline_span->body + span_offset, to_copy);
|
|
kunmap_atomic(dst);
|
|
} else {
|
|
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
|
|
if (block_span->cell_size%PAGE_SIZE != 0) {
|
|
ternfs_warn("cell size not multiple of page size %u", block_span->cell_size);
|
|
err = -EIO;
|
|
goto out;
|
|
}
|
|
|
|
struct page* stripe_page = NULL;
|
|
|
|
LIST_HEAD(pages);
|
|
stripe_page = alloc_page(GFP_KERNEL);
|
|
if(!stripe_page) {
|
|
err = -ENOMEM;
|
|
goto out;
|
|
}
|
|
stripe_page->index = page->index;
|
|
list_add_tail(&stripe_page->lru, &pages);
|
|
LIST_HEAD(extra_pages);
|
|
err = ternfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages);
|
|
process_file_pages(filp->f_mapping, &extra_pages, 0);
|
|
if (err) {
|
|
put_pages_list(&pages);
|
|
struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns());
|
|
ternfs_warn("reading page %lld in file %016lx failed with error %d", off, enode->inode.i_ino, err);
|
|
if ((end_ts.tv_sec - start_ts.tv_sec) > ternfs_file_io_timeout_sec) {
|
|
ternfs_warn("io timeout while reading page at off=%lld in file %016lx last error %d", off, enode->inode.i_ino, err);
|
|
goto out;
|
|
}
|
|
if ((end_ts.tv_sec - last_span_refresh_ts.tv_sec) > ternfs_file_io_retry_refresh_span_interval_sec) {
|
|
ternfs_info("refreshing span %lld of file %016lx as the span structure or block service flags could have changed in the meantime", off, enode->inode.i_ino);
|
|
ternfs_unlink_span(&enode->file.spans, span);
|
|
last_span_refresh_ts = end_ts;
|
|
}
|
|
goto retry;
|
|
}
|
|
list_del(&stripe_page->lru);
|
|
|
|
char* to_ptr = kmap_atomic(page);
|
|
char* from_ptr = kmap_atomic(stripe_page);
|
|
memcpy(to_ptr, from_ptr, PAGE_SIZE);
|
|
kunmap_atomic(to_ptr);
|
|
kunmap_atomic(from_ptr);
|
|
|
|
BUG_ON(atomic_read(&stripe_page->_refcount) != 1);
|
|
put_page(stripe_page);
|
|
}
|
|
|
|
out:
|
|
if (unlikely(err)) {
|
|
SetPageError(page);
|
|
} else {
|
|
SetPageUptodate(page);
|
|
}
|
|
|
|
unlock_page(page);
|
|
return err;
|
|
}
|
|
|
|
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0))
|
|
static void file_readahead(struct readahead_control *rac)
|
|
{
|
|
struct file *filp = rac->file;
|
|
struct inode *inode = file_inode(filp);
|
|
struct ternfs_inode *enode = TERNFS_I(inode);
|
|
struct page *page;
|
|
pgoff_t index;
|
|
loff_t off;
|
|
unsigned nr_pages = readahead_count(rac);
|
|
|
|
if (nr_pages == 0)
|
|
return;
|
|
|
|
// make sure we have size information
|
|
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
|
|
if (err) { return; }
|
|
|
|
index = readahead_index(rac);
|
|
off = index << PAGE_SHIFT;
|
|
|
|
ternfs_debug("enode=%p, ino=%ld, index=%lu, offset=%lld, nr_pages=%u",
|
|
enode, inode->i_ino, index, off, nr_pages);
|
|
|
|
struct ternfs_span *span = NULL;
|
|
LIST_HEAD(pages);
|
|
LIST_HEAD(extra_pages);
|
|
unsigned pages_allocated = 0;
|
|
|
|
// Check if we're out of bounds
|
|
if (off >= inode->i_size) {
|
|
// Allocate and zero-fill pages for out-of-bounds requests
|
|
unsigned i;
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
page = alloc_page(readahead_gfp_mask(rac->mapping));
|
|
if (!page)
|
|
break;
|
|
|
|
page->index = index + i;
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
|
list_add(&page->lru, &pages);
|
|
pages_allocated++;
|
|
}
|
|
goto out_process;
|
|
}
|
|
|
|
// Get the span for this offset
|
|
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
|
|
if (IS_ERR(span)) {
|
|
ternfs_warn("ternfs_get_span_failed at pos %llu", off);
|
|
err = PTR_ERR(span);
|
|
goto out_err;
|
|
}
|
|
|
|
// Allocate pages for readahead
|
|
unsigned i;
|
|
for (i = 0; i < nr_pages; i++) {
|
|
page = alloc_page(readahead_gfp_mask(rac->mapping));
|
|
if (!page)
|
|
break;
|
|
|
|
page->index = index + i;
|
|
list_add(&page->lru, &pages);
|
|
pages_allocated++;
|
|
}
|
|
|
|
if (pages_allocated == 0)
|
|
return;
|
|
|
|
// Process based on span type
|
|
if (span->storage_class == TERNFS_INLINE_STORAGE) {
|
|
struct ternfs_inline_span *inline_span = TERNFS_INLINE_SPAN(span);
|
|
u64 span_start = span->start;
|
|
|
|
list_for_each_entry(page, &pages, lru) {
|
|
u64 page_off = page_offset(page);
|
|
if (page_off >= inode->i_size) {
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
|
continue;
|
|
}
|
|
|
|
u64 span_offset = page_off - span_start;
|
|
if (span_offset >= inline_span->len) {
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
|
continue;
|
|
}
|
|
|
|
size_t to_copy = min((size_t)(inline_span->len - span_offset), (size_t)PAGE_SIZE);
|
|
char *dst = kmap_atomic(page);
|
|
memcpy(dst, inline_span->body + span_offset, to_copy);
|
|
if (to_copy < PAGE_SIZE)
|
|
memset(dst + to_copy, 0, PAGE_SIZE - to_copy);
|
|
kunmap_atomic(dst);
|
|
}
|
|
} else {
|
|
struct ternfs_block_span *block_span = TERNFS_BLOCK_SPAN(span);
|
|
err = ternfs_span_get_pages(block_span, rac->mapping, &pages, pages_allocated, &extra_pages);
|
|
if (err) {
|
|
ternfs_warn("readahead of %u pages at off=%lld in file %016lx failed with error %d",
|
|
pages_allocated, off, enode->inode.i_ino, err);
|
|
put_pages_list(&extra_pages);
|
|
goto out_err;
|
|
}
|
|
}
|
|
|
|
out_process:
|
|
// Process the main pages
|
|
list_for_each_entry(page, &pages, lru) {
|
|
struct page* rac_page = readahead_page(rac);
|
|
void* dest = kmap_atomic(rac_page);
|
|
void* src = kmap_atomic(page);
|
|
memcpy(dest, src, PAGE_SIZE);
|
|
kunmap_atomic(src);
|
|
kunmap_atomic(dest);
|
|
SetPageUptodate(rac_page);
|
|
unlock_page(rac_page);
|
|
put_page(rac_page);
|
|
}
|
|
// Process any extra pages
|
|
process_file_pages(rac->mapping, &extra_pages, 0);
|
|
put_pages_list(&pages);
|
|
return;
|
|
|
|
out_err:
|
|
put_pages_list(&pages);
|
|
return;
|
|
}
|
|
#endif
|
|
|
|
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0))
|
|
static int file_readfolio(struct file *filp, struct folio *folio) {
|
|
BUG_ON(folio_nr_pages(folio) != 1);
|
|
return file_readpage(filp, folio_page(folio, 0));
|
|
}
|
|
#endif
|
|
|
|
const struct address_space_operations ternfs_mmap_operations = {
|
|
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0))
|
|
.readpages = file_readpages,
|
|
#else
|
|
.readahead = file_readahead,
|
|
#endif
|
|
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 19, 0))
|
|
.readpage = file_readpage,
|
|
#else
|
|
.read_folio = file_readfolio,
|
|
#endif
|
|
};
|
|
|
|
int __init ternfs_file_init(void) {
|
|
ternfs_transient_span_cachep = kmem_cache_create(
|
|
"ternfs_transient_span_cache",
|
|
sizeof(struct ternfs_transient_span),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
|
|
&init_transient_span
|
|
);
|
|
if (!ternfs_transient_span_cachep) { return -ENOMEM; }
|
|
return 0;
|
|
}
|
|
|
|
void __cold ternfs_file_exit(void) {
|
|
ternfs_debug("file exit");
|
|
// TODO: handle case where there still are requests in flight.
|
|
kmem_cache_destroy(ternfs_transient_span_cachep);
|
|
}
|