Files
ternfs-XTXMarkets/kmod/file.c
T
2025-09-05 12:08:23 +00:00

1487 lines
53 KiB
C

#include "file.h"
#include <linux/uio.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include "bincode.h"
#include "inode.h"
#include "log.h"
#include "metadata.h"
#include "err.h"
#include "rs.h"
#include "block.h"
#include "crc.h"
#include "trace.h"
#include "span.h"
#include "wq.h"
#include "bincode.h"
#include "dir.h"
#include "inode_compat.h"
unsigned ternfs_atime_update_interval_sec = 0;
unsigned ternfs_file_io_timeout_sec = 86400;
unsigned ternfs_file_io_retry_refresh_span_interval_sec = 60; // 1 minute interval for refreshing spans during IO retries
unsigned ternfs_max_write_span_attempts = 5;
static struct kmem_cache* ternfs_transient_span_cachep;
struct ternfs_transient_span {
// The transient span holds a reference to this. As long as the
// transient span lives, the enode must live.
struct ternfs_inode* enode;
// Offset in the file for this span
u64 offset;
// Linear list of pages with the body of the span, used when we're still gathering
// content for it. We use `index` in the page to track where we're writing to.
struct list_head pages;
u32 written; // how much we've written to this span
atomic_t refcount;
// These are finalized when we start flushing out a span.
char failure_domains[TERNFS_MAX_BLOCKS][16]; // failure domains for current run so we can blacklist failures
char blacklisted_failure_domains[TERNFS_MAX_BLACKLIST_LENGTH][16]; // blacklisted failure domains
struct list_head blocks[TERNFS_MAX_BLOCKS]; // the pages for each block
u64 block_ids[TERNFS_MAX_BLOCKS]; // the block ids assigned by add span initiate
spinlock_t lock; // we use this for various modifications
u64 blocks_proofs[TERNFS_MAX_BLOCKS]; // when completing blocks, one of proof or err is set.
int blocks_errs[TERNFS_MAX_BLOCKS];
u32 span_crc;
u32 cell_crcs[TERNFS_MAX_BLOCKS*TERNFS_MAX_STRIPES];
u32 block_size;
u8 attempts;
u8 stripes;
u8 storage_class;
u8 parity;
u8 blacklist_length;
bool started_flushing;
};
// just a sanity check, this is just two per page rn
static_assert(sizeof(struct ternfs_transient_span) < (2<<10));
// open_mutex held here
// really want atomic open for this
static int file_open(struct inode* inode, struct file* filp) {
inode_lock(inode); // for the .status modification below
struct ternfs_inode* enode = TERNFS_I(inode);
ternfs_debug("enode=%p status=%d owner=%p", enode, enode->file.status, current->group_leader);
int err = 0;
if ((filp->f_mode&FMODE_WRITE) && (enode->file.status == TERNFS_FILE_STATUS_WRITING)) {
// this is the "common" writing case, we've just created a file to write it.
// note that we never change the file owner, which might lead to confusing behavior
// but is probably the only sensible thing to do.
} else {
// otherwise, the file must be already there, we're very relaxed in what we allow
// in f_mode here and we just fail when operations that can't be done (e.g. writing
// to files) are attempted. the reason is that some workflows (such as open write +
// setattr) _will_ work.
enode->file.status = TERNFS_FILE_STATUS_READING;
// also, set atime, if requested
if (!(filp->f_flags&O_NOATIME)) {
u64 atime_ns = ktime_get_real_ns();
struct timespec64 atime_ts = ns_to_timespec64(atime_ns);
u64 diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec);
if (diff < ternfs_atime_update_interval_sec) {
// we don't think we should update
goto out;
}
// https://internal-repo/issues/292
// we might have cached data and another client updated atime.
// ternfs_do_getattr is orders of magnitude cheaper than ternfs_shard_set_time,
// so we might as well refresh and re-check
int err = ternfs_do_getattr(enode, ATTR_CACHE_NO_TIMEOUT);
if (err) {
inode_unlock(inode);
goto out;
}
diff = atime_ts.tv_sec - min(inode_get_atime_sec(&enode->inode), atime_ts.tv_sec);
if (diff < ternfs_atime_update_interval_sec) {
// out local time changed and we see we don't need to update
goto out;
}
if ((inode_get_atime_sec(&enode->inode) > atime_ts.tv_sec) ||
(inode_get_atime_sec(&enode->inode) == atime_ts.tv_sec &&
inode_get_atime_nsec(&enode->inode) == atime_ts.tv_nsec
)
) {
// we don't want atime to go into the past don't update
goto out;
}
u64 atime = atime_ns | (1ull<<63);
err = ternfs_shard_set_time((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, inode->i_ino, 0, atime);
if (err) {
goto out;
}
// we updated time. we don't need to refresh it now but allow refresh on next stat by getattr_expiry
smp_store_release(&enode->getattr_expiry, 0);
}
}
out:
inode_unlock(inode);
return err;
}
static void init_transient_span(void* p) {
struct ternfs_transient_span* span = (struct ternfs_transient_span*)p;
INIT_LIST_HEAD(&span->pages);
int i;
for (i = 0; i < TERNFS_MAX_BLOCKS; i++) {
INIT_LIST_HEAD(&span->blocks[i]);
}
// we set it to 0, we bug if not 0 when getting from cache
atomic_set(&span->refcount, 0);
spin_lock_init(&span->lock);
}
// starts out with refcount = 1
static struct ternfs_transient_span* new_transient_span(struct ternfs_inode* enode, u64 offset) {
struct ternfs_transient_span* span = kmem_cache_alloc(ternfs_transient_span_cachep, GFP_KERNEL);
if (span == NULL) { return span; }
span->enode = enode;
BUG_ON(atomic_read(&span->refcount) != 0);
atomic_set(&span->refcount, 1);
span->offset = offset;
span->written = 0;
memset(span->failure_domains, 0, sizeof(span->failure_domains));
memset(span->blacklisted_failure_domains, 0, sizeof(span->blacklisted_failure_domains));
memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs));
memset(span->blocks_errs, 0, sizeof(span->blocks_errs));
span->span_crc = 0;
memset(span->cell_crcs, 0, sizeof(span->cell_crcs));
span->attempts = 0;
// We set the parity to zero here so that `put_transient_span`
// is always safe to run, since it traverses the blocks to free
// the pages, if any.
span->parity = 0;
span->blacklist_length = 0;
span->started_flushing = false;
return span;
}
static void hold_transient_span(struct ternfs_transient_span* span) {
atomic_inc(&span->refcount);
}
static bool put_transient_span(struct ternfs_transient_span* span) {
if (atomic_dec_return(&span->refcount) == 0) {
BUG_ON(spin_is_locked(&span->lock));
// free pages, adjust OOM score
int num_pages = 0;
// We are not necessarily holding the last reference to a page here.
// We could have errored out before fully sending out the request.
// In which case network stack could still hold a reference to it.
// It is however fine to adjust the OOM score here as technically
// the file system is now responsible for this memory and it's no
// longer tied to the process lifetime.
#define FREE_PAGES(__pages) \
while (!list_empty(__pages)) { \
struct page* victim = lru_to_page(__pages); \
list_del(&victim->lru); \
put_page(victim); \
num_pages++; \
}
FREE_PAGES(&span->pages);
int b;
for (b = 0; b < ternfs_blocks(span->parity); b++) {
FREE_PAGES(&span->blocks[b]);
}
#undef FREE_PAGES
#if (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0))
atomic_long_add_return(-num_pages, &span->enode->file.mm->rss_stat.count[MM_FILEPAGES]);
#else
percpu_counter_add(&span->enode->file.mm->rss_stat[MM_FILEPAGES], -num_pages);
#endif
if (span->started_flushing) {
up(&span->enode->file.flushing_span_sema);
}
// Free the span itself
kmem_cache_free(ternfs_transient_span_cachep, span);
return true;
}
return false;
}
static int add_span_initiate(struct ternfs_transient_span* span);
static int move_span_and_restart(struct ternfs_transient_span* span) {
struct ternfs_inode* enode = span->enode;
// create new file
u64 ino;
u64 cookie;
const char* name = "move_span";
int err = ternfs_error_to_linux(ternfs_shard_create_file(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
ternfs_inode_shard(enode->inode.i_ino), TERNFS_INODE_FILE, name, strlen(name), &ino, &cookie
));
if (err) { goto out_err; }
// move span
err = ternfs_error_to_linux(ternfs_shard_move_span(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
enode->inode.i_ino, span->offset, enode->file.cookie, ino, 0, cookie, span->written
));
if (err) { goto out_err; }
// go for it
err = add_span_initiate(span);
if (err) { goto out_err; }
return 0;
out_err:
ternfs_info("moving span failed %d", err);
return err;
}
static int retry_after_block_error(struct ternfs_transient_span* span) {
int i;
int B = ternfs_blocks(span->parity);
int err = 0;
for (i = 0; i < B; i++) {
if (span->blocks_errs[i]) {
if (err) {
ternfs_info("dropping err %d, since we already returned %d", span->blocks_errs[i], err);
} else {
err = span->blocks_errs[i];
}
}
}
BUG_ON(err == 0);
if (span->attempts < ternfs_max_write_span_attempts) {
ternfs_info("writing span failed with err %d after %d attempts, will try again", err, span->attempts);
return move_span_and_restart(span);
} else {
ternfs_warn("writing span failed with err %d after %d attempts, giving up", err, span->attempts);
return err;
}
}
static void write_block_finalize(struct ternfs_transient_span* span, int b, u64 proof, int block_write_err) {
int i;
int B = ternfs_blocks(span->parity);
struct ternfs_inode* enode = span->enode;
ternfs_debug("block=%d proof=%016llx err=%d", b, proof, block_write_err);
bool finished_writing = true;
bool any_block_errs = false;
// Mark the current one as done/error out, and check if we're done with all block
// writing requests.
spin_lock_bh(&span->lock);
for (i = 0; i < B; i++) {
BUG_ON(span->block_ids[i] == 0);
if (i == b) {
if (block_write_err == 0) {
span->blocks_proofs[i] = proof;
} else {
span->blocks_errs[i] = block_write_err;
}
} else if (span->blocks_proofs[i] == 0 && span->blocks_errs[i] == 0) {
// we're waiting on some other block proof.
finished_writing = false;
}
any_block_errs = any_block_errs || (span->blocks_errs[i] != 0);
}
spin_unlock_bh(&span->lock);
int err = 0;
if (finished_writing && likely(!any_block_errs)) {
// TODO just like in the other case, would be nicer to have this async, since we're in a queue here.
err = ternfs_error_to_linux(ternfs_shard_add_span_certify(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
enode->inode.i_ino, enode->file.cookie, span->offset, span->parity,
span->block_ids, span->blocks_proofs
));
}
if (finished_writing) {
span->attempts++;
if (unlikely(err == 0 && any_block_errs)) {
// if we failed writing blocks, consider retrying
err = retry_after_block_error(span);
}
atomic_cmpxchg(&enode->file.transient_err, 0, err);
}
// waiters will be woken up once span ref count reaches 0
put_transient_span(span);
}
static void write_block_done(void* data, struct list_head* pages, u64 block_id, u64 proof, int block_write_err) {
// We need to be careful with locking here: the writer waits on the
// flushing semaphore while holding the inode lock.
// we use 0 to mean "no proof yet"
if (unlikely(block_write_err == 0 && proof == 0)) {
block_write_err = -EIO; // we need to get _very_ unlucky for this to happen!
}
struct ternfs_transient_span* span = (struct ternfs_transient_span*)data;
ternfs_debug("block_id=%016llx proof=%016llx err=%d", block_id, proof, block_write_err);
int B = ternfs_blocks(span->parity);
// find the index that concerns us now
int b;
for (b = 0; b < B; b++) {
if (span->block_ids[b] == block_id) {
break;
}
}
BUG_ON(b == B); // we found the block
BUG_ON(span->blocks_proofs[b] != 0); // no proof or err already
BUG_ON(span->blocks_errs[b] != 0);
// re-acquire the pages
list_replace_init(pages, &span->blocks[b]);
write_block_finalize(span, b, proof, block_write_err);
}
static int add_span_initiate(struct ternfs_transient_span* span) {
int i;
struct ternfs_inode* enode = span->enode;
int B = ternfs_blocks(span->parity);
int cell_size = span->block_size / span->stripes;
ternfs_debug("add span initiate");
// fill in blacklist
for (i = 0; i < B; i++) {
if (span->blocks_errs[i]) {
if (span->blacklist_length == TERNFS_MAX_BLACKLIST_LENGTH) {
// it's ok to try again, we have limit on number of attempts
ternfs_warn("span initiate blacklist full, dropping other errored out blocks");
break;
}
static_assert(sizeof(span->blacklisted_failure_domains[span->blacklist_length]) == sizeof(span->failure_domains[i]));
memcpy(span->blacklisted_failure_domains[span->blacklist_length], span->failure_domains[i], sizeof(span->failure_domains[i]));
span->blacklist_length++;
}
}
// reset stuff (including bad failure domains themselves)
memset(span->block_ids, 0, sizeof(span->block_ids));
memset(span->failure_domains, 0, sizeof(span->failure_domains));
memset(span->blocks_proofs, 0, sizeof(span->blocks_proofs));
memset(span->blocks_errs, 0, sizeof(span->blocks_errs));
struct ternfs_add_span_initiate_block blocks[TERNFS_MAX_BLOCKS];
int err = ternfs_error_to_linux(ternfs_shard_add_span_initiate(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info,
span, enode->inode.i_ino, enode->file.cookie, span->offset, span->written,
span->span_crc, span->storage_class, span->parity, span->stripes, cell_size, span->cell_crcs,
span->blacklist_length, span->blacklisted_failure_domains,
blocks
));
if (unlikely(err)) {
// this means that we've failed to do the metadata (not the blocks), we can't recover from this
return err;
}
// send all the requests. at this point we never fail, we instead defer
// failure to the block requests. we intentionally let all the requests
// reach completion (rather than erroring on the first one) to try to
// clear out all bad failure domains in one go.
for (i = 0; i < B; i++) {
// first mark the downloading blocks. required to do this upfront
// so that we'll mark everything as finished only once we've gone
// through all the requests.
struct ternfs_add_span_initiate_block* block = &blocks[i];
span->block_ids[i] = block->block_id;
static_assert(sizeof(span->failure_domains[i]) == sizeof(block->failure_domain));
memcpy(span->failure_domains[i], block->failure_domain, sizeof(block->failure_domain));
}
for (i = 0; i < B; i++) { // then start uploading
struct ternfs_add_span_initiate_block* block = &blocks[i];
u32 block_crc = 0;
int s;
for (s = 0; s < span->stripes; s++) {
block_crc = ternfs_crc32c_append(block_crc, span->cell_crcs[s*B + i], cell_size);
}
struct ternfs_block_service bs = {
.id = block->block_service_id,
.ip1 = block->ip1,
.port1 = block->port1,
.ip2 = block->ip2,
.port2 = block->port2,
};
hold_transient_span(span); // for the callback when block is done
int err = ternfs_write_block(
&write_block_done,
span,
&bs,
block->block_id,
block->certificate,
span->block_size,
block_crc,
&span->blocks[i]
);
if (err) { // "finish" immediately, we've errored out
write_block_finalize(span, i, 0, err);
}
}
// Now we wait.
ternfs_debug("sent all block writing requests");
return 0;
}
static struct page* alloc_write_page(struct ternfs_inode_file* file) {
// The zeroing is to assume that in the blocks we have trailing zeros,
// and we also use this to just append zeros to the blocks without
// copying. The former is definitely not needed, but anyway, simpler
// for now.
struct page* p = alloc_page(GFP_KERNEL | __GFP_ZERO);
if (p == NULL) { return p; }
// This contributes to OOM score.
#if (LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0))
atomic_long_inc_return(&file->mm->rss_stat.count[MM_FILEPAGES]);
#else
percpu_counter_inc(&file->mm->rss_stat[MM_FILEPAGES]);
#endif
return p;
}
static int compute_span_parameters(
struct ternfs_policy* block_policy_p,
struct ternfs_policy* span_policy_p,
struct ternfs_policy* stripe_policy_p,
struct ternfs_transient_span* span
) {
u32 span_size = span->written;
ternfs_debug("span_size=%u", span_size);
// Easy case: inline span (or empty, can happen for empty files)
if (span_size < 256) {
span->storage_class = span_size > 0 ? TERNFS_INLINE_STORAGE : TERNFS_EMPTY_STORAGE;
span->block_size = 0;
span->stripes = 0;
span->parity = 0;
return 0;
}
// fetch the bodies of the policies
struct ternfs_policy_body block_policy;
ternfs_get_policy_body(block_policy_p, &block_policy);
struct ternfs_policy_body span_policy;
ternfs_get_policy_body(span_policy_p, &span_policy);
struct ternfs_policy_body stripe_policy;
ternfs_get_policy_body(stripe_policy_p, &stripe_policy);
// Pick parity
int num_span_policies = ternfs_span_policy_len(span_policy.body, span_policy.len);
BUG_ON(num_span_policies == 0);
int i;
u8 parity = 0;
for (i = num_span_policies - 2; i >= 0; i--) {
u32 max_size;
u8 this_parity;
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &this_parity);
if (span_size > max_size) {
i++;
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &parity);
break;
}
}
if (parity == 0) {
i = 0;
u32 max_size;
ternfs_span_policy_get(span_policy.body, span_policy.len, i, &max_size, &parity);
BUG_ON(span_size > max_size);
}
if (unlikely(ternfs_data_blocks(parity) > TERNFS_MAX_DATA || ternfs_parity_blocks(parity) > TERNFS_MAX_PARITY)) {
ternfs_info("got asked to write with parity (%d,%d), which is beyond max blocks (%d,%d)", ternfs_data_blocks(parity), ternfs_parity_blocks(parity), TERNFS_MAX_DATA, TERNFS_MAX_PARITY);
return -EINVAL;
}
// For simplicity, we have all cells to be a multiple of PAGE_SIZE, which means that all
// blocks/stripes/spans are multiples of PAGE_SIZE. We might also have some extra pages
// at the end to have things to line up correctly. This should only happens for big spans
// anyway (small spans will just use mirroring).
u32 target_stripe_size = ternfs_stripe_policy(stripe_policy.body, stripe_policy.len);
int S;
if (ternfs_data_blocks(parity) == 1) {
// If we only have one data block, things are also pretty simple (just mirroring).
// It doesn't make sense to have stripes in this case.
S = 1;
} else {
// Otherwise compute striping etc.
// We use target_stripe_size as an upper bound, for now (i.e. the stripe will always be smaller
// than TargetStripeSize)
S = min(max((u32)1, span_size / target_stripe_size), (u32)15);
}
int D = ternfs_data_blocks(parity);
int block_size = (span_size + D - 1) / D;
int cell_size = (block_size + S - 1) / S;
// Round up to cell to page size
cell_size = (cell_size + PAGE_SIZE - 1) & PAGE_MASK;
block_size = cell_size * S;
// Pick storage class
BUG_ON(span_size > TERNFS_MAX_BLOCK_SIZE);
int num_block_policies = ternfs_block_policy_len(block_policy.body, block_policy.len);
BUG_ON(num_block_policies == 0);
u8 storage_class;
for (i = num_block_policies-1; i >= 0; i--) {
u32 min_size;
ternfs_block_policy_get(block_policy.body, block_policy.len, i, &storage_class, &min_size);
if (block_size > min_size) {
break;
}
}
span->storage_class = storage_class;
span->block_size = block_size;
span->stripes = S;
span->parity = parity;
return 0;
}
// To be called with the inode lock. Acquires the flushing sema.
static int wait_flushed(struct ternfs_inode* enode, bool non_blocking) {
BUG_ON(!inode_is_locked(&enode->inode));
// make sure we're free to go
if (down_trylock(&enode->file.flushing_span_sema) == 1) {
if (non_blocking) {
return -EAGAIN;
}
down(&enode->file.flushing_span_sema);
}
return 0;
}
static int write_blocks(struct ternfs_transient_span* span) {
int i, j;
int err = 0;
struct ternfs_inode* enode = span->enode;
// transient file is already borked
err = atomic_read(&enode->file.transient_err);
if (err) { goto out; }
int D = ternfs_data_blocks(span->parity);
int B = ternfs_blocks(span->parity);
int S = span->stripes;
ternfs_debug("size=%d, D=%d, B=%d, S=%d", span->written, D, B, S);
loff_t offset = enode->inode.i_size - span->written;
trace_eggsfs_span_flush_enter(
enode->inode.i_ino, offset, span->written, span->parity, span->stripes, span->storage_class
);
// Below, remember that cell size is always a multiple of PAGE_SIZE.
u32 pages_per_block = span->block_size/PAGE_SIZE;
u32 data_pages = pages_per_block*D;
u32 current_data_pages = (span->written + PAGE_SIZE - 1) / PAGE_SIZE;
ternfs_debug("allocating padding pages_per_block=%u total_pages=%u current_pages=%u", pages_per_block, data_pages, current_data_pages);
// This can happen if we have trailing zeros because of how we arrange the
// stripes.
for (i = current_data_pages; i < data_pages; i++) {
struct page* zpage = alloc_write_page(&enode->file);
if (zpage == NULL) {
err = -ENOMEM;
// we'd have to back out the pages we allocated so far and i can't be bothered
goto out;
}
list_add_tail(&zpage->lru, &span->pages);
}
int cell_size = span->block_size / S;
ternfs_debug("arranging pages");
// First, we arrange the data blocks appropriatedly.
{
int cell_offset = 0;
int block = 0;
struct page* page;
struct page* tmp;
list_for_each_entry_safe(page, tmp, &span->pages, lru) {
list_del(&page->lru);
list_add_tail(&page->lru, &span->blocks[block]);
cell_offset += PAGE_SIZE;
if (cell_offset >= cell_size) {
block++;
block = block % D;
cell_offset = 0;
}
}
}
// Then we allocate the parity blocks
for (i = D; i < B; i++) {
for (j = 0; j < pages_per_block; j++) {
struct page* ppage = alloc_write_page(&enode->file);
if (ppage == NULL) { err = -ENOMEM; goto out; }
list_add_tail(&ppage->lru, &span->blocks[i]);
}
}
ternfs_debug("computing parity");
// Then, we compute the parity blocks and the CRCs for each block.
{
char* pages_bufs[TERNFS_MAX_BLOCKS];
kernel_fpu_begin();
// traverse each block, page by page, rotating as we go along
for (j = 0; j < pages_per_block; j++) {
// get buffer pointers
int s = (j*PAGE_SIZE)/cell_size;
for (i = 0; i < B; i++) {
pages_bufs[i] = kmap_atomic(list_first_entry(&span->blocks[i], struct page, lru));
}
// compute parity
err = ternfs_compute_parity(span->parity, PAGE_SIZE, (const char**)&pages_bufs[0], &pages_bufs[D]);
// compute CRCs
for (i = 0; i < B; i++) {
span->cell_crcs[s*B + i] = ternfs_crc32c(span->cell_crcs[s*B + i], pages_bufs[i], PAGE_SIZE);
}
for (i = 0; i < B; i++) {
kunmap_atomic(pages_bufs[i]);
list_rotate_left(&span->blocks[i]);
}
// unmap pages before exiting
if (err) { goto out_err_fpu; }
}
// Compute overall span crc
int s;
for (s = 0; s < S; s++) {
for (i = 0; i < D; i++) {
span->span_crc = ternfs_crc32c_append(span->span_crc, span->cell_crcs[s*B + i], cell_size);
}
}
span->span_crc = ternfs_crc32c_zero_extend(span->span_crc, (int)span->written - (int)(span->block_size*D));
kernel_fpu_end();
}
// Start the first attempt
// Now we need to init the span (the callback will actually send the requests)
// TODO would be better to have this async, and free up the queue for other stuff,
// and also not block in this call which might be in a non-blocking write.
// TODO Isn't this called on file write, and therefore not taking up time in
// the queue? I think the comment above is partially outdated.
err = add_span_initiate(span);
out:
if (err) { // we've failed before trying to write blocks, terminate immediately
atomic_cmpxchg(&enode->file.transient_err, 0, err); // store error if we have one
}
return err;
out_err_fpu:
kernel_fpu_end();
goto out;
}
// To be called with the inode lock.
static int start_flushing(struct ternfs_inode* enode, bool non_blocking) {
BUG_ON(!inode_is_locked(&enode->inode));
// Wait for the existing thing to be flushed
int err = wait_flushed(enode, non_blocking);
if (err < 0) { return err; }
// Now turn the writing span into a flushing span
struct ternfs_transient_span* span = enode->file.writing_span;
if (span == NULL) {
up(&enode->file.flushing_span_sema);
return 0;
}
enode->file.writing_span = NULL;
// We should hold the only reference
BUG_ON(atomic_read(&span->refcount) != 1);
// We mark span as started flushin so we know to wake up waiters on last reference going away.
// At that point operation is done in either failure or success.
span->started_flushing = true;
err = compute_span_parameters(
enode->block_policy, enode->span_policy, enode->stripe_policy, span
);
if (err) {
goto out;
}
// Here (and in `write_blocks`) we might block even if we're non
// blocking, since we don't have async metadata requests yet, which is
// a bit unfortunate.
if (span->storage_class == TERNFS_INLINE_STORAGE) {
// this is an easy one, just add the inline span
struct page* page = list_first_entry(&span->pages, struct page, lru);
char* data = kmap(page);
ternfs_debug("adding inline span of length %d", span->written);
err = ternfs_error_to_linux(ternfs_shard_add_inline_span(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino, enode->file.cookie,
span->offset, span->written, data, span->written
));
kunmap(page);
} else if (span->storage_class != TERNFS_EMPTY_STORAGE) {
// The real deal, we need to write blocks. Note that
// this guy is tasked with releasing the flushing semaphore
// if things fail (otherwise we're not done yet).
err = write_blocks(span);
}
out:
if (err) {
ternfs_info("attempting to flush failed permanently, err=%d ino=%016lx", err, enode->inode.i_ino);
atomic_cmpxchg(&enode->file.transient_err, 0, err);
}
// we don't need this anymore (the requests might though)
put_transient_span(span);
return err;
}
// To be called with the inode lock.
//
// This accepts a NULL `from`, in which case zeros will be written.
ssize_t ternfs_file_write_internal(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from, size_t count) {
BUG_ON(!inode_is_locked(&enode->inode));
int err;
if (flags & IOCB_DIRECT) { return -ENOSYS; }
loff_t ppos_before = *ppos;
ternfs_debug("enode=%p, ino=%lu, count=%lu, size=%lld, *ppos=%lld status=%d", enode, enode->inode.i_ino, count, enode->inode.i_size, *ppos, enode->file.status);
if (enode->file.status != TERNFS_FILE_STATUS_WRITING) {
err = -EROFS;
goto out_err;
}
// permanent error
err = atomic_read(&enode->file.transient_err);
if (err) { goto out_err; }
// we're writing at the right offset
if (*ppos != enode->inode.i_size) {
err = -EINVAL;
goto out_err;
}
// Precompute the bound for spans
u32 max_span_size;
{
struct ternfs_policy_body span_policy;
ternfs_get_policy_body(enode->span_policy, &span_policy);
u8 p;
ternfs_span_policy_last(span_policy.body, span_policy.len, &max_span_size, &p);
}
BUG_ON(max_span_size%PAGE_SIZE != 0); // needed for "new span" logic paired with page copying below, we could avoid it
// Get the span we're currently writing at
if (enode->file.writing_span == NULL) {
enode->file.writing_span = new_transient_span(enode, enode->inode.i_size);
if (enode->file.writing_span == NULL) {
err = -ENOMEM;
goto out_err;
}
}
struct ternfs_transient_span* span = enode->file.writing_span;
// Update mtime
struct timespec64 mtime;
ktime_get_real_ts64(&mtime);
inode_set_mtime_to_ts(&enode->inode, mtime);
// We now start writing into the span, as much as we can anyway
while (span->written < max_span_size && count) {
// grab the page to write to
struct page* page = list_empty(&span->pages) ? NULL : list_last_entry(&span->pages, struct page, lru);
if (page == NULL || page->index == 0) { // we're the first ones to get here, or we need to switch to the next one
BUG_ON(page != NULL && page->index > PAGE_SIZE);
page = alloc_write_page(&enode->file);
if (!page) {
err = -ENOMEM;
goto out_err; // we haven't corrupted anything, so no need to make it permanent
}
page->index = 0;
list_add_tail(&page->lru, &span->pages);
}
// copy stuff into page
int ret;
if (likely(from)) {
ret = copy_page_from_iter(page, page->index, PAGE_SIZE - page->index, from);
} else {
ret = min(count, PAGE_SIZE - page->index);
}
if (ret < 0) { err = ret; goto out_err_permanent; }
ternfs_debug("written %d to page %p", ret, page);
enode->inode.i_size += ret;
span->written += ret;
page->index = (page->index + ret) % PAGE_SIZE;
*ppos += ret;
count -= ret;
}
if (span->written >= max_span_size) { // we need to start flushing the span
// this might block even if it says nonblock, which is unfortunate.
err = start_flushing(enode, !!(flags&IOCB_NOWAIT));
if (err < 0) { goto out_err_permanent; }
}
int written;
out:
written = *ppos - ppos_before;
ternfs_debug("written=%d", written);
return written;
out_err_permanent:
atomic_cmpxchg(&enode->file.transient_err, 0, err);
out_err:
ternfs_debug("err=%d", err);
if (err == -EAGAIN && *ppos > ppos_before) {
// We can just return what we've already written. In fact, it's important
// that we do so, otherwise repeated calls might fail because they'd be
// with the wrong offset.
goto out;
}
return err;
}
ssize_t ternfs_file_write(struct ternfs_inode* enode, int flags, loff_t* ppos, struct iov_iter* from) {
return ternfs_file_write_internal(enode, flags, ppos, from, iov_iter_count(from));
}
static ssize_t file_write_iter(struct kiocb* iocb, struct iov_iter* from) {
struct file* file = iocb->ki_filp;
struct inode* inode = file->f_inode;
struct ternfs_inode* enode = TERNFS_I(inode);
if (!inode_trylock(inode)) {
if (iocb->ki_flags & IOCB_NOWAIT) {
return -EAGAIN;
}
inode_lock(inode);
}
ssize_t res = ternfs_file_write(enode, iocb->ki_flags, &iocb->ki_pos, from);
inode_unlock(inode);
return res;
}
int ternfs_file_flush(struct ternfs_inode* enode, struct dentry* dentry) {
inode_lock(&enode->inode);
int err = 0;
// Not writing, there's nothing to do, there's nothing to do, files are immutable
if (enode->file.status != TERNFS_FILE_STATUS_WRITING) {
ternfs_debug("status=%d, won't flush", enode->file.status);
goto out_early;
}
// We are in another process, skip
if (enode->file.owner != current->group_leader) {
ternfs_debug("owner=%p != group_leader=%p, won't flush", enode->file.owner, current->group_leader);
goto out_early;
}
bool file_is_alive_and_flushing = false;
// if we've errored out already, just exit
err = atomic_read(&enode->file.transient_err);
if (err < 0) { goto out; }
// If the owner doesn't have an mm anymore, it means that the file
// is being torn down after the process has been terminated. In that case
// we shouldn't even link the file.
if (current->mm == NULL) {
// abort everything
atomic_cmpxchg(&enode->file.transient_err, 0, -EIO);
goto out;
}
// OK, in this case we are indeed linking it, we need to do two things: first wait
// for the span which is being flushed to be flushed, and then flush the current span,
// if we have one.
err = start_flushing(enode, false);
if (err < 0) { goto out; }
down(&enode->file.flushing_span_sema);
file_is_alive_and_flushing = true;
// the requests might have failed
err = atomic_read(&enode->file.transient_err);
if (err < 0) { goto out; }
// now we actually need to link
ternfs_debug("linking file");
err = ternfs_error_to_linux(ternfs_shard_link_file(
(struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, enode->inode.i_ino,
enode->file.cookie, dentry->d_parent->d_inode->i_ino, dentry->d_name.name, dentry->d_name.len,
&enode->edge_creation_time
));
if (err < 0) { goto out; }
// finalize mtime
inode_set_mtime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000);
inode_set_ctime(&enode->inode, enode->edge_creation_time / 1000000000, enode->edge_creation_time % 1000000000);
// Switch the file to a normal file
enode->file.status = TERNFS_FILE_STATUS_READING;
smp_store_release(&enode->getattr_expiry, 0);
// expire the directory listing -- we know for a fact that it
// is wrong, it now contains this file.
{
struct dentry* parent = dget_parent(dentry);
WRITE_ONCE(TERNFS_I(d_inode(parent))->dir.mtime_expiry, 0);
dput(parent);
}
out:
if (err) {
atomic_cmpxchg(&enode->file.transient_err, 0, err);
}
if (!file_is_alive_and_flushing) {
// The file is dead, we want to make sure that any in-flight flushings
// are gone. We might enter this function multiple times with a dead
// file: imagine for instance a process dying with many threads having
// the reference to the same file. This is why we also release the
// semaphore afterwards, so that after the first one that cleans up
// the others will not get stuck. So this section acts as a flushing
// barrier of sorts. Since the file is dead this should not cause problems
// (e.g. we should not get genuine flushes due to writes which then conflict
// with this logic).
down(&enode->file.flushing_span_sema);
up(&enode->file.flushing_span_sema);
}
// There are cases where we decide not to flush, we still need to free the writing span
if (enode->file.writing_span != NULL) {
// if writing_span is not NULL we should be the one holding last reference to it
BUG_ON(!put_transient_span(enode->file.writing_span));
enode->file.writing_span = NULL;
}
// after we last cleared it should have no longer be set
BUG_ON(enode->file.writing_span != NULL);
if (enode->file.mm) {
mmdrop(enode->file.mm);
}
enode->file.mm = NULL;
inode_unlock(&enode->inode);
return err;
out_early:
inode_unlock(&enode->inode);
return err;
}
static int file_flush_internal(struct file* filp, fl_owner_t id) { // can we get write while this is in progress?
struct ternfs_inode* enode = TERNFS_I(filp->f_inode);
struct dentry* dentry = filp->f_path.dentry;
return ternfs_file_flush(enode, dentry);
}
static ssize_t file_read_iter(struct kiocb* iocb, struct iov_iter* to) {
struct file* file = iocb->ki_filp;
struct inode* inode = file->f_inode;
struct ternfs_inode* enode = TERNFS_I(inode);
if (unlikely(iocb->ki_flags & IOCB_DIRECT)) { return -ENOSYS; }
// make sure we have size information
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
if (err) { return err; }
return generic_file_read_iter(iocb, to);
}
void ternfs_link_destructor(void* buf) {
kfree(buf);
}
char* ternfs_read_link(struct ternfs_inode* enode) {
ternfs_debug("ino=%016lx", enode->inode.i_ino);
BUG_ON(ternfs_inode_type(enode->inode.i_ino) != TERNFS_INODE_SYMLINK);
// make sure we have size information
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
if (err) { return ERR_PTR(err); }
BUG_ON(enode->inode.i_size > PAGE_SIZE); // for simplicity...
size_t size = enode->inode.i_size;
ternfs_debug("size=%lu", size);
struct ternfs_span* span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, 0);
if (span == NULL) {
ternfs_debug("got no span, empty file?");
return "";
}
if (IS_ERR(span)) { return ERR_CAST(span); }
BUG_ON(span->end-span->start != size); // again, for simplicity
char* buf = kmalloc(size+1, GFP_KERNEL);
if (buf == NULL) { err = -ENOMEM; goto out_err; }
if (span->storage_class == TERNFS_INLINE_STORAGE) {
memcpy(buf, TERNFS_INLINE_SPAN(span)->body, size);
} else {
struct page* page;
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
LIST_HEAD(pages);
page = alloc_page(GFP_KERNEL);
if(IS_ERR(page)) {
err = PTR_ERR(page);
goto out_err;
}
page->index = 0;
list_add_tail(&page->lru, &pages);
LIST_HEAD(extra_pages);
err = ternfs_span_get_pages(block_span, enode->inode.i_mapping, &pages, 1, &extra_pages);
if (err) goto out_err;
list_del(&page->lru);
put_pages_list(&pages);
put_pages_list(&extra_pages);
char* page_buf = kmap(page);
memcpy(buf, page_buf, size);
kunmap(page);
}
buf[size] = '\0';
ternfs_debug("link %*pE", (int)size, buf);
return buf;
out_err:
ternfs_debug("get_link err=%d", err);
return ERR_PTR(err);
}
static loff_t file_lseek(struct file *file, loff_t offset, int whence) {
// When seeking a read file, we just do as normal.
// When seeking a file we're writing, we only allow to seek forward,
// which is the same as writing zeros past the end.
struct inode* inode = file->f_inode;
struct ternfs_inode* enode = TERNFS_I(inode);
if (likely(smp_load_acquire(&enode->file.status) == TERNFS_FILE_STATUS_READING)) {
// make sure we have size information
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
if (err) { return err; }
return generic_file_llseek(file, offset, whence);
}
inode_lock(inode);
loff_t ppos = file->f_pos;
// We only support ppos < i_size when reading the current position.
if (ppos != enode->inode.i_size && (offset != 0 || whence != SEEK_CUR)) {
goto out_err;
}
switch (whence) {
case SEEK_SET:
if (offset < ppos) { goto out_err; }
break;
case SEEK_CUR:
case SEEK_END:
if (offset < 0) { goto out_err; }
offset = ppos + offset;
break;
default:
goto out_err;
}
while (ppos < offset) {
ssize_t written = ternfs_file_write_internal(enode, 0, &ppos, NULL, offset - ppos);
if (unlikely(written < 0)) {
offset = written;
goto out;
}
file->f_pos = ppos;
file->f_version = 0; // what's this for?
}
out:
inode_unlock(inode);
return offset;
out_err:
offset = -EINVAL;
goto out;
}
// Files are not visible until they're fully persisted, so fsync is unnecessary.
static int file_fsync(struct file* f, loff_t start, loff_t end, int datasync) {
return 0;
}
const struct file_operations ternfs_file_operations = {
.open = file_open,
.read_iter = file_read_iter,
.write_iter = file_write_iter,
.flush = file_flush_internal,
.llseek = file_lseek,
.mmap = generic_file_readonly_mmap,
.fsync = file_fsync,
};
static void process_file_pages(struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
struct page* page;
int err;
for(;;) {
page = list_first_entry_or_null(pages, struct page, lru);
if (page == NULL) {
break;
}
list_del(&page->lru);
u32 refct = atomic_read(&page->_refcount);
if(refct != 1) ternfs_warn("page refcount=%d, expected 1 at index %ld, pages=%d", refct, page->index, nr_pages);
err = add_to_page_cache_lru(page,
mapping,
page->index,
readahead_gfp_mask(mapping));
if (err != 0) {
ternfs_debug("failed adding page at index=%ld, err=%d", page->index, err);
} else {
unlock_page(page);
SetPageUptodate(page);
}
put_page(page);
}
}
static int file_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) {
int err = 0;
struct inode* inode = file_inode(filp);
struct ternfs_inode* enode = TERNFS_I(inode);
loff_t off = page_offset(lru_to_page(pages));
ternfs_debug("enode=%p, ino=%ld, offset=%lld, nr_pages=%u", enode, inode->i_ino, off, nr_pages);
struct ternfs_span* span = NULL;
struct page *page;
LIST_HEAD(extra_pages);
if (off >= inode->i_size) { // out of bounds
list_for_each_entry(page, pages, lru) {
zero_user_segment(page, 0, PAGE_SIZE);
}
goto out;
}
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
if (IS_ERR(span)) {
ternfs_warn("ternfs_get_span_failed at pos %llu", off);
err = PTR_ERR(span);
goto out_err;
}
if (span->storage_class == TERNFS_INLINE_STORAGE) {
struct ternfs_inline_span* inline_span = TERNFS_INLINE_SPAN(span);
page = lru_to_page(pages);
if (page == NULL) {
err = -EIO;
goto out_err;
}
list_del(&page->lru);
// rest is past end of file, just drop it
put_pages_list(pages);
// return page back
list_add_tail(&page->lru, pages);
BUG_ON(page_offset(page) != span->start);
char* dst = kmap_atomic(page);
memcpy(dst, inline_span->body, inline_span->len);
kunmap_atomic(dst);
} else {
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
err = ternfs_span_get_pages(block_span, mapping, pages, nr_pages, &extra_pages);
if (err) {
ternfs_warn("readahead of %d pages at off=%lld in file %016lx failed with error %d", nr_pages, off, enode->inode.i_ino, err);
put_pages_list(&extra_pages);
goto out_err;
}
}
out:
process_file_pages(mapping, pages, nr_pages);
process_file_pages(mapping, &extra_pages, 0);
return 0;
out_err:
put_pages_list(pages);
return err;
}
static int file_readpage(struct file* filp, struct page* page) {
struct inode* inode = file_inode(filp);
struct ternfs_inode* enode = TERNFS_I(inode);
u64 off = page_offset(page);
int err = 0;
struct ternfs_span* span = NULL;
struct timespec64 start_ts = ns_to_timespec64(ktime_get_real_ns());
struct timespec64 last_span_refresh_ts = ns_to_timespec64(0);
if (off >= inode->i_size) { // out of bounds
zero_user_segment(page, 0, PAGE_SIZE);
goto out;
}
retry:
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
if (IS_ERR(span)) {
ternfs_debug("ternfs_get_span_failed at pos %llu", off);
err = PTR_ERR(span);
goto out;
}
if (span->start%PAGE_SIZE != 0) {
ternfs_warn("span start is not a multiple of page size %llu", span->start);
span = NULL;
err = -EIO;
goto out;
}
u64 span_offset = off - span->start;
if (span->storage_class == TERNFS_INLINE_STORAGE) {
struct ternfs_inline_span* inline_span = TERNFS_INLINE_SPAN(span);
size_t to_copy = inline_span->len - span_offset;
BUG_ON(to_copy > PAGE_SIZE);
char* dst = kmap_atomic(page);
memcpy(dst, inline_span->body + span_offset, to_copy);
kunmap_atomic(dst);
} else {
struct ternfs_block_span* block_span = TERNFS_BLOCK_SPAN(span);
if (block_span->cell_size%PAGE_SIZE != 0) {
ternfs_warn("cell size not multiple of page size %u", block_span->cell_size);
err = -EIO;
goto out;
}
struct page* stripe_page = NULL;
LIST_HEAD(pages);
stripe_page = alloc_page(GFP_KERNEL);
if(!stripe_page) {
err = -ENOMEM;
goto out;
}
stripe_page->index = page->index;
list_add_tail(&stripe_page->lru, &pages);
LIST_HEAD(extra_pages);
err = ternfs_span_get_pages(block_span, filp->f_mapping, &pages, 1, &extra_pages);
process_file_pages(filp->f_mapping, &extra_pages, 0);
if (err) {
put_pages_list(&pages);
struct timespec64 end_ts = ns_to_timespec64(ktime_get_real_ns());
ternfs_warn("reading page %lld in file %016lx failed with error %d", off, enode->inode.i_ino, err);
if ((end_ts.tv_sec - start_ts.tv_sec) > ternfs_file_io_timeout_sec) {
ternfs_warn("io timeout while reading page at off=%lld in file %016lx last error %d", off, enode->inode.i_ino, err);
goto out;
}
if ((end_ts.tv_sec - last_span_refresh_ts.tv_sec) > ternfs_file_io_retry_refresh_span_interval_sec) {
ternfs_info("refreshing span %lld of file %016lx as the span structure or block service flags could have changed in the meantime", off, enode->inode.i_ino);
ternfs_unlink_span(&enode->file.spans, span);
last_span_refresh_ts = end_ts;
}
goto retry;
}
list_del(&stripe_page->lru);
char* to_ptr = kmap_atomic(page);
char* from_ptr = kmap_atomic(stripe_page);
memcpy(to_ptr, from_ptr, PAGE_SIZE);
kunmap_atomic(to_ptr);
kunmap_atomic(from_ptr);
BUG_ON(atomic_read(&stripe_page->_refcount) != 1);
put_page(stripe_page);
}
out:
if (unlikely(err)) {
SetPageError(page);
} else {
SetPageUptodate(page);
}
unlock_page(page);
return err;
}
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0))
static void file_readahead(struct readahead_control *rac)
{
struct file *filp = rac->file;
struct inode *inode = file_inode(filp);
struct ternfs_inode *enode = TERNFS_I(inode);
struct page *page;
pgoff_t index;
loff_t off;
unsigned nr_pages = readahead_count(rac);
if (nr_pages == 0)
return;
// make sure we have size information
int err = ternfs_do_getattr(enode, ATTR_CACHE_NORM_TIMEOUT);
if (err) { return; }
index = readahead_index(rac);
off = index << PAGE_SHIFT;
ternfs_debug("enode=%p, ino=%ld, index=%lu, offset=%lld, nr_pages=%u",
enode, inode->i_ino, index, off, nr_pages);
struct ternfs_span *span = NULL;
LIST_HEAD(pages);
LIST_HEAD(extra_pages);
unsigned pages_allocated = 0;
// Check if we're out of bounds
if (off >= inode->i_size) {
// Allocate and zero-fill pages for out-of-bounds requests
unsigned i;
for (i = 0; i < nr_pages; i++) {
page = alloc_page(readahead_gfp_mask(rac->mapping));
if (!page)
break;
page->index = index + i;
zero_user_segment(page, 0, PAGE_SIZE);
list_add(&page->lru, &pages);
pages_allocated++;
}
goto out_process;
}
// Get the span for this offset
span = ternfs_get_span((struct ternfs_fs_info*)enode->inode.i_sb->s_fs_info, &enode->file.spans, off);
if (IS_ERR(span)) {
ternfs_warn("ternfs_get_span_failed at pos %llu", off);
err = PTR_ERR(span);
goto out_err;
}
// Allocate pages for readahead
unsigned i;
for (i = 0; i < nr_pages; i++) {
page = alloc_page(readahead_gfp_mask(rac->mapping));
if (!page)
break;
page->index = index + i;
list_add(&page->lru, &pages);
pages_allocated++;
}
if (pages_allocated == 0)
return;
// Process based on span type
if (span->storage_class == TERNFS_INLINE_STORAGE) {
struct ternfs_inline_span *inline_span = TERNFS_INLINE_SPAN(span);
u64 span_start = span->start;
list_for_each_entry(page, &pages, lru) {
u64 page_off = page_offset(page);
if (page_off >= inode->i_size) {
zero_user_segment(page, 0, PAGE_SIZE);
continue;
}
u64 span_offset = page_off - span_start;
if (span_offset >= inline_span->len) {
zero_user_segment(page, 0, PAGE_SIZE);
continue;
}
size_t to_copy = min((size_t)(inline_span->len - span_offset), (size_t)PAGE_SIZE);
char *dst = kmap_atomic(page);
memcpy(dst, inline_span->body + span_offset, to_copy);
if (to_copy < PAGE_SIZE)
memset(dst + to_copy, 0, PAGE_SIZE - to_copy);
kunmap_atomic(dst);
}
} else {
struct ternfs_block_span *block_span = TERNFS_BLOCK_SPAN(span);
err = ternfs_span_get_pages(block_span, rac->mapping, &pages, pages_allocated, &extra_pages);
if (err) {
ternfs_warn("readahead of %u pages at off=%lld in file %016lx failed with error %d",
pages_allocated, off, enode->inode.i_ino, err);
put_pages_list(&extra_pages);
goto out_err;
}
}
out_process:
// Process the main pages
list_for_each_entry(page, &pages, lru) {
struct page* rac_page = readahead_page(rac);
void* dest = kmap_atomic(rac_page);
void* src = kmap_atomic(page);
memcpy(dest, src, PAGE_SIZE);
kunmap_atomic(src);
kunmap_atomic(dest);
SetPageUptodate(rac_page);
unlock_page(rac_page);
put_page(rac_page);
}
// Process any extra pages
process_file_pages(rac->mapping, &extra_pages, 0);
put_pages_list(&pages);
return;
out_err:
put_pages_list(&pages);
return;
}
#endif
#if (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 18, 0))
static int file_readfolio(struct file *filp, struct folio *folio) {
BUG_ON(folio_nr_pages(folio) != 1);
return file_readpage(filp, folio_page(folio, 0));
}
#endif
const struct address_space_operations ternfs_mmap_operations = {
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 18, 0))
.readpages = file_readpages,
#else
.readahead = file_readahead,
#endif
#if (LINUX_VERSION_CODE < KERNEL_VERSION(5, 19, 0))
.readpage = file_readpage,
#else
.read_folio = file_readfolio,
#endif
};
int __init ternfs_file_init(void) {
ternfs_transient_span_cachep = kmem_cache_create(
"ternfs_transient_span_cache",
sizeof(struct ternfs_transient_span),
0,
SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
&init_transient_span
);
if (!ternfs_transient_span_cachep) { return -ENOMEM; }
return 0;
}
void __cold ternfs_file_exit(void) {
ternfs_debug("file exit");
// TODO: handle case where there still are requests in flight.
kmem_cache_destroy(ternfs_transient_span_cachep);
}