mirror of
https://github.com/XTXMarkets/ternfs.git
synced 2026-05-14 00:00:19 -05:00
1676 lines
64 KiB
C
1676 lines
64 KiB
C
// Copyright 2025 XTX Markets Technologies Limited
|
|
//
|
|
// SPDX-License-Identifier: GPL-2.0-or-later
|
|
|
|
#include <asm-generic/errno.h>
|
|
#include <linux/inet.h>
|
|
#include <net/tcp.h>
|
|
#include <net/sock.h>
|
|
#include <linux/atomic.h>
|
|
#include <linux/workqueue.h>
|
|
#include <linux/completion.h>
|
|
#include <linux/prandom.h>
|
|
|
|
#include "block.h"
|
|
#include "log.h"
|
|
#include "err.h"
|
|
#include "super.h"
|
|
#include "trace.h"
|
|
#include "wq.h"
|
|
#include "file.h"
|
|
#include "net_compat.h"
|
|
|
|
// Static data
|
|
// --------------------------------------------------------------------
|
|
|
|
#define TERNFS_BLOCKS_REQ_HEADER_SIZE (4 + 8 + 1) // protocol + block service id + kind
|
|
#define TERNFS_BLOCKS_RESP_HEADER_SIZE (4 + 1) // protocol + kind
|
|
|
|
#define MSECS_TO_JIFFIES(_ms) ((_ms * HZ) / 1000)
|
|
|
|
#define PAGE_CRC_BYTES 4
|
|
|
|
int ternfs_fetch_block_timeout_jiffies = MSECS_TO_JIFFIES(60 * 1000);
|
|
int ternfs_write_block_timeout_jiffies = MSECS_TO_JIFFIES(60 * 1000);
|
|
int ternfs_block_service_connect_timeout_jiffies = MSECS_TO_JIFFIES(4 * 1000);
|
|
|
|
static u64 WHICH_BLOCK_IP = 0;
|
|
|
|
static struct kmem_cache* fetch_request_cachep;
|
|
static struct kmem_cache* write_request_cachep;
|
|
|
|
// two 256 elements arrays (we have ~100 disks per block service, so
|
|
// with one bucket per element we can hold ~25k disks comfortably)
|
|
#define BLOCK_SOCKET_BITS 8
|
|
#define BLOCK_SOCKET_BUCKETS (1<<BLOCK_SOCKET_BITS)
|
|
|
|
static u64 block_socket_key(struct sockaddr_in* addr) {
|
|
return ((__force u64)addr->sin_addr.s_addr << 16) | (__force u64)addr->sin_port;
|
|
}
|
|
|
|
struct block_socket;
|
|
struct block_request;
|
|
|
|
struct block_ops {
|
|
DECLARE_HASHTABLE(sockets, BLOCK_SOCKET_BITS);
|
|
spinlock_t locks[BLOCK_SOCKET_BUCKETS];
|
|
// these two are just to detect when a socket is gone
|
|
wait_queue_head_t wqs[BLOCK_SOCKET_BUCKETS];
|
|
atomic_t len[BLOCK_SOCKET_BUCKETS];
|
|
|
|
int* timeout_jiffies;
|
|
|
|
// functions
|
|
void (*sk_write_space)(struct sock *sk);
|
|
void (*sk_data_ready)(struct sock *sk);
|
|
void (*write_work)(struct work_struct* work);
|
|
int (*write_req)(struct block_socket* socket, struct block_request* req);
|
|
int (*receive_req)(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len);
|
|
};
|
|
|
|
static void block_socket_sk_write_space(struct sock *sk);
|
|
|
|
static void fetch_block_pages_with_crc_sk_data_ready(struct sock *sk);
|
|
static void fetch_block_pages_with_crc_write_work(struct work_struct* work);
|
|
static int fetch_block_pages_with_crc_write_req(struct block_socket* socket, struct block_request* breq);
|
|
static int fetch_block_pages_with_crc_receive_req(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len);
|
|
|
|
static struct block_ops fetch_block_pages_witch_crc_ops = {
|
|
.sk_write_space = block_socket_sk_write_space,
|
|
.sk_data_ready = fetch_block_pages_with_crc_sk_data_ready,
|
|
.write_work = fetch_block_pages_with_crc_write_work,
|
|
.write_req = fetch_block_pages_with_crc_write_req,
|
|
.receive_req = fetch_block_pages_with_crc_receive_req,
|
|
.timeout_jiffies = &ternfs_fetch_block_timeout_jiffies,
|
|
};
|
|
|
|
static void write_block_sk_data_ready(struct sock *sk);
|
|
static void write_block_write_work(struct work_struct* work);
|
|
static int write_block_write_req(struct block_socket* socket, struct block_request* breq);
|
|
static int write_block_receive_req(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len);
|
|
|
|
static struct block_ops write_block_ops = {
|
|
.sk_write_space = block_socket_sk_write_space,
|
|
.sk_data_ready = write_block_sk_data_ready,
|
|
.write_work = write_block_write_work,
|
|
.write_req = write_block_write_req,
|
|
.receive_req = write_block_receive_req,
|
|
.timeout_jiffies = &ternfs_write_block_timeout_jiffies,
|
|
};
|
|
|
|
static void do_timeout_sockets(struct work_struct* w);
|
|
static DECLARE_DELAYED_WORK(timeout_work, do_timeout_sockets);
|
|
|
|
// Utils
|
|
// --------------------------------------------------------------------
|
|
|
|
static u32 ternfs_skb_copy(void* dstp, struct sk_buff* skb, u32 offset, u32 len) {
|
|
struct skb_seq_state seq;
|
|
u32 consumed = 0;
|
|
char* dst = dstp;
|
|
|
|
skb_prepare_seq_read(skb, offset, min(offset + len, skb->len), &seq);
|
|
for (;;) {
|
|
const u8* ptr;
|
|
int avail = skb_seq_read(consumed, &ptr, &seq);
|
|
if (avail == 0) { return consumed; }
|
|
BUG_ON(avail < 0);
|
|
// avail _can_ exceed the upper bound here
|
|
int actual_avail = min((u32)avail, len-consumed);
|
|
memcpy(dst + consumed, ptr, actual_avail);
|
|
consumed += actual_avail;
|
|
if (actual_avail < avail) {
|
|
skb_abort_seq_read(&seq);
|
|
return consumed;
|
|
}
|
|
}
|
|
BUG();
|
|
}
|
|
|
|
// Generic socket/request infra
|
|
// --------------------------------------------------------------------
|
|
|
|
static void block_ops_init(struct block_ops* ops) {
|
|
int i;
|
|
hash_init(ops->sockets);
|
|
for (i = 0; i < BLOCK_SOCKET_BUCKETS; i++) {
|
|
spin_lock_init(&ops->locks[i]);
|
|
atomic_set(&ops->len[i], 0);
|
|
init_waitqueue_head(&ops->wqs[i]);
|
|
}
|
|
}
|
|
|
|
// Locking principles around this are quite complex as it can be used from
|
|
// multiple contexts (workqueue, socket callbacks and direct calls enqueing more work).
|
|
// Safe disposal is protected via reference counting.
|
|
// While active socket is held in a hash table protected by RCU. This always hold one reference
|
|
// and the removal from hash table only drops the reference after rcu synchronize is done.
|
|
// This makes taking a reference under an rcu lock safe.
|
|
// Because we always remove socket callbacks before removing from hash table and wait for completion of any ongoing
|
|
// work it is also safe to take a refernce from any of the socket_callbacks.
|
|
// Helper functions for erroring out socket or enquing work quarantee that reference is either passed
|
|
// to the work queue or dropped if work is not scheduled. It is unsafe to use socket after calling those functions.
|
|
struct block_socket {
|
|
struct socket* sock;
|
|
struct sockaddr_in addr;
|
|
// We'll check that not more than `block_ops.timeout_jiffies` has passed since this.
|
|
// For this reason, we set this timeout when:
|
|
// * We write anything
|
|
// * We read anything
|
|
// The counters are separate as we can keep writing a lot of small requests while still not getting any response
|
|
// in which case we still need to timeout
|
|
u64 last_write_activity;
|
|
u64 last_read_activity;
|
|
// Error can be set either by write/read issues, error response which requires termination of socket
|
|
// or timeout. If error is already set when enqueing work we wait for sock_wait and then try to get new socket.
|
|
// This is to avoid enqueuing work on already errored out socket and a race with timing out inactive sockets.
|
|
atomic_t err;
|
|
// To store in the hashmap.
|
|
struct hlist_node hnode;
|
|
// Write end. write_work does all the work by peeking at the head, new requests
|
|
// get added to the tail and try to write immediately if they get queued as head
|
|
struct list_head write;
|
|
spinlock_t list_lock;
|
|
// We schedule write work as kernel is unhappy if we write from sk_write_space callback
|
|
// write work is scheduled on ternfs_fast_wq and we yield after at most 10ms of writing
|
|
// we also check if socket needs to be cleaned up due to error timeout and do it from same work
|
|
struct work_struct write_work;
|
|
atomic_t write_work_active;
|
|
atomic_t refcount;
|
|
// There could be multiple cleanup items scheduled, we only want first one to remove socket from hash, wake up waiters, remove requests
|
|
bool terminal;
|
|
// Read end. "sk_data_ready" callback does all the work.
|
|
struct list_head read;
|
|
// used for waiting on connect or waiting for socket do be fully removed
|
|
struct completion sock_wait;
|
|
// Saved callbacks
|
|
void (*saved_state_change)(struct sock *sk);
|
|
void (*saved_data_ready)(struct sock *sk);
|
|
void (*saved_write_space)(struct sock *sk);
|
|
};
|
|
|
|
// used exclusively to track time spent in state for requests
|
|
enum block_requests_state {
|
|
BR_ST_QUEUEING = 0,
|
|
BR_ST_WRITING = 1,
|
|
BR_ST_WAITING_RESPONSE = 2,
|
|
BR_ST_READING = 3,
|
|
BR_ST_MAX = 4
|
|
};
|
|
|
|
struct block_request {
|
|
// To store the request in the read/write lists of the socket
|
|
// Access should be protected by holding respectively block_socket->list_lock
|
|
// There is no guarantee requests will be removed from write list before it gets removed form read list
|
|
// as response can be quick and handled by a different thread. We only want to schedule complete_work once
|
|
// removed from all lists.
|
|
struct list_head read_list;
|
|
struct list_head write_list;
|
|
|
|
// There is no other reason for these to be atomic appart from preventing write req/read resp race
|
|
atomic_t err;
|
|
atomic_t state;
|
|
atomic64_t timings[BR_ST_MAX];
|
|
|
|
// work that gets scheduled on ternfs_wq to call complete callback on request
|
|
struct work_struct complete_work;
|
|
|
|
// How much is left to write (including the block body if writing)
|
|
u32 left_to_write;
|
|
// How much is left to read (including block body if reading)
|
|
u32 left_to_read;
|
|
};
|
|
|
|
|
|
// This function needs to be called while holding a reference to the socket
|
|
// or holding and RCU lock or from under the socket callback lock.
|
|
inline void block_socket_hold(struct block_socket* socket) {
|
|
int ref_count = atomic_inc_return(&socket->refcount);
|
|
ternfs_debug("socket(%p) %pI4:%d ref_count=%d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), ref_count);
|
|
BUG_ON(ref_count <= 1); // initial reference is set on init
|
|
}
|
|
|
|
// We log requests which error out or take more than 10s to complete
|
|
static void block_socket_log_req_completion(struct block_socket* socket, struct block_request* req) {
|
|
u64 completed = get_jiffies_64();
|
|
u64 created = (u64)atomic64_read(&req->timings[BR_ST_QUEUEING]);
|
|
u64 total = completed - created;
|
|
int err = atomic_read(&socket->err);
|
|
if (err == 0 && total < MSECS_TO_JIFFIES(10000)) {
|
|
return;
|
|
}
|
|
int state = atomic_read(&req->state);
|
|
if (state != BR_ST_READING) {
|
|
// in case request didn't get to last state set now as end time of last operation
|
|
atomic64_set(&req->timings[state + 1], completed);
|
|
}
|
|
u64 writing_started = max(created, (u64)atomic64_read(&req->timings[BR_ST_WRITING]));
|
|
u64 waiting_started = max(writing_started, (u64)atomic64_read(&req->timings[BR_ST_WAITING_RESPONSE]));
|
|
u64 reading_started = max(waiting_started, (u64)atomic64_read(&req->timings[BR_ST_READING]));
|
|
u64 queued = writing_started - created;
|
|
u64 writing = waiting_started - writing_started;
|
|
u64 waiting = reading_started - waiting_started;
|
|
u64 reading = completed - reading_started;
|
|
|
|
ternfs_warn("dropped block request to %pI4:%d (err=%d) last_state=%d left_to_read=%d left_to_write=%d elapsed=%llums [queued=%llums, writing=%llums, waiting=%llums, reading=%llums]",
|
|
&socket->addr.sin_addr, ntohs(socket->addr.sin_port), err, state, req->left_to_read, req->left_to_write, jiffies64_to_msecs(total),
|
|
jiffies64_to_msecs(queued), jiffies64_to_msecs(writing), jiffies64_to_msecs(waiting), jiffies64_to_msecs(reading));
|
|
}
|
|
|
|
|
|
inline void block_socket_put(struct block_socket* socket) {
|
|
ternfs_debug("socket(%p) %pI4:%d ref_count=%d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount)-1);
|
|
int ref_count = atomic_dec_return(&socket->refcount);
|
|
BUG_ON(ref_count < 0);
|
|
if (ref_count == 0) {
|
|
// Now complete all the remaining requests with the error.
|
|
// We are the only one remaining, no need to take locks
|
|
int err = atomic_read(&socket->err);
|
|
|
|
struct block_request* req;
|
|
struct block_request* tmp;
|
|
|
|
struct list_head all_reqs;
|
|
INIT_LIST_HEAD(&all_reqs);
|
|
|
|
list_for_each_entry_safe(req, tmp, &socket->read, read_list) {
|
|
if (!list_empty(&req->write_list)) {
|
|
list_del(&req->write_list);
|
|
}
|
|
list_del(&req->read_list);
|
|
list_add(&req->write_list, &all_reqs);
|
|
}
|
|
list_for_each_entry_safe(req, tmp, &socket->write, write_list) {
|
|
list_del(&req->write_list);
|
|
list_add(&req->write_list, &all_reqs);
|
|
}
|
|
|
|
list_for_each_entry_safe(req, tmp, &all_reqs, write_list) {
|
|
atomic_cmpxchg(&req->err, 0, err);
|
|
ternfs_debug("completing request because of a socket winddown");
|
|
list_del(&req->write_list);
|
|
block_socket_log_req_completion(socket, req);
|
|
queue_work(ternfs_wq, &req->complete_work);
|
|
}
|
|
if (socket->sock != NULL) {
|
|
sock_release(socket->sock);
|
|
}
|
|
ternfs_debug("free socket(%p) %pI4:%d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
|
|
kfree(socket);
|
|
}
|
|
}
|
|
|
|
// if cleanup was scheduled ref is passed to workqueue otherwise refcount is reduced
|
|
// it is not safe to use socket after calling this function
|
|
inline void queue_work_or_put(struct block_socket* socket) {
|
|
ternfs_debug("socket(%p) %pI4:%d, ref_count=%d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
if (!queue_work(ternfs_fast_wq, &socket->write_work)) {
|
|
block_socket_put(socket);
|
|
}
|
|
}
|
|
|
|
// Needs to be called with read lock on socket->sock->sock->sk_callback_lock
|
|
// or from workqueue context (ternfs_fast_wq)
|
|
// Errors socket if not already in error state and schedules cleanup
|
|
// if cleanup was scheduled ref is passed to workqueue otherwise refcount is reduced
|
|
// it is not safe to use socket after calling this function
|
|
inline void error_socket(struct block_socket* socket, int err) {
|
|
ternfs_debug("socket(%p) %pI4:%d, setting error %d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), err);
|
|
if (atomic_cmpxchg(&socket->err, 0, err) == 0) {
|
|
queue_work_or_put(socket);
|
|
} else {
|
|
block_socket_put(socket);
|
|
}
|
|
}
|
|
|
|
static void block_ops_exit(struct block_ops* ops) {
|
|
ternfs_debug("waiting for all sockets to be done");
|
|
|
|
struct block_socket* sock;
|
|
int bucket;
|
|
rcu_read_lock();
|
|
hash_for_each_rcu(ops->sockets, bucket, sock, hnode) {
|
|
ternfs_debug("scheduling winddown for %d", ntohs(sock->addr.sin_port));
|
|
block_socket_hold(sock);
|
|
error_socket(sock, -ECONNABORTED);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
// wait for all of them to be freed by work
|
|
for (bucket = 0; bucket < BLOCK_SOCKET_BUCKETS; bucket++) {
|
|
ternfs_debug("waiting for bucket %d (len %d)", bucket, atomic_read(&ops->len[bucket]));
|
|
wait_event(ops->wqs[bucket], atomic_read(&ops->len[bucket]) == 0);
|
|
}
|
|
}
|
|
|
|
|
|
static void block_socket_state_check_locked(struct sock* sk) {
|
|
struct block_socket* socket = sk->sk_user_data;
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d state check triggered: %d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount), sk->sk_state);
|
|
|
|
if (sk->sk_state == TCP_ESTABLISHED) { return; } // the only good one
|
|
|
|
// Right now we connect beforehand, so every change is trouble. Just
|
|
// kill the socket.
|
|
ternfs_debug("socket state check triggered: %d, will fail reqs", sk->sk_state);
|
|
// we are protected by callback lock but not holding a reference
|
|
// we need to take it for the error_socket call
|
|
block_socket_hold(socket);
|
|
error_socket(socket, -ECONNABORTED); // TODO we could have nicer errors
|
|
}
|
|
|
|
static void block_socket_sk_state_change(struct sock* sk) {
|
|
void (*saved_state_change)(struct sock *);
|
|
read_lock_bh(&sk->sk_callback_lock);
|
|
struct block_socket* socket = sk->sk_user_data;
|
|
if (socket != NULL) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d state change triggered: %d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount), sk->sk_state);
|
|
saved_state_change = socket->saved_state_change;
|
|
block_socket_state_check_locked(sk);
|
|
} else {
|
|
saved_state_change = sk->sk_state_change;
|
|
}
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
|
saved_state_change(sk);
|
|
}
|
|
|
|
static void block_socket_connect_sk_state_change(struct sock* sk) {
|
|
void (*saved_state_change)(struct sock *);
|
|
read_lock_bh(&sk->sk_callback_lock);
|
|
struct block_socket* socket = sk->sk_user_data;
|
|
if (socket != NULL) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d connect state change triggered: %d", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount), sk->sk_state);
|
|
if (sk->sk_state == TCP_ESTABLISHED) {
|
|
complete_all(&socket->sock_wait);
|
|
}
|
|
saved_state_change = socket->saved_state_change;
|
|
} else {
|
|
saved_state_change = sk->sk_state_change;
|
|
}
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
|
|
|
saved_state_change(sk);
|
|
}
|
|
|
|
inline bool block_socket_check_timeout(struct block_socket* socket, u64 now, u64 timeout_jiffies, bool set_error) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d check timeout triggered", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
// if something is locked activity in progress, nothing to do
|
|
if (!spin_trylock_bh(&socket->list_lock)) {
|
|
return false;
|
|
}
|
|
u64 last_activity = max(socket->last_read_activity, socket->last_write_activity);
|
|
bool timed_out = now - min(now, last_activity) > timeout_jiffies;
|
|
if (set_error && timed_out) {
|
|
atomic_cmpxchg(&socket->err, 0, -ETIMEDOUT);
|
|
}
|
|
spin_unlock_bh(&socket->list_lock);
|
|
return timed_out;
|
|
}
|
|
|
|
// Call periodically to check for inactive ones and schedules work on them to potentially time them out
|
|
static void timeout_sockets(struct block_ops* ops) {
|
|
int bucket;
|
|
struct block_socket* sock;
|
|
u64 now = get_jiffies_64();
|
|
|
|
rcu_read_lock();
|
|
hash_for_each_rcu(ops->sockets, bucket, sock, hnode) {
|
|
if (block_socket_check_timeout(sock, now, *ops->timeout_jiffies, false)) {
|
|
// We can't clean up here as timing out involves removing from hash and needs
|
|
// synchronization through rcu_synchronize
|
|
block_socket_hold(sock);
|
|
queue_work_or_put(sock);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
}
|
|
|
|
// check if socket needs cleanup due to error or timeout
|
|
// returns true if socket was cleaned up or partly cleaned up
|
|
static bool block_socket_cleanup(
|
|
struct block_ops* ops,
|
|
struct block_socket* socket,
|
|
bool check_timeout
|
|
) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d cleanup triggered", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
if (check_timeout) {
|
|
block_socket_check_timeout(socket, get_jiffies_64(), *ops->timeout_jiffies, true);
|
|
}
|
|
int err = atomic_read(&socket->err);
|
|
|
|
if (!err) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d no error, no cleanup needed", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
return false;
|
|
}
|
|
// if already terminal nothing to do
|
|
// no lock needed as this is protecte by write_work_active
|
|
if (socket->terminal) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d already terminal", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
goto cleaned_up;
|
|
}
|
|
|
|
socket->terminal = true;
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d winding down", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
|
|
// wait for callbacks to complete
|
|
write_lock_bh(&socket->sock->sk->sk_callback_lock);
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d callbacks locked", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
|
|
// We are killing the socket either due to error or timeout
|
|
// We already have callback lock, no callback could be running except for
|
|
// default linux callback which we don't care about.
|
|
socket->sock->sk->sk_data_ready = socket->saved_data_ready;
|
|
socket->sock->sk->sk_state_change = socket->saved_state_change;
|
|
socket->sock->sk->sk_write_space = socket->saved_write_space;
|
|
socket->sock->sk->sk_user_data = NULL;
|
|
write_unlock_bh(&socket->sock->sk->sk_callback_lock);
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d callbacks unlocked", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
|
|
|
|
|
|
u64 key = block_socket_key(&socket->addr);
|
|
int bucket = hash_min(key, BLOCK_SOCKET_BITS);
|
|
|
|
|
|
// First, remove socket from hashmap. After we're done with this,
|
|
// we know nobody's going to add new requests to this.
|
|
spin_lock(&ops->locks[bucket]);
|
|
hash_del_rcu(&socket->hnode); // tied to atomic_dec below
|
|
spin_unlock(&ops->locks[bucket]);
|
|
synchronize_rcu();
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d removed from hashmap", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
|
|
// Release the reference held by hash table
|
|
block_socket_put(socket);
|
|
|
|
// Adjust len, notify waiters
|
|
smp_mb__before_atomic();
|
|
atomic_dec(&ops->len[bucket]);
|
|
wake_up_all(&ops->wqs[bucket]);
|
|
|
|
// wake up waiters on socket removal from list
|
|
complete_all(&socket->sock_wait);
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d waiters woken up", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
cleaned_up:
|
|
atomic_set(&socket->write_work_active, 0);
|
|
block_socket_put(socket);
|
|
return true;
|
|
}
|
|
|
|
static void block_socket_write_work(
|
|
struct block_ops* ops,
|
|
struct block_socket* socket
|
|
) {
|
|
ternfs_debug("socket(%p) %pI4:%d refcount %d write work triggered", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), atomic_read(&socket->refcount));
|
|
|
|
if (atomic_cmpxchg(&socket->write_work_active, 0, 1) != 0) {
|
|
// already active but we need to requeue in case it is just about to exit
|
|
// and will not pick up new work
|
|
queue_work_or_put(socket);
|
|
return;
|
|
}
|
|
|
|
if (block_socket_cleanup(ops, socket, false)) {
|
|
return;
|
|
}
|
|
|
|
struct block_request* req;
|
|
u64 start = get_jiffies_64();
|
|
spin_lock_bh(&socket->list_lock);
|
|
req = list_first_entry_or_null(&socket->write, struct block_request, write_list);
|
|
socket->last_write_activity = get_jiffies_64();
|
|
spin_unlock_bh(&socket->list_lock);
|
|
|
|
while (req != NULL) {
|
|
if (get_jiffies_64() - start > MSECS_TO_JIFFIES(10)) {
|
|
atomic_set(&socket->write_work_active, 0);
|
|
queue_work_or_put(socket);
|
|
//yield to other sockets
|
|
return;
|
|
}
|
|
// Can't be done already, we just got the req
|
|
BUG_ON(req->left_to_write == 0);
|
|
|
|
if (atomic_read(&req->state) == BR_ST_QUEUEING) {
|
|
atomic_set(&req->state, BR_ST_WRITING);
|
|
atomic64_set(&req->timings[BR_ST_WRITING], get_jiffies_64());
|
|
}
|
|
|
|
int sent = ops->write_req(socket, req);
|
|
if (sent < 0) {
|
|
atomic_set(&socket->write_work_active, 0);
|
|
error_socket(socket, sent);
|
|
return;
|
|
}
|
|
BUG_ON(sent > req->left_to_write);
|
|
req->left_to_write -= sent;
|
|
if (req->left_to_write > 0) {
|
|
// we didn't manage to write out request, wait for more space
|
|
break;
|
|
}
|
|
|
|
// we are done with this requests update timers and get a new one
|
|
atomic64_set(&req->timings[BR_ST_WAITING_RESPONSE], get_jiffies_64());
|
|
// we need to compare as we could be in reading state already
|
|
atomic_cmpxchg(&req->state, BR_ST_WRITING, BR_ST_WAITING_RESPONSE);
|
|
struct block_request* completed_req = req;
|
|
spin_lock_bh(&socket->list_lock);
|
|
list_del(&req->write_list);
|
|
INIT_LIST_HEAD(&req->write_list); // mark it empty for cleanup function
|
|
|
|
// check if this request is first and update read activity time so we don't get timed out
|
|
if (req == list_first_entry_or_null(&socket->read, struct block_request, read_list)) {
|
|
socket->last_read_activity = get_jiffies_64();
|
|
}
|
|
bool scheduledCompletion = list_empty(&completed_req->read_list);
|
|
req = list_first_entry_or_null(&socket->write, struct block_request, write_list);
|
|
socket->last_write_activity = get_jiffies_64();
|
|
spin_unlock_bh(&socket->list_lock);
|
|
if (scheduledCompletion) {
|
|
block_socket_log_req_completion(socket, completed_req);
|
|
queue_work(ternfs_wq, &completed_req->complete_work);
|
|
}
|
|
}
|
|
atomic_set(&socket->write_work_active, 0);
|
|
block_socket_put(socket);
|
|
}
|
|
|
|
static void block_socket_sk_write_space(struct sock* sk) {
|
|
void (*old_write_space)(struct sock *);
|
|
read_lock_bh(&sk->sk_callback_lock);
|
|
struct block_socket* socket = (struct block_socket*)sk->sk_user_data;
|
|
if (socket != NULL) {
|
|
block_socket_hold(socket);
|
|
queue_work_or_put(socket);
|
|
old_write_space = socket->saved_write_space;
|
|
} else {
|
|
old_write_space = sk->sk_write_space;
|
|
}
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
|
old_write_space(sk);
|
|
}
|
|
|
|
// Gets the socket, and acquires a reference to it.
|
|
static struct block_socket* get_block_socket(
|
|
struct block_ops* ops,
|
|
struct sockaddr_in* addr
|
|
) {
|
|
u64 key = block_socket_key(addr);
|
|
int bucket = hash_min(key, BLOCK_SOCKET_BITS);
|
|
|
|
rcu_read_lock();
|
|
struct block_socket* sock;
|
|
hlist_for_each_entry_rcu(sock, &ops->sockets[bucket], hnode) {
|
|
if (block_socket_key(&sock->addr) == key) {
|
|
block_socket_hold(sock);
|
|
rcu_read_unlock();
|
|
return sock;
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
ternfs_debug("socket to %pI4:%d not found, will create", &addr->sin_addr, ntohs(addr->sin_port));
|
|
|
|
sock = kmalloc(sizeof(struct block_socket), GFP_KERNEL);
|
|
int err = 0;
|
|
if (sock == NULL) {
|
|
err = -ENOMEM;
|
|
goto out_err;
|
|
}
|
|
|
|
memcpy(&sock->addr, addr, sizeof(struct sockaddr_in));
|
|
|
|
atomic_set(&sock->err, 0);
|
|
spin_lock_init(&sock->list_lock);
|
|
INIT_LIST_HEAD(&sock->write);
|
|
INIT_WORK(&sock->write_work, ops->write_work);
|
|
atomic_set(&sock->write_work_active, 0);
|
|
atomic_set(&sock->refcount, 1);
|
|
INIT_LIST_HEAD(&sock->read);
|
|
|
|
|
|
sock->terminal = false;
|
|
|
|
err = sock_create_kern(&init_net, PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock->sock);
|
|
if (err != 0) {
|
|
sock->sock = NULL;
|
|
block_socket_put(sock);
|
|
goto out_err;
|
|
}
|
|
|
|
init_completion(&sock->sock_wait);
|
|
|
|
// Put the minimal state_change callback in before connecting because it is
|
|
// needed to finalise the completion when connection becomes established.
|
|
//
|
|
// Note that
|
|
// 1. We save the callbacks just for hygiene -- I'm pretty sure this is actually
|
|
// not needed since we call the original callbacks in the state change callback
|
|
// anyway. Moreover it's nice to be able to remove the custom callbacks
|
|
// when disposing of a socket, since then we know that we're really done with
|
|
// that socket (see)
|
|
// 2. We could use TFO, but we currently don't. It shouldn't really matter anyway
|
|
// since we aggressively reuse connection, and also given that we have our own
|
|
// callbacks which does not play well at connection opening stage.
|
|
sock->saved_state_change = sock->sock->sk->sk_state_change;
|
|
sock->sock->sk->sk_user_data = sock;
|
|
sock->sock->sk->sk_state_change = block_socket_connect_sk_state_change;
|
|
|
|
err = kernel_connect(sock->sock, (struct sockaddr*)&sock->addr, sizeof(sock->addr), O_NONBLOCK);
|
|
if (err < 0) {
|
|
if (unlikely(err != -EINPROGRESS)) {
|
|
ternfs_warn("could not connect to block service at %pI4:%d: %d", &sock->addr.sin_addr, ntohs(sock->addr.sin_port), err);
|
|
write_lock(&sock->sock->sk->sk_callback_lock);
|
|
sock->sock->sk->sk_state_change = sock->saved_state_change;
|
|
sock->sock->sk->sk_user_data = NULL;
|
|
write_unlock(&sock->sock->sk->sk_callback_lock);
|
|
block_socket_put(sock);
|
|
goto out_err;
|
|
} else {
|
|
if (likely(sock->sock->sk->sk_state != TCP_ESTABLISHED)) {
|
|
ternfs_debug("waiting for connection to %pI4:%d to be established", &sock->addr.sin_addr, ntohs(sock->addr.sin_port));
|
|
err = wait_for_completion_timeout(&sock->sock_wait, ternfs_block_service_connect_timeout_jiffies);
|
|
if (err <= 0) {
|
|
ternfs_warn("timed out waiting for connection to %pI4:%d to be established", &sock->addr.sin_addr, ntohs(sock->addr.sin_port));
|
|
// cleanup socket
|
|
write_lock(&sock->sock->sk->sk_callback_lock);
|
|
sock->sock->sk->sk_state_change = sock->saved_state_change;
|
|
sock->sock->sk->sk_user_data = NULL;
|
|
write_unlock(&sock->sock->sk->sk_callback_lock);
|
|
block_socket_put(sock);
|
|
err = -ETIMEDOUT;
|
|
goto out_err;
|
|
}
|
|
// we need to re-init for removal waiters
|
|
reinit_completion(&sock->sock_wait);
|
|
}
|
|
}
|
|
}
|
|
|
|
sock->last_read_activity = sock->last_write_activity = get_jiffies_64();
|
|
|
|
// now insert
|
|
struct block_socket* other_sock;
|
|
|
|
// Important for the callbacks to happen after the connect, see
|
|
// comment about fastopen above.
|
|
//
|
|
// We cannot setup the callbacks before we take the read lock.
|
|
// They might delete the socket itself. We also cannot just
|
|
// add them afterwards, otherwise multiple processes racing
|
|
// to add the same socket might exit this function. So we take
|
|
// the callback locks now, add the socket, and then change the
|
|
// callbacks contextually to adding the socket to the hash
|
|
// map.
|
|
|
|
write_lock(&sock->sock->sk->sk_callback_lock);
|
|
|
|
spin_lock(&ops->locks[bucket]);
|
|
// Take the RCU lock while holding the socket lock so that
|
|
// we know it won't get removed by timeouts or such in the
|
|
// span of time between unlocking the socket lock and acquiring
|
|
// the RCU lock.
|
|
rcu_read_lock();
|
|
// first check if somebody else got there first
|
|
hlist_for_each_entry_rcu(other_sock, &ops->sockets[bucket], hnode) { // first we check if somebody didn't get to it first
|
|
if (block_socket_key(&other_sock->addr) == key) {
|
|
// somebody got here before us
|
|
// we need to restore callbacks before releasing since we already set them up
|
|
sock->sock->sk->sk_state_change = sock->saved_state_change;
|
|
sock->sock->sk->sk_user_data = NULL;
|
|
rcu_read_unlock();
|
|
spin_unlock(&ops->locks[bucket]);
|
|
write_unlock(&sock->sock->sk->sk_callback_lock);
|
|
ternfs_debug("multiple callers tried to get socket to %pI4:%d, dropping one", &other_sock->addr.sin_addr, ntohs(other_sock->addr.sin_port));
|
|
// call again rather than trying to `sock_release` with the
|
|
// RCU read lock held, this might not be safe in atomic context.
|
|
block_socket_put(sock);
|
|
return get_block_socket(ops, addr);
|
|
}
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
block_socket_hold(sock); // we are now sure we'll return it
|
|
|
|
// Put the new callbacks in
|
|
|
|
sock->saved_data_ready = sock->sock->sk->sk_data_ready;
|
|
sock->saved_write_space = sock->sock->sk->sk_write_space;
|
|
sock->sock->sk->sk_data_ready = ops->sk_data_ready;
|
|
sock->sock->sk->sk_state_change = block_socket_sk_state_change;
|
|
sock->sock->sk->sk_write_space = ops->sk_write_space;
|
|
|
|
// Insert the socket into the hash map -- anyone else which
|
|
// will find it will be good to do.
|
|
hlist_add_head_rcu(&sock->hnode, &ops->sockets[bucket]);
|
|
|
|
spin_unlock(&ops->locks[bucket]);
|
|
|
|
write_unlock(&sock->sock->sk->sk_callback_lock);
|
|
|
|
atomic_inc(&ops->len[bucket]);
|
|
|
|
// We are holding RCU, let's verify that we also have a socket.
|
|
BUG_ON(IS_ERR(sock));
|
|
return sock;
|
|
out_err:
|
|
return ERR_PTR(err);
|
|
}
|
|
|
|
static int block_socket_receive_req_locked(
|
|
int (*receive_single_req)(struct block_request* req, struct sk_buff* skb, unsigned int offset, size_t len),
|
|
read_descriptor_t* rd_desc,
|
|
struct sk_buff* skb,
|
|
unsigned int offset,
|
|
size_t len
|
|
) {
|
|
struct block_socket* socket = rd_desc->arg.data;
|
|
|
|
ternfs_debug("socket(%p) %pI4:%d offset=%u len=%lu", socket, &socket->addr.sin_addr, ntohs(socket->addr.sin_port), offset, len);
|
|
|
|
size_t len0 = len;
|
|
int err = atomic_read(&socket->err);
|
|
if (err) {
|
|
ternfs_debug("socket(%p) error %d, dropping data", socket, err);
|
|
return len0;
|
|
}
|
|
|
|
// line up first req
|
|
struct block_request* req;
|
|
spin_lock_bh(&socket->list_lock);
|
|
req = list_first_entry_or_null(&socket->read, struct block_request, read_list);
|
|
socket->last_read_activity = get_jiffies_64();
|
|
spin_unlock_bh(&socket->list_lock);
|
|
|
|
while (len > 0) {
|
|
if (atomic_read(&req->state) != BR_ST_READING) {
|
|
atomic_set(&req->state, BR_ST_READING);
|
|
atomic64_set(&req->timings[BR_ST_READING], get_jiffies_64());
|
|
}
|
|
int consumed = receive_single_req(req, skb, offset, len);
|
|
if (consumed < 0) {
|
|
block_socket_hold(socket);
|
|
error_socket(socket, consumed);
|
|
return len0;
|
|
}
|
|
if (consumed == 0) {
|
|
// we didn't make any progress, stop
|
|
break;
|
|
}
|
|
BUG_ON(consumed > req->left_to_read);
|
|
req->left_to_read -= consumed;
|
|
ternfs_debug("left_to_read=%u consumed=%d", req->left_to_read, consumed);
|
|
len -= consumed;
|
|
offset += consumed;
|
|
if (req->left_to_read == 0 || atomic_read(&req->err)) {
|
|
// this request is done remove it from list and schedule completion
|
|
struct block_request* completed_req = req;
|
|
spin_lock_bh(&socket->list_lock);
|
|
list_del(&req->read_list);
|
|
INIT_LIST_HEAD(&req->read_list);
|
|
|
|
req = list_first_entry_or_null(&socket->read, struct block_request, read_list);
|
|
socket->last_read_activity = get_jiffies_64();
|
|
bool scheduleCompletion = list_empty(&completed_req->write_list);
|
|
spin_unlock_bh(&socket->list_lock);
|
|
ternfs_debug("socket(%p) completed request, scheduling completion %d", socket, scheduleCompletion);
|
|
if (scheduleCompletion) {
|
|
block_socket_log_req_completion(socket, completed_req);
|
|
queue_work(ternfs_wq, &completed_req->complete_work);
|
|
}
|
|
}
|
|
}
|
|
|
|
// We have more data but no requests. This should not happen
|
|
BUG_ON(req == NULL && len > 0);
|
|
return len0 - len;
|
|
}
|
|
|
|
static void block_socket_sk_data_ready(
|
|
struct block_ops* ops,
|
|
struct sock* sk
|
|
) {
|
|
read_lock_bh(&sk->sk_callback_lock);
|
|
if (sk->sk_user_data != NULL) {
|
|
read_descriptor_t rd_desc;
|
|
// Taken from iscsi -- we set count to 1 because we want the network layer to
|
|
// hand us all the skbs that are available.
|
|
rd_desc.arg.data = sk->sk_user_data;
|
|
rd_desc.count = 1;
|
|
// while this is protected by callback lock the callback might want to error a socket
|
|
// and enqueue cleanup work which needs a reference. best to take it here
|
|
tcp_read_sock(sk, &rd_desc, ops->receive_req);
|
|
block_socket_state_check_locked(sk);
|
|
}
|
|
read_unlock_bh(&sk->sk_callback_lock);
|
|
}
|
|
|
|
static struct block_socket* get_blockservice_socket(
|
|
struct block_ops* ops,
|
|
struct ternfs_block_service* bs
|
|
) {
|
|
int i, block_ip;
|
|
struct sockaddr_in addr;
|
|
struct block_socket* sock;
|
|
|
|
// Try both ips (if we have them) before giving up
|
|
block_ip = WHICH_BLOCK_IP++;
|
|
for (i = 0; i < 1 + (bs->port2 != 0); i++, block_ip++) { // we might not have a second address
|
|
addr.sin_family = AF_INET;
|
|
if (bs->port2 == 0 || block_ip&1) {
|
|
addr.sin_addr.s_addr = htonl(bs->ip1);
|
|
addr.sin_port = htons(bs->port1);
|
|
} else {
|
|
addr.sin_addr.s_addr = htonl(bs->ip2);
|
|
addr.sin_port = htons(bs->port2);
|
|
}
|
|
|
|
sock = get_block_socket(ops, &addr);
|
|
if (!IS_ERR(sock)) {
|
|
goto out;
|
|
}
|
|
}
|
|
out:
|
|
return sock;
|
|
}
|
|
|
|
static int block_socket_start_req(
|
|
struct block_ops* ops,
|
|
struct block_request* req,
|
|
struct ternfs_block_service* bs,
|
|
u32 left_to_write,
|
|
u32 left_to_read
|
|
) {
|
|
int err, i;
|
|
struct block_socket* sock;
|
|
|
|
atomic_set(&req->err, 0);
|
|
for (i = 0; i < BR_ST_MAX; ++i) {
|
|
atomic64_set(&req->timings[i], 0);
|
|
}
|
|
atomic_set(&req->state, BR_ST_QUEUEING);
|
|
atomic64_set(&req->timings[BR_ST_QUEUEING], get_jiffies_64());
|
|
|
|
req->left_to_write = left_to_write;
|
|
req->left_to_read = left_to_read;
|
|
|
|
|
|
retry_get_socket:
|
|
// As the very last thing, try to get the socket with increased refcount.
|
|
sock = get_blockservice_socket(ops, bs);
|
|
if (IS_ERR(sock)) {
|
|
err = PTR_ERR(sock);
|
|
goto out_err;
|
|
}
|
|
err = atomic_read(&sock->err);
|
|
|
|
// We have a socket We need to place the request in the queue,
|
|
// and schedule work. We need to put it in both write and read queue
|
|
// to avoid a race where we get response before we move it from write
|
|
// to read queue
|
|
if (err) {
|
|
// current socket is in errored state. Wait for removal. Socket will not get destroyed
|
|
// as hold a reference to it.
|
|
err = wait_for_completion_timeout(&sock->sock_wait, MSECS_TO_JIFFIES(100));
|
|
// we timed out or got awoken in any case drop the reference
|
|
block_socket_put(sock);
|
|
if (err <= 0) {
|
|
ternfs_warn("timed out waiting for socked to bs=%016llx to be cleaned up", bs->id);
|
|
err = -ETIMEDOUT;
|
|
goto out_err;
|
|
}
|
|
goto retry_get_socket;
|
|
}
|
|
|
|
spin_lock_bh(&sock->list_lock);
|
|
if (list_first_entry_or_null(&sock->write, struct block_request, write_list) == NULL) {
|
|
// Prevent timeout if first
|
|
sock->last_write_activity = get_jiffies_64();
|
|
}
|
|
list_add_tail(&req->write_list, &sock->write);
|
|
list_add_tail(&req->read_list, &sock->read);
|
|
spin_unlock_bh(&sock->list_lock);
|
|
|
|
queue_work_or_put(sock);
|
|
|
|
return 0;
|
|
|
|
out_err:
|
|
ternfs_info("couldn't start block request, err=%d", err);
|
|
return err;
|
|
}
|
|
|
|
static int drop_sockets(struct block_ops* ops) {
|
|
int dropped = 0;
|
|
int bucket;
|
|
struct block_socket* sock;
|
|
|
|
rcu_read_lock();
|
|
hash_for_each_rcu(ops->sockets, bucket, sock, hnode) {
|
|
dropped++;
|
|
// We put EAGAIN here so that the caller can restart the
|
|
// request if it wants to (since we didn't really encounter
|
|
// any error). We don't currently do this though.
|
|
block_socket_hold(sock);
|
|
error_socket(sock, -EAGAIN);
|
|
}
|
|
rcu_read_unlock();
|
|
|
|
ternfs_info("dropped %d sockets", dropped);
|
|
|
|
return dropped;
|
|
}
|
|
|
|
// Fetch
|
|
// --------------------------------------------------------------------
|
|
|
|
struct fetch_request {
|
|
struct block_request breq;
|
|
|
|
void (*callback)(void* data, u64 block_id, struct list_head* pages, int err);
|
|
void *data;
|
|
struct list_head pages;
|
|
|
|
// Req info
|
|
u64 file_id;
|
|
u64 block_service_id;
|
|
u64 block_id;
|
|
u32 block_crc;
|
|
u32 offset;
|
|
u32 count;
|
|
u32 bytes_fetched;
|
|
|
|
// When reading blocks with crcs we need to alternate between reading pages
|
|
// and CRCs. next_read_size will be either PAGE_SIZE or PAGE_CRC_BYTES.
|
|
u32 next_read_size;
|
|
|
|
// The cursor inside the current page.
|
|
u16 page_offset;
|
|
// How much we've read of the header.
|
|
u8 header_read;
|
|
static_assert(TERNFS_FETCH_BLOCK_RESP_SIZE == 0);
|
|
union {
|
|
char buf[TERNFS_BLOCKS_RESP_HEADER_SIZE + 2];
|
|
struct {
|
|
__le32 protocol;
|
|
u8 kind;
|
|
__le16 error;
|
|
} __attribute__((packed)) data;
|
|
} header;
|
|
};
|
|
|
|
#define get_fetch_request(req) container_of(req, struct fetch_request, breq)
|
|
|
|
static void fetch_complete(struct block_request* breq) {
|
|
struct fetch_request* req = get_fetch_request(breq);
|
|
// When fetching pages with crc, the pages list needs to be returned in the
|
|
// exact same order as it was received. We need to restore it back if
|
|
// reading didn't finish and not all the items got rotated.
|
|
struct page* first_page = list_first_entry(&req->pages, struct page, lru);
|
|
struct page *first_seen = first_page;
|
|
struct page* last_page = list_last_entry(&req->pages, struct page, lru);
|
|
while(last_page->index < first_page->index || first_page->index == (unsigned long)-1) {
|
|
list_rotate_left(&req->pages);
|
|
ternfs_debug("rotating list left: %d %d, first_page:%ld, last_page:%ld", req->count, req->bytes_fetched, first_page->index, last_page->index);
|
|
first_page = list_first_entry(&req->pages, struct page, lru);
|
|
last_page = list_last_entry(&req->pages, struct page, lru);
|
|
if (first_page == first_seen) {
|
|
// Can only happen when page index is -1.
|
|
// We've gone a full circle and we are still back at page out of range.
|
|
break;
|
|
}
|
|
}
|
|
ternfs_debug("block fetch complete block_id=%016llx err=%d", req->block_id, atomic_read(&req->breq.err));
|
|
req->callback(req->data, req->block_id, &req->pages, atomic_read(&req->breq.err));
|
|
|
|
ternfs_debug("block released socket block_id=%016llx err=%d", req->block_id, atomic_read(&req->breq.err));
|
|
put_pages_list(&req->pages);
|
|
kmem_cache_free(fetch_request_cachep, req);
|
|
}
|
|
|
|
static void fetch_complete_work(struct work_struct* work) {
|
|
fetch_complete(container_of(work, struct block_request, complete_work));
|
|
}
|
|
|
|
static void fetch_request_constructor(void* ptr) {
|
|
struct fetch_request* req = (struct fetch_request*)ptr;
|
|
|
|
INIT_LIST_HEAD(&req->pages);
|
|
INIT_WORK(&req->breq.complete_work, fetch_complete_work);
|
|
}
|
|
|
|
int ternfs_drop_fetch_block_sockets(void) {
|
|
return drop_sockets(&fetch_block_pages_witch_crc_ops);
|
|
}
|
|
|
|
// Returns a negative result if the socket has to be considered corrupted.
|
|
// If a "recoverable" error occurs (e.g. the server just replies with an error)
|
|
// `req->err` will be filled in and receiving can continue. If an unrecoverable
|
|
// error occurs `req->complete.err` will be filled in also, but it's expected that the
|
|
// caller stops operating on this socket.
|
|
static int fetch_receive_single_req(
|
|
struct block_request* breq,
|
|
struct sk_buff* skb,
|
|
unsigned int offset,
|
|
size_t len
|
|
) {
|
|
struct fetch_request* req = get_fetch_request(breq);
|
|
size_t header_read;
|
|
{
|
|
size_t len0 = len;
|
|
|
|
BUG_ON(req->breq.left_to_read == 0);
|
|
|
|
#define HEADER_COPY(buf, count) ({ \
|
|
int read = count > 0 ? ternfs_skb_copy(buf, skb, offset, count) : 0; \
|
|
offset += read; \
|
|
len -= read; \
|
|
req->header_read += read; \
|
|
read; \
|
|
})
|
|
|
|
// Header
|
|
{
|
|
int header_left = TERNFS_BLOCKS_RESP_HEADER_SIZE - (int)req->header_read;
|
|
int read = HEADER_COPY(req->header.buf + req->header_read, header_left);
|
|
header_left -= read;
|
|
if (header_left > 0) { return len0-len; }
|
|
}
|
|
|
|
// Protocol check
|
|
if (unlikely(le32_to_cpu(req->header.data.protocol) != TERNFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
|
|
ternfs_info("bad blocks resp protocol, expected %*pE, got %*pE", 4, &TERNFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->header.data.protocol);
|
|
return ternfs_error_to_linux(TERNFS_ERR_MALFORMED_RESPONSE);
|
|
}
|
|
|
|
// Error check
|
|
if (unlikely(req->header.data.kind == 0)) {
|
|
int error_left = (TERNFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)req->header_read;
|
|
int read = HEADER_COPY(req->header.buf + req->header_read, error_left);
|
|
error_left -= read;
|
|
if (error_left > 0) { return len0-len; }
|
|
// we got an error "nicely", the socket can carry on living.
|
|
atomic_cmpxchg(&req->breq.err, 0, ternfs_error_to_linux(le16_to_cpu(req->header.data.error)));
|
|
return len0-len;
|
|
}
|
|
|
|
header_read = len0-len;
|
|
|
|
BUG_ON(header_read > req->breq.left_to_read);
|
|
|
|
#undef HEADER_COPY
|
|
}
|
|
|
|
// Actual data copy
|
|
|
|
struct page* page = list_first_entry(&req->pages, struct page, lru);
|
|
BUG_ON(!page);
|
|
char* page_ptr = kmap_atomic(page);
|
|
|
|
struct skb_seq_state seq;
|
|
u32 read_len = min(req->breq.left_to_read - (u32)header_read, (u32)len);
|
|
skb_prepare_seq_read(skb, offset, offset + read_len, &seq);
|
|
u32 block_bytes_read = 0;
|
|
while (block_bytes_read < req->breq.left_to_read) {
|
|
const u8* data;
|
|
u32 avail = skb_seq_read(block_bytes_read, &data, &seq);
|
|
if (avail == 0) { break; }
|
|
avail = min(avail, read_len - block_bytes_read); // avail can exceed the len upper bound
|
|
|
|
while (avail) {
|
|
u32 this_len = min3(avail, (u32)req->next_read_size - req->page_offset, req->breq.left_to_read - block_bytes_read);
|
|
ternfs_debug("read %d bytes of %d", this_len, req->next_read_size);
|
|
if (req->next_read_size == PAGE_SIZE) {
|
|
req->bytes_fetched += this_len;
|
|
memcpy(page_ptr + req->page_offset, data, this_len);
|
|
} else {
|
|
memcpy((u8 *)&page->private+req->page_offset, data, this_len);
|
|
}
|
|
data += this_len;
|
|
req->page_offset += this_len;
|
|
avail -= this_len;
|
|
block_bytes_read += this_len;
|
|
|
|
if (req->page_offset >= req->next_read_size) {
|
|
BUG_ON(req->page_offset > req->next_read_size);
|
|
req->page_offset = 0;
|
|
if (req->next_read_size == PAGE_SIZE) {
|
|
if (req->block_crc > 0) {
|
|
// We are handling BlockWithCrc response, so CRC will follow this page
|
|
req->next_read_size = PAGE_CRC_BYTES;
|
|
// We only want to rotate the list and map new page
|
|
// when we are not expecting CRC after each page. When
|
|
// parsing BlockWithCrc response, the list will be
|
|
// rotated and new page mapped when we finish reading the CRC.
|
|
continue;
|
|
}
|
|
} else {
|
|
// We have finished reading the CRC of the last page
|
|
// the crc itself was stored in page_crc->crc in the read loop.
|
|
req->next_read_size = PAGE_SIZE;
|
|
}
|
|
kunmap_atomic(page_ptr);
|
|
list_rotate_left(&req->pages);
|
|
page = list_first_entry(&req->pages, struct page, lru);
|
|
page_ptr = kmap_atomic(page);
|
|
}
|
|
}
|
|
}
|
|
|
|
// We might have not terminated with avail == 0, but because we're done with this request
|
|
skb_abort_seq_read(&seq);
|
|
|
|
BUG_ON(header_read + block_bytes_read > req->breq.left_to_read);
|
|
|
|
kunmap_atomic(page_ptr);
|
|
|
|
return header_read + block_bytes_read;
|
|
}
|
|
|
|
// --------------------------------------------------------------------
|
|
// Fetch pages with crc
|
|
// --------------------------------------------------------------------
|
|
|
|
static int fetch_block_pages_with_crc_write_req(struct block_socket* socket, struct block_request* breq) {
|
|
struct fetch_request* req = get_fetch_request(breq);
|
|
|
|
char req_msg_buf[TERNFS_BLOCKS_REQ_HEADER_SIZE + TERNFS_FETCH_BLOCK_WITH_CRC_REQ_SIZE];
|
|
BUG_ON(breq->left_to_write > sizeof(req_msg_buf));
|
|
|
|
// we're not done, fill in the req
|
|
char* req_msg = req_msg_buf;
|
|
put_unaligned_le32(TERNFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
|
|
put_unaligned_le64(req->block_service_id, req_msg); req_msg += 8;
|
|
*(u8*)req_msg = TERNFS_BLOCKS_FETCH_BLOCK_WITH_CRC; req_msg += 1;
|
|
{
|
|
struct ternfs_bincode_put_ctx ctx = {
|
|
.start = req_msg,
|
|
.cursor = req_msg,
|
|
.end = req_msg_buf + sizeof(req_msg_buf),
|
|
};
|
|
ternfs_fetch_block_with_crc_req_put_start(&ctx, start);
|
|
ternfs_fetch_block_with_crc_req_put_file_id_unused(&ctx, start, req_file_id, 0);
|
|
ternfs_fetch_block_with_crc_req_put_block_id(&ctx, req_file_id, req_block_id, req->block_id);
|
|
ternfs_fetch_block_with_crc_req_put_block_crc_unused(&ctx, req_block_id, req_block_crc, 0);
|
|
ternfs_fetch_block_with_crc_req_put_offset(&ctx, req_block_crc, req_offset, req->offset);
|
|
ternfs_fetch_block_with_crc_req_put_count(&ctx, req_offset, req_count, req->count);
|
|
ternfs_fetch_block_with_crc_req_put_end(ctx, req_count, end);
|
|
BUG_ON(ctx.cursor != ctx.end);
|
|
}
|
|
|
|
// send message
|
|
struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
|
|
ternfs_debug("sending fetch block req to %pI4:%d, bs=%016llx block_id=%016llx", &socket->addr.sin_addr, ntohs(socket->addr.sin_port), req->block_service_id, req->block_id);
|
|
struct kvec iov = {
|
|
.iov_base = req_msg_buf + (sizeof(req_msg_buf) - breq->left_to_write),
|
|
.iov_len = breq->left_to_write,
|
|
};
|
|
int err = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
|
|
return err == -EAGAIN ? 0 : err;
|
|
}
|
|
|
|
static void fetch_block_pages_with_crc_write_work(struct work_struct* work) {
|
|
block_socket_write_work(&fetch_block_pages_witch_crc_ops, container_of(work, struct block_socket, write_work));
|
|
}
|
|
|
|
static int fetch_block_pages_with_crc_receive_req(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
|
|
return block_socket_receive_req_locked(fetch_receive_single_req, rd_desc, skb, offset, len);
|
|
}
|
|
|
|
static void fetch_block_pages_with_crc_sk_data_ready(struct sock* sk) {
|
|
block_socket_sk_data_ready(&fetch_block_pages_witch_crc_ops, sk);
|
|
}
|
|
|
|
int ternfs_fetch_block_pages_with_crc(
|
|
void (*callback)(void* data, u64 block_id, struct list_head* pages, int err),
|
|
void* data,
|
|
struct ternfs_block_service* bs,
|
|
struct list_head* pages,
|
|
u64 file_id,
|
|
u64 block_id,
|
|
u32 block_crc,
|
|
u32 offset,
|
|
u32 count
|
|
) {
|
|
int err, i;
|
|
|
|
if (count%PAGE_SIZE) {
|
|
// we rely on the pages being filled neatly in the block fetch code
|
|
// (we could change this easily, but not needed for now)
|
|
ternfs_warn("cannot read not page-sized block segment: %u", count);
|
|
return -EIO;
|
|
}
|
|
|
|
// can't read
|
|
if (unlikely(bs->flags & TERNFS_BLOCK_SERVICE_DONT_READ)) {
|
|
ternfs_debug("could not fetch block given block flags %02x", bs->flags);
|
|
return -EIO;
|
|
}
|
|
|
|
struct fetch_request* req = kmem_cache_alloc(fetch_request_cachep, GFP_KERNEL);
|
|
if (!req) { err = -ENOMEM; goto out_err; }
|
|
|
|
req->callback = callback;
|
|
req->data = data;
|
|
req->file_id = file_id;
|
|
req->block_service_id = bs->id;
|
|
req->block_id = block_id;
|
|
req->block_crc = block_crc;
|
|
req->offset = offset;
|
|
req->count = count;
|
|
req->bytes_fetched = 0;
|
|
req->page_offset = 0;
|
|
req->header_read = 0;
|
|
req->next_read_size = PAGE_SIZE;
|
|
|
|
BUG_ON(!list_empty(&req->pages));
|
|
for (i = 0; i < count/PAGE_SIZE; i++) {
|
|
struct page *page = list_first_entry_or_null(pages, struct page, lru);
|
|
BUG_ON(page == NULL);
|
|
list_del(&page->lru);
|
|
page->private = 0;
|
|
list_add_tail(&page->lru, &req->pages);
|
|
}
|
|
BUG_ON(!list_empty(pages));
|
|
err = block_socket_start_req(
|
|
&fetch_block_pages_witch_crc_ops,
|
|
&req->breq,
|
|
bs,
|
|
TERNFS_BLOCKS_REQ_HEADER_SIZE + TERNFS_FETCH_BLOCK_WITH_CRC_REQ_SIZE,
|
|
TERNFS_BLOCKS_RESP_HEADER_SIZE + TERNFS_FETCH_BLOCK_WITH_CRC_RESP_SIZE + count + (count / PAGE_SIZE) * PAGE_CRC_BYTES
|
|
);
|
|
if (err) {
|
|
goto out_err_pages;
|
|
}
|
|
|
|
return 0;
|
|
|
|
out_err_pages:
|
|
// Transfer pages back to the caller.
|
|
for (i = 0; i < count/PAGE_SIZE; i++) {
|
|
struct page *page = list_first_entry_or_null(&req->pages, struct page, lru);
|
|
BUG_ON(page == NULL);
|
|
list_del(&page->lru);
|
|
page->private = 0;
|
|
list_add_tail(&page->lru, pages);
|
|
}
|
|
kmem_cache_free(fetch_request_cachep, req);
|
|
out_err:
|
|
ternfs_info("couldn't start fetch block request, err=%d", err);
|
|
return err;
|
|
}
|
|
|
|
// Write
|
|
// --------------------------------------------------------------------
|
|
|
|
struct write_request {
|
|
struct block_request breq;
|
|
|
|
// Called when request is done
|
|
void (*callback)(void* data, struct list_head* pages, u64 block_id, u64 proof, int err);
|
|
void* data;
|
|
|
|
struct list_head pages;
|
|
|
|
// Req info
|
|
u64 block_service_id;
|
|
u64 block_id;
|
|
u32 crc;
|
|
u32 size;
|
|
u64 certificate;
|
|
|
|
static_assert(TERNFS_WRITE_BLOCK_RESP_SIZE > 2);
|
|
union {
|
|
char buf[TERNFS_BLOCKS_RESP_HEADER_SIZE+TERNFS_WRITE_BLOCK_RESP_SIZE];
|
|
struct {
|
|
__le32 protocol;
|
|
u8 kind;
|
|
union {
|
|
__le16 error;
|
|
__be64 proof;
|
|
};
|
|
} __attribute__((packed)) data;
|
|
} write_resp;
|
|
};
|
|
|
|
#define get_write_request(req) container_of(req, struct write_request, breq)
|
|
|
|
static void write_block_complete(struct block_request* breq) {
|
|
struct write_request* req = get_write_request(breq);
|
|
|
|
// might be that we didn't consume all the pages -- in which case we need
|
|
// to keep rotating until we're there.
|
|
while (list_first_entry(&req->pages, struct page, lru)->private == 0) {
|
|
list_rotate_left(&req->pages);
|
|
}
|
|
req->callback(req->data, &req->pages, req->block_id, be64_to_cpu(req->write_resp.data.proof), atomic_read(&req->breq.err));
|
|
|
|
BUG_ON(!list_empty(&req->pages)); // the callback must acquire this
|
|
|
|
// What if the socket is being read right now? There's probably a race with
|
|
// that and the free here.
|
|
//
|
|
// This function is called in three ways:
|
|
// * `block_socket_receive_req_locked` enqueues the write complete work on ternfs_wq
|
|
// * `block_socket_write_req_locked` enqueues the write complete work on ternfs_wq
|
|
// * `cleanup_work` enqueues the write complete on ternfs_wq
|
|
//
|
|
// `cleanup_work` changes the callbacks, so that by the time
|
|
// we get to call this function all the callbacks s must be done.
|
|
//
|
|
// Moreover `block_socket_write/receive_req_locked` removes the request from the queue as it
|
|
// enqueues the work. So this should be safe.
|
|
|
|
kmem_cache_free(write_request_cachep, req);
|
|
}
|
|
|
|
static void write_block_complete_work(struct work_struct* work) {
|
|
write_block_complete(container_of(work, struct block_request, complete_work));
|
|
}
|
|
|
|
static void write_request_constructor(void* ptr) {
|
|
struct write_request* req = (struct write_request*)ptr;
|
|
|
|
INIT_LIST_HEAD(&req->pages);
|
|
INIT_WORK(&req->breq.complete_work, write_block_complete_work);
|
|
}
|
|
|
|
int ternfs_drop_write_block_sockets(void) {
|
|
return drop_sockets(&write_block_ops);
|
|
}
|
|
|
|
static int write_block_write_req(struct block_socket* socket, struct block_request* breq) {
|
|
struct write_request* req = get_write_request(breq);
|
|
int err = 0;
|
|
|
|
u32 write_size = TERNFS_BLOCKS_REQ_HEADER_SIZE + TERNFS_WRITE_BLOCK_REQ_SIZE + req->size;
|
|
u32 write_req_written0 = write_size - breq->left_to_write;
|
|
u32 write_req_written = write_req_written0;
|
|
|
|
#define BLOCK_WRITE_EXIT return write_req_written - write_req_written0;
|
|
|
|
// Still writing request -- we just serialize the buffer each time and send it out
|
|
while (write_req_written < TERNFS_BLOCKS_REQ_HEADER_SIZE+TERNFS_WRITE_BLOCK_REQ_SIZE) {
|
|
char req_msg_buf[TERNFS_BLOCKS_REQ_HEADER_SIZE+TERNFS_WRITE_BLOCK_REQ_SIZE];
|
|
char* req_msg = req_msg_buf;
|
|
put_unaligned_le32(TERNFS_BLOCKS_REQ_PROTOCOL_VERSION, req_msg); req_msg += 4;
|
|
put_unaligned_le64(req->block_service_id, req_msg); req_msg += 8;
|
|
*(u8*)req_msg = TERNFS_BLOCKS_WRITE_BLOCK; req_msg += 1;
|
|
{
|
|
struct ternfs_bincode_put_ctx ctx = {
|
|
.start = req_msg,
|
|
.cursor = req_msg,
|
|
.end = req_msg_buf + sizeof(req_msg_buf),
|
|
};
|
|
ternfs_write_block_req_put_start(&ctx, start);
|
|
ternfs_write_block_req_put_block_id(&ctx, start, req_block_id, req->block_id);
|
|
ternfs_write_block_req_put_crc(&ctx, req_block_id, req_crc, req->crc);
|
|
ternfs_write_block_req_put_size(&ctx, req_crc, req_size, req->size);
|
|
ternfs_write_block_req_put_certificate(&ctx, req_size, req_certificate, req->certificate);
|
|
ternfs_write_block_req_put_end(ctx, req_certificate, end);
|
|
BUILD_BUG_ON(ctx.cursor != ctx.end);
|
|
}
|
|
|
|
// send message
|
|
struct msghdr msg = {
|
|
.msg_flags = MSG_MORE | MSG_DONTWAIT,
|
|
};
|
|
ternfs_debug("sending write block req to %pI4:%d", &socket->addr.sin_addr, ntohs(socket->addr.sin_port));
|
|
struct kvec iov = {
|
|
.iov_base = req_msg_buf + write_req_written,
|
|
.iov_len = sizeof(req_msg_buf) - write_req_written,
|
|
};
|
|
int sent = kernel_sendmsg(socket->sock, &msg, &iov, 1, iov.iov_len);
|
|
if (sent == -EAGAIN) { BLOCK_WRITE_EXIT } // done for now, will be rescheduled by `sk_write_space`
|
|
if (sent < 0) { err = sent; goto out_err; }
|
|
write_req_written += sent;
|
|
}
|
|
|
|
// Write the pages out
|
|
while (write_req_written < write_size) {
|
|
struct page* page = list_first_entry(&req->pages, struct page, lru);
|
|
int sent = kernel_sendpage(
|
|
socket->sock, page,
|
|
page->index,
|
|
min((u32)(PAGE_SIZE - page->index), write_size - write_req_written),
|
|
MSG_MORE | MSG_DONTWAIT
|
|
);
|
|
if (sent == -EAGAIN) {
|
|
// done for now, will be rescheduled by `sk_write_space`
|
|
break;
|
|
}
|
|
if (sent < 0) { err = sent; goto out_err; }
|
|
page->index += sent;
|
|
write_req_written += sent;
|
|
if (page->index >= PAGE_SIZE) {
|
|
BUG_ON(page->index != PAGE_SIZE);
|
|
list_rotate_left(&req->pages);
|
|
if (write_size - write_req_written) { // otherwise this will be null
|
|
list_first_entry(&req->pages, struct page, lru)->index = 0;
|
|
list_first_entry(&req->pages, struct page, lru)->private = 0;
|
|
}
|
|
}
|
|
}
|
|
BLOCK_WRITE_EXIT;
|
|
|
|
out_err:
|
|
ternfs_debug("block write failed err=%d", err);
|
|
BUG_ON(err == 0);
|
|
atomic_cmpxchg(&breq->err, 0, err);
|
|
return err;
|
|
|
|
#undef BLOCK_WRITE_EXIT
|
|
}
|
|
|
|
static void write_block_write_work(struct work_struct* work) {
|
|
struct block_socket* socket = container_of(work, struct block_socket, write_work);
|
|
block_socket_write_work(&write_block_ops, socket);
|
|
}
|
|
|
|
static int write_block_receive_single_req(
|
|
struct block_request* breq,
|
|
struct sk_buff* skb,
|
|
unsigned int offset,
|
|
size_t len0
|
|
) {
|
|
struct write_request* req = get_write_request(breq);
|
|
size_t len = len0;
|
|
u32 write_resp_read = (TERNFS_BLOCKS_RESP_HEADER_SIZE + TERNFS_WRITE_BLOCK_RESP_SIZE) - req->breq.left_to_read;
|
|
|
|
#define BLOCK_WRITE_EXIT(i) do { \
|
|
if (i < 0) { \
|
|
atomic_cmpxchg(&req->breq.err, 0, i); \
|
|
} \
|
|
return i; \
|
|
} while (0)
|
|
|
|
// Write resp
|
|
|
|
#define HEADER_COPY(buf, count) ({ \
|
|
int read = count > 0 ? ternfs_skb_copy(buf, skb, offset, count) : 0; \
|
|
offset += read; \
|
|
len -= read; \
|
|
write_resp_read += read; \
|
|
read; \
|
|
})
|
|
|
|
// Header
|
|
{
|
|
int header_left = TERNFS_BLOCKS_RESP_HEADER_SIZE - (int)write_resp_read;
|
|
int read = HEADER_COPY(req->write_resp.buf + write_resp_read, header_left);
|
|
header_left -= read;
|
|
if (header_left > 0) { BLOCK_WRITE_EXIT(len0-len); }
|
|
}
|
|
|
|
// Protocol check
|
|
if (unlikely(le32_to_cpu(req->write_resp.data.protocol) != TERNFS_BLOCKS_RESP_PROTOCOL_VERSION)) {
|
|
ternfs_info("bad blocks resp protocol, expected %*pE, got %*pE", 4, &TERNFS_BLOCKS_RESP_PROTOCOL_VERSION, 4, &req->write_resp.data.protocol);
|
|
BLOCK_WRITE_EXIT(ternfs_error_to_linux(TERNFS_ERR_MALFORMED_RESPONSE));
|
|
}
|
|
|
|
// Error check
|
|
if (unlikely(req->write_resp.data.kind == 0)) {
|
|
int error_left = (TERNFS_BLOCKS_RESP_HEADER_SIZE + 2) - (int)write_resp_read;
|
|
int read = HEADER_COPY(req->write_resp.buf + write_resp_read, error_left);
|
|
error_left -= read;
|
|
if (error_left > 0) { BLOCK_WRITE_EXIT(len0-len); }
|
|
ternfs_info("writing block to bs=%016llx block_id=%016llx failed=%u", req->block_service_id, req->block_id, req->write_resp.data.error);
|
|
// We immediately start writing, so any error here means that the socket is kaput
|
|
int err = ternfs_error_to_linux(le16_to_cpu(req->write_resp.data.error));
|
|
BLOCK_WRITE_EXIT(err);
|
|
}
|
|
|
|
// We can finally get the proof
|
|
{
|
|
int proof_left = (TERNFS_BLOCKS_RESP_HEADER_SIZE + 8) - (int)write_resp_read;
|
|
int read = HEADER_COPY(req->write_resp.buf + write_resp_read, proof_left);
|
|
proof_left -= read;
|
|
if (proof_left > 0) {
|
|
BLOCK_WRITE_EXIT(len0-len);
|
|
}
|
|
}
|
|
|
|
#undef HEADER_COPY
|
|
|
|
BLOCK_WRITE_EXIT(len0-len);
|
|
|
|
#undef BLOCK_WRITE_EXIT
|
|
}
|
|
|
|
static int write_block_receive_req(read_descriptor_t* rd_desc, struct sk_buff* skb, unsigned int offset, size_t len) {
|
|
return block_socket_receive_req_locked(write_block_receive_single_req, rd_desc, skb, offset, len);
|
|
}
|
|
|
|
static void write_block_sk_data_ready(struct sock* sk) {
|
|
block_socket_sk_data_ready(&write_block_ops, sk);
|
|
}
|
|
|
|
int ternfs_write_block(
|
|
void (*callback)(void* data, struct list_head* pages, u64 block_id, u64 proof, int err),
|
|
void* data,
|
|
struct ternfs_block_service* bs,
|
|
u64 block_id,
|
|
u64 certificate,
|
|
u32 size,
|
|
u32 crc,
|
|
struct list_head* pages
|
|
) {
|
|
int err;
|
|
|
|
BUG_ON(list_empty(pages));
|
|
|
|
// can't write
|
|
if (unlikely(bs->flags & TERNFS_BLOCK_SERVICE_DONT_WRITE)) {
|
|
ternfs_debug("could not write block given block flags %02x", bs->flags);
|
|
return -EIO;
|
|
}
|
|
|
|
struct write_request* req = kmem_cache_alloc(write_request_cachep, GFP_KERNEL);
|
|
if (!req) { err = -ENOMEM; goto out_err; }
|
|
|
|
req->callback = callback;
|
|
req->data = data;
|
|
req->block_service_id = bs->id;
|
|
req->block_id = block_id;
|
|
req->crc = crc;
|
|
req->size = size;
|
|
req->certificate = certificate;
|
|
memset(&req->write_resp.buf, 0, sizeof(req->write_resp.buf));
|
|
list_replace_init(pages, &req->pages);
|
|
list_first_entry(&req->pages, struct page, lru)->index = 0;
|
|
list_first_entry(&req->pages, struct page, lru)->private = 1; // we use this to mark the first page
|
|
|
|
err = block_socket_start_req(
|
|
&write_block_ops,
|
|
&req->breq,
|
|
bs,
|
|
TERNFS_BLOCKS_REQ_HEADER_SIZE + TERNFS_WRITE_BLOCK_REQ_SIZE + size,
|
|
TERNFS_BLOCKS_RESP_HEADER_SIZE + TERNFS_WRITE_BLOCK_RESP_SIZE
|
|
);
|
|
if (err) {
|
|
goto out_err_req;
|
|
}
|
|
|
|
return 0;
|
|
|
|
out_err_req:
|
|
list_replace_init(&req->pages, pages);
|
|
kmem_cache_free(write_request_cachep, req);
|
|
out_err:
|
|
ternfs_info("couldn't start write block request, err=%d", err);
|
|
return err;
|
|
}
|
|
|
|
// Timeout
|
|
// --------------------------------------------------------------------
|
|
|
|
// Periodically checks all sockets and schedules timeouts
|
|
static void do_timeout_sockets(struct work_struct* w) {
|
|
u64 start = get_jiffies_64();
|
|
timeout_sockets(&fetch_block_pages_witch_crc_ops);
|
|
timeout_sockets(&write_block_ops);
|
|
u64 end = get_jiffies_64();
|
|
u64 dt = end - start;
|
|
if (dt > MSECS_TO_JIFFIES(100)) {
|
|
ternfs_warn("SLOW checking for socket timeouts %llums", jiffies64_to_msecs(dt));
|
|
}
|
|
queue_delayed_work(ternfs_fast_wq, &timeout_work, MSECS_TO_JIFFIES(1000));
|
|
}
|
|
|
|
// init/exit
|
|
// --------------------------------------------------------------------
|
|
|
|
int __init ternfs_block_init(void) {
|
|
int err;
|
|
fetch_request_cachep = kmem_cache_create(
|
|
"ternfs_fetch_block_request_cache",
|
|
sizeof(struct fetch_request),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT,
|
|
fetch_request_constructor
|
|
);
|
|
if (!fetch_request_cachep) { err = -ENOMEM; goto out_err; }
|
|
|
|
write_request_cachep = kmem_cache_create(
|
|
"ternfs_write_block_request_cache",
|
|
sizeof(struct write_request),
|
|
0,
|
|
SLAB_RECLAIM_ACCOUNT,
|
|
write_request_constructor
|
|
);
|
|
if (!write_request_cachep) { err = -ENOMEM; goto out_fetch_request; }
|
|
|
|
block_ops_init(&fetch_block_pages_witch_crc_ops);
|
|
block_ops_init(&write_block_ops);
|
|
|
|
queue_work(ternfs_fast_wq, &timeout_work.work);
|
|
|
|
return 0;
|
|
|
|
out_fetch_request:
|
|
kmem_cache_destroy(fetch_request_cachep);
|
|
out_err:
|
|
return err;
|
|
}
|
|
|
|
void __cold ternfs_block_exit(void) {
|
|
ternfs_debug("block exit");
|
|
|
|
// stop timing out sockets
|
|
cancel_delayed_work_sync(&timeout_work);
|
|
|
|
// If we're here, it means that all requests
|
|
// must have finished. However we might still
|
|
// be getting data down the sockets. So we let
|
|
// the normal cleanup procedure run its course
|
|
// by erroring out each socket and waiting
|
|
// for all sockets to be released.
|
|
|
|
block_ops_exit(&fetch_block_pages_witch_crc_ops);
|
|
block_ops_exit(&write_block_ops);
|
|
|
|
// clear caches
|
|
kmem_cache_destroy(fetch_request_cachep);
|
|
kmem_cache_destroy(write_request_cachep);
|
|
}
|