vfs-6.14-rc4.fixes

-----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQRAhzRXHqcMeLMyaSiRxhvAZXjcogUCZ7MONQAKCRCRxhvAZXjc
 ovw1AP4uB8c0hYfQHv/02XVTBad46zQm7uDh28EnEI8mrX7UBwEAnHw1PrrcX6ZH
 QFA47x5iGR+InXfQx4mmGqgvlD1XQgI=
 =x1hu
 -----END PGP SIGNATURE-----

Merge tag 'vfs-6.14-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs

Pull vfs fixes from Christian Brauner:
 "It was reported that the acct(2) system call can be used to trigger a
  NULL deref in cases where it is set to write to a file that triggers
  an internal lookup.

  This can e.g., happen when pointing acct(2) to /sys/power/resume. At
  the point the where the write to this file happens the calling task
  has already exited and called exit_fs() but an internal lookup might
  be triggered through lookup_bdev(). This may trigger a NULL-deref when
  accessing current->fs.

  Reorganize the code so that the the final write happens from the
  workqueue but with the caller's credentials. This preserves the
  (strange) permission model and has almost no regression risk.

  Also block access to kernel internal filesystems as well as procfs and
  sysfs in the first place.

  Various fixes for netfslib:

   - Fix a number of read-retry hangs, including:

      - Incorrect getting/putting of references on subreqs as we retry
        them

      - Failure to track whether a last old subrequest in a retried set
        is superfluous

      - Inconsistency in the usage of wait queues used for subrequests
        (ie. using clear_and_wake_up_bit() whilst waiting on a private
        waitqueue)

   - Add stats counters for retries and publish in /proc/fs/netfs/stats.
     This is not a fix per se, but is useful in debugging and shouldn't
     otherwise change the operation of the code

   - Fix the ordering of queuing subrequests with respect to setting the
     request flag that says we've now queued them all"

* tag 'vfs-6.14-rc4.fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/vfs/vfs:
  netfs: Fix setting NETFS_RREQ_ALL_QUEUED to be after all subreqs queued
  netfs: Add retry stat counters
  netfs: Fix a number of read-retry hangs
  acct: block access to kernel internal filesystems
  acct: perform last write from workqueue
This commit is contained in:
Linus Torvalds 2025-02-17 10:38:25 -08:00
commit 2408a807bf
10 changed files with 155 additions and 71 deletions

View File

@ -155,8 +155,9 @@ static void netfs_read_cache_to_pagecache(struct netfs_io_request *rreq,
netfs_cache_read_terminated, subreq);
}
static void netfs_issue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
static void netfs_queue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq,
bool last_subreq)
{
struct netfs_io_stream *stream = &rreq->io_streams[0];
@ -177,8 +178,17 @@ static void netfs_issue_read(struct netfs_io_request *rreq,
}
}
spin_unlock(&rreq->lock);
if (last_subreq) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
}
spin_unlock(&rreq->lock);
}
static void netfs_issue_read(struct netfs_io_request *rreq,
struct netfs_io_subrequest *subreq)
{
switch (subreq->source) {
case NETFS_DOWNLOAD_FROM_SERVER:
rreq->netfs_ops->issue_read(subreq);
@ -293,11 +303,8 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq)
}
size -= slice;
start += slice;
if (size <= 0) {
smp_wmb(); /* Write lists before ALL_QUEUED. */
set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags);
}
netfs_queue_read(rreq, subreq, size <= 0);
netfs_issue_read(rreq, subreq);
cond_resched();
} while (size > 0);

View File

@ -135,6 +135,8 @@ extern atomic_t netfs_n_rh_write_begin;
extern atomic_t netfs_n_rh_write_done;
extern atomic_t netfs_n_rh_write_failed;
extern atomic_t netfs_n_rh_write_zskip;
extern atomic_t netfs_n_rh_retry_read_req;
extern atomic_t netfs_n_rh_retry_read_subreq;
extern atomic_t netfs_n_wh_buffered_write;
extern atomic_t netfs_n_wh_writethrough;
extern atomic_t netfs_n_wh_dio_write;
@ -147,6 +149,8 @@ extern atomic_t netfs_n_wh_upload_failed;
extern atomic_t netfs_n_wh_write;
extern atomic_t netfs_n_wh_write_done;
extern atomic_t netfs_n_wh_write_failed;
extern atomic_t netfs_n_wh_retry_write_req;
extern atomic_t netfs_n_wh_retry_write_subreq;
extern atomic_t netfs_n_wb_lock_skip;
extern atomic_t netfs_n_wb_lock_wait;
extern atomic_t netfs_n_folioq;

View File

@ -470,7 +470,8 @@ void netfs_read_collection_worker(struct work_struct *work)
*/
void netfs_wake_read_collector(struct netfs_io_request *rreq)
{
if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) {
if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) &&
!test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) {
if (!work_pending(&rreq->work)) {
netfs_get_request(rreq, netfs_rreq_trace_get_work);
if (!queue_work(system_unbound_wq, &rreq->work))
@ -586,7 +587,8 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq)
smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */
/* If we are at the head of the queue, wake up the collector. */
if (list_is_first(&subreq->rreq_link, &stream->subrequests))
if (list_is_first(&subreq->rreq_link, &stream->subrequests) ||
test_bit(NETFS_RREQ_RETRYING, &rreq->flags))
netfs_wake_read_collector(rreq);
netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated);

View File

@ -14,7 +14,7 @@ static void netfs_reissue_read(struct netfs_io_request *rreq,
{
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_stat(&netfs_n_rh_retry_read_subreq);
subreq->rreq->netfs_ops->issue_read(subreq);
}
@ -48,6 +48,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
subreq->retry_count++;
netfs_reset_iter(subreq);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
}
}
@ -75,7 +76,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
struct iov_iter source;
unsigned long long start, len;
size_t part;
bool boundary = false;
bool boundary = false, subreq_superfluous = false;
/* Go through the subreqs and find the next span of contiguous
* buffer that we then rejig (cifs, for example, needs the
@ -116,8 +117,10 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
/* Work through the sublist. */
subreq = from;
list_for_each_entry_from(subreq, &stream->subrequests, rreq_link) {
if (!len)
if (!len) {
subreq_superfluous = true;
break;
}
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start - subreq->transferred;
subreq->len = len + subreq->transferred;
@ -154,19 +157,21 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
netfs_reissue_read(rreq, subreq);
if (subreq == to)
if (subreq == to) {
subreq_superfluous = false;
break;
}
}
/* If we managed to use fewer subreqs, we can discard the
* excess; if we used the same number, then we're done.
*/
if (!len) {
if (subreq == to)
if (!subreq_superfluous)
continue;
list_for_each_entry_safe_from(subreq, tmp,
&stream->subrequests, rreq_link) {
trace_netfs_sreq(subreq, netfs_sreq_trace_discard);
trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous);
list_del(&subreq->rreq_link);
netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done);
if (subreq == to)
@ -187,14 +192,12 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq)
subreq->source = NETFS_DOWNLOAD_FROM_SERVER;
subreq->start = start;
subreq->len = len;
subreq->debug_index = atomic_inc_return(&rreq->subreq_counter);
subreq->stream_nr = stream->stream_nr;
subreq->retry_count = 1;
trace_netfs_sreq_ref(rreq->debug_id, subreq->debug_index,
refcount_read(&subreq->ref),
netfs_sreq_trace_new);
netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit);
list_add(&subreq->rreq_link, &to->rreq_link);
to = list_next_entry(to, rreq_link);
@ -256,14 +259,34 @@ void netfs_retry_reads(struct netfs_io_request *rreq)
{
struct netfs_io_subrequest *subreq;
struct netfs_io_stream *stream = &rreq->io_streams[0];
DEFINE_WAIT(myself);
netfs_stat(&netfs_n_rh_retry_read_req);
set_bit(NETFS_RREQ_RETRYING, &rreq->flags);
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/
list_for_each_entry(subreq, &stream->subrequests, rreq_link) {
wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS,
TASK_UNINTERRUPTIBLE);
if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
continue;
trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue);
for (;;) {
prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE);
if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags))
break;
trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for);
schedule();
trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue);
}
finish_wait(&rreq->waitq, &myself);
}
clear_bit(NETFS_RREQ_RETRYING, &rreq->flags);
trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit);
netfs_retry_read_subrequests(rreq);

View File

@ -29,6 +29,8 @@ atomic_t netfs_n_rh_write_begin;
atomic_t netfs_n_rh_write_done;
atomic_t netfs_n_rh_write_failed;
atomic_t netfs_n_rh_write_zskip;
atomic_t netfs_n_rh_retry_read_req;
atomic_t netfs_n_rh_retry_read_subreq;
atomic_t netfs_n_wh_buffered_write;
atomic_t netfs_n_wh_writethrough;
atomic_t netfs_n_wh_dio_write;
@ -41,6 +43,8 @@ atomic_t netfs_n_wh_upload_failed;
atomic_t netfs_n_wh_write;
atomic_t netfs_n_wh_write_done;
atomic_t netfs_n_wh_write_failed;
atomic_t netfs_n_wh_retry_write_req;
atomic_t netfs_n_wh_retry_write_subreq;
atomic_t netfs_n_wb_lock_skip;
atomic_t netfs_n_wb_lock_wait;
atomic_t netfs_n_folioq;
@ -81,6 +85,11 @@ int netfs_stats_show(struct seq_file *m, void *v)
atomic_read(&netfs_n_wh_write),
atomic_read(&netfs_n_wh_write_done),
atomic_read(&netfs_n_wh_write_failed));
seq_printf(m, "Retries: rq=%u rs=%u wq=%u ws=%u\n",
atomic_read(&netfs_n_rh_retry_read_req),
atomic_read(&netfs_n_rh_retry_read_subreq),
atomic_read(&netfs_n_wh_retry_write_req),
atomic_read(&netfs_n_wh_retry_write_subreq));
seq_printf(m, "Objs : rr=%u sr=%u foq=%u wsc=%u\n",
atomic_read(&netfs_n_rh_rreq),
atomic_read(&netfs_n_rh_sreq),

View File

@ -253,6 +253,7 @@ void netfs_reissue_write(struct netfs_io_stream *stream,
subreq->retry_count++;
__clear_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags);
__set_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags);
netfs_stat(&netfs_n_wh_retry_write_subreq);
netfs_do_issue_write(stream, subreq);
}

View File

@ -203,6 +203,8 @@ void netfs_retry_writes(struct netfs_io_request *wreq)
struct netfs_io_stream *stream;
int s;
netfs_stat(&netfs_n_wh_retry_write_req);
/* Wait for all outstanding I/O to quiesce before performing retries as
* we may need to renegotiate the I/O sizes.
*/

View File

@ -278,7 +278,7 @@ struct netfs_io_request {
#define NETFS_RREQ_PAUSE 11 /* Pause subrequest generation */
#define NETFS_RREQ_USE_IO_ITER 12 /* Use ->io_iter rather than ->i_pages */
#define NETFS_RREQ_ALL_QUEUED 13 /* All subreqs are now queued */
#define NETFS_RREQ_NEED_RETRY 14 /* Need to try retrying */
#define NETFS_RREQ_RETRYING 14 /* Set if we're in the retry path */
#define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark
* write to cache on read */
const struct netfs_request_ops *netfs_ops;

View File

@ -99,7 +99,7 @@
EM(netfs_sreq_trace_limited, "LIMIT") \
EM(netfs_sreq_trace_need_clear, "N-CLR") \
EM(netfs_sreq_trace_partial_read, "PARTR") \
EM(netfs_sreq_trace_need_retry, "NRTRY") \
EM(netfs_sreq_trace_need_retry, "ND-RT") \
EM(netfs_sreq_trace_prepare, "PREP ") \
EM(netfs_sreq_trace_prep_failed, "PRPFL") \
EM(netfs_sreq_trace_progress, "PRGRS") \
@ -108,7 +108,9 @@
EM(netfs_sreq_trace_short, "SHORT") \
EM(netfs_sreq_trace_split, "SPLIT") \
EM(netfs_sreq_trace_submit, "SUBMT") \
EM(netfs_sreq_trace_superfluous, "SPRFL") \
EM(netfs_sreq_trace_terminated, "TERM ") \
EM(netfs_sreq_trace_wait_for, "_WAIT") \
EM(netfs_sreq_trace_write, "WRITE") \
EM(netfs_sreq_trace_write_skip, "SKIP ") \
E_(netfs_sreq_trace_write_term, "WTERM")

View File

@ -103,48 +103,50 @@ struct bsd_acct_struct {
atomic_long_t count;
struct rcu_head rcu;
struct mutex lock;
int active;
bool active;
bool check_space;
unsigned long needcheck;
struct file *file;
struct pid_namespace *ns;
struct work_struct work;
struct completion done;
acct_t ac;
};
static void do_acct_process(struct bsd_acct_struct *acct);
static void fill_ac(struct bsd_acct_struct *acct);
static void acct_write_process(struct bsd_acct_struct *acct);
/*
* Check the amount of free space and suspend/resume accordingly.
*/
static int check_free_space(struct bsd_acct_struct *acct)
static bool check_free_space(struct bsd_acct_struct *acct)
{
struct kstatfs sbuf;
if (time_is_after_jiffies(acct->needcheck))
goto out;
if (!acct->check_space)
return acct->active;
/* May block */
if (vfs_statfs(&acct->file->f_path, &sbuf))
goto out;
return acct->active;
if (acct->active) {
u64 suspend = sbuf.f_blocks * SUSPEND;
do_div(suspend, 100);
if (sbuf.f_bavail <= suspend) {
acct->active = 0;
acct->active = false;
pr_info("Process accounting paused\n");
}
} else {
u64 resume = sbuf.f_blocks * RESUME;
do_div(resume, 100);
if (sbuf.f_bavail >= resume) {
acct->active = 1;
acct->active = true;
pr_info("Process accounting resumed\n");
}
}
acct->needcheck = jiffies + ACCT_TIMEOUT*HZ;
out:
return acct->active;
}
@ -189,7 +191,11 @@ static void acct_pin_kill(struct fs_pin *pin)
{
struct bsd_acct_struct *acct = to_acct(pin);
mutex_lock(&acct->lock);
do_acct_process(acct);
/*
* Fill the accounting struct with the exiting task's info
* before punting to the workqueue.
*/
fill_ac(acct);
schedule_work(&acct->work);
wait_for_completion(&acct->done);
cmpxchg(&acct->ns->bacct, pin, NULL);
@ -202,6 +208,9 @@ static void close_work(struct work_struct *work)
{
struct bsd_acct_struct *acct = container_of(work, struct bsd_acct_struct, work);
struct file *file = acct->file;
/* We were fired by acct_pin_kill() which holds acct->lock. */
acct_write_process(acct);
if (file->f_op->flush)
file->f_op->flush(file, NULL);
__fput_sync(file);
@ -234,6 +243,20 @@ static int acct_on(struct filename *pathname)
return -EACCES;
}
/* Exclude kernel kernel internal filesystems. */
if (file_inode(file)->i_sb->s_flags & (SB_NOUSER | SB_KERNMOUNT)) {
kfree(acct);
filp_close(file, NULL);
return -EINVAL;
}
/* Exclude procfs and sysfs. */
if (file_inode(file)->i_sb->s_iflags & SB_I_USERNS_VISIBLE) {
kfree(acct);
filp_close(file, NULL);
return -EINVAL;
}
if (!(file->f_mode & FMODE_CAN_WRITE)) {
kfree(acct);
filp_close(file, NULL);
@ -430,13 +453,27 @@ static u32 encode_float(u64 value)
* do_exit() or when switching to a different output file.
*/
static void fill_ac(acct_t *ac)
static void fill_ac(struct bsd_acct_struct *acct)
{
struct pacct_struct *pacct = &current->signal->pacct;
struct file *file = acct->file;
acct_t *ac = &acct->ac;
u64 elapsed, run_time;
time64_t btime;
struct tty_struct *tty;
lockdep_assert_held(&acct->lock);
if (time_is_after_jiffies(acct->needcheck)) {
acct->check_space = false;
/* Don't fill in @ac if nothing will be written. */
if (!acct->active)
return;
} else {
acct->check_space = true;
}
/*
* Fill the accounting struct with the needed info as recorded
* by the different kernel functions.
@ -484,64 +521,61 @@ static void fill_ac(acct_t *ac)
ac->ac_majflt = encode_comp_t(pacct->ac_majflt);
ac->ac_exitcode = pacct->ac_exitcode;
spin_unlock_irq(&current->sighand->siglock);
}
/*
* do_acct_process does all actual work. Caller holds the reference to file.
*/
static void do_acct_process(struct bsd_acct_struct *acct)
{
acct_t ac;
unsigned long flim;
const struct cred *orig_cred;
struct file *file = acct->file;
/*
* Accounting records are not subject to resource limits.
*/
flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
/* Perform file operations on behalf of whoever enabled accounting */
orig_cred = override_creds(file->f_cred);
/*
* First check to see if there is enough free_space to continue
* the process accounting system.
*/
if (!check_free_space(acct))
goto out;
fill_ac(&ac);
/* we really need to bite the bullet and change layout */
ac.ac_uid = from_kuid_munged(file->f_cred->user_ns, orig_cred->uid);
ac.ac_gid = from_kgid_munged(file->f_cred->user_ns, orig_cred->gid);
ac->ac_uid = from_kuid_munged(file->f_cred->user_ns, current_uid());
ac->ac_gid = from_kgid_munged(file->f_cred->user_ns, current_gid());
#if ACCT_VERSION == 1 || ACCT_VERSION == 2
/* backward-compatible 16 bit fields */
ac.ac_uid16 = ac.ac_uid;
ac.ac_gid16 = ac.ac_gid;
ac->ac_uid16 = ac->ac_uid;
ac->ac_gid16 = ac->ac_gid;
#elif ACCT_VERSION == 3
{
struct pid_namespace *ns = acct->ns;
ac.ac_pid = task_tgid_nr_ns(current, ns);
ac->ac_pid = task_tgid_nr_ns(current, ns);
rcu_read_lock();
ac.ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent),
ns);
ac->ac_ppid = task_tgid_nr_ns(rcu_dereference(current->real_parent), ns);
rcu_read_unlock();
}
#endif
}
static void acct_write_process(struct bsd_acct_struct *acct)
{
struct file *file = acct->file;
const struct cred *cred;
acct_t *ac = &acct->ac;
/* Perform file operations on behalf of whoever enabled accounting */
cred = override_creds(file->f_cred);
/*
* Get freeze protection. If the fs is frozen, just skip the write
* as we could deadlock the system otherwise.
* First check to see if there is enough free_space to continue
* the process accounting system. Then get freeze protection. If
* the fs is frozen, just skip the write as we could deadlock
* the system otherwise.
*/
if (file_start_write_trylock(file)) {
if (check_free_space(acct) && file_start_write_trylock(file)) {
/* it's been opened O_APPEND, so position is irrelevant */
loff_t pos = 0;
__kernel_write(file, &ac, sizeof(acct_t), &pos);
__kernel_write(file, ac, sizeof(acct_t), &pos);
file_end_write(file);
}
out:
revert_creds(cred);
}
static void do_acct_process(struct bsd_acct_struct *acct)
{
unsigned long flim;
/* Accounting records are not subject to resource limits. */
flim = rlimit(RLIMIT_FSIZE);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
fill_ac(acct);
acct_write_process(acct);
current->signal->rlim[RLIMIT_FSIZE].rlim_cur = flim;
revert_creds(orig_cred);
}
/**