Merge 4c5b123ab2 ("blk-rq-qos: fix crash on rq_qos_wait vs. rq_qos_wake_function race") into android15-6.6-lts

Steps on the way to 6.6.57

Change-Id: I561755de546d2b23668440400c65d012c2700435
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
This commit is contained in:
Greg Kroah-Hartman 2024-11-21 22:09:25 +00:00
commit db82f3c8d1
41 changed files with 547 additions and 287 deletions

View File

@ -77,7 +77,7 @@ static int __diag_page_ref_service(struct kvm_vcpu *vcpu)
vcpu->stat.instruction_diagnose_258++; vcpu->stat.instruction_diagnose_258++;
if (vcpu->run->s.regs.gprs[rx] & 7) if (vcpu->run->s.regs.gprs[rx] & 7)
return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION); return kvm_s390_inject_program_int(vcpu, PGM_SPECIFICATION);
rc = read_guest(vcpu, vcpu->run->s.regs.gprs[rx], rx, &parm, sizeof(parm)); rc = read_guest_real(vcpu, vcpu->run->s.regs.gprs[rx], &parm, sizeof(parm));
if (rc) if (rc)
return kvm_s390_inject_prog_cond(vcpu, rc); return kvm_s390_inject_prog_cond(vcpu, rc);
if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258) if (parm.parm_version != 2 || parm.parm_len < 5 || parm.code != 0x258)

View File

@ -1001,6 +1001,8 @@ static int access_guest_page(struct kvm *kvm, enum gacc_mode mode, gpa_t gpa,
const gfn_t gfn = gpa_to_gfn(gpa); const gfn_t gfn = gpa_to_gfn(gpa);
int rc; int rc;
if (!gfn_to_memslot(kvm, gfn))
return PGM_ADDRESSING;
if (mode == GACC_STORE) if (mode == GACC_STORE)
rc = kvm_write_guest_page(kvm, gfn, data, offset, len); rc = kvm_write_guest_page(kvm, gfn, data, offset, len);
else else
@ -1158,6 +1160,8 @@ int access_guest_real(struct kvm_vcpu *vcpu, unsigned long gra,
gra += fragment_len; gra += fragment_len;
data += fragment_len; data += fragment_len;
} }
if (rc > 0)
vcpu->arch.pgm.code = rc;
return rc; return rc;
} }

View File

@ -405,11 +405,12 @@ int read_guest_abs(struct kvm_vcpu *vcpu, unsigned long gpa, void *data,
* @len: number of bytes to copy * @len: number of bytes to copy
* *
* Copy @len bytes from @data (kernel space) to @gra (guest real address). * Copy @len bytes from @data (kernel space) to @gra (guest real address).
* It is up to the caller to ensure that the entire guest memory range is
* valid memory before calling this function.
* Guest low address and key protection are not checked. * Guest low address and key protection are not checked.
* *
* Returns zero on success or -EFAULT on error. * Returns zero on success, -EFAULT when copying from @data failed, or
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
* is also stored to allow injecting into the guest (if applicable) using
* kvm_s390_inject_prog_cond().
* *
* If an error occurs data may have been copied partially to guest memory. * If an error occurs data may have been copied partially to guest memory.
*/ */
@ -428,11 +429,12 @@ int write_guest_real(struct kvm_vcpu *vcpu, unsigned long gra, void *data,
* @len: number of bytes to copy * @len: number of bytes to copy
* *
* Copy @len bytes from @gra (guest real address) to @data (kernel space). * Copy @len bytes from @gra (guest real address) to @data (kernel space).
* It is up to the caller to ensure that the entire guest memory range is
* valid memory before calling this function.
* Guest key protection is not checked. * Guest key protection is not checked.
* *
* Returns zero on success or -EFAULT on error. * Returns zero on success, -EFAULT when copying to @data failed, or
* PGM_ADRESSING in case @gra is outside a memslot. In this case, pgm check info
* is also stored to allow injecting into the guest (if applicable) using
* kvm_s390_inject_prog_cond().
* *
* If an error occurs data may have been copied partially to kernel space. * If an error occurs data may have been copied partially to kernel space.
*/ */

View File

@ -9,6 +9,8 @@
#include <asm/unwind_hints.h> #include <asm/unwind_hints.h>
#include <asm/segment.h> #include <asm/segment.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/cpufeatures.h>
#include <asm/nospec-branch.h>
.pushsection .noinstr.text, "ax" .pushsection .noinstr.text, "ax"
@ -17,6 +19,9 @@ SYM_FUNC_START(entry_ibpb)
movl $PRED_CMD_IBPB, %eax movl $PRED_CMD_IBPB, %eax
xorl %edx, %edx xorl %edx, %edx
wrmsr wrmsr
/* Make sure IBPB clears return stack preductions too. */
FILL_RETURN_BUFFER %rax, RSB_CLEAR_LOOPS, X86_BUG_IBPB_NO_RET
RET RET
SYM_FUNC_END(entry_ibpb) SYM_FUNC_END(entry_ibpb)
/* For KVM */ /* For KVM */

View File

@ -216,7 +216,7 @@
#define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */ #define X86_FEATURE_SPEC_STORE_BYPASS_DISABLE ( 7*32+23) /* "" Disable Speculative Store Bypass. */
#define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */ #define X86_FEATURE_LS_CFG_SSBD ( 7*32+24) /* "" AMD SSBD implementation via LS_CFG MSR */
#define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */
#define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_IBPB ( 7*32+26) /* "ibpb" Indirect Branch Prediction Barrier without a guaranteed RSB flush */
#define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */
#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */
#define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */
@ -347,6 +347,7 @@
#define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */ #define X86_FEATURE_CPPC (13*32+27) /* Collaborative Processor Performance Control */
#define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */ #define X86_FEATURE_AMD_PSFD (13*32+28) /* "" Predictive Store Forwarding Disable */
#define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */ #define X86_FEATURE_BTC_NO (13*32+29) /* "" Not vulnerable to Branch Type Confusion */
#define X86_FEATURE_AMD_IBPB_RET (13*32+30) /* "" IBPB clears return address predictor */
#define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */ #define X86_FEATURE_BRS (13*32+31) /* Branch Sampling available */
/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ /* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
@ -516,4 +517,5 @@
#define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */ #define X86_BUG_DIV0 X86_BUG(1*32 + 1) /* AMD DIV0 speculation bug */
#define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */
#define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */
#define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */
#endif /* _ASM_X86_CPUFEATURES_H */ #endif /* _ASM_X86_CPUFEATURES_H */

View File

@ -1113,8 +1113,25 @@ do_cmd_auto:
case RETBLEED_MITIGATION_IBPB: case RETBLEED_MITIGATION_IBPB:
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
/*
* IBPB on entry already obviates the need for
* software-based untraining so clear those in case some
* other mitigation like SRSO has selected them.
*/
setup_clear_cpu_cap(X86_FEATURE_UNRET);
setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
mitigate_smt = true; mitigate_smt = true;
/*
* There is no need for RSB filling: entry_ibpb() ensures
* all predictions, including the RSB, are invalidated,
* regardless of IBPB implementation.
*/
setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
break; break;
case RETBLEED_MITIGATION_STUFF: case RETBLEED_MITIGATION_STUFF:
@ -2610,6 +2627,14 @@ static void __init srso_select_mitigation(void)
if (has_microcode) { if (has_microcode) {
setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB); setup_force_cpu_cap(X86_FEATURE_ENTRY_IBPB);
srso_mitigation = SRSO_MITIGATION_IBPB; srso_mitigation = SRSO_MITIGATION_IBPB;
/*
* IBPB on entry already obviates the need for
* software-based untraining so clear those in case some
* other mitigation like Retbleed has selected them.
*/
setup_clear_cpu_cap(X86_FEATURE_UNRET);
setup_clear_cpu_cap(X86_FEATURE_RETHUNK);
} }
} else { } else {
pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n"); pr_err("WARNING: kernel not compiled with CPU_IBPB_ENTRY.\n");
@ -2622,6 +2647,13 @@ static void __init srso_select_mitigation(void)
if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) { if (!boot_cpu_has(X86_FEATURE_ENTRY_IBPB) && has_microcode) {
setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT); setup_force_cpu_cap(X86_FEATURE_IBPB_ON_VMEXIT);
srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT; srso_mitigation = SRSO_MITIGATION_IBPB_ON_VMEXIT;
/*
* There is no need for RSB filling: entry_ibpb() ensures
* all predictions, including the RSB, are invalidated,
* regardless of IBPB implementation.
*/
setup_clear_cpu_cap(X86_FEATURE_RSB_VMEXIT);
} }
} else { } else {
pr_err("WARNING: kernel not compiled with CPU_SRSO.\n"); pr_err("WARNING: kernel not compiled with CPU_SRSO.\n");

View File

@ -1483,6 +1483,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c)
boot_cpu_has(X86_FEATURE_HYPERVISOR))) boot_cpu_has(X86_FEATURE_HYPERVISOR)))
setup_force_cpu_bug(X86_BUG_BHI); setup_force_cpu_bug(X86_BUG_BHI);
if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET))
setup_force_cpu_bug(X86_BUG_IBPB_NO_RET);
if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN))
return; return;

View File

@ -219,8 +219,8 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
data->got_token = true; data->got_token = true;
smp_wmb(); smp_wmb();
list_del_init(&curr->entry);
wake_up_process(data->task); wake_up_process(data->task);
list_del_init_careful(&curr->entry);
return 1; return 1;
} }

View File

@ -3925,8 +3925,10 @@ static int domain_context_clear_one_cb(struct pci_dev *pdev, u16 alias, void *op
*/ */
static void domain_context_clear(struct device_domain_info *info) static void domain_context_clear(struct device_domain_info *info)
{ {
if (!dev_is_pci(info->dev)) if (!dev_is_pci(info->dev)) {
domain_context_clear_one(info, info->bus, info->devfn); domain_context_clear_one(info, info->bus, info->devfn);
return;
}
pci_for_each_dma_alias(to_pci_dev(info->dev), pci_for_each_dma_alias(to_pci_dev(info->dev),
&domain_context_clear_one_cb, info); &domain_context_clear_one_cb, info);

View File

@ -90,6 +90,30 @@
#define FEC_PTP_MAX_NSEC_PERIOD 4000000000ULL #define FEC_PTP_MAX_NSEC_PERIOD 4000000000ULL
#define FEC_PTP_MAX_NSEC_COUNTER 0x80000000ULL #define FEC_PTP_MAX_NSEC_COUNTER 0x80000000ULL
/**
* fec_ptp_read - read raw cycle counter (to be used by time counter)
* @cc: the cyclecounter structure
*
* this function reads the cyclecounter registers and is called by the
* cyclecounter structure used to construct a ns counter from the
* arbitrary fixed point registers
*/
static u64 fec_ptp_read(const struct cyclecounter *cc)
{
struct fec_enet_private *fep =
container_of(cc, struct fec_enet_private, cc);
u32 tempval;
tempval = readl(fep->hwp + FEC_ATIME_CTRL);
tempval |= FEC_T_CTRL_CAPTURE;
writel(tempval, fep->hwp + FEC_ATIME_CTRL);
if (fep->quirks & FEC_QUIRK_BUG_CAPTURE)
udelay(1);
return readl(fep->hwp + FEC_ATIME);
}
/** /**
* fec_ptp_enable_pps * fec_ptp_enable_pps
* @fep: the fec_enet_private structure handle * @fep: the fec_enet_private structure handle
@ -136,7 +160,7 @@ static int fec_ptp_enable_pps(struct fec_enet_private *fep, uint enable)
* NSEC_PER_SEC - ts.tv_nsec. Add the remaining nanoseconds * NSEC_PER_SEC - ts.tv_nsec. Add the remaining nanoseconds
* to current timer would be next second. * to current timer would be next second.
*/ */
tempval = fep->cc.read(&fep->cc); tempval = fec_ptp_read(&fep->cc);
/* Convert the ptp local counter to 1588 timestamp */ /* Convert the ptp local counter to 1588 timestamp */
ns = timecounter_cyc2time(&fep->tc, tempval); ns = timecounter_cyc2time(&fep->tc, tempval);
ts = ns_to_timespec64(ns); ts = ns_to_timespec64(ns);
@ -211,13 +235,7 @@ static int fec_ptp_pps_perout(struct fec_enet_private *fep)
timecounter_read(&fep->tc); timecounter_read(&fep->tc);
/* Get the current ptp hardware time counter */ /* Get the current ptp hardware time counter */
temp_val = readl(fep->hwp + FEC_ATIME_CTRL); ptp_hc = fec_ptp_read(&fep->cc);
temp_val |= FEC_T_CTRL_CAPTURE;
writel(temp_val, fep->hwp + FEC_ATIME_CTRL);
if (fep->quirks & FEC_QUIRK_BUG_CAPTURE)
udelay(1);
ptp_hc = readl(fep->hwp + FEC_ATIME);
/* Convert the ptp local counter to 1588 timestamp */ /* Convert the ptp local counter to 1588 timestamp */
curr_time = timecounter_cyc2time(&fep->tc, ptp_hc); curr_time = timecounter_cyc2time(&fep->tc, ptp_hc);
@ -271,30 +289,6 @@ static enum hrtimer_restart fec_ptp_pps_perout_handler(struct hrtimer *timer)
return HRTIMER_NORESTART; return HRTIMER_NORESTART;
} }
/**
* fec_ptp_read - read raw cycle counter (to be used by time counter)
* @cc: the cyclecounter structure
*
* this function reads the cyclecounter registers and is called by the
* cyclecounter structure used to construct a ns counter from the
* arbitrary fixed point registers
*/
static u64 fec_ptp_read(const struct cyclecounter *cc)
{
struct fec_enet_private *fep =
container_of(cc, struct fec_enet_private, cc);
u32 tempval;
tempval = readl(fep->hwp + FEC_ATIME_CTRL);
tempval |= FEC_T_CTRL_CAPTURE;
writel(tempval, fep->hwp + FEC_ATIME_CTRL);
if (fep->quirks & FEC_QUIRK_BUG_CAPTURE)
udelay(1);
return readl(fep->hwp + FEC_ATIME);
}
/** /**
* fec_ptp_start_cyclecounter - create the cycle counter from hw * fec_ptp_start_cyclecounter - create the cycle counter from hw
* @ndev: network device * @ndev: network device

View File

@ -1195,7 +1195,8 @@ sclp_reboot_event(struct notifier_block *this, unsigned long event, void *ptr)
} }
static struct notifier_block sclp_reboot_notifier = { static struct notifier_block sclp_reboot_notifier = {
.notifier_call = sclp_reboot_event .notifier_call = sclp_reboot_event,
.priority = INT_MIN,
}; };
static ssize_t con_pages_show(struct device_driver *dev, char *buf) static ssize_t con_pages_show(struct device_driver *dev, char *buf)

View File

@ -319,7 +319,7 @@ sclp_vt220_add_msg(struct sclp_vt220_request *request,
buffer = (void *) ((addr_t) sccb + sccb->header.length); buffer = (void *) ((addr_t) sccb + sccb->header.length);
if (convertlf) { if (convertlf) {
/* Perform Linefeed conversion (0x0a -> 0x0a 0x0d)*/ /* Perform Linefeed conversion (0x0a -> 0x0d 0x0a)*/
for (from=0, to=0; for (from=0, to=0;
(from < count) && (to < sclp_vt220_space_left(request)); (from < count) && (to < sclp_vt220_space_left(request));
from++) { from++) {
@ -328,8 +328,8 @@ sclp_vt220_add_msg(struct sclp_vt220_request *request,
/* Perform conversion */ /* Perform conversion */
if (c == 0x0a) { if (c == 0x0a) {
if (to + 1 < sclp_vt220_space_left(request)) { if (to + 1 < sclp_vt220_space_left(request)) {
((unsigned char *) buffer)[to++] = c;
((unsigned char *) buffer)[to++] = 0x0d; ((unsigned char *) buffer)[to++] = 0x0d;
((unsigned char *) buffer)[to++] = c;
} else } else
break; break;

View File

@ -1565,12 +1565,23 @@ out_release:
return error; return error;
} }
/* Enforce that there is at most one namespace bit per attr. */
inline bool xfs_attr_check_namespace(unsigned int attr_flags)
{
return hweight32(attr_flags & XFS_ATTR_NSP_ONDISK_MASK) < 2;
}
/* Returns true if the attribute entry name is valid. */ /* Returns true if the attribute entry name is valid. */
bool bool
xfs_attr_namecheck( xfs_attr_namecheck(
unsigned int attr_flags,
const void *name, const void *name,
size_t length) size_t length)
{ {
/* Only one namespace bit allowed. */
if (!xfs_attr_check_namespace(attr_flags))
return false;
/* /*
* MAXNAMELEN includes the trailing null, but (name/length) leave it * MAXNAMELEN includes the trailing null, but (name/length) leave it
* out, so use >= for the length check. * out, so use >= for the length check.

View File

@ -547,7 +547,9 @@ int xfs_attr_get(struct xfs_da_args *args);
int xfs_attr_set(struct xfs_da_args *args); int xfs_attr_set(struct xfs_da_args *args);
int xfs_attr_set_iter(struct xfs_attr_intent *attr); int xfs_attr_set_iter(struct xfs_attr_intent *attr);
int xfs_attr_remove_iter(struct xfs_attr_intent *attr); int xfs_attr_remove_iter(struct xfs_attr_intent *attr);
bool xfs_attr_namecheck(const void *name, size_t length); bool xfs_attr_check_namespace(unsigned int attr_flags);
bool xfs_attr_namecheck(unsigned int attr_flags, const void *name,
size_t length);
int xfs_attr_calc_size(struct xfs_da_args *args, int *local); int xfs_attr_calc_size(struct xfs_da_args *args, int *local);
void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres, void xfs_init_attr_trans(struct xfs_da_args *args, struct xfs_trans_res *tres,
unsigned int *total); unsigned int *total);

View File

@ -984,6 +984,10 @@ xfs_attr_shortform_to_leaf(
nargs.hashval = xfs_da_hashname(sfe->nameval, nargs.hashval = xfs_da_hashname(sfe->nameval,
sfe->namelen); sfe->namelen);
nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK; nargs.attr_filter = sfe->flags & XFS_ATTR_NSP_ONDISK_MASK;
if (!xfs_attr_check_namespace(sfe->flags)) {
error = -EFSCORRUPTED;
goto out;
}
error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */ error = xfs_attr3_leaf_lookup_int(bp, &nargs); /* set a->index */
ASSERT(error == -ENOATTR); ASSERT(error == -ENOATTR);
error = xfs_attr3_leaf_add(bp, &nargs); error = xfs_attr3_leaf_add(bp, &nargs);
@ -1105,7 +1109,7 @@ xfs_attr_shortform_verify(
* one namespace flag per xattr, so we can just count the * one namespace flag per xattr, so we can just count the
* bits (i.e. hweight) here. * bits (i.e. hweight) here.
*/ */
if (hweight8(sfep->flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) if (!xfs_attr_check_namespace(sfep->flags))
return __this_address; return __this_address;
sfep = next_sfep; sfep = next_sfep;

View File

@ -619,7 +619,6 @@ xfs_attr_rmtval_set_blk(
if (error) if (error)
return error; return error;
ASSERT(nmap == 1);
ASSERT((map->br_startblock != DELAYSTARTBLOCK) && ASSERT((map->br_startblock != DELAYSTARTBLOCK) &&
(map->br_startblock != HOLESTARTBLOCK)); (map->br_startblock != HOLESTARTBLOCK));

View File

@ -1549,6 +1549,7 @@ xfs_bmap_add_extent_delay_real(
if (error) if (error)
goto done; goto done;
} }
ASSERT(da_new <= da_old);
break; break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG: case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_LEFT_CONTIG:
@ -1578,6 +1579,7 @@ xfs_bmap_add_extent_delay_real(
if (error) if (error)
goto done; goto done;
} }
ASSERT(da_new <= da_old);
break; break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG: case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
@ -1611,6 +1613,7 @@ xfs_bmap_add_extent_delay_real(
if (error) if (error)
goto done; goto done;
} }
ASSERT(da_new <= da_old);
break; break;
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING: case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING:
@ -1643,6 +1646,7 @@ xfs_bmap_add_extent_delay_real(
goto done; goto done;
} }
} }
ASSERT(da_new <= da_old);
break; break;
case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG: case BMAP_LEFT_FILLING | BMAP_LEFT_CONTIG:
@ -1680,6 +1684,7 @@ xfs_bmap_add_extent_delay_real(
if (error) if (error)
goto done; goto done;
} }
ASSERT(da_new <= da_old);
break; break;
case BMAP_LEFT_FILLING: case BMAP_LEFT_FILLING:
@ -1767,6 +1772,7 @@ xfs_bmap_add_extent_delay_real(
xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV); xfs_iext_update_extent(bma->ip, state, &bma->icur, &PREV);
xfs_iext_next(ifp, &bma->icur); xfs_iext_next(ifp, &bma->icur);
xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT); xfs_iext_update_extent(bma->ip, state, &bma->icur, &RIGHT);
ASSERT(da_new <= da_old);
break; break;
case BMAP_RIGHT_FILLING: case BMAP_RIGHT_FILLING:
@ -1814,6 +1820,7 @@ xfs_bmap_add_extent_delay_real(
PREV.br_blockcount = temp; PREV.br_blockcount = temp;
xfs_iext_insert(bma->ip, &bma->icur, &PREV, state); xfs_iext_insert(bma->ip, &bma->icur, &PREV, state);
xfs_iext_next(ifp, &bma->icur); xfs_iext_next(ifp, &bma->icur);
ASSERT(da_new <= da_old);
break; break;
case 0: case 0:
@ -1934,11 +1941,9 @@ xfs_bmap_add_extent_delay_real(
} }
/* adjust for changes in reserved delayed indirect blocks */ /* adjust for changes in reserved delayed indirect blocks */
if (da_new != da_old) { if (da_new != da_old)
ASSERT(state == 0 || da_new < da_old);
error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), error = xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new),
false); true);
}
xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork); xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
done: done:
@ -3969,20 +3974,32 @@ xfs_bmapi_reserve_delalloc(
xfs_extlen_t alen; xfs_extlen_t alen;
xfs_extlen_t indlen; xfs_extlen_t indlen;
int error; int error;
xfs_fileoff_t aoff = off; xfs_fileoff_t aoff;
bool use_cowextszhint =
whichfork == XFS_COW_FORK && !prealloc;
retry:
/* /*
* Cap the alloc length. Keep track of prealloc so we know whether to * Cap the alloc length. Keep track of prealloc so we know whether to
* tag the inode before we return. * tag the inode before we return.
*/ */
aoff = off;
alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN);
if (!eof) if (!eof)
alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
if (prealloc && alen >= len) if (prealloc && alen >= len)
prealloc = alen - len; prealloc = alen - len;
/* Figure out the extent size, adjust alen */ /*
if (whichfork == XFS_COW_FORK) { * If we're targetting the COW fork but aren't creating a speculative
* posteof preallocation, try to expand the reservation to align with
* the COW extent size hint if there's sufficient free space.
*
* Unlike the data fork, the CoW cancellation functions will free all
* the reservations at inactivation, so we don't require that every
* delalloc reservation have a dirty pagecache.
*/
if (use_cowextszhint) {
struct xfs_bmbt_irec prev; struct xfs_bmbt_irec prev;
xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip);
@ -4001,7 +4018,7 @@ xfs_bmapi_reserve_delalloc(
*/ */
error = xfs_quota_reserve_blkres(ip, alen); error = xfs_quota_reserve_blkres(ip, alen);
if (error) if (error)
return error; goto out;
/* /*
* Split changing sb for alen and indlen since they could be coming * Split changing sb for alen and indlen since they could be coming
@ -4046,6 +4063,17 @@ out_unreserve_blocks:
out_unreserve_quota: out_unreserve_quota:
if (XFS_IS_QUOTA_ON(mp)) if (XFS_IS_QUOTA_ON(mp))
xfs_quota_unreserve_blkres(ip, alen); xfs_quota_unreserve_blkres(ip, alen);
out:
if (error == -ENOSPC || error == -EDQUOT) {
trace_xfs_delalloc_enospc(ip, off, len);
if (prealloc || use_cowextszhint) {
/* retry without any preallocation */
use_cowextszhint = false;
prealloc = 0;
goto retry;
}
}
return error; return error;
} }
@ -4128,8 +4156,10 @@ xfs_bmapi_allocate(
} else { } else {
error = xfs_bmap_alloc_userdata(bma); error = xfs_bmap_alloc_userdata(bma);
} }
if (error || bma->blkno == NULLFSBLOCK) if (error)
return error; return error;
if (bma->blkno == NULLFSBLOCK)
return -ENOSPC;
if (bma->flags & XFS_BMAPI_ZERO) { if (bma->flags & XFS_BMAPI_ZERO) {
error = xfs_zero_extent(bma->ip, bma->blkno, bma->length); error = xfs_zero_extent(bma->ip, bma->blkno, bma->length);
@ -4309,6 +4339,15 @@ xfs_bmapi_finish(
* extent state if necessary. Details behaviour is controlled by the flags * extent state if necessary. Details behaviour is controlled by the flags
* parameter. Only allocates blocks from a single allocation group, to avoid * parameter. Only allocates blocks from a single allocation group, to avoid
* locking problems. * locking problems.
*
* Returns 0 on success and places the extent mappings in mval. nmaps is used
* as an input/output parameter where the caller specifies the maximum number
* of mappings that may be returned and xfs_bmapi_write passes back the number
* of mappings (including existing mappings) it found.
*
* Returns a negative error code on failure, including -ENOSPC when it could not
* allocate any blocks and -ENOSR when it did allocate blocks to convert a
* delalloc range, but those blocks were before the passed in range.
*/ */
int int
xfs_bmapi_write( xfs_bmapi_write(
@ -4436,10 +4475,16 @@ xfs_bmapi_write(
ASSERT(len > 0); ASSERT(len > 0);
ASSERT(bma.length > 0); ASSERT(bma.length > 0);
error = xfs_bmapi_allocate(&bma); error = xfs_bmapi_allocate(&bma);
if (error) if (error) {
/*
* If we already allocated space in a previous
* iteration return what we go so far when
* running out of space.
*/
if (error == -ENOSPC && bma.nallocs)
break;
goto error0; goto error0;
if (bma.blkno == NULLFSBLOCK) }
break;
/* /*
* If this is a CoW allocation, record the data in * If this is a CoW allocation, record the data in
@ -4477,7 +4522,6 @@ xfs_bmapi_write(
if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got)) if (!xfs_iext_next_extent(ifp, &bma.icur, &bma.got))
eof = true; eof = true;
} }
*nmap = n;
error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags, error = xfs_bmap_btree_to_extents(tp, ip, bma.cur, &bma.logflags,
whichfork); whichfork);
@ -4488,7 +4532,22 @@ xfs_bmapi_write(
ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork)); ifp->if_nextents > XFS_IFORK_MAXEXT(ip, whichfork));
xfs_bmapi_finish(&bma, whichfork, 0); xfs_bmapi_finish(&bma, whichfork, 0);
xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval, xfs_bmap_validate_ret(orig_bno, orig_len, orig_flags, orig_mval,
orig_nmap, *nmap); orig_nmap, n);
/*
* When converting delayed allocations, xfs_bmapi_allocate ignores
* the passed in bno and always converts from the start of the found
* delalloc extent.
*
* To avoid a successful return with *nmap set to 0, return the magic
* -ENOSR error code for this particular case so that the caller can
* handle it.
*/
if (!n) {
ASSERT(bma.nallocs >= *nmap);
return -ENOSR;
}
*nmap = n;
return 0; return 0;
error0: error0:
xfs_bmapi_finish(&bma, whichfork, error); xfs_bmapi_finish(&bma, whichfork, error);
@ -4501,8 +4560,8 @@ error0:
* invocations to allocate the target offset if a large enough physical extent * invocations to allocate the target offset if a large enough physical extent
* is not available. * is not available.
*/ */
int static int
xfs_bmapi_convert_delalloc( xfs_bmapi_convert_one_delalloc(
struct xfs_inode *ip, struct xfs_inode *ip,
int whichfork, int whichfork,
xfs_off_t offset, xfs_off_t offset,
@ -4559,7 +4618,8 @@ xfs_bmapi_convert_delalloc(
if (!isnullstartblock(bma.got.br_startblock)) { if (!isnullstartblock(bma.got.br_startblock)) {
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags)); xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq); if (seq)
*seq = READ_ONCE(ifp->if_seq);
goto out_trans_cancel; goto out_trans_cancel;
} }
@ -4595,9 +4655,6 @@ xfs_bmapi_convert_delalloc(
if (error) if (error)
goto out_finish; goto out_finish;
error = -ENOSPC;
if (WARN_ON_ONCE(bma.blkno == NULLFSBLOCK))
goto out_finish;
error = -EFSCORRUPTED; error = -EFSCORRUPTED;
if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock))) if (WARN_ON_ONCE(!xfs_valid_startblock(ip, bma.got.br_startblock)))
goto out_finish; goto out_finish;
@ -4608,7 +4665,8 @@ xfs_bmapi_convert_delalloc(
ASSERT(!isnullstartblock(bma.got.br_startblock)); ASSERT(!isnullstartblock(bma.got.br_startblock));
xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags, xfs_bmbt_to_iomap(ip, iomap, &bma.got, 0, flags,
xfs_iomap_inode_sequence(ip, flags)); xfs_iomap_inode_sequence(ip, flags));
*seq = READ_ONCE(ifp->if_seq); if (seq)
*seq = READ_ONCE(ifp->if_seq);
if (whichfork == XFS_COW_FORK) if (whichfork == XFS_COW_FORK)
xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length); xfs_refcount_alloc_cow_extent(tp, bma.blkno, bma.length);
@ -4631,6 +4689,36 @@ out_trans_cancel:
return error; return error;
} }
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
* extent that maps offset_fsb in iomap.
*/
int
xfs_bmapi_convert_delalloc(
struct xfs_inode *ip,
int whichfork,
loff_t offset,
struct iomap *iomap,
unsigned int *seq)
{
int error;
/*
* Attempt to allocate whatever delalloc extent currently backs offset
* and put the result into iomap. Allocate in a loop because it may
* take several attempts to allocate real blocks for a contiguous
* delalloc extent if free space is sufficiently fragmented.
*/
do {
error = xfs_bmapi_convert_one_delalloc(ip, whichfork, offset,
iomap, seq);
if (error)
return error;
} while (iomap->offset + iomap->length <= offset);
return 0;
}
int int
xfs_bmapi_remap( xfs_bmapi_remap(
struct xfs_trans *tp, struct xfs_trans *tp,

View File

@ -2158,8 +2158,8 @@ xfs_da_grow_inode_int(
struct xfs_inode *dp = args->dp; struct xfs_inode *dp = args->dp;
int w = args->whichfork; int w = args->whichfork;
xfs_rfsblock_t nblks = dp->i_nblocks; xfs_rfsblock_t nblks = dp->i_nblocks;
struct xfs_bmbt_irec map, *mapp; struct xfs_bmbt_irec map, *mapp = &map;
int nmap, error, got, i, mapi; int nmap, error, got, i, mapi = 1;
/* /*
* Find a spot in the file space to put the new block. * Find a spot in the file space to put the new block.
@ -2175,14 +2175,7 @@ xfs_da_grow_inode_int(
error = xfs_bmapi_write(tp, dp, *bno, count, error = xfs_bmapi_write(tp, dp, *bno, count,
xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG, xfs_bmapi_aflag(w)|XFS_BMAPI_METADATA|XFS_BMAPI_CONTIG,
args->total, &map, &nmap); args->total, &map, &nmap);
if (error) if (error == -ENOSPC && count > 1) {
return error;
ASSERT(nmap <= 1);
if (nmap == 1) {
mapp = &map;
mapi = 1;
} else if (nmap == 0 && count > 1) {
xfs_fileoff_t b; xfs_fileoff_t b;
int c; int c;
@ -2199,16 +2192,13 @@ xfs_da_grow_inode_int(
args->total, &mapp[mapi], &nmap); args->total, &mapp[mapi], &nmap);
if (error) if (error)
goto out_free_map; goto out_free_map;
if (nmap < 1)
break;
mapi += nmap; mapi += nmap;
b = mapp[mapi - 1].br_startoff + b = mapp[mapi - 1].br_startoff +
mapp[mapi - 1].br_blockcount; mapp[mapi - 1].br_blockcount;
} }
} else {
mapi = 0;
mapp = NULL;
} }
if (error)
goto out_free_map;
/* /*
* Count the blocks we got, make sure it matches the total. * Count the blocks we got, make sure it matches the total.

View File

@ -703,8 +703,13 @@ struct xfs_attr3_leafblock {
#define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT) #define XFS_ATTR_ROOT (1u << XFS_ATTR_ROOT_BIT)
#define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT) #define XFS_ATTR_SECURE (1u << XFS_ATTR_SECURE_BIT)
#define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT) #define XFS_ATTR_INCOMPLETE (1u << XFS_ATTR_INCOMPLETE_BIT)
#define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE) #define XFS_ATTR_NSP_ONDISK_MASK (XFS_ATTR_ROOT | XFS_ATTR_SECURE)
#define XFS_ATTR_ONDISK_MASK (XFS_ATTR_NSP_ONDISK_MASK | \
XFS_ATTR_LOCAL | \
XFS_ATTR_INCOMPLETE)
/* /*
* Alignment for namelist and valuelist entries (since they are mixed * Alignment for namelist and valuelist entries (since they are mixed
* there can be only one alignment value) * there can be only one alignment value)

View File

@ -366,17 +366,40 @@ xfs_dinode_verify_fork(
/* /*
* For fork types that can contain local data, check that the fork * For fork types that can contain local data, check that the fork
* format matches the size of local data contained within the fork. * format matches the size of local data contained within the fork.
*
* For all types, check that when the size says the should be in extent
* or btree format, the inode isn't claiming it is in local format.
*/ */
if (whichfork == XFS_DATA_FORK) { if (whichfork == XFS_DATA_FORK) {
if (S_ISDIR(mode) || S_ISLNK(mode)) { /*
if (be64_to_cpu(dip->di_size) <= fork_size && * A directory small enough to fit in the inode must be stored
* in local format. The directory sf <-> extents conversion
* code updates the directory size accordingly. Directories
* being truncated have zero size and are not subject to this
* check.
*/
if (S_ISDIR(mode)) {
if (dip->di_size &&
be64_to_cpu(dip->di_size) <= fork_size &&
fork_format != XFS_DINODE_FMT_LOCAL) fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address; return __this_address;
} }
/*
* A symlink with a target small enough to fit in the inode can
* be stored in extents format if xattrs were added (thus
* converting the data fork from shortform to remote format)
* and then removed.
*/
if (S_ISLNK(mode)) {
if (be64_to_cpu(dip->di_size) <= fork_size &&
fork_format != XFS_DINODE_FMT_EXTENTS &&
fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
/*
* For all types, check that when the size says the fork should
* be in extent or btree format, the inode isn't claiming to be
* in local format.
*/
if (be64_to_cpu(dip->di_size) > fork_size && if (be64_to_cpu(dip->di_size) > fork_size &&
fork_format == XFS_DINODE_FMT_LOCAL) fork_format == XFS_DINODE_FMT_LOCAL)
return __this_address; return __this_address;
@ -492,9 +515,19 @@ xfs_dinode_verify(
if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN) if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
return __this_address; return __this_address;
/* No zero-length symlinks/dirs. */ /*
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) * No zero-length symlinks/dirs unless they're unlinked and hence being
return __this_address; * inactivated.
*/
if ((S_ISLNK(mode) || S_ISDIR(mode)) && di_size == 0) {
if (dip->di_version > 1) {
if (dip->di_nlink)
return __this_address;
} else {
if (dip->di_onlink)
return __this_address;
}
}
fa = xfs_dinode_verify_nrext64(mp, dip); fa = xfs_dinode_verify_nrext64(mp, dip);
if (fa) if (fa)

View File

@ -1031,11 +1031,12 @@ xfs_log_sb(
* and hence we don't need have to update it here. * and hence we don't need have to update it here.
*/ */
if (xfs_has_lazysbcount(mp)) { if (xfs_has_lazysbcount(mp)) {
mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount); mp->m_sb.sb_icount = percpu_counter_sum_positive(&mp->m_icount);
mp->m_sb.sb_ifree = min_t(uint64_t, mp->m_sb.sb_ifree = min_t(uint64_t,
percpu_counter_sum(&mp->m_ifree), percpu_counter_sum_positive(&mp->m_ifree),
mp->m_sb.sb_icount); mp->m_sb.sb_icount);
mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks); mp->m_sb.sb_fdblocks =
percpu_counter_sum_positive(&mp->m_fdblocks);
} }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_sb_to_disk(bp->b_addr, &mp->m_sb);

View File

@ -182,32 +182,23 @@ xchk_xattr_listent(
return; return;
} }
if (flags & ~XFS_ATTR_ONDISK_MASK) {
xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
goto fail_xref;
}
if (flags & XFS_ATTR_INCOMPLETE) { if (flags & XFS_ATTR_INCOMPLETE) {
/* Incomplete attr key, just mark the inode for preening. */ /* Incomplete attr key, just mark the inode for preening. */
xchk_ino_set_preen(sx->sc, context->dp->i_ino); xchk_ino_set_preen(sx->sc, context->dp->i_ino);
return; return;
} }
/* Only one namespace bit allowed. */
if (hweight32(flags & XFS_ATTR_NSP_ONDISK_MASK) > 1) {
xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
goto fail_xref;
}
/* Does this name make sense? */ /* Does this name make sense? */
if (!xfs_attr_namecheck(name, namelen)) { if (!xfs_attr_namecheck(flags, name, namelen)) {
xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno); xchk_fblock_set_corrupt(sx->sc, XFS_ATTR_FORK, args.blkno);
goto fail_xref; goto fail_xref;
} }
/*
* Local xattr values are stored in the attr leaf block, so we don't
* need to retrieve the value from a remote block to detect corruption
* problems.
*/
if (flags & XFS_ATTR_LOCAL)
goto fail_xref;
/* /*
* Try to allocate enough memory to extrat the attr value. If that * Try to allocate enough memory to extrat the attr value. If that
* doesn't work, we overload the seen_enough variable to convey * doesn't work, we overload the seen_enough variable to convey
@ -223,6 +214,11 @@ xchk_xattr_listent(
args.value = ab->value; args.value = ab->value;
/*
* Get the attr value to ensure that lookup can find this attribute
* through the dabtree indexing and that remote value retrieval also
* works correctly.
*/
error = xfs_attr_get_ilocked(&args); error = xfs_attr_get_ilocked(&args);
/* ENODATA means the hash lookup failed and the attr is bad */ /* ENODATA means the hash lookup failed and the attr is bad */
if (error == -ENODATA) if (error == -ENODATA)
@ -463,7 +459,6 @@ xchk_xattr_rec(
xfs_dahash_t hash; xfs_dahash_t hash;
int nameidx; int nameidx;
int hdrsize; int hdrsize;
unsigned int badflags;
int error; int error;
ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC); ASSERT(blk->magic == XFS_ATTR_LEAF_MAGIC);
@ -493,10 +488,15 @@ xchk_xattr_rec(
/* Retrieve the entry and check it. */ /* Retrieve the entry and check it. */
hash = be32_to_cpu(ent->hashval); hash = be32_to_cpu(ent->hashval);
badflags = ~(XFS_ATTR_LOCAL | XFS_ATTR_ROOT | XFS_ATTR_SECURE | if (ent->flags & ~XFS_ATTR_ONDISK_MASK) {
XFS_ATTR_INCOMPLETE);
if ((ent->flags & badflags) != 0)
xchk_da_set_corrupt(ds, level); xchk_da_set_corrupt(ds, level);
return 0;
}
if (!xfs_attr_check_namespace(ent->flags)) {
xchk_da_set_corrupt(ds, level);
return 0;
}
if (ent->flags & XFS_ATTR_LOCAL) { if (ent->flags & XFS_ATTR_LOCAL) {
lentry = (struct xfs_attr_leaf_name_local *) lentry = (struct xfs_attr_leaf_name_local *)
(((char *)bp->b_addr) + nameidx); (((char *)bp->b_addr) + nameidx);
@ -561,6 +561,15 @@ xchk_xattr_check_sf(
break; break;
} }
/*
* Shortform entries do not set LOCAL or INCOMPLETE, so the
* only valid flag bits here are for namespaces.
*/
if (sfe->flags & ~XFS_ATTR_NSP_ONDISK_MASK) {
xchk_fblock_set_corrupt(sc, XFS_ATTR_FORK, 0);
break;
}
if (!xchk_xattr_set_map(sc, ab->usedmap, if (!xchk_xattr_set_map(sc, ab->usedmap,
(char *)sfe - (char *)sf, (char *)sfe - (char *)sf,
sizeof(struct xfs_attr_sf_entry))) { sizeof(struct xfs_attr_sf_entry))) {

View File

@ -735,7 +735,7 @@ xchk_iget(
{ {
ASSERT(sc->tp != NULL); ASSERT(sc->tp != NULL);
return xfs_iget(sc->mp, sc->tp, inum, XFS_IGET_UNTRUSTED, 0, ipp); return xfs_iget(sc->mp, sc->tp, inum, XCHK_IGET_FLAGS, 0, ipp);
} }
/* /*
@ -786,8 +786,8 @@ again:
if (error) if (error)
return error; return error;
error = xfs_iget(mp, tp, inum, error = xfs_iget(mp, tp, inum, XFS_IGET_NORETRY | XCHK_IGET_FLAGS, 0,
XFS_IGET_NORETRY | XFS_IGET_UNTRUSTED, 0, ipp); ipp);
if (error == -EAGAIN) { if (error == -EAGAIN) {
/* /*
* The inode may be in core but temporarily unavailable and may * The inode may be in core but temporarily unavailable and may
@ -994,12 +994,6 @@ xchk_irele(
spin_lock(&VFS_I(ip)->i_lock); spin_lock(&VFS_I(ip)->i_lock);
VFS_I(ip)->i_state &= ~I_DONTCACHE; VFS_I(ip)->i_state &= ~I_DONTCACHE;
spin_unlock(&VFS_I(ip)->i_lock); spin_unlock(&VFS_I(ip)->i_lock);
} else if (atomic_read(&VFS_I(ip)->i_count) == 1) {
/*
* If this is the last reference to the inode and the caller
* permits it, set DONTCACHE to avoid thrashing.
*/
d_mark_dontcache(VFS_I(ip));
} }
xfs_irele(ip); xfs_irele(ip);

View File

@ -17,6 +17,13 @@ struct xfs_scrub;
#define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \ #define XCHK_GFP_FLAGS ((__force gfp_t)(GFP_KERNEL | __GFP_NOWARN | \
__GFP_RETRY_MAYFAIL)) __GFP_RETRY_MAYFAIL))
/*
* For opening files by handle for fsck operations, we don't trust the inumber
* or the allocation state; therefore, perform an untrusted lookup. We don't
* want these inodes to pollute the cache, so mark them for immediate removal.
*/
#define XCHK_IGET_FLAGS (XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE)
/* Type info and names for the scrub types. */ /* Type info and names for the scrub types. */
enum xchk_type { enum xchk_type {
ST_NONE = 1, /* disabled */ ST_NONE = 1, /* disabled */

View File

@ -233,45 +233,6 @@ xfs_imap_valid(
return true; return true;
} }
/*
* Pass in a dellalloc extent and convert it to real extents, return the real
* extent that maps offset_fsb in wpc->iomap.
*
* The current page is held locked so nothing could have removed the block
* backing offset_fsb, although it could have moved from the COW to the data
* fork by another thread.
*/
static int
xfs_convert_blocks(
struct iomap_writepage_ctx *wpc,
struct xfs_inode *ip,
int whichfork,
loff_t offset)
{
int error;
unsigned *seq;
if (whichfork == XFS_COW_FORK)
seq = &XFS_WPC(wpc)->cow_seq;
else
seq = &XFS_WPC(wpc)->data_seq;
/*
* Attempt to allocate whatever delalloc extent currently backs offset
* and put the result into wpc->iomap. Allocate in a loop because it
* may take several attempts to allocate real blocks for a contiguous
* delalloc extent if free space is sufficiently fragmented.
*/
do {
error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
&wpc->iomap, seq);
if (error)
return error;
} while (wpc->iomap.offset + wpc->iomap.length <= offset);
return 0;
}
static int static int
xfs_map_blocks( xfs_map_blocks(
struct iomap_writepage_ctx *wpc, struct iomap_writepage_ctx *wpc,
@ -289,6 +250,7 @@ xfs_map_blocks(
struct xfs_iext_cursor icur; struct xfs_iext_cursor icur;
int retries = 0; int retries = 0;
int error = 0; int error = 0;
unsigned int *seq;
if (xfs_is_shutdown(mp)) if (xfs_is_shutdown(mp))
return -EIO; return -EIO;
@ -386,7 +348,19 @@ retry:
trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap); trace_xfs_map_blocks_found(ip, offset, count, whichfork, &imap);
return 0; return 0;
allocate_blocks: allocate_blocks:
error = xfs_convert_blocks(wpc, ip, whichfork, offset); /*
* Convert a dellalloc extent to a real one. The current page is held
* locked so nothing could have removed the block backing offset_fsb,
* although it could have moved from the COW to the data fork by another
* thread.
*/
if (whichfork == XFS_COW_FORK)
seq = &XFS_WPC(wpc)->cow_seq;
else
seq = &XFS_WPC(wpc)->data_seq;
error = xfs_bmapi_convert_delalloc(ip, whichfork, offset,
&wpc->iomap, seq);
if (error) { if (error) {
/* /*
* If we failed to find the extent in the COW fork we might have * If we failed to find the extent in the COW fork we might have

View File

@ -510,6 +510,9 @@ xfs_attri_validate(
unsigned int op = attrp->alfi_op_flags & unsigned int op = attrp->alfi_op_flags &
XFS_ATTRI_OP_FLAGS_TYPE_MASK; XFS_ATTRI_OP_FLAGS_TYPE_MASK;
if (!xfs_sb_version_haslogxattrs(&mp->m_sb))
return false;
if (attrp->__pad != 0) if (attrp->__pad != 0)
return false; return false;
@ -519,6 +522,10 @@ xfs_attri_validate(
if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK) if (attrp->alfi_attr_filter & ~XFS_ATTRI_FILTER_MASK)
return false; return false;
if (!xfs_attr_check_namespace(attrp->alfi_attr_filter &
XFS_ATTR_NSP_ONDISK_MASK))
return false;
/* alfi_op_flags should be either a set or remove */ /* alfi_op_flags should be either a set or remove */
switch (op) { switch (op) {
case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_SET:
@ -569,7 +576,8 @@ xfs_attri_item_recover(
*/ */
attrp = &attrip->attri_format; attrp = &attrip->attri_format;
if (!xfs_attri_validate(mp, attrp) || if (!xfs_attri_validate(mp, attrp) ||
!xfs_attr_namecheck(nv->name.i_addr, nv->name.i_len)) !xfs_attr_namecheck(attrp->alfi_attr_filter, nv->name.i_addr,
nv->name.i_len))
return -EFSCORRUPTED; return -EFSCORRUPTED;
error = xlog_recover_iget(mp, attrp->alfi_ino, &ip); error = xlog_recover_iget(mp, attrp->alfi_ino, &ip);
@ -602,8 +610,6 @@ xfs_attri_item_recover(
args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT | args->op_flags = XFS_DA_OP_RECOVERY | XFS_DA_OP_OKNOENT |
XFS_DA_OP_LOGGED; XFS_DA_OP_LOGGED;
ASSERT(xfs_sb_version_haslogxattrs(&mp->m_sb));
switch (attr->xattri_op_flags) { switch (attr->xattri_op_flags) {
case XFS_ATTRI_OP_FLAGS_SET: case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE: case XFS_ATTRI_OP_FLAGS_REPLACE:
@ -718,48 +724,112 @@ xlog_recover_attri_commit_pass2(
const void *attr_value = NULL; const void *attr_value = NULL;
const void *attr_name; const void *attr_name;
size_t len; size_t len;
unsigned int op, i = 0;
attri_formatp = item->ri_buf[0].i_addr;
attr_name = item->ri_buf[1].i_addr;
/* Validate xfs_attri_log_format before the large memory allocation */ /* Validate xfs_attri_log_format before the large memory allocation */
len = sizeof(struct xfs_attri_log_format); len = sizeof(struct xfs_attri_log_format);
if (item->ri_buf[0].i_len != len) { if (item->ri_buf[i].i_len != len) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len); item->ri_buf[0].i_addr, item->ri_buf[0].i_len);
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
attri_formatp = item->ri_buf[i].i_addr;
if (!xfs_attri_validate(mp, attri_formatp)) { if (!xfs_attri_validate(mp, attri_formatp)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len); attri_formatp, len);
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
/* Check the number of log iovecs makes sense for the op code. */
op = attri_formatp->alfi_op_flags & XFS_ATTRI_OP_FLAGS_TYPE_MASK;
switch (op) {
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
/* Log item, attr name, attr value */
if (item->ri_total != 3) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
break;
case XFS_ATTRI_OP_FLAGS_REMOVE:
/* Log item, attr name */
if (item->ri_total != 2) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
break;
default:
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
i++;
/* Validate the attr name */ /* Validate the attr name */
if (item->ri_buf[1].i_len != if (item->ri_buf[i].i_len !=
xlog_calc_iovec_len(attri_formatp->alfi_name_len)) { xlog_calc_iovec_len(attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_len); attri_formatp, len);
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
if (!xfs_attr_namecheck(attr_name, attri_formatp->alfi_name_len)) { attr_name = item->ri_buf[i].i_addr;
if (!xfs_attr_namecheck(attri_formatp->alfi_attr_filter, attr_name,
attri_formatp->alfi_name_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[1].i_addr, item->ri_buf[1].i_len); attri_formatp, len);
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
i++;
/* Validate the attr value, if present */ /* Validate the attr value, if present */
if (attri_formatp->alfi_value_len != 0) { if (attri_formatp->alfi_value_len != 0) {
if (item->ri_buf[2].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) { if (item->ri_buf[i].i_len != xlog_calc_iovec_len(attri_formatp->alfi_value_len)) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp, XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
item->ri_buf[0].i_addr, item->ri_buf[0].i_addr,
item->ri_buf[0].i_len); item->ri_buf[0].i_len);
return -EFSCORRUPTED; return -EFSCORRUPTED;
} }
attr_value = item->ri_buf[2].i_addr; attr_value = item->ri_buf[i].i_addr;
i++;
}
/*
* Make sure we got the correct number of buffers for the operation
* that we just loaded.
*/
if (i != item->ri_total) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
switch (op) {
case XFS_ATTRI_OP_FLAGS_REMOVE:
/* Regular remove operations operate only on names. */
if (attr_value != NULL || attri_formatp->alfi_value_len != 0) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
fallthrough;
case XFS_ATTRI_OP_FLAGS_SET:
case XFS_ATTRI_OP_FLAGS_REPLACE:
/*
* Regular xattr set/remove/replace operations require a name
* and do not take a newname. Values are optional for set and
* replace.
*/
if (attr_name == NULL || attri_formatp->alfi_name_len == 0) {
XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_LOW, mp,
attri_formatp, len);
return -EFSCORRUPTED;
}
break;
} }
/* /*

View File

@ -82,7 +82,8 @@ xfs_attr_shortform_list(
(dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) { (dp->i_af.if_bytes + sf->hdr.count * 16) < context->bufsize)) {
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (XFS_IS_CORRUPT(context->dp->i_mount, if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sfe->nameval, !xfs_attr_namecheck(sfe->flags,
sfe->nameval,
sfe->namelen))) sfe->namelen)))
return -EFSCORRUPTED; return -EFSCORRUPTED;
context->put_listent(context, context->put_listent(context,
@ -120,7 +121,8 @@ xfs_attr_shortform_list(
for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) { for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
if (unlikely( if (unlikely(
((char *)sfe < (char *)sf) || ((char *)sfe < (char *)sf) ||
((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)))) { ((char *)sfe >= ((char *)sf + dp->i_af.if_bytes)) ||
!xfs_attr_check_namespace(sfe->flags))) {
XFS_CORRUPTION_ERROR("xfs_attr_shortform_list", XFS_CORRUPTION_ERROR("xfs_attr_shortform_list",
XFS_ERRLEVEL_LOW, XFS_ERRLEVEL_LOW,
context->dp->i_mount, sfe, context->dp->i_mount, sfe,
@ -174,7 +176,7 @@ xfs_attr_shortform_list(
cursor->offset = 0; cursor->offset = 0;
} }
if (XFS_IS_CORRUPT(context->dp->i_mount, if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(sbp->name, !xfs_attr_namecheck(sbp->flags, sbp->name,
sbp->namelen))) { sbp->namelen))) {
error = -EFSCORRUPTED; error = -EFSCORRUPTED;
goto out; goto out;
@ -465,7 +467,8 @@ xfs_attr3_leaf_list_int(
} }
if (XFS_IS_CORRUPT(context->dp->i_mount, if (XFS_IS_CORRUPT(context->dp->i_mount,
!xfs_attr_namecheck(name, namelen))) !xfs_attr_namecheck(entry->flags, name,
namelen)))
return -EFSCORRUPTED; return -EFSCORRUPTED;
context->put_listent(context, entry->flags, context->put_listent(context, entry->flags,
name, namelen, valuelen); name, namelen, valuelen);

View File

@ -636,13 +636,11 @@ out_unlock:
/* /*
* Test whether it is appropriate to check an inode for and free post EOF * Test whether it is appropriate to check an inode for and free post EOF
* blocks. The 'force' parameter determines whether we should also consider * blocks.
* regular files that are marked preallocated or append-only.
*/ */
bool bool
xfs_can_free_eofblocks( xfs_can_free_eofblocks(
struct xfs_inode *ip, struct xfs_inode *ip)
bool force)
{ {
struct xfs_bmbt_irec imap; struct xfs_bmbt_irec imap;
struct xfs_mount *mp = ip->i_mount; struct xfs_mount *mp = ip->i_mount;
@ -676,11 +674,11 @@ xfs_can_free_eofblocks(
return false; return false;
/* /*
* Do not free real preallocated or append-only files unless the file * Only free real extents for inodes with persistent preallocations or
* has delalloc blocks and we are forced to remove them. * the append-only flag.
*/ */
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND))
if (!force || ip->i_delayed_blks == 0) if (ip->i_delayed_blks == 0)
return false; return false;
/* /*
@ -734,6 +732,22 @@ xfs_free_eofblocks(
/* Wait on dio to ensure i_size has settled. */ /* Wait on dio to ensure i_size has settled. */
inode_dio_wait(VFS_I(ip)); inode_dio_wait(VFS_I(ip));
/*
* For preallocated files only free delayed allocations.
*
* Note that this means we also leave speculative preallocations in
* place for preallocated files.
*/
if (ip->i_diflags & (XFS_DIFLAG_PREALLOC | XFS_DIFLAG_APPEND)) {
if (ip->i_delayed_blks) {
xfs_bmap_punch_delalloc_range(ip,
round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize),
LLONG_MAX);
}
xfs_inode_clear_eofblocks_tag(ip);
return 0;
}
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp); error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
if (error) { if (error) {
ASSERT(xfs_is_shutdown(mp)); ASSERT(xfs_is_shutdown(mp));
@ -868,33 +882,32 @@ xfs_alloc_file_space(
if (error) if (error)
goto error; goto error;
/*
* If the allocator cannot find a single free extent large
* enough to cover the start block of the requested range,
* xfs_bmapi_write will return -ENOSR.
*
* In that case we simply need to keep looping with the same
* startoffset_fsb so that one of the following allocations
* will eventually reach the requested range.
*/
error = xfs_bmapi_write(tp, ip, startoffset_fsb, error = xfs_bmapi_write(tp, ip, startoffset_fsb,
allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp, allocatesize_fsb, XFS_BMAPI_PREALLOC, 0, imapp,
&nimaps); &nimaps);
if (error) if (error) {
goto error; if (error != -ENOSR)
goto error;
error = 0;
} else {
startoffset_fsb += imapp->br_blockcount;
allocatesize_fsb -= imapp->br_blockcount;
}
ip->i_diflags |= XFS_DIFLAG_PREALLOC; ip->i_diflags |= XFS_DIFLAG_PREALLOC;
xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
error = xfs_trans_commit(tp); error = xfs_trans_commit(tp);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, XFS_ILOCK_EXCL);
if (error)
break;
/*
* If the allocator cannot find a single free extent large
* enough to cover the start block of the requested range,
* xfs_bmapi_write will return 0 but leave *nimaps set to 0.
*
* In that case we simply need to keep looping with the same
* startoffset_fsb so that one of the following allocations
* will eventually reach the requested range.
*/
if (nimaps) {
startoffset_fsb += imapp->br_blockcount;
allocatesize_fsb -= imapp->br_blockcount;
}
} }
return error; return error;
@ -1049,7 +1062,7 @@ xfs_prepare_shift(
* Trim eofblocks to avoid shifting uninitialized post-eof preallocation * Trim eofblocks to avoid shifting uninitialized post-eof preallocation
* into the accessible region of the file. * into the accessible region of the file.
*/ */
if (xfs_can_free_eofblocks(ip, true)) { if (xfs_can_free_eofblocks(ip)) {
error = xfs_free_eofblocks(ip); error = xfs_free_eofblocks(ip);
if (error) if (error)
return error; return error;

View File

@ -63,7 +63,7 @@ int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset,
xfs_off_t len); xfs_off_t len);
/* EOF block manipulation functions */ /* EOF block manipulation functions */
bool xfs_can_free_eofblocks(struct xfs_inode *ip, bool force); bool xfs_can_free_eofblocks(struct xfs_inode *ip);
int xfs_free_eofblocks(struct xfs_inode *ip); int xfs_free_eofblocks(struct xfs_inode *ip);
int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip, int xfs_swap_extents(struct xfs_inode *ip, struct xfs_inode *tip,

View File

@ -333,7 +333,6 @@ xfs_dquot_disk_alloc(
goto err_cancel; goto err_cancel;
ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB); ASSERT(map.br_blockcount == XFS_DQUOT_CLUSTER_SIZE_FSB);
ASSERT(nmaps == 1);
ASSERT((map.br_startblock != DELAYSTARTBLOCK) && ASSERT((map.br_startblock != DELAYSTARTBLOCK) &&
(map.br_startblock != HOLESTARTBLOCK)); (map.br_startblock != HOLESTARTBLOCK));

View File

@ -1149,7 +1149,7 @@ xfs_inode_free_eofblocks(
} }
*lockflags |= XFS_IOLOCK_EXCL; *lockflags |= XFS_IOLOCK_EXCL;
if (xfs_can_free_eofblocks(ip, false)) if (xfs_can_free_eofblocks(ip))
return xfs_free_eofblocks(ip); return xfs_free_eofblocks(ip);
/* inode could be preallocated or append-only */ /* inode could be preallocated or append-only */

View File

@ -1469,7 +1469,7 @@ xfs_release(
if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) if (!xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL))
return 0; return 0;
if (xfs_can_free_eofblocks(ip, false)) { if (xfs_can_free_eofblocks(ip)) {
/* /*
* Check if the inode is being opened, written and closed * Check if the inode is being opened, written and closed
* frequently and we have delayed allocation blocks outstanding * frequently and we have delayed allocation blocks outstanding
@ -1685,15 +1685,13 @@ xfs_inode_needs_inactive(
/* /*
* This file isn't being freed, so check if there are post-eof blocks * This file isn't being freed, so check if there are post-eof blocks
* to free. @force is true because we are evicting an inode from the * to free.
* cache. Post-eof blocks must be freed, lest we end up with broken
* free space accounting.
* *
* Note: don't bother with iolock here since lockdep complains about * Note: don't bother with iolock here since lockdep complains about
* acquiring it in reclaim context. We have the only reference to the * acquiring it in reclaim context. We have the only reference to the
* inode at this point anyways. * inode at this point anyways.
*/ */
return xfs_can_free_eofblocks(ip, true); return xfs_can_free_eofblocks(ip);
} }
/* /*
@ -1741,15 +1739,11 @@ xfs_inactive(
if (VFS_I(ip)->i_nlink != 0) { if (VFS_I(ip)->i_nlink != 0) {
/* /*
* force is true because we are evicting an inode from the
* cache. Post-eof blocks must be freed, lest we end up with
* broken free space accounting.
*
* Note: don't bother with iolock here since lockdep complains * Note: don't bother with iolock here since lockdep complains
* about acquiring it in reclaim context. We have the only * about acquiring it in reclaim context. We have the only
* reference to the inode at this point anyways. * reference to the inode at this point anyways.
*/ */
if (xfs_can_free_eofblocks(ip, true)) if (xfs_can_free_eofblocks(ip))
error = xfs_free_eofblocks(ip); error = xfs_free_eofblocks(ip);
goto out; goto out;
@ -2329,11 +2323,26 @@ xfs_ifree_cluster(
* This buffer may not have been correctly initialised as we * This buffer may not have been correctly initialised as we
* didn't read it from disk. That's not important because we are * didn't read it from disk. That's not important because we are
* only using to mark the buffer as stale in the log, and to * only using to mark the buffer as stale in the log, and to
* attach stale cached inodes on it. That means it will never be * attach stale cached inodes on it.
* dispatched for IO. If it is, we want to know about it, and we *
* want it to fail. We can acheive this by adding a write * For the inode that triggered the cluster freeing, this
* verifier to the buffer. * attachment may occur in xfs_inode_item_precommit() after we
* have marked this buffer stale. If this buffer was not in
* memory before xfs_ifree_cluster() started, it will not be
* marked XBF_DONE and this will cause problems later in
* xfs_inode_item_precommit() when we trip over a (stale, !done)
* buffer to attached to the transaction.
*
* Hence we have to mark the buffer as XFS_DONE here. This is
* safe because we are also marking the buffer as XBF_STALE and
* XFS_BLI_STALE. That means it will never be dispatched for
* IO and it won't be unlocked until the cluster freeing has
* been committed to the journal and the buffer unpinned. If it
* is written, we want to know about it, and we want it to
* fail. We can acheive this by adding a write verifier to the
* buffer.
*/ */
bp->b_flags |= XBF_DONE;
bp->b_ops = &xfs_inode_buf_ops; bp->b_ops = &xfs_inode_buf_ops;
/* /*

View File

@ -317,14 +317,6 @@ xfs_iomap_write_direct(
if (error) if (error)
goto out_unlock; goto out_unlock;
/*
* Copy any maps to caller's array and return any error.
*/
if (nimaps == 0) {
error = -ENOSPC;
goto out_unlock;
}
if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock))) if (unlikely(!xfs_valid_startblock(ip, imap->br_startblock)))
error = xfs_alert_fsblock_zero(ip, imap); error = xfs_alert_fsblock_zero(ip, imap);
@ -1013,6 +1005,24 @@ xfs_buffered_write_iomap_begin(
goto out_unlock; goto out_unlock;
} }
/*
* For zeroing, trim a delalloc extent that extends beyond the EOF
* block. If it starts beyond the EOF block, convert it to an
* unwritten extent.
*/
if ((flags & IOMAP_ZERO) && imap.br_startoff <= offset_fsb &&
isnullstartblock(imap.br_startblock)) {
xfs_fileoff_t eof_fsb = XFS_B_TO_FSB(mp, XFS_ISIZE(ip));
if (offset_fsb >= eof_fsb)
goto convert_delay;
if (end_fsb > eof_fsb) {
end_fsb = eof_fsb;
xfs_trim_extent(&imap, offset_fsb,
end_fsb - offset_fsb);
}
}
/* /*
* Search the COW fork extent list even if we did not find a data fork * Search the COW fork extent list even if we did not find a data fork
* extent. This serves two purposes: first this implements the * extent. This serves two purposes: first this implements the
@ -1117,47 +1127,48 @@ xfs_buffered_write_iomap_begin(
} }
} }
retry:
error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks,
allocfork == XFS_DATA_FORK ? &imap : &cmap,
allocfork == XFS_DATA_FORK ? &icur : &ccur,
allocfork == XFS_DATA_FORK ? eof : cow_eof);
switch (error) {
case 0:
break;
case -ENOSPC:
case -EDQUOT:
/* retry without any preallocation */
trace_xfs_delalloc_enospc(ip, offset, count);
if (prealloc_blocks) {
prealloc_blocks = 0;
goto retry;
}
fallthrough;
default:
goto out_unlock;
}
if (allocfork == XFS_COW_FORK) { if (allocfork == XFS_COW_FORK) {
error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks, &cmap,
&ccur, cow_eof);
if (error)
goto out_unlock;
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &cmap);
goto found_cow; goto found_cow;
} }
error = xfs_bmapi_reserve_delalloc(ip, allocfork, offset_fsb,
end_fsb - offset_fsb, prealloc_blocks, &imap, &icur,
eof);
if (error)
goto out_unlock;
/* /*
* Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch * Flag newly allocated delalloc blocks with IOMAP_F_NEW so we punch
* them out if the write happens to fail. * them out if the write happens to fail.
*/ */
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_NEW);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, lockmode);
trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap); trace_xfs_iomap_alloc(ip, offset, count, allocfork, &imap);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, IOMAP_F_NEW, seq);
found_imap: found_imap:
seq = xfs_iomap_inode_sequence(ip, 0); seq = xfs_iomap_inode_sequence(ip, 0);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq); return xfs_bmbt_to_iomap(ip, iomap, &imap, flags, 0, seq);
convert_delay:
xfs_iunlock(ip, lockmode);
truncate_pagecache(inode, offset);
error = xfs_bmapi_convert_delalloc(ip, XFS_DATA_FORK, offset,
iomap, NULL);
if (error)
return error;
trace_xfs_iomap_alloc(ip, offset, count, XFS_DATA_FORK, &imap);
return 0;
found_cow: found_cow:
seq = xfs_iomap_inode_sequence(ip, 0); seq = xfs_iomap_inode_sequence(ip, 0);
if (imap.br_startoff <= offset_fsb) { if (imap.br_startoff <= offset_fsb) {
@ -1165,17 +1176,17 @@ found_cow:
if (error) if (error)
goto out_unlock; goto out_unlock;
seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags,
IOMAP_F_SHARED, seq); IOMAP_F_SHARED, seq);
} }
xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb); xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, lockmode);
return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq); return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, 0, seq);
out_unlock: out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL); xfs_iunlock(ip, lockmode);
return error; return error;
} }

View File

@ -429,13 +429,6 @@ xfs_reflink_fill_cow_hole(
if (error) if (error)
return error; return error;
/*
* Allocation succeeded but the requested range was not even partially
* satisfied? Bail out!
*/
if (nimaps == 0)
return -ENOSPC;
convert: convert:
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@ -498,13 +491,6 @@ xfs_reflink_fill_delalloc(
error = xfs_trans_commit(tp); error = xfs_trans_commit(tp);
if (error) if (error)
return error; return error;
/*
* Allocation succeeded but the requested range was not even
* partially satisfied? Bail out!
*/
if (nimaps == 0)
return -ENOSPC;
} while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff); } while (cmap->br_startoff + cmap->br_blockcount <= imap->br_startoff);
return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now); return xfs_reflink_convert_unwritten(ip, imap, cmap, convert_now);
@ -730,12 +716,6 @@ xfs_reflink_end_cow_extent(
int nmaps; int nmaps;
int error; int error;
/* No COW extents? That's easy! */
if (ifp->if_bytes == 0) {
*offset_fsb = end_fsb;
return 0;
}
resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK);
error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0,
XFS_TRANS_RESERVE, &tp); XFS_TRANS_RESERVE, &tp);

View File

@ -840,8 +840,6 @@ xfs_growfs_rt_alloc(
nmap = 1; nmap = 1;
error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks, error = xfs_bmapi_write(tp, ip, oblocks, nblocks - oblocks,
XFS_BMAPI_METADATA, 0, &map, &nmap); XFS_BMAPI_METADATA, 0, &map, &nmap);
if (!error && nmap < 1)
error = -ENOSPC;
if (error) if (error)
goto out_trans_cancel; goto out_trans_cancel;
/* /*

View File

@ -2240,6 +2240,8 @@ static inline struct maple_enode *mte_node_or_none(struct maple_enode *enode)
/* /*
* mas_wr_node_walk() - Find the correct offset for the index in the @mas. * mas_wr_node_walk() - Find the correct offset for the index in the @mas.
* If @mas->index cannot be found within the containing
* node, we traverse to the last entry in the node.
* @wr_mas: The maple write state * @wr_mas: The maple write state
* *
* Uses mas_slot_locked() and does not need to worry about dead nodes. * Uses mas_slot_locked() and does not need to worry about dead nodes.
@ -3655,7 +3657,7 @@ static bool mas_wr_walk(struct ma_wr_state *wr_mas)
return true; return true;
} }
static bool mas_wr_walk_index(struct ma_wr_state *wr_mas) static void mas_wr_walk_index(struct ma_wr_state *wr_mas)
{ {
struct ma_state *mas = wr_mas->mas; struct ma_state *mas = wr_mas->mas;
@ -3664,11 +3666,9 @@ static bool mas_wr_walk_index(struct ma_wr_state *wr_mas)
wr_mas->content = mas_slot_locked(mas, wr_mas->slots, wr_mas->content = mas_slot_locked(mas, wr_mas->slots,
mas->offset); mas->offset);
if (ma_is_leaf(wr_mas->type)) if (ma_is_leaf(wr_mas->type))
return true; return;
mas_wr_walk_traverse(wr_mas); mas_wr_walk_traverse(wr_mas);
} }
return true;
} }
/* /*
* mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs. * mas_extend_spanning_null() - Extend a store of a %NULL to include surrounding %NULLs.
@ -3904,8 +3904,8 @@ static inline int mas_wr_spanning_store(struct ma_wr_state *wr_mas)
memset(&b_node, 0, sizeof(struct maple_big_node)); memset(&b_node, 0, sizeof(struct maple_big_node));
/* Copy l_mas and store the value in b_node. */ /* Copy l_mas and store the value in b_node. */
mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end); mas_store_b_node(&l_wr_mas, &b_node, l_wr_mas.node_end);
/* Copy r_mas into b_node. */ /* Copy r_mas into b_node if there is anything to copy. */
if (r_mas.offset <= r_wr_mas.node_end) if (r_mas.max > r_mas.last)
mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end, mas_mab_cp(&r_mas, r_mas.offset, r_wr_mas.node_end,
&b_node, b_node.b_end + 1); &b_node, b_node.b_end + 1);
else else

View File

@ -15,6 +15,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK), SNMP_MIB_ITEM("MPCapableACKRX", MPTCP_MIB_MPCAPABLEPASSIVEACK),
SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackACK", MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK), SNMP_MIB_ITEM("MPCapableFallbackSYNACK", MPTCP_MIB_MPCAPABLEACTIVEFALLBACK),
SNMP_MIB_ITEM("MPCapableEndpAttempt", MPTCP_MIB_MPCAPABLEENDPATTEMPT),
SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT), SNMP_MIB_ITEM("MPFallbackTokenInit", MPTCP_MIB_TOKENFALLBACKINIT),
SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS), SNMP_MIB_ITEM("MPTCPRetrans", MPTCP_MIB_RETRANSSEGS),
SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN), SNMP_MIB_ITEM("MPJoinNoTokenFound", MPTCP_MIB_JOINNOTOKEN),

View File

@ -8,6 +8,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */ MPTCP_MIB_MPCAPABLEPASSIVEACK, /* Received third ACK with MP_CAPABLE */
MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEPASSIVEFALLBACK,/* Server-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */ MPTCP_MIB_MPCAPABLEACTIVEFALLBACK, /* Client-side fallback during 3-way handshake */
MPTCP_MIB_MPCAPABLEENDPATTEMPT, /* Prohibited MPC to port-based endp */
MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */ MPTCP_MIB_TOKENFALLBACKINIT, /* Could not init/allocate token */
MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */ MPTCP_MIB_RETRANSSEGS, /* Segments retransmitted at the MPTCP-level */
MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */ MPTCP_MIB_JOINNOTOKEN, /* Received MP_JOIN but the token was not found */

View File

@ -1125,6 +1125,7 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk,
*/ */
inet_sk_state_store(newsk, TCP_LISTEN); inet_sk_state_store(newsk, TCP_LISTEN);
lock_sock(ssk); lock_sock(ssk);
WRITE_ONCE(mptcp_subflow_ctx(ssk)->pm_listener, true);
err = __inet_listen_sk(ssk, backlog); err = __inet_listen_sk(ssk, backlog);
if (!err) if (!err)
mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CREATED);

View File

@ -504,6 +504,7 @@ struct mptcp_subflow_context {
__unused : 9; __unused : 9;
enum mptcp_data_avail data_avail; enum mptcp_data_avail data_avail;
bool scheduled; bool scheduled;
bool pm_listener; /* a listener managed by the kernel PM? */
u32 remote_nonce; u32 remote_nonce;
u64 thmac; u64 thmac;
u32 local_nonce; u32 local_nonce;

View File

@ -132,6 +132,13 @@ static void subflow_add_reset_reason(struct sk_buff *skb, u8 reason)
} }
} }
static int subflow_reset_req_endp(struct request_sock *req, struct sk_buff *skb)
{
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEENDPATTEMPT);
subflow_add_reset_reason(skb, MPTCP_RST_EPROHIBIT);
return -EPERM;
}
/* Init mptcp request socket. /* Init mptcp request socket.
* *
* Returns an error code if a JOIN has failed and a TCP reset * Returns an error code if a JOIN has failed and a TCP reset
@ -163,6 +170,8 @@ static int subflow_check_req(struct request_sock *req,
if (opt_mp_capable) { if (opt_mp_capable) {
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE); SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_MPCAPABLEPASSIVE);
if (unlikely(listener->pm_listener))
return subflow_reset_req_endp(req, skb);
if (opt_mp_join) if (opt_mp_join)
return 0; return 0;
} else if (opt_mp_join) { } else if (opt_mp_join) {
@ -170,6 +179,8 @@ static int subflow_check_req(struct request_sock *req,
if (mp_opt.backup) if (mp_opt.backup)
SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX); SUBFLOW_REQ_INC_STATS(req, MPTCP_MIB_JOINSYNBACKUPRX);
} else if (unlikely(listener->pm_listener)) {
return subflow_reset_req_endp(req, skb);
} }
if (opt_mp_capable && listener->request_mptcp) { if (opt_mp_capable && listener->request_mptcp) {