mm: perform VMA allocation, freeing, duplication in mm

Right now these are performed in kernel/fork.c which is odd and a
violation of separation of concerns, as well as preventing us from
integrating this and related logic into userland VMA testing going
forward.

There is a fly in the ointment - nommu - mmap.c is not compiled if
CONFIG_MMU not set, and neither is vma.c.

To square the circle, let's add a new file - vma_init.c.  This will be
compiled for both CONFIG_MMU and nommu builds, and will also form part of
the VMA userland testing.

This allows us to de-duplicate code, while maintaining separation of
concerns and the ability for us to userland test this logic.

Update the VMA userland tests accordingly, additionally adding a
detach_free_vma() helper function to correctly detach VMAs before freeing
them in test code, as this change was triggering the assert for this.

[akpm@linux-foundation.org: remove stray newline, per Liam]
Link: https://lkml.kernel.org/r/f97b3a85a6da0196b28070df331b99e22b263be8.1745853549.git.lorenzo.stoakes@oracle.com
Signed-off-by: Lorenzo Stoakes <lorenzo.stoakes@oracle.com>
Reviewed-by: Liam R. Howlett <Liam.Howlett@oracle.com>
Reviewed-by: Pedro Falcato <pfalcato@suse.de>
Reviewed-by: David Hildenbrand <david@redhat.com>
Reviewed-by: Kees Cook <kees@kernel.org>
Reviewed-by: Suren Baghdasaryan <surenb@google.com>
Reviewed-by: Vlastimil Babka <vbabka@suse.cz>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christian Brauner <brauner@kernel.org>
Cc: Jan Kara <jack@suse.cz>
Cc: Jann Horn <jannh@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
Lorenzo Stoakes 2025-04-28 16:28:17 +01:00 committed by Andrew Morton
parent 26a8f57760
commit 3e43e260f1
10 changed files with 250 additions and 126 deletions

View File

@ -15683,6 +15683,7 @@ F: mm/mseal.c
F: mm/vma.c
F: mm/vma.h
F: mm/vma_exec.c
F: mm/vma_init.c
F: mm/vma_internal.h
F: tools/testing/selftests/mm/merge.c
F: tools/testing/vma/

View File

@ -431,88 +431,9 @@ struct kmem_cache *files_cachep;
/* SLAB cache for fs_struct structures (tsk->fs) */
struct kmem_cache *fs_cachep;
/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!vma)
return NULL;
vma_init(vma, mm);
return vma;
}
static void vm_area_init_from(const struct vm_area_struct *src,
struct vm_area_struct *dest)
{
dest->vm_mm = src->vm_mm;
dest->vm_ops = src->vm_ops;
dest->vm_start = src->vm_start;
dest->vm_end = src->vm_end;
dest->anon_vma = src->anon_vma;
dest->vm_pgoff = src->vm_pgoff;
dest->vm_file = src->vm_file;
dest->vm_private_data = src->vm_private_data;
vm_flags_init(dest, src->vm_flags);
memcpy(&dest->vm_page_prot, &src->vm_page_prot,
sizeof(dest->vm_page_prot));
/*
* src->shared.rb may be modified concurrently when called from
* dup_mmap(), but the clone will reinitialize it.
*/
data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
sizeof(dest->vm_userfaultfd_ctx));
#ifdef CONFIG_ANON_VMA_NAME
dest->anon_name = src->anon_name;
#endif
#ifdef CONFIG_SWAP
memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
sizeof(dest->swap_readahead_info));
#endif
#ifndef CONFIG_MMU
dest->vm_region = src->vm_region;
#endif
#ifdef CONFIG_NUMA
dest->vm_policy = src->vm_policy;
#endif
}
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return NULL;
ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
vm_area_init_from(orig, new);
vma_lock_init(new, true);
INIT_LIST_HEAD(&new->anon_vma_chain);
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
/* The vma should be detached while being destroyed. */
vma_assert_detached(vma);
vma_numab_state_free(vma);
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}
static void account_kernel_stack(struct task_struct *tsk, int account)
{
if (IS_ENABLED(CONFIG_VMAP_STACK)) {
@ -3033,11 +2954,6 @@ void __init mm_cache_init(void)
void __init proc_caches_init(void)
{
struct kmem_cache_args args = {
.use_freeptr_offset = true,
.freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
};
sighand_cachep = kmem_cache_create("sighand_cache",
sizeof(struct sighand_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
@ -3054,10 +2970,6 @@ void __init proc_caches_init(void)
sizeof(struct fs_struct), 0,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
NULL);
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), &args,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_ACCOUNT);
mmap_init();
nsproxy_cache_init();
}

View File

@ -55,7 +55,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o show_mem.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o mmap_lock.o $(mmu-y)
debug.o gup.o mmap_lock.o vma_init.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o

View File

@ -1554,7 +1554,7 @@ static const struct ctl_table mmap_table[] = {
#endif /* CONFIG_SYSCTL */
/*
* initialise the percpu counter for VM
* initialise the percpu counter for VM, initialise VMA state.
*/
void __init mmap_init(void)
{
@ -1565,6 +1565,7 @@ void __init mmap_init(void)
#ifdef CONFIG_SYSCTL
register_sysctl_init("vm", mmap_table);
#endif
vma_state_init();
}
/*

View File

@ -399,7 +399,8 @@ static const struct ctl_table nommu_table[] = {
};
/*
* initialise the percpu counter for VM and region record slabs
* initialise the percpu counter for VM and region record slabs, initialise VMA
* state.
*/
void __init mmap_init(void)
{
@ -409,6 +410,7 @@ void __init mmap_init(void)
VM_BUG_ON(ret);
vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
register_sysctl_init("vm", nommu_table);
vma_state_init();
}
/*

View File

@ -550,6 +550,12 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock);
int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma);
/* vma_init.h, shared between CONFIG_MMU and nommu. */
void __init vma_state_init(void);
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm);
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig);
void vm_area_free(struct vm_area_struct *vma);
/* vma_exec.c */
#ifdef CONFIG_MMU
int create_init_stack_vma(struct mm_struct *mm, struct vm_area_struct **vmap,

101
mm/vma_init.c Normal file
View File

@ -0,0 +1,101 @@
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Functions for initialisaing, allocating, freeing and duplicating VMAs. Shared
* between CONFIG_MMU and non-CONFIG_MMU kernel configurations.
*/
#include "vma_internal.h"
#include "vma.h"
/* SLAB cache for vm_area_struct structures */
static struct kmem_cache *vm_area_cachep;
void __init vma_state_init(void)
{
struct kmem_cache_args args = {
.use_freeptr_offset = true,
.freeptr_offset = offsetof(struct vm_area_struct, vm_freeptr),
};
vm_area_cachep = kmem_cache_create("vm_area_struct",
sizeof(struct vm_area_struct), &args,
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
SLAB_ACCOUNT);
}
struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
{
struct vm_area_struct *vma;
vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!vma)
return NULL;
vma_init(vma, mm);
return vma;
}
static void vm_area_init_from(const struct vm_area_struct *src,
struct vm_area_struct *dest)
{
dest->vm_mm = src->vm_mm;
dest->vm_ops = src->vm_ops;
dest->vm_start = src->vm_start;
dest->vm_end = src->vm_end;
dest->anon_vma = src->anon_vma;
dest->vm_pgoff = src->vm_pgoff;
dest->vm_file = src->vm_file;
dest->vm_private_data = src->vm_private_data;
vm_flags_init(dest, src->vm_flags);
memcpy(&dest->vm_page_prot, &src->vm_page_prot,
sizeof(dest->vm_page_prot));
/*
* src->shared.rb may be modified concurrently when called from
* dup_mmap(), but the clone will reinitialize it.
*/
data_race(memcpy(&dest->shared, &src->shared, sizeof(dest->shared)));
memcpy(&dest->vm_userfaultfd_ctx, &src->vm_userfaultfd_ctx,
sizeof(dest->vm_userfaultfd_ctx));
#ifdef CONFIG_ANON_VMA_NAME
dest->anon_name = src->anon_name;
#endif
#ifdef CONFIG_SWAP
memcpy(&dest->swap_readahead_info, &src->swap_readahead_info,
sizeof(dest->swap_readahead_info));
#endif
#ifndef CONFIG_MMU
dest->vm_region = src->vm_region;
#endif
#ifdef CONFIG_NUMA
dest->vm_policy = src->vm_policy;
#endif
}
struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
{
struct vm_area_struct *new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
if (!new)
return NULL;
ASSERT_EXCLUSIVE_WRITER(orig->vm_flags);
ASSERT_EXCLUSIVE_WRITER(orig->vm_file);
vm_area_init_from(orig, new);
vma_lock_init(new, true);
INIT_LIST_HEAD(&new->anon_vma_chain);
vma_numab_state_init(new);
dup_anon_vma_name(orig, new);
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
/* The vma should be detached while being destroyed. */
vma_assert_detached(vma);
vma_numab_state_free(vma);
free_anon_vma_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}

View File

@ -9,7 +9,7 @@ include ../shared/shared.mk
OFILES = $(SHARED_OFILES) vma.o maple-shim.o
TARGETS = vma
vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_exec.c ../../../mm/vma.h
vma.o: vma.c vma_internal.h ../../../mm/vma.c ../../../mm/vma_init.c ../../../mm/vma_exec.c ../../../mm/vma.h
vma: $(OFILES)
$(CC) $(CFLAGS) -o $@ $(OFILES) $(LDLIBS)

View File

@ -28,6 +28,7 @@ unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
* Directly import the VMA implementation here. Our vma_internal.h wrapper
* provides userland-equivalent functionality for everything vma.c uses.
*/
#include "../../../mm/vma_init.c"
#include "../../../mm/vma_exec.c"
#include "../../../mm/vma.c"
@ -91,6 +92,12 @@ static int attach_vma(struct mm_struct *mm, struct vm_area_struct *vma)
return res;
}
static void detach_free_vma(struct vm_area_struct *vma)
{
vma_mark_detached(vma);
vm_area_free(vma);
}
/* Helper function to allocate a VMA and link it to the tree. */
static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
unsigned long start,
@ -104,7 +111,7 @@ static struct vm_area_struct *alloc_and_link_vma(struct mm_struct *mm,
return NULL;
if (attach_vma(mm, vma)) {
vm_area_free(vma);
detach_free_vma(vma);
return NULL;
}
@ -249,7 +256,7 @@ static int cleanup_mm(struct mm_struct *mm, struct vma_iterator *vmi)
vma_iter_set(vmi, 0);
for_each_vma(*vmi, vma) {
vm_area_free(vma);
detach_free_vma(vma);
count++;
}
@ -319,7 +326,7 @@ static bool test_simple_merge(void)
ASSERT_EQ(vma->vm_pgoff, 0);
ASSERT_EQ(vma->vm_flags, flags);
vm_area_free(vma);
detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@ -361,7 +368,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x1000);
ASSERT_EQ(vma->vm_pgoff, 0);
vm_area_free(vma);
detach_free_vma(vma);
vma_iter_clear(&vmi);
vma = vma_next(&vmi);
@ -370,7 +377,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x2000);
ASSERT_EQ(vma->vm_pgoff, 1);
vm_area_free(vma);
detach_free_vma(vma);
vma_iter_clear(&vmi);
vma = vma_next(&vmi);
@ -379,7 +386,7 @@ static bool test_simple_modify(void)
ASSERT_EQ(vma->vm_end, 0x3000);
ASSERT_EQ(vma->vm_pgoff, 2);
vm_area_free(vma);
detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@ -407,7 +414,7 @@ static bool test_simple_expand(void)
ASSERT_EQ(vma->vm_end, 0x3000);
ASSERT_EQ(vma->vm_pgoff, 0);
vm_area_free(vma);
detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@ -428,7 +435,7 @@ static bool test_simple_shrink(void)
ASSERT_EQ(vma->vm_end, 0x1000);
ASSERT_EQ(vma->vm_pgoff, 0);
vm_area_free(vma);
detach_free_vma(vma);
mtree_destroy(&mm.mm_mt);
return true;
@ -619,7 +626,7 @@ static bool test_merge_new(void)
ASSERT_EQ(vma->vm_pgoff, 0);
ASSERT_EQ(vma->anon_vma, &dummy_anon_vma);
vm_area_free(vma);
detach_free_vma(vma);
count++;
}
@ -1668,6 +1675,7 @@ int main(void)
int num_tests = 0, num_fail = 0;
maple_tree_init();
vma_state_init();
#define TEST(name) \
do { \

View File

@ -155,6 +155,10 @@ typedef __bitwise unsigned int vm_fault_t;
*/
#define pr_warn_once pr_err
#define data_race(expr) expr
#define ASSERT_EXCLUSIVE_WRITER(x)
struct kref {
refcount_t refcount;
};
@ -255,6 +259,8 @@ struct file {
#define VMA_LOCK_OFFSET 0x40000000
typedef struct { unsigned long v; } freeptr_t;
struct vm_area_struct {
/* The first cache line has the info for VMA tree walking. */
@ -264,9 +270,7 @@ struct vm_area_struct {
unsigned long vm_start;
unsigned long vm_end;
};
#ifdef CONFIG_PER_VMA_LOCK
struct rcu_head vm_rcu; /* Used for deferred freeing. */
#endif
freeptr_t vm_freeptr; /* Pointer used by SLAB_TYPESAFE_BY_RCU */
};
struct mm_struct *vm_mm; /* The address space we belong to. */
@ -463,6 +467,65 @@ struct pagetable_move_control {
.len_in = len_, \
}
struct kmem_cache_args {
/**
* @align: The required alignment for the objects.
*
* %0 means no specific alignment is requested.
*/
unsigned int align;
/**
* @useroffset: Usercopy region offset.
*
* %0 is a valid offset, when @usersize is non-%0
*/
unsigned int useroffset;
/**
* @usersize: Usercopy region size.
*
* %0 means no usercopy region is specified.
*/
unsigned int usersize;
/**
* @freeptr_offset: Custom offset for the free pointer
* in &SLAB_TYPESAFE_BY_RCU caches
*
* By default &SLAB_TYPESAFE_BY_RCU caches place the free pointer
* outside of the object. This might cause the object to grow in size.
* Cache creators that have a reason to avoid this can specify a custom
* free pointer offset in their struct where the free pointer will be
* placed.
*
* Note that placing the free pointer inside the object requires the
* caller to ensure that no fields are invalidated that are required to
* guard against object recycling (See &SLAB_TYPESAFE_BY_RCU for
* details).
*
* Using %0 as a value for @freeptr_offset is valid. If @freeptr_offset
* is specified, %use_freeptr_offset must be set %true.
*
* Note that @ctor currently isn't supported with custom free pointers
* as a @ctor requires an external free pointer.
*/
unsigned int freeptr_offset;
/**
* @use_freeptr_offset: Whether a @freeptr_offset is used.
*/
bool use_freeptr_offset;
/**
* @ctor: A constructor for the objects.
*
* The constructor is invoked for each object in a newly allocated slab
* page. It is the cache user's responsibility to free object in the
* same state as after calling the constructor, or deal appropriately
* with any differences between a freshly constructed and a reallocated
* object.
*
* %NULL means no constructor.
*/
void (*ctor)(void *);
};
static inline void vma_iter_invalidate(struct vma_iterator *vmi)
{
mas_pause(&vmi->mas);
@ -547,31 +610,38 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm)
vma->vm_lock_seq = UINT_MAX;
}
static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm)
struct kmem_cache {
const char *name;
size_t object_size;
struct kmem_cache_args *args;
};
static inline struct kmem_cache *__kmem_cache_create(const char *name,
size_t object_size,
struct kmem_cache_args *args)
{
struct vm_area_struct *vma = calloc(1, sizeof(struct vm_area_struct));
struct kmem_cache *ret = malloc(sizeof(struct kmem_cache));
if (!vma)
return NULL;
ret->name = name;
ret->object_size = object_size;
ret->args = args;
vma_init(vma, mm);
return vma;
return ret;
}
static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
#define kmem_cache_create(__name, __object_size, __args, ...) \
__kmem_cache_create((__name), (__object_size), (__args))
static inline void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
{
struct vm_area_struct *new = calloc(1, sizeof(struct vm_area_struct));
(void)gfpflags;
if (!new)
return NULL;
return calloc(s->object_size, 1);
}
memcpy(new, orig, sizeof(*new));
refcount_set(&new->vm_refcnt, 0);
new->vm_lock_seq = UINT_MAX;
INIT_LIST_HEAD(&new->anon_vma_chain);
return new;
static inline void kmem_cache_free(struct kmem_cache *s, void *x)
{
free(x);
}
/*
@ -738,11 +808,6 @@ static inline void mpol_put(struct mempolicy *)
{
}
static inline void vm_area_free(struct vm_area_struct *vma)
{
free(vma);
}
static inline void lru_add_drain(void)
{
}
@ -1312,4 +1377,32 @@ static inline void ksm_exit(struct mm_struct *mm)
(void)mm;
}
static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt)
{
(void)vma;
(void)reset_refcnt;
}
static inline void vma_numab_state_init(struct vm_area_struct *vma)
{
(void)vma;
}
static inline void vma_numab_state_free(struct vm_area_struct *vma)
{
(void)vma;
}
static inline void dup_anon_vma_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma)
{
(void)orig_vma;
(void)new_vma;
}
static inline void free_anon_vma_name(struct vm_area_struct *vma)
{
(void)vma;
}
#endif /* __MM_VMA_INTERNAL_H */