mirror of
				git://git.yoctoproject.org/linux-yocto.git
				synced 2025-10-23 07:23:12 +02:00 
			
		
		
		
	mm: replace vm_lock and detached flag with a reference count
rw_semaphore is a sizable structure of 40 bytes and consumes considerable space for each vm_area_struct. However vma_lock has two important specifics which can be used to replace rw_semaphore with a simpler structure: 1. Readers never wait. They try to take the vma_lock and fall back to mmap_lock if that fails. 2. Only one writer at a time will ever try to write-lock a vma_lock because writers first take mmap_lock in write mode. Because of these requirements, full rw_semaphore functionality is not needed and we can replace rw_semaphore and the vma->detached flag with a refcount (vm_refcnt). When vma is in detached state, vm_refcnt is 0 and only a call to vma_mark_attached() can take it out of this state. Note that unlike before, now we enforce both vma_mark_attached() and vma_mark_detached() to be done only after vma has been write-locked. vma_mark_attached() changes vm_refcnt to 1 to indicate that it has been attached to the vma tree. When a reader takes read lock, it increments vm_refcnt, unless the top usable bit of vm_refcnt (0x40000000) is set, indicating presence of a writer. When writer takes write lock, it sets the top usable bit to indicate its presence. If there are readers, writer will wait using newly introduced mm->vma_writer_wait. Since all writers take mmap_lock in write mode first, there can be only one writer at a time. The last reader to release the lock will signal the writer to wake up. refcount might overflow if there are many competing readers, in which case read-locking will fail. Readers are expected to handle such failures. In summary: 1. all readers increment the vm_refcnt; 2. writer sets top usable (writer) bit of vm_refcnt; 3. readers cannot increment the vm_refcnt if the writer bit is set; 4. in the presence of readers, writer must wait for the vm_refcnt to drop to 1 (plus the VMA_LOCK_OFFSET writer bit), indicating an attached vma with no readers; 5. vm_refcnt overflow is handled by the readers. While this vm_lock replacement does not yet result in a smaller vm_area_struct (it stays at 256 bytes due to cacheline alignment), it allows for further size optimization by structure member regrouping to bring the size of vm_area_struct below 192 bytes. [surenb@google.com: fix a crash due to vma_end_read() that should have been removed] Link: https://lkml.kernel.org/r/20250220200208.323769-1-surenb@google.com Link: https://lkml.kernel.org/r/20250213224655.1680278-13-surenb@google.com Signed-off-by: Suren Baghdasaryan <surenb@google.com> Suggested-by: Peter Zijlstra <peterz@infradead.org> Suggested-by: Matthew Wilcox <willy@infradead.org> Tested-by: Shivank Garg <shivankg@amd.com> Link: https://lkml.kernel.org/r/5e19ec93-8307-47c2-bb13-3ddf7150624e@amd.com Reviewed-by: Vlastimil Babka <vbabka@suse.cz> Cc: Christian Brauner <brauner@kernel.org> Cc: David Hildenbrand <david@redhat.com> Cc: David Howells <dhowells@redhat.com> Cc: Davidlohr Bueso <dave@stgolabs.net> Cc: Hugh Dickins <hughd@google.com> Cc: Jann Horn <jannh@google.com> Cc: Johannes Weiner <hannes@cmpxchg.org> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Klara Modin <klarasmodin@gmail.com> Cc: Liam R. Howlett <Liam.Howlett@Oracle.com> Cc: Lokesh Gidra <lokeshgidra@google.com> Cc: Lorenzo Stoakes <lorenzo.stoakes@oracle.com> Cc: Mateusz Guzik <mjguzik@gmail.com> Cc: Mel Gorman <mgorman@techsingularity.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Minchan Kim <minchan@google.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Pasha Tatashin <pasha.tatashin@soleen.com> Cc: "Paul E . McKenney" <paulmck@kernel.org> Cc: Peter Xu <peterx@redhat.com> Cc: Shakeel Butt <shakeel.butt@linux.dev> Cc: Sourav Panda <souravpanda@google.com> Cc: Wei Yang <richard.weiyang@gmail.com> Cc: Will Deacon <will@kernel.org> Cc: Heiko Carstens <hca@linux.ibm.com> Cc: Stephen Rothwell <sfr@canb.auug.org.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
This commit is contained in:
		
							parent
							
								
									4e0dbe105d
								
							
						
					
					
						commit
						f35ab95ca0
					
				|  | @ -32,6 +32,7 @@ | ||||||
| #include <linux/memremap.h> | #include <linux/memremap.h> | ||||||
| #include <linux/slab.h> | #include <linux/slab.h> | ||||||
| #include <linux/cacheinfo.h> | #include <linux/cacheinfo.h> | ||||||
|  | #include <linux/rcuwait.h> | ||||||
| 
 | 
 | ||||||
| struct mempolicy; | struct mempolicy; | ||||||
| struct anon_vma; | struct anon_vma; | ||||||
|  | @ -697,19 +698,54 @@ static inline void vma_numab_state_free(struct vm_area_struct *vma) {} | ||||||
| #endif /* CONFIG_NUMA_BALANCING */ | #endif /* CONFIG_NUMA_BALANCING */ | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
| static inline void vma_lock_init(struct vm_area_struct *vma) | static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) | ||||||
| { | { | ||||||
| 	init_rwsem(&vma->vm_lock.lock); | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||||||
|  | 	static struct lock_class_key lockdep_key; | ||||||
|  | 
 | ||||||
|  | 	lockdep_init_map(&vma->vmlock_dep_map, "vm_lock", &lockdep_key, 0); | ||||||
|  | #endif | ||||||
|  | 	if (reset_refcnt) | ||||||
|  | 		refcount_set(&vma->vm_refcnt, 0); | ||||||
| 	vma->vm_lock_seq = UINT_MAX; | 	vma->vm_lock_seq = UINT_MAX; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | static inline bool is_vma_writer_only(int refcnt) | ||||||
|  | { | ||||||
|  | 	/*
 | ||||||
|  | 	 * With a writer and no readers, refcnt is VMA_LOCK_OFFSET if the vma | ||||||
|  | 	 * is detached and (VMA_LOCK_OFFSET + 1) if it is attached. Waiting on | ||||||
|  | 	 * a detached vma happens only in vma_mark_detached() and is a rare | ||||||
|  | 	 * case, therefore most of the time there will be no unnecessary wakeup. | ||||||
|  | 	 */ | ||||||
|  | 	return refcnt & VMA_LOCK_OFFSET && refcnt <= VMA_LOCK_OFFSET + 1; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void vma_refcount_put(struct vm_area_struct *vma) | ||||||
|  | { | ||||||
|  | 	/* Use a copy of vm_mm in case vma is freed after we drop vm_refcnt */ | ||||||
|  | 	struct mm_struct *mm = vma->vm_mm; | ||||||
|  | 	int oldcnt; | ||||||
|  | 
 | ||||||
|  | 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_); | ||||||
|  | 	if (!__refcount_dec_and_test(&vma->vm_refcnt, &oldcnt)) { | ||||||
|  | 
 | ||||||
|  | 		if (is_vma_writer_only(oldcnt - 1)) | ||||||
|  | 			rcuwait_wake_up(&mm->vma_writer_wait); | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Try to read-lock a vma. The function is allowed to occasionally yield false |  * Try to read-lock a vma. The function is allowed to occasionally yield false | ||||||
|  * locked result to avoid performance overhead, in which case we fall back to |  * locked result to avoid performance overhead, in which case we fall back to | ||||||
|  * using mmap_lock. The function should never yield false unlocked result. |  * using mmap_lock. The function should never yield false unlocked result. | ||||||
|  |  * Returns the vma on success, NULL on failure to lock and EAGAIN if vma got | ||||||
|  |  * detached. | ||||||
|  */ |  */ | ||||||
| static inline bool vma_start_read(struct vm_area_struct *vma) | static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
|  | 	int oldcnt; | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Check before locking. A race might cause false locked result. | 	 * Check before locking. A race might cause false locked result. | ||||||
| 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need | 	 * We can use READ_ONCE() for the mm_lock_seq here, and don't need | ||||||
|  | @ -718,15 +754,25 @@ static inline bool vma_start_read(struct vm_area_struct *vma) | ||||||
| 	 * need ordering is below. | 	 * need ordering is below. | ||||||
| 	 */ | 	 */ | ||||||
| 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) | 	if (READ_ONCE(vma->vm_lock_seq) == READ_ONCE(vma->vm_mm->mm_lock_seq.sequence)) | ||||||
| 		return false; | 		return NULL; | ||||||
| 
 |  | ||||||
| 	if (unlikely(down_read_trylock(&vma->vm_lock.lock) == 0)) |  | ||||||
| 		return false; |  | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Overflow might produce false locked result. | 	 * If VMA_LOCK_OFFSET is set, __refcount_inc_not_zero_limited_acquire() | ||||||
|  | 	 * will fail because VMA_REF_LIMIT is less than VMA_LOCK_OFFSET. | ||||||
|  | 	 * Acquire fence is required here to avoid reordering against later | ||||||
|  | 	 * vm_lock_seq check and checks inside lock_vma_under_rcu(). | ||||||
|  | 	 */ | ||||||
|  | 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, | ||||||
|  | 							      VMA_REF_LIMIT))) { | ||||||
|  | 		/* return EAGAIN if vma got detached from under us */ | ||||||
|  | 		return oldcnt ? NULL : ERR_PTR(-EAGAIN); | ||||||
|  | 	} | ||||||
|  | 
 | ||||||
|  | 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); | ||||||
|  | 	/*
 | ||||||
|  | 	 * Overflow of vm_lock_seq/mm_lock_seq might produce false locked result. | ||||||
| 	 * False unlocked result is impossible because we modify and check | 	 * False unlocked result is impossible because we modify and check | ||||||
| 	 * vma->vm_lock_seq under vma->vm_lock protection and mm->mm_lock_seq | 	 * vma->vm_lock_seq under vma->vm_refcnt protection and mm->mm_lock_seq | ||||||
| 	 * modification invalidates all existing locks. | 	 * modification invalidates all existing locks. | ||||||
| 	 * | 	 * | ||||||
| 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are | 	 * We must use ACQUIRE semantics for the mm_lock_seq so that if we are | ||||||
|  | @ -735,10 +781,11 @@ static inline bool vma_start_read(struct vm_area_struct *vma) | ||||||
| 	 * This pairs with RELEASE semantics in vma_end_write_all(). | 	 * This pairs with RELEASE semantics in vma_end_write_all(). | ||||||
| 	 */ | 	 */ | ||||||
| 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { | 	if (unlikely(vma->vm_lock_seq == raw_read_seqcount(&vma->vm_mm->mm_lock_seq))) { | ||||||
| 		up_read(&vma->vm_lock.lock); | 		vma_refcount_put(vma); | ||||||
| 		return false; | 		return NULL; | ||||||
| 	} | 	} | ||||||
| 	return true; | 
 | ||||||
|  | 	return vma; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /*
 | /*
 | ||||||
|  | @ -749,8 +796,14 @@ static inline bool vma_start_read(struct vm_area_struct *vma) | ||||||
|  */ |  */ | ||||||
| static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) | static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int subclass) | ||||||
| { | { | ||||||
|  | 	int oldcnt; | ||||||
|  | 
 | ||||||
| 	mmap_assert_locked(vma->vm_mm); | 	mmap_assert_locked(vma->vm_mm); | ||||||
| 	down_read_nested(&vma->vm_lock.lock, subclass); | 	if (unlikely(!__refcount_inc_not_zero_limited_acquire(&vma->vm_refcnt, &oldcnt, | ||||||
|  | 							      VMA_REF_LIMIT))) | ||||||
|  | 		return false; | ||||||
|  | 
 | ||||||
|  | 	rwsem_acquire_read(&vma->vmlock_dep_map, 0, 1, _RET_IP_); | ||||||
| 	return true; | 	return true; | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | @ -762,16 +815,12 @@ static inline bool vma_start_read_locked_nested(struct vm_area_struct *vma, int | ||||||
|  */ |  */ | ||||||
| static inline bool vma_start_read_locked(struct vm_area_struct *vma) | static inline bool vma_start_read_locked(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	mmap_assert_locked(vma->vm_mm); | 	return vma_start_read_locked_nested(vma, 0); | ||||||
| 	down_read(&vma->vm_lock.lock); |  | ||||||
| 	return true; |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_end_read(struct vm_area_struct *vma) | static inline void vma_end_read(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	rcu_read_lock(); /* keeps vma alive till the end of up_read */ | 	vma_refcount_put(vma); | ||||||
| 	up_read(&vma->vm_lock.lock); |  | ||||||
| 	rcu_read_unlock(); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ | /* WARNING! Can only be used if mmap_lock is expected to be write-locked */ | ||||||
|  | @ -813,38 +862,35 @@ static inline void vma_assert_write_locked(struct vm_area_struct *vma) | ||||||
| 
 | 
 | ||||||
| static inline void vma_assert_locked(struct vm_area_struct *vma) | static inline void vma_assert_locked(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	if (!rwsem_is_locked(&vma->vm_lock.lock)) | 	unsigned int mm_lock_seq; | ||||||
| 		vma_assert_write_locked(vma); | 
 | ||||||
|  | 	VM_BUG_ON_VMA(refcount_read(&vma->vm_refcnt) <= 1 && | ||||||
|  | 		      !__is_vma_write_locked(vma, &mm_lock_seq), vma); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
|  | /*
 | ||||||
|  |  * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these | ||||||
|  |  * assertions should be made either under mmap_write_lock or when the object | ||||||
|  |  * has been isolated under mmap_write_lock, ensuring no competing writers. | ||||||
|  |  */ | ||||||
| static inline void vma_assert_attached(struct vm_area_struct *vma) | static inline void vma_assert_attached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	WARN_ON_ONCE(vma->detached); | 	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_assert_detached(struct vm_area_struct *vma) | static inline void vma_assert_detached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	WARN_ON_ONCE(!vma->detached); | 	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_mark_attached(struct vm_area_struct *vma) | static inline void vma_mark_attached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	vma_assert_detached(vma); |  | ||||||
| 	vma->detached = false; |  | ||||||
| } |  | ||||||
| 
 |  | ||||||
| static inline void vma_mark_detached(struct vm_area_struct *vma) |  | ||||||
| { |  | ||||||
| 	/* When detaching vma should be write-locked */ |  | ||||||
| 	vma_assert_write_locked(vma); | 	vma_assert_write_locked(vma); | ||||||
| 	vma_assert_attached(vma); | 	vma_assert_detached(vma); | ||||||
| 	vma->detached = true; | 	refcount_set(&vma->vm_refcnt, 1); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline bool is_vma_detached(struct vm_area_struct *vma) | void vma_mark_detached(struct vm_area_struct *vma); | ||||||
| { |  | ||||||
| 	return vma->detached; |  | ||||||
| } |  | ||||||
| 
 | 
 | ||||||
| static inline void release_fault_lock(struct vm_fault *vmf) | static inline void release_fault_lock(struct vm_fault *vmf) | ||||||
| { | { | ||||||
|  | @ -867,9 +913,9 @@ struct vm_area_struct *lock_vma_under_rcu(struct mm_struct *mm, | ||||||
| 
 | 
 | ||||||
| #else /* CONFIG_PER_VMA_LOCK */ | #else /* CONFIG_PER_VMA_LOCK */ | ||||||
| 
 | 
 | ||||||
| static inline void vma_lock_init(struct vm_area_struct *vma) {} | static inline void vma_lock_init(struct vm_area_struct *vma, bool reset_refcnt) {} | ||||||
| static inline bool vma_start_read(struct vm_area_struct *vma) | static inline struct vm_area_struct *vma_start_read(struct vm_area_struct *vma) | ||||||
| 		{ return false; } | 		{ return NULL; } | ||||||
| static inline void vma_end_read(struct vm_area_struct *vma) {} | static inline void vma_end_read(struct vm_area_struct *vma) {} | ||||||
| static inline void vma_start_write(struct vm_area_struct *vma) {} | static inline void vma_start_write(struct vm_area_struct *vma) {} | ||||||
| static inline void vma_assert_write_locked(struct vm_area_struct *vma) | static inline void vma_assert_write_locked(struct vm_area_struct *vma) | ||||||
|  | @ -910,12 +956,8 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) | ||||||
| 	vma->vm_mm = mm; | 	vma->vm_mm = mm; | ||||||
| 	vma->vm_ops = &vma_dummy_vm_ops; | 	vma->vm_ops = &vma_dummy_vm_ops; | ||||||
| 	INIT_LIST_HEAD(&vma->anon_vma_chain); | 	INIT_LIST_HEAD(&vma->anon_vma_chain); | ||||||
| #ifdef CONFIG_PER_VMA_LOCK |  | ||||||
| 	/* vma is not locked, can't use vma_mark_detached() */ |  | ||||||
| 	vma->detached = true; |  | ||||||
| #endif |  | ||||||
| 	vma_numab_state_init(vma); | 	vma_numab_state_init(vma); | ||||||
| 	vma_lock_init(vma); | 	vma_lock_init(vma, false); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| /* Use when VMA is not part of the VMA tree and needs no locking */ | /* Use when VMA is not part of the VMA tree and needs no locking */ | ||||||
|  |  | ||||||
|  | @ -19,6 +19,7 @@ | ||||||
| #include <linux/workqueue.h> | #include <linux/workqueue.h> | ||||||
| #include <linux/seqlock.h> | #include <linux/seqlock.h> | ||||||
| #include <linux/percpu_counter.h> | #include <linux/percpu_counter.h> | ||||||
|  | #include <linux/types.h> | ||||||
| 
 | 
 | ||||||
| #include <asm/mmu.h> | #include <asm/mmu.h> | ||||||
| 
 | 
 | ||||||
|  | @ -629,9 +630,8 @@ static inline struct anon_vma_name *anon_vma_name_alloc(const char *name) | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| struct vma_lock { | #define VMA_LOCK_OFFSET	0x40000000 | ||||||
| 	struct rw_semaphore lock; | #define VMA_REF_LIMIT	(VMA_LOCK_OFFSET - 1) | ||||||
| }; |  | ||||||
| 
 | 
 | ||||||
| struct vma_numab_state { | struct vma_numab_state { | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -709,19 +709,13 @@ struct vm_area_struct { | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
| 	/*
 |  | ||||||
| 	 * Flag to indicate areas detached from the mm->mm_mt tree. |  | ||||||
| 	 * Unstable RCU readers are allowed to read this. |  | ||||||
| 	 */ |  | ||||||
| 	bool detached; |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Can only be written (using WRITE_ONCE()) while holding both: | 	 * Can only be written (using WRITE_ONCE()) while holding both: | ||||||
| 	 *  - mmap_lock (in write mode) | 	 *  - mmap_lock (in write mode) | ||||||
| 	 *  - vm_lock->lock (in write mode) | 	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set | ||||||
| 	 * Can be read reliably while holding one of: | 	 * Can be read reliably while holding one of: | ||||||
| 	 *  - mmap_lock (in read or write mode) | 	 *  - mmap_lock (in read or write mode) | ||||||
| 	 *  - vm_lock->lock (in read or write mode) | 	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 | ||||||
| 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout | 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout | ||||||
| 	 * while holding nothing (except RCU to keep the VMA struct allocated). | 	 * while holding nothing (except RCU to keep the VMA struct allocated). | ||||||
| 	 * | 	 * | ||||||
|  | @ -784,7 +778,10 @@ struct vm_area_struct { | ||||||
| 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
| 	/* Unstable RCU readers are allowed to read this. */ | 	/* Unstable RCU readers are allowed to read this. */ | ||||||
| 	struct vma_lock vm_lock ____cacheline_aligned_in_smp; | 	refcount_t vm_refcnt ____cacheline_aligned_in_smp; | ||||||
|  | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||||||
|  | 	struct lockdep_map vmlock_dep_map; | ||||||
|  | #endif | ||||||
| #endif | #endif | ||||||
| } __randomize_layout; | } __randomize_layout; | ||||||
| 
 | 
 | ||||||
|  | @ -920,6 +917,7 @@ struct mm_struct { | ||||||
| 					  * by mmlist_lock | 					  * by mmlist_lock | ||||||
| 					  */ | 					  */ | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
|  | 		struct rcuwait vma_writer_wait; | ||||||
| 		/*
 | 		/*
 | ||||||
| 		 * This field has lock-like semantics, meaning it is sometimes | 		 * This field has lock-like semantics, meaning it is sometimes | ||||||
| 		 * accessed with ACQUIRE/RELEASE semantics. | 		 * accessed with ACQUIRE/RELEASE semantics. | ||||||
|  |  | ||||||
|  | @ -463,12 +463,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) | ||||||
| 	 * will be reinitialized. | 	 * will be reinitialized. | ||||||
| 	 */ | 	 */ | ||||||
| 	data_race(memcpy(new, orig, sizeof(*new))); | 	data_race(memcpy(new, orig, sizeof(*new))); | ||||||
| 	vma_lock_init(new); | 	vma_lock_init(new, true); | ||||||
| 	INIT_LIST_HEAD(&new->anon_vma_chain); | 	INIT_LIST_HEAD(&new->anon_vma_chain); | ||||||
| #ifdef CONFIG_PER_VMA_LOCK |  | ||||||
| 	/* vma is not locked, can't use vma_mark_detached() */ |  | ||||||
| 	new->detached = true; |  | ||||||
| #endif |  | ||||||
| 	vma_numab_state_init(new); | 	vma_numab_state_init(new); | ||||||
| 	dup_anon_vma_name(orig, new); | 	dup_anon_vma_name(orig, new); | ||||||
| 
 | 
 | ||||||
|  | @ -477,6 +473,8 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) | ||||||
| 
 | 
 | ||||||
| void __vm_area_free(struct vm_area_struct *vma) | void __vm_area_free(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
|  | 	/* The vma should be detached while being destroyed. */ | ||||||
|  | 	vma_assert_detached(vma); | ||||||
| 	vma_numab_state_free(vma); | 	vma_numab_state_free(vma); | ||||||
| 	free_anon_vma_name(vma); | 	free_anon_vma_name(vma); | ||||||
| 	kmem_cache_free(vm_area_cachep, vma); | 	kmem_cache_free(vm_area_cachep, vma); | ||||||
|  | @ -488,8 +486,6 @@ static void vm_area_free_rcu_cb(struct rcu_head *head) | ||||||
| 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct, | 	struct vm_area_struct *vma = container_of(head, struct vm_area_struct, | ||||||
| 						  vm_rcu); | 						  vm_rcu); | ||||||
| 
 | 
 | ||||||
| 	/* The vma should not be locked while being destroyed. */ |  | ||||||
| 	VM_BUG_ON_VMA(rwsem_is_locked(&vma->vm_lock.lock), vma); |  | ||||||
| 	__vm_area_free(vma); | 	__vm_area_free(vma); | ||||||
| } | } | ||||||
| #endif | #endif | ||||||
|  | @ -1234,6 +1230,9 @@ static void mmap_init_lock(struct mm_struct *mm) | ||||||
| { | { | ||||||
| 	init_rwsem(&mm->mmap_lock); | 	init_rwsem(&mm->mmap_lock); | ||||||
| 	mm_lock_seqcount_init(mm); | 	mm_lock_seqcount_init(mm); | ||||||
|  | #ifdef CONFIG_PER_VMA_LOCK | ||||||
|  | 	rcuwait_init(&mm->vma_writer_wait); | ||||||
|  | #endif | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, | static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, | ||||||
|  |  | ||||||
|  | @ -40,6 +40,7 @@ struct mm_struct init_mm = { | ||||||
| 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), | 	.arg_lock	=  __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), | ||||||
| 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist), | 	.mmlist		= LIST_HEAD_INIT(init_mm.mmlist), | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
|  | 	.vma_writer_wait = __RCUWAIT_INITIALIZER(init_mm.vma_writer_wait), | ||||||
| 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq), | 	.mm_lock_seq	= SEQCNT_ZERO(init_mm.mm_lock_seq), | ||||||
| #endif | #endif | ||||||
| 	.user_ns	= &init_user_ns, | 	.user_ns	= &init_user_ns, | ||||||
|  |  | ||||||
							
								
								
									
										90
									
								
								mm/memory.c
									
									
									
									
									
								
							
							
						
						
									
										90
									
								
								mm/memory.c
									
									
									
									
									
								
							|  | @ -6353,9 +6353,47 @@ fail: | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
|  | static inline bool __vma_enter_locked(struct vm_area_struct *vma, bool detaching) | ||||||
|  | { | ||||||
|  | 	unsigned int tgt_refcnt = VMA_LOCK_OFFSET; | ||||||
|  | 
 | ||||||
|  | 	/* Additional refcnt if the vma is attached. */ | ||||||
|  | 	if (!detaching) | ||||||
|  | 		tgt_refcnt++; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * If vma is detached then only vma_mark_attached() can raise the | ||||||
|  | 	 * vm_refcnt. mmap_write_lock prevents racing with vma_mark_attached(). | ||||||
|  | 	 */ | ||||||
|  | 	if (!refcount_add_not_zero(VMA_LOCK_OFFSET, &vma->vm_refcnt)) | ||||||
|  | 		return false; | ||||||
|  | 
 | ||||||
|  | 	rwsem_acquire(&vma->vmlock_dep_map, 0, 0, _RET_IP_); | ||||||
|  | 	rcuwait_wait_event(&vma->vm_mm->vma_writer_wait, | ||||||
|  | 		   refcount_read(&vma->vm_refcnt) == tgt_refcnt, | ||||||
|  | 		   TASK_UNINTERRUPTIBLE); | ||||||
|  | 	lock_acquired(&vma->vmlock_dep_map, _RET_IP_); | ||||||
|  | 
 | ||||||
|  | 	return true; | ||||||
|  | } | ||||||
|  | 
 | ||||||
|  | static inline void __vma_exit_locked(struct vm_area_struct *vma, bool *detached) | ||||||
|  | { | ||||||
|  | 	*detached = refcount_sub_and_test(VMA_LOCK_OFFSET, &vma->vm_refcnt); | ||||||
|  | 	rwsem_release(&vma->vmlock_dep_map, _RET_IP_); | ||||||
|  | } | ||||||
|  | 
 | ||||||
| void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) | void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) | ||||||
| { | { | ||||||
| 	down_write(&vma->vm_lock.lock); | 	bool locked; | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * __vma_enter_locked() returns false immediately if the vma is not | ||||||
|  | 	 * attached, otherwise it waits until refcnt is indicating that vma | ||||||
|  | 	 * is attached with no readers. | ||||||
|  | 	 */ | ||||||
|  | 	locked = __vma_enter_locked(vma, false); | ||||||
|  | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * We should use WRITE_ONCE() here because we can have concurrent reads | 	 * We should use WRITE_ONCE() here because we can have concurrent reads | ||||||
| 	 * from the early lockless pessimistic check in vma_start_read(). | 	 * from the early lockless pessimistic check in vma_start_read(). | ||||||
|  | @ -6363,10 +6401,40 @@ void __vma_start_write(struct vm_area_struct *vma, unsigned int mm_lock_seq) | ||||||
| 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. | 	 * we should use WRITE_ONCE() for cleanliness and to keep KCSAN happy. | ||||||
| 	 */ | 	 */ | ||||||
| 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); | 	WRITE_ONCE(vma->vm_lock_seq, mm_lock_seq); | ||||||
| 	up_write(&vma->vm_lock.lock); | 
 | ||||||
|  | 	if (locked) { | ||||||
|  | 		bool detached; | ||||||
|  | 
 | ||||||
|  | 		__vma_exit_locked(vma, &detached); | ||||||
|  | 		WARN_ON_ONCE(detached); /* vma should remain attached */ | ||||||
|  | 	} | ||||||
| } | } | ||||||
| EXPORT_SYMBOL_GPL(__vma_start_write); | EXPORT_SYMBOL_GPL(__vma_start_write); | ||||||
| 
 | 
 | ||||||
|  | void vma_mark_detached(struct vm_area_struct *vma) | ||||||
|  | { | ||||||
|  | 	vma_assert_write_locked(vma); | ||||||
|  | 	vma_assert_attached(vma); | ||||||
|  | 
 | ||||||
|  | 	/*
 | ||||||
|  | 	 * We are the only writer, so no need to use vma_refcount_put(). | ||||||
|  | 	 * The condition below is unlikely because the vma has been already | ||||||
|  | 	 * write-locked and readers can increment vm_refcnt only temporarily | ||||||
|  | 	 * before they check vm_lock_seq, realize the vma is locked and drop | ||||||
|  | 	 * back the vm_refcnt. That is a narrow window for observing a raised | ||||||
|  | 	 * vm_refcnt. | ||||||
|  | 	 */ | ||||||
|  | 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { | ||||||
|  | 		/* Wait until vma is detached with no readers. */ | ||||||
|  | 		if (__vma_enter_locked(vma, true)) { | ||||||
|  | 			bool detached; | ||||||
|  | 
 | ||||||
|  | 			__vma_exit_locked(vma, &detached); | ||||||
|  | 			WARN_ON_ONCE(!detached); | ||||||
|  | 		} | ||||||
|  | 	} | ||||||
|  | } | ||||||
|  | 
 | ||||||
| /*
 | /*
 | ||||||
|  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be |  * Lookup and lock a VMA under RCU protection. Returned VMA is guaranteed to be | ||||||
|  * stable and not isolated. If the VMA is not found or is being modified the |  * stable and not isolated. If the VMA is not found or is being modified the | ||||||
|  | @ -6384,15 +6452,17 @@ retry: | ||||||
| 	if (!vma) | 	if (!vma) | ||||||
| 		goto inval; | 		goto inval; | ||||||
| 
 | 
 | ||||||
| 	if (!vma_start_read(vma)) | 	vma = vma_start_read(vma); | ||||||
| 		goto inval; | 	if (IS_ERR_OR_NULL(vma)) { | ||||||
|  | 		/* Check if the VMA got isolated after we found it */ | ||||||
|  | 		if (PTR_ERR(vma) == -EAGAIN) { | ||||||
|  | 			count_vm_vma_lock_event(VMA_LOCK_MISS); | ||||||
|  | 			/* The area was replaced with another one */ | ||||||
|  | 			goto retry; | ||||||
|  | 		} | ||||||
| 
 | 
 | ||||||
| 	/* Check if the VMA got isolated after we found it */ | 		/* Failed to lock the VMA */ | ||||||
| 	if (is_vma_detached(vma)) { | 		goto inval; | ||||||
| 		vma_end_read(vma); |  | ||||||
| 		count_vm_vma_lock_event(VMA_LOCK_MISS); |  | ||||||
| 		/* The area was replaced with another one */ |  | ||||||
| 		goto retry; |  | ||||||
| 	} | 	} | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * At this point, we have a stable reference to a VMA: The VMA is | 	 * At this point, we have a stable reference to a VMA: The VMA is | ||||||
|  |  | ||||||
|  | @ -9,4 +9,9 @@ | ||||||
| #define atomic_set(x, y) uatomic_set(x, y) | #define atomic_set(x, y) uatomic_set(x, y) | ||||||
| #define U8_MAX UCHAR_MAX | #define U8_MAX UCHAR_MAX | ||||||
| 
 | 
 | ||||||
|  | #ifndef atomic_cmpxchg_relaxed | ||||||
|  | #define  atomic_cmpxchg_relaxed		uatomic_cmpxchg | ||||||
|  | #define  atomic_cmpxchg_release         uatomic_cmpxchg | ||||||
|  | #endif /* atomic_cmpxchg_relaxed */ | ||||||
|  | 
 | ||||||
| #endif	/* _LINUX_ATOMIC_H */ | #endif	/* _LINUX_ATOMIC_H */ | ||||||
|  |  | ||||||
|  | @ -25,7 +25,7 @@ | ||||||
| #include <linux/maple_tree.h> | #include <linux/maple_tree.h> | ||||||
| #include <linux/mm.h> | #include <linux/mm.h> | ||||||
| #include <linux/rbtree.h> | #include <linux/rbtree.h> | ||||||
| #include <linux/rwsem.h> | #include <linux/refcount.h> | ||||||
| 
 | 
 | ||||||
| extern unsigned long stack_guard_gap; | extern unsigned long stack_guard_gap; | ||||||
| #ifdef CONFIG_MMU | #ifdef CONFIG_MMU | ||||||
|  | @ -135,10 +135,6 @@ typedef __bitwise unsigned int vm_fault_t; | ||||||
|  */ |  */ | ||||||
| #define pr_warn_once pr_err | #define pr_warn_once pr_err | ||||||
| 
 | 
 | ||||||
| typedef struct refcount_struct { |  | ||||||
| 	atomic_t refs; |  | ||||||
| } refcount_t; |  | ||||||
| 
 |  | ||||||
| struct kref { | struct kref { | ||||||
| 	refcount_t refcount; | 	refcount_t refcount; | ||||||
| }; | }; | ||||||
|  | @ -233,15 +229,12 @@ struct mm_struct { | ||||||
| 	unsigned long flags; /* Must use atomic bitops to access */ | 	unsigned long flags; /* Must use atomic bitops to access */ | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
| struct vma_lock { |  | ||||||
| 	struct rw_semaphore lock; |  | ||||||
| }; |  | ||||||
| 
 |  | ||||||
| 
 |  | ||||||
| struct file { | struct file { | ||||||
| 	struct address_space	*f_mapping; | 	struct address_space	*f_mapping; | ||||||
| }; | }; | ||||||
| 
 | 
 | ||||||
|  | #define VMA_LOCK_OFFSET	0x40000000 | ||||||
|  | 
 | ||||||
| struct vm_area_struct { | struct vm_area_struct { | ||||||
| 	/* The first cache line has the info for VMA tree walking. */ | 	/* The first cache line has the info for VMA tree walking. */ | ||||||
| 
 | 
 | ||||||
|  | @ -269,16 +262,13 @@ struct vm_area_struct { | ||||||
| 	}; | 	}; | ||||||
| 
 | 
 | ||||||
| #ifdef CONFIG_PER_VMA_LOCK | #ifdef CONFIG_PER_VMA_LOCK | ||||||
| 	/* Flag to indicate areas detached from the mm->mm_mt tree */ |  | ||||||
| 	bool detached; |  | ||||||
| 
 |  | ||||||
| 	/*
 | 	/*
 | ||||||
| 	 * Can only be written (using WRITE_ONCE()) while holding both: | 	 * Can only be written (using WRITE_ONCE()) while holding both: | ||||||
| 	 *  - mmap_lock (in write mode) | 	 *  - mmap_lock (in write mode) | ||||||
| 	 *  - vm_lock.lock (in write mode) | 	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set | ||||||
| 	 * Can be read reliably while holding one of: | 	 * Can be read reliably while holding one of: | ||||||
| 	 *  - mmap_lock (in read or write mode) | 	 *  - mmap_lock (in read or write mode) | ||||||
| 	 *  - vm_lock.lock (in read or write mode) | 	 *  - vm_refcnt bit at VMA_LOCK_OFFSET is set or vm_refcnt > 1 | ||||||
| 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout | 	 * Can be read unreliably (using READ_ONCE()) for pessimistic bailout | ||||||
| 	 * while holding nothing (except RCU to keep the VMA struct allocated). | 	 * while holding nothing (except RCU to keep the VMA struct allocated). | ||||||
| 	 * | 	 * | ||||||
|  | @ -287,7 +277,6 @@ struct vm_area_struct { | ||||||
| 	 * slowpath. | 	 * slowpath. | ||||||
| 	 */ | 	 */ | ||||||
| 	unsigned int vm_lock_seq; | 	unsigned int vm_lock_seq; | ||||||
| 	struct vma_lock vm_lock; |  | ||||||
| #endif | #endif | ||||||
| 
 | 
 | ||||||
| 	/*
 | 	/*
 | ||||||
|  | @ -340,6 +329,10 @@ struct vm_area_struct { | ||||||
| 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */ | 	struct vma_numab_state *numab_state;	/* NUMA Balancing state */ | ||||||
| #endif | #endif | ||||||
| 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | 	struct vm_userfaultfd_ctx vm_userfaultfd_ctx; | ||||||
|  | #ifdef CONFIG_PER_VMA_LOCK | ||||||
|  | 	/* Unstable RCU readers are allowed to read this. */ | ||||||
|  | 	refcount_t vm_refcnt; | ||||||
|  | #endif | ||||||
| } __randomize_layout; | } __randomize_layout; | ||||||
| 
 | 
 | ||||||
| struct vm_fault {}; | struct vm_fault {}; | ||||||
|  | @ -464,33 +457,40 @@ static inline struct vm_area_struct *vma_next(struct vma_iterator *vmi) | ||||||
| 	return mas_find(&vmi->mas, ULONG_MAX); | 	return mas_find(&vmi->mas, ULONG_MAX); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_lock_init(struct vm_area_struct *vma) | /*
 | ||||||
| { |  * WARNING: to avoid racing with vma_mark_attached()/vma_mark_detached(), these | ||||||
| 	init_rwsem(&vma->vm_lock.lock); |  * assertions should be made either under mmap_write_lock or when the object | ||||||
| 	vma->vm_lock_seq = UINT_MAX; |  * has been isolated under mmap_write_lock, ensuring no competing writers. | ||||||
| } |  */ | ||||||
| 
 |  | ||||||
| static inline void vma_assert_attached(struct vm_area_struct *vma) | static inline void vma_assert_attached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	WARN_ON_ONCE(vma->detached); | 	WARN_ON_ONCE(!refcount_read(&vma->vm_refcnt)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_assert_detached(struct vm_area_struct *vma) | static inline void vma_assert_detached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	WARN_ON_ONCE(!vma->detached); | 	WARN_ON_ONCE(refcount_read(&vma->vm_refcnt)); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_assert_write_locked(struct vm_area_struct *); | static inline void vma_assert_write_locked(struct vm_area_struct *); | ||||||
| static inline void vma_mark_attached(struct vm_area_struct *vma) | static inline void vma_mark_attached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	vma->detached = false; | 	vma_assert_write_locked(vma); | ||||||
|  | 	vma_assert_detached(vma); | ||||||
|  | 	refcount_set(&vma->vm_refcnt, 1); | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline void vma_mark_detached(struct vm_area_struct *vma) | static inline void vma_mark_detached(struct vm_area_struct *vma) | ||||||
| { | { | ||||||
| 	/* When detaching vma should be write-locked */ |  | ||||||
| 	vma_assert_write_locked(vma); | 	vma_assert_write_locked(vma); | ||||||
| 	vma->detached = true; | 	vma_assert_attached(vma); | ||||||
|  | 	/* We are the only writer, so no need to use vma_refcount_put(). */ | ||||||
|  | 	if (unlikely(!refcount_dec_and_test(&vma->vm_refcnt))) { | ||||||
|  | 		/*
 | ||||||
|  | 		 * Reader must have temporarily raised vm_refcnt but it will | ||||||
|  | 		 * drop it without using the vma since vma is write-locked. | ||||||
|  | 		 */ | ||||||
|  | 	} | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| extern const struct vm_operations_struct vma_dummy_vm_ops; | extern const struct vm_operations_struct vma_dummy_vm_ops; | ||||||
|  | @ -503,9 +503,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) | ||||||
| 	vma->vm_mm = mm; | 	vma->vm_mm = mm; | ||||||
| 	vma->vm_ops = &vma_dummy_vm_ops; | 	vma->vm_ops = &vma_dummy_vm_ops; | ||||||
| 	INIT_LIST_HEAD(&vma->anon_vma_chain); | 	INIT_LIST_HEAD(&vma->anon_vma_chain); | ||||||
| 	/* vma is not locked, can't use vma_mark_detached() */ | 	vma->vm_lock_seq = UINT_MAX; | ||||||
| 	vma->detached = true; |  | ||||||
| 	vma_lock_init(vma); |  | ||||||
| } | } | ||||||
| 
 | 
 | ||||||
| static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) | static inline struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) | ||||||
|  | @ -528,10 +526,9 @@ static inline struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) | ||||||
| 		return NULL; | 		return NULL; | ||||||
| 
 | 
 | ||||||
| 	memcpy(new, orig, sizeof(*new)); | 	memcpy(new, orig, sizeof(*new)); | ||||||
| 	vma_lock_init(new); | 	refcount_set(&new->vm_refcnt, 0); | ||||||
|  | 	new->vm_lock_seq = UINT_MAX; | ||||||
| 	INIT_LIST_HEAD(&new->anon_vma_chain); | 	INIT_LIST_HEAD(&new->anon_vma_chain); | ||||||
| 	/* vma is not locked, can't use vma_mark_detached() */ |  | ||||||
| 	new->detached = true; |  | ||||||
| 
 | 
 | ||||||
| 	return new; | 	return new; | ||||||
| } | } | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue
	
	Block a user
	 Suren Baghdasaryan
						Suren Baghdasaryan