linux-yocto/include/linux/dax.h
Alistair Popple 38607c62b3 fs/dax: properly refcount fs dax pages
Currently fs dax pages are considered free when the refcount drops to one
and their refcounts are not increased when mapped via PTEs or decreased
when unmapped.  This requires special logic in mm paths to detect that
these pages should not be properly refcounted, and to detect when the
refcount drops to one instead of zero.

On the other hand get_user_pages(), etc.  will properly refcount fs dax
pages by taking a reference and dropping it when the page is unpinned.

Tracking this special behaviour requires extra PTE bits (eg.  pte_devmap)
and introduces rules that are potentially confusing and specific to FS DAX
pages.  To fix this, and to possibly allow removal of the special PTE bits
in future, convert the fs dax page refcounts to be zero based and instead
take a reference on the page each time it is mapped as is currently the
case for normal pages.

This may also allow a future clean-up to remove the pgmap refcounting that
is currently done in mm/gup.c.

Link: https://lkml.kernel.org/r/c7d886ad7468a20452ef6e0ddab6cfe220874e7c.1740713401.git-series.apopple@nvidia.com
Signed-off-by: Alistair Popple <apopple@nvidia.com>
Reviewed-by: Dan Williams <dan.j.williams@intel.com>
Tested-by: Alison Schofield <alison.schofield@intel.com>
Acked-by: David Hildenbrand <david@redhat.com>
Cc: Alexander Gordeev <agordeev@linux.ibm.com>
Cc: Asahi Lina <lina@asahilina.net>
Cc: Balbir Singh <balbirs@nvidia.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Christian Borntraeger <borntraeger@linux.ibm.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Chunyan Zhang <zhang.lyra@gmail.com>
Cc: "Darrick J. Wong" <djwong@kernel.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Dave Jiang <dave.jiang@intel.com>
Cc: Gerald Schaefer <gerald.schaefer@linux.ibm.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Huacai Chen <chenhuacai@kernel.org>
Cc: Ira Weiny <ira.weiny@intel.com>
Cc: Jan Kara <jack@suse.cz>
Cc: Jason Gunthorpe <jgg@nvidia.com>
Cc: Jason Gunthorpe <jgg@ziepe.ca>
Cc: John Hubbard <jhubbard@nvidia.com>
Cc: linmiaohe <linmiaohe@huawei.com>
Cc: Logan Gunthorpe <logang@deltatee.com>
Cc: Matthew Wilcow (Oracle) <willy@infradead.org>
Cc: Michael "Camp Drill Sergeant" Ellerman <mpe@ellerman.id.au>
Cc: Nicholas Piggin <npiggin@gmail.com>
Cc: Peter Xu <peterx@redhat.com>
Cc: Sven Schnelle <svens@linux.ibm.com>
Cc: Ted Ts'o <tytso@mit.edu>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Cc: Vishal Verma <vishal.l.verma@intel.com>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: WANG Xuerui <kernel@xen0n.name>
Cc: Will Deacon <will@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
2025-03-17 22:06:41 -07:00

312 lines
9.5 KiB
C

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_DAX_H
#define _LINUX_DAX_H
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/radix-tree.h>
typedef unsigned long dax_entry_t;
struct dax_device;
struct gendisk;
struct iomap_ops;
struct iomap_iter;
struct iomap;
enum dax_access_mode {
DAX_ACCESS,
DAX_RECOVERY_WRITE,
};
struct dax_operations {
/*
* direct_access: translate a device-relative
* logical-page-offset into an absolute physical pfn. Return the
* number of pages available for DAX at that pfn.
*/
long (*direct_access)(struct dax_device *, pgoff_t, long,
enum dax_access_mode, void **, pfn_t *);
/* zero_page_range: required operation. Zero page range */
int (*zero_page_range)(struct dax_device *, pgoff_t, size_t);
/*
* recovery_write: recover a poisoned range by DAX device driver
* capable of clearing poison.
*/
size_t (*recovery_write)(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *iter);
};
struct dax_holder_operations {
/*
* notify_failure - notify memory failure into inner holder device
* @dax_dev: the dax device which contains the holder
* @offset: offset on this dax device where memory failure occurs
* @len: length of this memory failure event
* @flags: action flags for memory failure handler
*/
int (*notify_failure)(struct dax_device *dax_dev, u64 offset,
u64 len, int mf_flags);
};
#if IS_ENABLED(CONFIG_DAX)
struct dax_device *alloc_dax(void *private, const struct dax_operations *ops);
void *dax_holder(struct dax_device *dax_dev);
void put_dax(struct dax_device *dax_dev);
void kill_dax(struct dax_device *dax_dev);
void dax_write_cache(struct dax_device *dax_dev, bool wc);
bool dax_write_cache_enabled(struct dax_device *dax_dev);
bool dax_synchronous(struct dax_device *dax_dev);
void set_dax_nocache(struct dax_device *dax_dev);
void set_dax_nomc(struct dax_device *dax_dev);
void set_dax_synchronous(struct dax_device *dax_dev);
size_t dax_recovery_write(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i);
/*
* Check if given mapping is supported by the file / underlying device.
*/
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
struct dax_device *dax_dev)
{
if (!(vma->vm_flags & VM_SYNC))
return true;
if (!IS_DAX(file_inode(vma->vm_file)))
return false;
return dax_synchronous(dax_dev);
}
#else
static inline void *dax_holder(struct dax_device *dax_dev)
{
return NULL;
}
static inline struct dax_device *alloc_dax(void *private,
const struct dax_operations *ops)
{
return ERR_PTR(-EOPNOTSUPP);
}
static inline void put_dax(struct dax_device *dax_dev)
{
}
static inline void kill_dax(struct dax_device *dax_dev)
{
}
static inline void dax_write_cache(struct dax_device *dax_dev, bool wc)
{
}
static inline bool dax_write_cache_enabled(struct dax_device *dax_dev)
{
return false;
}
static inline bool dax_synchronous(struct dax_device *dax_dev)
{
return true;
}
static inline void set_dax_nocache(struct dax_device *dax_dev)
{
}
static inline void set_dax_nomc(struct dax_device *dax_dev)
{
}
static inline void set_dax_synchronous(struct dax_device *dax_dev)
{
}
static inline bool daxdev_mapping_supported(struct vm_area_struct *vma,
struct dax_device *dax_dev)
{
return !(vma->vm_flags & VM_SYNC);
}
static inline size_t dax_recovery_write(struct dax_device *dax_dev,
pgoff_t pgoff, void *addr, size_t bytes, struct iov_iter *i)
{
return 0;
}
#endif
struct writeback_control;
#if defined(CONFIG_BLOCK) && defined(CONFIG_FS_DAX)
int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk);
void dax_remove_host(struct gendisk *disk);
struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev, u64 *start_off,
void *holder, const struct dax_holder_operations *ops);
void fs_put_dax(struct dax_device *dax_dev, void *holder);
#else
static inline int dax_add_host(struct dax_device *dax_dev, struct gendisk *disk)
{
return 0;
}
static inline void dax_remove_host(struct gendisk *disk)
{
}
static inline struct dax_device *fs_dax_get_by_bdev(struct block_device *bdev,
u64 *start_off, void *holder,
const struct dax_holder_operations *ops)
{
return NULL;
}
static inline void fs_put_dax(struct dax_device *dax_dev, void *holder)
{
}
#endif /* CONFIG_BLOCK && CONFIG_FS_DAX */
#if IS_ENABLED(CONFIG_FS_DAX)
int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc);
struct page *dax_layout_busy_page(struct address_space *mapping);
struct page *dax_layout_busy_page_range(struct address_space *mapping, loff_t start, loff_t end);
dax_entry_t dax_lock_folio(struct folio *folio);
void dax_unlock_folio(struct folio *folio, dax_entry_t cookie);
dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
unsigned long index, struct page **page);
void dax_unlock_mapping_entry(struct address_space *mapping,
unsigned long index, dax_entry_t cookie);
#else
static inline struct page *dax_layout_busy_page(struct address_space *mapping)
{
return NULL;
}
static inline struct page *dax_layout_busy_page_range(struct address_space *mapping, pgoff_t start, pgoff_t nr_pages)
{
return NULL;
}
static inline int dax_writeback_mapping_range(struct address_space *mapping,
struct dax_device *dax_dev, struct writeback_control *wbc)
{
return -EOPNOTSUPP;
}
static inline dax_entry_t dax_lock_folio(struct folio *folio)
{
if (IS_DAX(folio->mapping->host))
return ~0UL;
return 0;
}
static inline void dax_unlock_folio(struct folio *folio, dax_entry_t cookie)
{
}
static inline dax_entry_t dax_lock_mapping_entry(struct address_space *mapping,
unsigned long index, struct page **page)
{
return 0;
}
static inline void dax_unlock_mapping_entry(struct address_space *mapping,
unsigned long index, dax_entry_t cookie)
{
}
#endif
int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len,
const struct iomap_ops *ops);
int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero,
const struct iomap_ops *ops);
int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero,
const struct iomap_ops *ops);
static inline bool dax_page_is_idle(struct page *page)
{
return page && page_ref_count(page) == 0;
}
#if IS_ENABLED(CONFIG_DAX)
int dax_read_lock(void);
void dax_read_unlock(int id);
#else
static inline int dax_read_lock(void)
{
return 0;
}
static inline void dax_read_unlock(int id)
{
}
#endif /* CONFIG_DAX */
#if !IS_ENABLED(CONFIG_FS_DAX)
static inline int __must_check dax_break_layout(struct inode *inode,
loff_t start, loff_t end, void (cb)(struct inode *))
{
return 0;
}
static inline void dax_break_layout_final(struct inode *inode)
{
}
#endif
bool dax_alive(struct dax_device *dax_dev);
void *dax_get_private(struct dax_device *dax_dev);
long dax_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, long nr_pages,
enum dax_access_mode mode, void **kaddr, pfn_t *pfn);
size_t dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
size_t dax_copy_to_iter(struct dax_device *dax_dev, pgoff_t pgoff, void *addr,
size_t bytes, struct iov_iter *i);
int dax_zero_page_range(struct dax_device *dax_dev, pgoff_t pgoff,
size_t nr_pages);
int dax_holder_notify_failure(struct dax_device *dax_dev, u64 off, u64 len,
int mf_flags);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops);
vm_fault_t dax_iomap_fault(struct vm_fault *vmf, unsigned int order,
pfn_t *pfnp, int *errp, const struct iomap_ops *ops);
vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf,
unsigned int order, pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
void dax_delete_mapping_range(struct address_space *mapping,
loff_t start, loff_t end);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index);
int __must_check dax_break_layout(struct inode *inode, loff_t start,
loff_t end, void (cb)(struct inode *));
static inline int __must_check dax_break_layout_inode(struct inode *inode,
void (cb)(struct inode *))
{
return dax_break_layout(inode, 0, LLONG_MAX, cb);
}
void dax_break_layout_final(struct inode *inode);
int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
struct inode *dest, loff_t destoff,
loff_t len, bool *is_same,
const struct iomap_ops *ops);
int dax_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct file *file_out, loff_t pos_out,
loff_t *len, unsigned int remap_flags,
const struct iomap_ops *ops);
static inline bool dax_mapping(struct address_space *mapping)
{
return mapping->host && IS_DAX(mapping->host);
}
/*
* Due to dax's memory and block duo personalities, hwpoison reporting
* takes into consideration which personality is presently visible.
* When dax acts like a block device, such as in block IO, an encounter of
* dax hwpoison is reported as -EIO.
* When dax acts like memory, such as in page fault, a detection of hwpoison
* is reported as -EHWPOISON which leads to VM_FAULT_HWPOISON.
*/
static inline int dax_mem2blk_err(int err)
{
return (err == -EHWPOISON) ? -EIO : err;
}
#ifdef CONFIG_DEV_DAX_HMEM_DEVICES
void hmem_register_resource(int target_nid, struct resource *r);
#else
static inline void hmem_register_resource(int target_nid, struct resource *r)
{
}
#endif
typedef int (*walk_hmem_fn)(struct device *dev, int target_nid,
const struct resource *res);
int walk_hmem_resources(struct device *dev, walk_hmem_fn fn);
#endif