summaryrefslogtreecommitdiffstats
path: root/mm/hugetlb.c
diff options
context:
space:
mode:
authorJohn Hubbard <jhubbard@nvidia.com>2020-04-01 21:05:29 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2020-04-02 09:35:27 -0700
commit3faa52c03f440d1b9ddef18c4f189f4790d52d7e (patch)
treebb6a2fef66352f853961d6a90c379f01b0a9ada4 /mm/hugetlb.c
parent94202f126f698691f8865906ad6a68203e5dde8c (diff)
downloadlinux-3faa52c03f440d1b9ddef18c4f189f4790d52d7e.tar.gz
linux-3faa52c03f440d1b9ddef18c4f189f4790d52d7e.tar.xz
mm/gup: track FOLL_PIN pages
Add tracking of pages that were pinned via FOLL_PIN. This tracking is implemented via overloading of page->_refcount: pins are added by adding GUP_PIN_COUNTING_BIAS (1024) to the refcount. This provides a fuzzy indication of pinning, and it can have false positives (and that's OK). Please see the pre-existing Documentation/core-api/pin_user_pages.rst for details. As mentioned in pin_user_pages.rst, callers who effectively set FOLL_PIN (typically via pin_user_pages*()) are required to ultimately free such pages via unpin_user_page(). Please also note the limitation, discussed in pin_user_pages.rst under the "TODO: for 1GB and larger huge pages" section. (That limitation will be removed in a following patch.) The effect of a FOLL_PIN flag is similar to that of FOLL_GET, and may be thought of as "FOLL_GET for DIO and/or RDMA use". Pages that have been pinned via FOLL_PIN are identifiable via a new function call: bool page_maybe_dma_pinned(struct page *page); What to do in response to encountering such a page, is left to later patchsets. There is discussion about this in [1], [2], [3], and [4]. This also changes a BUG_ON(), to a WARN_ON(), in follow_page_mask(). [1] Some slow progress on get_user_pages() (Apr 2, 2019): https://lwn.net/Articles/784574/ [2] DMA and get_user_pages() (LPC: Dec 12, 2018): https://lwn.net/Articles/774411/ [3] The trouble with get_user_pages() (Apr 30, 2018): https://lwn.net/Articles/753027/ [4] LWN kernel index: get_user_pages(): https://lwn.net/Kernel/Index/#Memory_management-get_user_pages [jhubbard@nvidia.com: add kerneldoc] Link: http://lkml.kernel.org/r/20200307021157.235726-1-jhubbard@nvidia.com [imbrenda@linux.ibm.com: if pin fails, we need to unpin, a simple put_page will not be enough] Link: http://lkml.kernel.org/r/20200306132537.783769-2-imbrenda@linux.ibm.com [akpm@linux-foundation.org: fix put_compound_head defined but not used] Suggested-by: Jan Kara <jack@suse.cz> Suggested-by: Jérôme Glisse <jglisse@redhat.com> Signed-off-by: John Hubbard <jhubbard@nvidia.com> Signed-off-by: Claudio Imbrenda <imbrenda@linux.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Reviewed-by: Jan Kara <jack@suse.cz> Acked-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Cc: Ira Weiny <ira.weiny@intel.com> Cc: "Matthew Wilcox (Oracle)" <willy@infradead.org> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dan Williams <dan.j.williams@intel.com> Cc: Dave Chinner <david@fromorbit.com> Cc: Jason Gunthorpe <jgg@ziepe.ca> Cc: Jonathan Corbet <corbet@lwn.net> Cc: Michal Hocko <mhocko@suse.com> Cc: Mike Kravetz <mike.kravetz@oracle.com> Cc: Shuah Khan <shuah@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Link: http://lkml.kernel.org/r/20200211001536.1027652-7-jhubbard@nvidia.com Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/hugetlb.c')
-rw-r--r--mm/hugetlb.c54
1 files changed, 36 insertions, 18 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index dd8737a94bec..ba1de6bc1402 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -4376,19 +4376,6 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
page = pte_page(huge_ptep_get(pte));
/*
- * Instead of doing 'try_get_page()' below in the same_page
- * loop, just check the count once here.
- */
- if (unlikely(page_count(page) <= 0)) {
- if (pages) {
- spin_unlock(ptl);
- remainder = 0;
- err = -ENOMEM;
- break;
- }
- }
-
- /*
* If subpage information not requested, update counters
* and skip the same_page loop below.
*/
@@ -4405,7 +4392,22 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
same_page:
if (pages) {
pages[i] = mem_map_offset(page, pfn_offset);
- get_page(pages[i]);
+ /*
+ * try_grab_page() should always succeed here, because:
+ * a) we hold the ptl lock, and b) we've just checked
+ * that the huge page is present in the page tables. If
+ * the huge page is present, then the tail pages must
+ * also be present. The ptl prevents the head page and
+ * tail pages from being rearranged in any way. So this
+ * page must be available at this point, unless the page
+ * refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(pages[i], flags))) {
+ spin_unlock(ptl);
+ remainder = 0;
+ err = -ENOMEM;
+ break;
+ }
}
if (vmas)
@@ -4965,6 +4967,12 @@ follow_huge_pmd(struct mm_struct *mm, unsigned long address,
struct page *page = NULL;
spinlock_t *ptl;
pte_t pte;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) ==
+ (FOLL_PIN | FOLL_GET)))
+ return NULL;
+
retry:
ptl = pmd_lockptr(mm, pmd);
spin_lock(ptl);
@@ -4977,8 +4985,18 @@ retry:
pte = huge_ptep_get((pte_t *)pmd);
if (pte_present(pte)) {
page = pmd_page(*pmd) + ((address & ~PMD_MASK) >> PAGE_SHIFT);
- if (flags & FOLL_GET)
- get_page(page);
+ /*
+ * try_grab_page() should always succeed here, because: a) we
+ * hold the pmd (ptl) lock, and b) we've just checked that the
+ * huge pmd (head) page is present in the page tables. The ptl
+ * prevents the head page and tail pages from being rearranged
+ * in any way. So this page must be available at this point,
+ * unless the page refcount overflowed:
+ */
+ if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ page = NULL;
+ goto out;
+ }
} else {
if (is_hugetlb_entry_migration(pte)) {
spin_unlock(ptl);
@@ -4999,7 +5017,7 @@ struct page * __weak
follow_huge_pud(struct mm_struct *mm, unsigned long address,
pud_t *pud, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
@@ -5008,7 +5026,7 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address,
struct page * __weak
follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags)
{
- if (flags & FOLL_GET)
+ if (flags & (FOLL_GET | FOLL_PIN))
return NULL;
return pte_page(*(pte_t *)pgd) + ((address & ~PGDIR_MASK) >> PAGE_SHIFT);