Basic operation of page table in linux

Finding a virtual address by traversing the page table and mapping the corresponding physical memory is a very common operation, so the whole walk traversal process should take as little time as possible.

show_pte()

With show_ The PTE () function is taken as an example to illustrate how to obtain the physical page conversion process according to a virtual address addr (described by x86 level 5 address conversion)

/*
 * This is useful to dump out the page tables associated with
 * 'addr' in mm 'mm'.
 */
static void show_pte(struct mm_struct *mm, unsigned long addr)
{
	pgd_t *pgd;

	if (mm) {
		pgd = mm->pgd;
	} else {
		pgd = get_TTB();

		if (unlikely(!pgd))
			pgd = swapper_pg_dir;
	}

	pr_alert("pgd = %p\n", pgd);
	pgd += pgd_index(addr);
	pr_alert("[%08lx] *pgd=%0*llx", addr, (u32)(sizeof(*pgd) * 2),
		 (u64)pgd_val(*pgd));

	do {
		p4d_t *p4d;
		pud_t *pud;
		pmd_t *pmd;
		pte_t *pte;

		if (pgd_none(*pgd))
			break;

		if (pgd_bad(*pgd)) {
			pr_cont("(bad)");
			break;
		}

		p4d = p4d_offset(pgd, addr);
		if (PTRS_PER_P4D != 1)
			pr_cont(", *p4d=%0*Lx", (u32)(sizeof(*p4d) * 2),
			        (u64)p4d_val(*p4d));

		if (p4d_none(*p4d))
			break;

		if (p4d_bad(*p4d)) {
			pr_cont("(bad)");
			break;
		}

		pud = pud_offset(p4d, addr);
		if (PTRS_PER_PUD != 1)
			pr_cont(", *pud=%0*llx", (u32)(sizeof(*pud) * 2),
				(u64)pud_val(*pud));

		if (pud_none(*pud))
			break;

		if (pud_bad(*pud)) {
			pr_cont("(bad)");
			break;
		}

		pmd = pmd_offset(pud, addr);
		if (PTRS_PER_PMD != 1)
			pr_cont(", *pmd=%0*llx", (u32)(sizeof(*pmd) * 2),
				(u64)pmd_val(*pmd));

		if (pmd_none(*pmd))
			break;

		if (pmd_bad(*pmd)) {
			pr_cont("(bad)");
			break;
		}

		/* We must not map this if we have highmem enabled */
		if (PageHighMem(pfn_to_page(pmd_val(*pmd) >> PAGE_SHIFT)))
			break;

		pte = pte_offset_kernel(pmd, addr);
		pr_cont(", *pte=%0*llx", (u32)(sizeof(*pte) * 2),
			(u64)pte_val(*pte));
	} while (0);

	pr_cont("\n");
}

show_pte() parameter addr virtual address, mm parameter indicates which process the virtual address belongs to. Main operation steps:

  • pgd = mm - > pgd: get the pgd header address of the process
  • pgd += pgd_index(addr): get the PGD table corresponding to the virtual address according to the pgd index parsed from the virtual address
  • p4d = p4d_offset(pgd, addr): obtain the p4d corresponding to the virtual address according to the entry corresponding to the pgd table (i.e. the p4d table header address) and the p4d index part of the address addr
  • pud = pud_offset(p4d, addr): the PUD is obtained according to the entry corresponding to the p4d table (i.e. the first address of the PUD table) and the pud index part in addr
  • pmd = pmd_offset(pud, addr): obtain the PMD according to the entry corresponding to the pud table (i.e. the first address of the PMD table) and the pmd index in addr
  • pte = pte_offset_kernel(pmd, addr): get the PTE according to the entry corresponding to the pmd table (i.e. the first address of the PTE table) and the pte index in addr.

The kernel provides corresponding interface operations for the operations of each table.

pgd table operation

pgd_index

Obtaining the corresponding pgd part from the addr virtual address is actually a typical displacement operation:

#define pgd_index(a)  (((a) >> PGDIR_SHIFT) & (PTRS_PER_PGD - 1))

Offset address addr to PGDIR_SHIFT and take the PGD part

pgd_offset

pgd_offset supports mm and addr parameters:

#define pgd_offset(mm, address)		pgd_offset_pgd((mm)->pgd, (address))

pgd_t

pgd_t is defined in the x86 structure as follows( /arch/x86/include/asm/pgtable_types.h):  

typedef struct { pgdval_t pgd; } pgd_t;

 pgd_none()

pgd_none() is used to query whether the PGD table corresponding to the current virtual address exists. Level 5 mapping is enabled under x86 64 bit architecture. This function is implemented in( /arch/x86/include/asm/pgtable.h):

static inline int pgd_none(pgd_t pgd)
{
	if (!pgtable_l5_enabled())
		return 0;
	/*
	 * There is no need to do a workaround for the KNL stray
	 * A/D bit erratum here.  PGDs only point to page tables
	 * except on 32-bit non-PAE which is not supported on
	 * KNL.
	 */
	return !native_pgd_val(pgd);
}

If the five level mapping is not enabled, 0 is returned; otherwise, native is called_ pgd_ Val() check whether the corresponding ogd exists:

static inline pgdval_t native_pgd_val(pgd_t pgd)
{
	return pgd.pgd & PGD_ALLOWED_BITS;
}

 pgd_bad

Whether the pgd is valid or damaged:

static inline int pgd_bad(pgd_t pgd)
{
	unsigned long ignore_flags = _PAGE_USER;

	if (!pgtable_l5_enabled())
		return 0;

	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
		ignore_flags |= _PAGE_NX;

	return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE;
}

Check whether the pdd is valid by checking the flag bit in the pgd table. The flag bit is allowed in the pgd (described in detail later):

#define PGD_ALLOWED_BITS	(PGD_PAE_PHYS_MASK | _PAGE_PRESENT | \
				 _PAGE_PWT | _PAGE_PCD | \
				 _PAGE_SOFTW1 | _PAGE_SOFTW2 | _PAGE_SOFTW3)

pgd_page

Get physical pages used by p4d table:

#define pgd_page(pgd)	pfn_to_page(pgd_pfn(pgd))

 pgd_pfn

Physical page corresponding to pgd table pfn:

static inline unsigned long pgd_pfn(pgd_t pgd)
{
	return (pgd_val(pgd) & PTE_PFN_MASK) >> PAGE_SHIFT;
}

p4d table

p4d_offset

According to the pgd table corresponding to the virtual address, corresponding to entry and p4d in addr:

static inline p4d_t *p4d_offset(pgd_t *pgd, unsigned long address)
{
	if (!pgtable_l5_enabled())
		return (p4d_t *)pgd;
	return (p4d_t *)pgd_page_vaddr(*pgd) + p4d_index(address);
}

The processing steps are divided into two parts:

  • pgd entry corresponds to p4d_t table header address
  • Get p4d index according to address

pgd_page_vaddr

pgd_page_vaddr converts the contents stored in the physical address of the PGD table, that is, the entry, into p4d_t header address:

static inline unsigned long pgd_page_vaddr(pgd_t pgd)
{
	return (unsigned long)__va((unsigned long)pgd_val(pgd) & PTE_PFN_MASK);
}

p4d_index 

Get p4d index in addr address:

static inline unsigned long p4d_index(unsigned long address)
{
	return (address >> P4D_SHIFT) & (PTRS_PER_P4D - 1);
}

p4d_none

Whether p4d table exists:

static inline int p4d_none(p4d_t p4d)
{
	return (native_p4d_val(p4d) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

p4d_bad

Is the p4d table valid

static inline int p4d_bad(p4d_t p4d)
{
	unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER;

	if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION))
		ignore_flags |= _PAGE_NX;

	return (p4d_flags(p4d) & ~ignore_flags) != 0;
}

 p4d_present

Another function is also provided to determine whether the p4d table applies for a physical address:

static inline int p4d_present(p4d_t p4d)
{
	return p4d_flags(p4d) & _PAGE_PRESENT;
}

p4d_page_vaddr

According to the p4d table, obtain the corresponding header address of the next level in the entry:

static inline unsigned long p4d_page_vaddr(p4d_t p4d)
{
	return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

p4d_page

Get physical pages used by p4d table:

#define p4d_page(p4d)	pfn_to_page(p4d_pfn(p4d))

 p4d_pfn

Physical page frame number used by p4d table pfn:

static inline unsigned long p4d_pfn(p4d_t p4d)
{
	return (p4d_val(p4d) & p4d_pfn_mask(p4d)) >> PAGE_SHIFT;
}

pud table operation

pud_offset

The pud is obtained according to the corresponding entry in the p4d table, that is, the first address of the pud table, and the pud part in addr:

static inline pud_t *pud_offset(p4d_t *p4d, unsigned long address)
{
	return (pud_t *)p4d_page_vaddr(*p4d) + pud_index(address);
}

The operation steps are similar to p4d. First, convert the entry corresponding to p4d table into the pud table header address, and then obtain pub according to pud index

p4d_page_vaddr

Convert the entry # corresponding to p4d table to the first address of pud table:

static inline unsigned long p4d_page_vaddr(p4d_t p4d)
{
	return (unsigned long)__va(p4d_val(p4d) & p4d_pfn_mask(p4d));
}

 pud_index

Get the pud part from the virtual address:

static inline unsigned long pud_index(unsigned long address)
{
	return (address >> PUD_SHIFT) & (PTRS_PER_PUD - 1);
}

pud_none

Does the pud table exist

static inline int pud_none(pud_t pud)
{
	return (native_pud_val(pud) & ~(_PAGE_KNL_ERRATUM_MASK)) == 0;
}

pud_bad

Whether the pud table is valid and damaged:

static inline int pud_bad(pud_t pud)
{
	return (pud_flags(pud) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0;
}

pud_present 

Does the pud form request a physical address:

static inline int pud_present(pud_t pud)
{
	return pud_flags(pud) & _PAGE_PRESENT;
}

pud_pfn

Physical page frame number used by pud table pfn:

static inline unsigned long pud_pfn(pud_t pud)
{
	phys_addr_t pfn = pud_val(pud);
	pfn ^= protnone_mask(pfn);
	return (pfn & pud_pfn_mask(pud)) >> PAGE_SHIFT;
}

pmd table operation

pmd_offset

The pmd is obtained according to the corresponding entry in the pud table, i.e. the first address of the pmd table, and the pmd part in addr:

static inline pmd_t *pmd_offset(pud_t *pud, unsigned long address)
{
	return pud_pgtable(*pud) + pmd_index(address);
}

The operation steps are similar to those of pud. First, convert the entry corresponding to the pud table into the pmd table header address, and then obtain the pmb according to the pmd index

pud_page_vaddr

Convert the entry # corresponding to the pud table to the pmd table header address:

static inline unsigned long pud_page_vaddr(pud_t pud)
{
	return (unsigned long)__va(pud_val(pud) & pud_pfn_mask(pud));
}

 pmd_index

Get the pmd part from the virtual address:

static inline unsigned long pmd_index(unsigned long address)
{
	return (address >> PMD_SHIFT) & (PTRS_PER_PMD - 1);
}

pmd_none

Does the pmd table exist

static inline int pmd_none(pmd_t pmd)
{
	/* Only check low word on 32-bit platforms, since it might be
	   out of sync with upper half. */
	unsigned long val = native_pmd_val(pmd);
	return (val & ~_PAGE_KNL_ERRATUM_MASK) == 0;
}

pmd_bad

Whether the pmd table is valid and damaged:

static inline int pmd_bad(pmd_t pmd)
{
	return (pmd_flags(pmd) & ~_PAGE_USER) != _KERNPG_TABLE;
}

pmd_present 

Does the pmd form request a physical address:

static inline int pmd_present(pmd_t pmd)
{
	/*
	 * Checking for _PAGE_PSE is needed too because
	 * split_huge_page will temporarily clear the present bit (but
	 * the _PAGE_PSE flag will remain set at all times while the
	 * _PAGE_PRESENT bit is clear).
	 */
	return pmd_flags(pmd) & (_PAGE_PRESENT | _PAGE_PROTNONE | _PAGE_PSE);
}

pmd_pfn

Physical page frame number used by pmd table pfn:

static inline unsigned long pmd_pfn(pmd_t pmd)
{
	phys_addr_t pfn = pmd_val(pmd);
	pfn ^= protnone_mask(pfn);
	return (pfn & pmd_pfn_mask(pmd)) >> PAGE_SHIFT;
}

pmd_page 

Physical pages used by pmd table:

#define pmd_page(pmd)	pfn_to_page(pmd_pfn(pmd))

pte table operation

pte_offset_kernel

Obtain the pte according to the corresponding entry in the pmd table, i.e. the first address of the pte table, and the pte part in the addr:

static inline pte_t *pte_offset_kernel(pmd_t *pmd, unsigned long address)
{
	return (pte_t *)pmd_page_vaddr(*pmd) + pte_index(address);
}

The operation steps are similar to pmd. First, convert the entry corresponding to pmd table into the first address of pte table, and then obtain pte according to pte index

pmd_page_vaddr

Convert the entry # corresponding to the pud table to the pmd table header address:

static inline unsigned long pmd_page_vaddr(pmd_t pmd)
{
	return (unsigned long)__va(pmd_val(pmd) & pmd_pfn_mask(pmd));
}

 pte_index

Get the pmd part from the virtual address:

static inline unsigned long pte_index(unsigned long address)
{
	return (address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1);
}

pte_none

Does the pte table exist

static inline int pte_none(pte_t pte)
{
	return !(pte.pte & ~(_PAGE_KNL_ERRATUM_MASK));
}

pte_present 

Whether the pte form applies for a physical address:

static inline int pte_present(pte_t a)
{
	return pte_flags(a) & (_PAGE_PRESENT | _PAGE_PROTNONE);
}

pte_pfn

The real physical page pfn corresponding to the virtual address:

static inline unsigned long pte_pfn(pte_t pte)
{
	phys_addr_t pfn = pte_val(pte);
	pfn ^= protnone_mask(pfn);
	return (pfn & PTE_PFN_MASK) >> PAGE_SHIFT;
}

pte_page

Physical page corresponding to virtual address:

#define pte_page(pte)	pfn_to_page(pte_pfn(pte))

Page form application at all levels

The establishment of page tables at all levels is also handled by special functions. In essence, the corresponding physical memory is applied from the slab or directly from the buddy to save the page table information at all levels.

pgd_alloc

pgd_alloc is used to apply for PGD table. Generally, when creating threads such as fork or vfork, the PGD table of the process will be built first The function is implemented on x86 platform( /arch/x86/mm/pgtable.c) The main functions are as follows:

pgd_t *pgd_alloc(struct mm_struct *mm)
{
	pgd_t *pgd;
	pmd_t *u_pmds[MAX_PREALLOCATED_USER_PMDS];
	pmd_t *pmds[MAX_PREALLOCATED_PMDS];

	pgd = _pgd_alloc();

	if (pgd == NULL)
		goto out;

	mm->pgd = pgd;

	if (preallocate_pmds(mm, pmds, PREALLOCATED_PMDS) != 0)
		goto out_free_pgd;

	if (preallocate_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS) != 0)
		goto out_free_pmds;

	if (paravirt_pgd_alloc(mm) != 0)
		goto out_free_user_pmds;

	/*
	 * Make sure that pre-populating the pmds is atomic with
	 * respect to anything walking the pgd_list, so that they
	 * never see a partially populated pgd.
	 */
	spin_lock(&pgd_lock);

	pgd_ctor(mm, pgd);
	pgd_prepopulate_pmd(mm, pgd, pmds);
	pgd_prepopulate_user_pmd(mm, pgd, u_pmds);

	spin_unlock(&pgd_lock);

	return pgd;

out_free_user_pmds:
	free_pmds(mm, u_pmds, PREALLOCATED_USER_PMDS);
out_free_pmds:
	free_pmds(mm, pmds, PREALLOCATED_PMDS);
out_free_pgd:
	_pgd_free(pgd);
out:
	return NULL;
}

Main treatment process:

  • Call_ pgd_alloc application PGD table physical memory
  • Save the pgd table assignment of the application to the corresponding process space mm - > pgd = pgd;
  • Apply for a certain number of pmd tables in advance to speed up the process when applying for pdm tables next time. This pmd is mainly used for the conversion of kernel page tables
  • Pre apply for a certain number of i_pmd table is mainly used for conversion of user memory space page table
  • Set the pmd # to be pre applied to the page table

_pgd_alloc

_ pgd_alloc requests physical memory for PGD table:

static inline pgd_t *_pgd_alloc(void)
{
	/*
	 * If no SHARED_KERNEL_PMD, PAE kernel is running as a Xen domain.
	 * We allocate one page for pgd.
	 */
	if (!SHARED_KERNEL_PMD)
		return (pgd_t *)__get_free_pages(GFP_PGTABLE_USER,
						 PGD_ALLOCATION_ORDER);

	/*
	 * Now PAE kernel is not running as a Xen domain. We can allocate
	 * a 32-byte slab for pgd to save memory space.
	 */
	return kmem_cache_alloc(pgd_cache, GFP_PGTABLE_USER);
}

If SHARED_KERNEL_PMD is not set. Directly request a physical page from buddy to use as pgd table, otherwise request physical memory from slab

p4d_alloc

Request physical memory for p4d table:

static inline p4d_t *p4d_alloc(struct mm_struct *mm, pgd_t *pgd,
		unsigned long address)
{
	return (unlikely(pgd_none(*pgd)) && __p4d_alloc(mm, pgd, address)) ?
		NULL : p4d_offset(pgd, address);
}

Finally, it is called__ p4d_alloc() request p4d table physical memory

__p4d_alloc

/*
 * Allocate p4d page table.
 * We've already handled the fast-path in-line.
 */
int __p4d_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
	p4d_t *new = p4d_alloc_one(mm, address);
	if (!new)
		return -ENOMEM;

	smp_wmb(); /* See comment in __pte_alloc */

	spin_lock(&mm->page_table_lock);
	if (pgd_present(*pgd))		/* Another has populated it */
		p4d_free(mm, new);
	else
		pgd_populate(mm, pgd, new);
	spin_unlock(&mm->page_table_lock);
	return 0;
}
  • Call p4d_alloc_one application p4d form
  • smp_wmb: considering the SMP system, it is necessary to ensure consistency
  • pgd_populate: set the requested p4d table to the corresponding PGD table

pud_alloc

Application pud form:

static inline pud_t *pud_alloc(struct mm_struct *mm, p4d_t *p4d,
		unsigned long address)
{
	return (unlikely(p4d_none(*p4d)) && __pud_alloc(mm, p4d, address)) ?
		NULL : pud_offset(p4d, address);
}

__pud_alloc

pud table request physical memory:

/*
 * Allocate page upper directory.
 * We've already handled the fast-path in-line.
 */
int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
{
	pud_t *new = pud_alloc_one(mm, address);
	if (!new)
		return -ENOMEM;

	smp_wmb(); /* See comment in __pte_alloc */

	spin_lock(&mm->page_table_lock);
	if (!p4d_present(*p4d)) {
		mm_inc_nr_puds(mm);
		p4d_populate(mm, p4d, new);
	} else	/* Another has populated it */
		pud_free(mm, new);
	spin_unlock(&mm->page_table_lock);
	return 0;
}
  • pud_ alloc_ One: apply for physical memory in PUD table
  • p4d_populate: set the requested pud table to the corresponding p4d

pmd_alloc

Application pmd form:

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
	return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
		NULL: pmd_offset(pud, address);
}

 __pmd_alloc

/*
 * Allocate page middle directory.
 * We've already handled the fast-path in-line.
 */
int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
	spinlock_t *ptl;
	pmd_t *new = pmd_alloc_one(mm, address);
	if (!new)
		return -ENOMEM;

	smp_wmb(); /* See comment in __pte_alloc */

	ptl = pud_lock(mm, pud);
	if (!pud_present(*pud)) {
		mm_inc_nr_pmds(mm);
		pud_populate(mm, pud, new);
	} else	/* Another has populated it */
		pmd_free(mm, new);
	spin_unlock(ptl);
	return 0;
}
  • pmd_alloc_one: application PMD form
  • pud_populate: set the requested pmd table to the PUD.  

pte_alloc

Application pte form:

#define pte_alloc(mm, pmd) (unlikely(pmd_none(*(pmd))) && __pte_alloc(mm, pmd))

__pte_alloc

int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
{
	spinlock_t *ptl;
	pgtable_t new = pte_alloc_one(mm);
	if (!new)
		return -ENOMEM;

	/*
	 * Ensure all pte setup (eg. pte page lock and page clearing) are
	 * visible before the pte is made visible to other CPUs by being
	 * put into page tables.
	 *
	 * The other side of the story is the pointer chasing in the page
	 * table walking code (when walking the page table without locking;
	 * ie. most of the time). Fortunately, these data accesses consist
	 * of a chain of data-dependent loads, meaning most CPUs (alpha
	 * being the notable exception) will already guarantee loads are
	 * seen in-order. See the alpha page table accessors for the
	 * smp_read_barrier_depends() barriers in page table walking code.
	 */
	smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */

	ptl = pmd_lock(mm, pmd);
	if (likely(pmd_none(*pmd))) {	/* Has another populated it ? */
		mm_inc_nr_ptes(mm);
		pmd_populate(mm, pmd, new);
		new = NULL;
	}
	spin_unlock(ptl);
	if (new)
		pte_free(mm, new);
	return 0;
}
  • pte_alloc_one application PTE
  • pmd_populate: set the requested pte to PMD.

Page table release at all levels

pgd_free

Release pgd table:

void pgd_free(struct mm_struct *mm, pgd_t *pgd)
{
	pgd_mop_up_pmds(mm, pgd);
	pgd_dtor(pgd);
	paravirt_pgd_free(mm, pgd);
	_pgd_free(pgd);
}

Final call_ pgd_free release PGD table

_pgd_free

static inline void _pgd_free(pgd_t *pgd)
{
	if (!SHARED_KERNEL_PMD)
		free_pages((unsigned long)pgd, PGD_ALLOCATION_ORDER);
	else
		kmem_cache_free(pgd_cache, pgd);
}

Call free_page or release physical memory through slab

p4d_free

p4d table release:

static inline void p4d_free(struct mm_struct *mm, p4d_t *p4d)
{
	if (!pgtable_l5_enabled())
		return;

	BUG_ON((unsigned long)p4d & (PAGE_SIZE-1));
	free_page((unsigned long)p4d);
}

Through free_page release p4d table occupied physical memory page

pud_free

Release pud table:

static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
	BUG_ON((unsigned long)pud & (PAGE_SIZE-1));
	free_page((unsigned long)pud);
}

 pmd_free

Release pmd table:

static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
{
	BUG_ON((unsigned long)pmd & (PAGE_SIZE-1));
	pgtable_pmd_page_dtor(virt_to_page(pmd));
	free_page((unsigned long)pmd);
}

 pte_free

Release pte table:

static inline void pte_free(struct mm_struct *mm, struct page *pte_page)
{
	pgtable_pte_page_dtor(pte_page);
	__free_page(pte_page);
}

Keywords: Linux Operation & Maintenance server

Added by webdevelope on Sat, 18 Dec 2021 07:53:51 +0200