From 4b47b0fa4b15e0de916e7dd93cd787fdab208ff2 Mon Sep 17 00:00:00 2001 From: Muhammad Muzammil Date: Fri, 13 Oct 2023 10:31:18 +0500 Subject: [PATCH 01/76] powerpc/bpf: Fixed 'instead' typo in bpf_jit_build_body() Fixed 'instead' typo. Signed-off-by: Muhammad Muzammil Signed-off-by: Michael Ellerman Link: https://msgid.link/20231013053118.11221-1-m.muzzammilashraf@gmail.com --- arch/powerpc/net/bpf_jit_comp32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index 7f91ea064c087..bc7f92ec7f2d7 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -940,7 +940,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * * !fp->aux->verifier_zext. Emit NOP otherwise. * * Note that "li reg_h,0" is emitted for BPF_B/H/W case, - * if necessary. So, jump there insted of emitting an + * if necessary. So, jump there instead of emitting an * additional "li reg_h,0" instruction. */ if (size == BPF_DW && !fp->aux->verifier_zext) From cc8ee288f484a2a59c01ccd4d8a417d6ed3466e3 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:17 +0200 Subject: [PATCH 02/76] powerpc/40x: Remove stale PTE_ATOMIC_UPDATES macro 40x TLB handlers were reworked by commit 2c74e2586bb9 ("powerpc/40x: Rework 40x PTE access and TLB miss") to not require PTE_ATOMIC_UPDATES anymore. Then commit 4e1df545e2fa ("powerpc/pgtable: Drop PTE_ATOMIC_UPDATES") removed all code related to PTE_ATOMIC_UPDATES. Remove left over PTE_ATOMIC_UPDATES macro. Fixes: 2c74e2586bb9 ("powerpc/40x: Rework 40x PTE access and TLB miss") Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/f061db5857fcd748f84a6707aad01754686ce97e.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-40x.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 6fe46e7545566..0b4e5f8ce3e8a 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -69,9 +69,6 @@ #define _PTE_NONE_MASK 0 -/* Until my rework is finished, 40x still needs atomic PTE updates */ -#define PTE_ATOMIC_UPDATES 1 - #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) From 3b8547ec4d35778c9f4cc261d85c0cae6c1a8ecb Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:18 +0200 Subject: [PATCH 03/76] powerpc: Remove pte_ERROR() pte_ERROR() is used neither in powerpc code nor in common mm code. Remove it. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/bec9eb973ecc1cba091e5c9201d877a7797f3242.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 3 --- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 -- arch/powerpc/include/asm/nohash/32/pgtable.h | 3 --- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 -- 4 files changed, 10 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 9b13eb14e21bf..543c3691839b7 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -224,9 +224,6 @@ void unmap_kernel_page(unsigned long va); /* Bits to mask out from a PGD to get to the PUD page */ #define PGD_MASKED_BITS 0 -#define pte_ERROR(e) \ - pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ - (unsigned long long)pte_val(e)) #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) /* diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 5c497c862d757..7c4ad1e03a49c 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1014,8 +1014,6 @@ static inline pmd_t *pud_pgtable(pud_t pud) return (pmd_t *)__va(pud_val(pud) & ~PUD_MASKED_BITS); } -#define pte_ERROR(e) \ - pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index f99c53a5f184b..868aecbec8d15 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -55,9 +55,6 @@ extern int icache_44x_need_flush; #define USER_PTRS_PER_PGD (TASK_SIZE / PGDIR_SIZE) -#define pte_ERROR(e) \ - pr_err("%s:%d: bad pte %llx.\n", __FILE__, __LINE__, \ - (unsigned long long)pte_val(e)) #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index eb6891e34cbde..8083c04a1e6da 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -269,8 +269,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, flush_tlb_page(vma, address); } -#define pte_ERROR(e) \ - pr_err("%s:%d: bad pte %08lx.\n", __FILE__, __LINE__, pte_val(e)) #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ From 93f81f6eea10f497e892c52998a2194b4e16c91d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:19 +0200 Subject: [PATCH 04/76] powerpc: Deduplicate prototypes of ptep_set_access_flags() and phys_mem_access_prot() Prototypes of ptep_set_access_flags() and phys_mem_access_prot() are identical for book3s and nohash. Deduplicate them. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/b846832cce842a2852615b7356937fb9507e436d.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/pgtable.h | 9 --------- arch/powerpc/include/asm/nohash/pgtable.h | 10 ---------- arch/powerpc/include/asm/pgtable.h | 10 ++++++++++ 3 files changed, 10 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index 3b7bd36a23210..6f4578daea6c8 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -9,15 +9,6 @@ #endif #ifndef __ASSEMBLY__ -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep, pte_t entry, int dirty); - -struct file; -extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot); -#define __HAVE_PHYS_MEM_ACCESS_PROT - void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); /* diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index c721478c59347..5b6647fb398b3 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -207,11 +207,6 @@ static inline void __set_pte_at(struct mm_struct *mm, unsigned long addr, mb(); } - -#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS -extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, - pte_t *ptep, pte_t entry, int dirty); - /* * Macro to mark a page protection value as "uncacheable". */ @@ -240,11 +235,6 @@ extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long addre #define pgprot_writecombine pgprot_noncached_wc -struct file; -extern pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, - unsigned long size, pgprot_t vma_prot); -#define __HAVE_PHYS_MEM_ACCESS_PROT - #ifdef CONFIG_HUGETLB_PAGE static inline int hugepd_ok(hugepd_t hpd) { diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index d0ee46de248ea..bcdbdeda65d3d 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -110,6 +110,16 @@ void mark_initmem_nx(void); static inline void mark_initmem_nx(void) { } #endif +#define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS +int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, pte_t entry, int dirty); + +struct file; +pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, + unsigned long size, pgprot_t vma_prot); +#define __HAVE_PHYS_MEM_ACCESS_PROT + + /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h * so we are sure it is included when arriving here. From da9554e0fe3c7b46912a361a803b50f2655ff30f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:20 +0200 Subject: [PATCH 05/76] powerpc: Refactor update_mmu_cache_range() On nohash, this function voids except for E500 with hugepages. On book3s, this function is for hash MMUs only. Combine those tests and rename E500 update_mmu_cache_range() as __update_mmu_cache() which gets called by update_mmu_cache_range(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/b029842cb6783cbeb43d202e69a90341d65295a4.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/pgtable.h | 24 ----------------------- arch/powerpc/include/asm/nohash/pgtable.h | 15 -------------- arch/powerpc/include/asm/pgtable.h | 19 ++++++++++++++++++ arch/powerpc/mm/nohash/e500_hugetlbpage.c | 3 +-- 4 files changed, 20 insertions(+), 41 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/pgtable.h b/arch/powerpc/include/asm/book3s/pgtable.h index 6f4578daea6c8..f42d68c6b3140 100644 --- a/arch/powerpc/include/asm/book3s/pgtable.h +++ b/arch/powerpc/include/asm/book3s/pgtable.h @@ -8,28 +8,4 @@ #include #endif -#ifndef __ASSEMBLY__ -void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); - -/* - * This gets called at the end of handling a page fault, when - * the kernel has put a new PTE into the page table for the process. - * We use it to ensure coherency between the i-cache and d-cache - * for the page which has just been mapped in. - * On machines which use an MMU hash table, we use this to put a - * corresponding HPTE into the hash table ahead of time, instead of - * waiting for the inevitable extra hash-table miss exception. - */ -static inline void update_mmu_cache_range(struct vm_fault *vmf, - struct vm_area_struct *vma, unsigned long address, - pte_t *ptep, unsigned int nr) -{ - if (IS_ENABLED(CONFIG_PPC32) && !mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return; - if (radix_enabled()) - return; - __update_mmu_cache(vma, address, ptep); -} - -#endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 5b6647fb398b3..a9056f4fad48d 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -259,20 +259,5 @@ static inline int pud_huge(pud_t pud) #define is_hugepd(hpd) (hugepd_ok(hpd)) #endif -/* - * This gets called at the end of handling a page fault, when - * the kernel has put a new PTE into the page table for the process. - * We use it to ensure coherency between the i-cache and d-cache - * for the page which has just been mapped in. - */ -#if defined(CONFIG_PPC_E500) && defined(CONFIG_HUGETLB_PAGE) -void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int nr); -#else -static inline void update_mmu_cache_range(struct vm_fault *vmf, - struct vm_area_struct *vma, unsigned long address, - pte_t *ptep, unsigned int nr) {} -#endif - #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index bcdbdeda65d3d..966e7c5119f63 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -119,6 +119,25 @@ pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot); #define __HAVE_PHYS_MEM_ACCESS_PROT +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep); + +/* + * This gets called at the end of handling a page fault, when + * the kernel has put a new PTE into the page table for the process. + * We use it to ensure coherency between the i-cache and d-cache + * for the page which has just been mapped in. + * On machines which use an MMU hash table, we use this to put a + * corresponding HPTE into the hash table ahead of time, instead of + * waiting for the inevitable extra hash-table miss exception. + */ +static inline void update_mmu_cache_range(struct vm_fault *vmf, + struct vm_area_struct *vma, unsigned long address, + pte_t *ptep, unsigned int nr) +{ + if ((mmu_has_feature(MMU_FTR_HPTE_TABLE) && !radix_enabled()) || + (IS_ENABLED(CONFIG_PPC_E500) && IS_ENABLED(CONFIG_HUGETLB_PAGE))) + __update_mmu_cache(vma, address, ptep); +} /* * When used, PTE_FRAG_NR is defined in subarch pgtable.h diff --git a/arch/powerpc/mm/nohash/e500_hugetlbpage.c b/arch/powerpc/mm/nohash/e500_hugetlbpage.c index 6b30e40d45903..a134d28a0e4d3 100644 --- a/arch/powerpc/mm/nohash/e500_hugetlbpage.c +++ b/arch/powerpc/mm/nohash/e500_hugetlbpage.c @@ -178,8 +178,7 @@ book3e_hugetlb_preload(struct vm_area_struct *vma, unsigned long ea, pte_t pte) * * This must always be called with the pte lock held. */ -void update_mmu_cache_range(struct vm_fault *vmf, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, unsigned int nr) +void __update_mmu_cache(struct vm_area_struct *vma, unsigned long address, pte_t *ptep) { if (is_vm_hugetlb_page(vma)) book3e_hugetlb_preload(vma, address, *ptep); From d3e01796728add53ab778298573772d44d52d19c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:21 +0200 Subject: [PATCH 06/76] powerpc: Untangle fixmap.h and pgtable.h and mmu.h fixmap.h need pgtable.h for [un]map_kernel_page() pgtable.h need fixmap.h for FIXADDR_TOP. Untangle the two files by moving FIXADDR_TOP into pgtable.h Also move VIRT_IMMR_BASE to fixmap.h to avoid fixmap.h in mmu.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/5eba12392a018be28ad0a02ed844767b132589e7.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 9 ++++++++- arch/powerpc/include/asm/book3s/64/pgtable.h | 1 + arch/powerpc/include/asm/fixmap.h | 16 ++++------------ arch/powerpc/include/asm/nohash/32/mmu-8xx.h | 1 - arch/powerpc/include/asm/nohash/32/pgtable.h | 9 ++++++++- arch/powerpc/include/asm/nohash/64/pgtable.h | 1 + arch/powerpc/mm/init_32.c | 1 + arch/powerpc/mm/mem.c | 1 + arch/powerpc/mm/nohash/8xx.c | 2 ++ arch/powerpc/platforms/83xx/misc.c | 2 ++ arch/powerpc/platforms/8xx/cpm1.c | 1 + 11 files changed, 29 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 543c3691839b7..45b69ae2631e0 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -170,7 +170,14 @@ void unmap_kernel_page(unsigned long va); * value (for now) on others, from where we can start layout kernel * virtual space that goes below PKMAP and FIXMAP */ -#include + +#define FIXADDR_SIZE 0 +#ifdef CONFIG_KASAN +#include +#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) +#else +#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +#endif /* * ioremap_bot starts at that address. Early ioremaps move down from there, diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 7c4ad1e03a49c..dbd545e73161a 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -316,6 +316,7 @@ extern unsigned long pci_io_base; #define IOREMAP_START (ioremap_bot) #define IOREMAP_END (KERN_IO_END - FIXADDR_SIZE) #define FIXADDR_SIZE SZ_32M +#define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index a832aeafe5601..f9068dd8dfce7 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -23,18 +23,6 @@ #include #endif -#ifdef CONFIG_PPC64 -#define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) -#else -#define FIXADDR_SIZE 0 -#ifdef CONFIG_KASAN -#include -#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) -#else -#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) -#endif -#endif - /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at @@ -119,5 +107,9 @@ static inline void __set_fixmap(enum fixed_addresses idx, #define __early_set_fixmap __set_fixmap +#ifdef CONFIG_PPC_8xx +#define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) +#endif + #endif /* !__ASSEMBLY__ */ #endif diff --git a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h index 0e93a4728c9e1..141d82e249a86 100644 --- a/arch/powerpc/include/asm/nohash/32/mmu-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/mmu-8xx.h @@ -188,7 +188,6 @@ typedef struct { } mm_context_t; #define PHYS_IMMR_BASE (mfspr(SPRN_IMMR) & 0xfff80000) -#define VIRT_IMMR_BASE (__fix_to_virt(FIX_IMMR_BASE)) /* Page size definitions, common between 32 and 64-bit * diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 868aecbec8d15..c8311ee088116 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -71,7 +71,14 @@ void unmap_kernel_page(unsigned long va); * value (for now) on others, from where we can start layout kernel * virtual space that goes below PKMAP and FIXMAP */ -#include + +#define FIXADDR_SIZE 0 +#ifdef CONFIG_KASAN +#include +#define FIXADDR_TOP (KASAN_SHADOW_START - PAGE_SIZE) +#else +#define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) +#endif /* * ioremap_bot starts at that address. Early ioremaps move down from there, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 8083c04a1e6da..dee3fc654d401 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -57,6 +57,7 @@ #define IOREMAP_START (ioremap_bot) #define IOREMAP_END (KERN_IO_START + KERN_IO_SIZE - FIXADDR_SIZE) #define FIXADDR_SIZE SZ_32M +#define FIXADDR_TOP (IOREMAP_END + FIXADDR_SIZE) /* * Defines the address of the vmemap area, in its own region on diff --git a/arch/powerpc/mm/init_32.c b/arch/powerpc/mm/init_32.c index d8adc452f4310..4e71dfe7d026d 100644 --- a/arch/powerpc/mm/init_32.c +++ b/arch/powerpc/mm/init_32.c @@ -39,6 +39,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 8b121df7b08f8..08f3ec9d522b0 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -26,6 +26,7 @@ #include #include #include +#include #include diff --git a/arch/powerpc/mm/nohash/8xx.c b/arch/powerpc/mm/nohash/8xx.c index dbbfe897455dc..bb9c39b449d17 100644 --- a/arch/powerpc/mm/nohash/8xx.c +++ b/arch/powerpc/mm/nohash/8xx.c @@ -10,6 +10,8 @@ #include #include +#include + #include #define IMMR_SIZE (FIX_IMMR_SIZE << PAGE_SHIFT) diff --git a/arch/powerpc/platforms/83xx/misc.c b/arch/powerpc/platforms/83xx/misc.c index 2fb2a85d131fd..1135c1ab923cc 100644 --- a/arch/powerpc/platforms/83xx/misc.c +++ b/arch/powerpc/platforms/83xx/misc.c @@ -14,6 +14,8 @@ #include #include #include +#include + #include #include diff --git a/arch/powerpc/platforms/8xx/cpm1.c b/arch/powerpc/platforms/8xx/cpm1.c index ebb5f6a27dbf3..b24d4102fbf61 100644 --- a/arch/powerpc/platforms/8xx/cpm1.c +++ b/arch/powerpc/platforms/8xx/cpm1.c @@ -40,6 +40,7 @@ #include #include #include +#include #include From 81fbb9997057b6e6e5795a08d9a8e10e9f48236f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:22 +0200 Subject: [PATCH 07/76] powerpc/nohash: Remove {pte/pmd}_protnone() Only book3s/64 selects ARCH_SUPPORTS_NUMA_BALANCING so CONFIG_NUMA_BALANCING can't be selected on nohash targets. Remove pte_protnone() and pmd_protnone(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/a50c1a19828a8eced82cbcc5c61754b667037d21.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pgtable.h | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index a9056f4fad48d..ab26af2b421ae 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -35,23 +35,6 @@ static inline bool pte_hashpte(pte_t pte) { return false; } static inline bool pte_ci(pte_t pte) { return pte_val(pte) & _PAGE_NO_CACHE; } static inline bool pte_exec(pte_t pte) { return pte_val(pte) & _PAGE_EXEC; } -#ifdef CONFIG_NUMA_BALANCING -/* - * These work without NUMA balancing but the kernel does not care. See the - * comment in include/linux/pgtable.h . On powerpc, this will only - * work for user pages and always return true for kernel pages. - */ -static inline int pte_protnone(pte_t pte) -{ - return pte_present(pte) && !pte_user(pte); -} - -static inline int pmd_protnone(pmd_t pmd) -{ - return pte_protnone(pmd_pte(pmd)); -} -#endif /* CONFIG_NUMA_BALANCING */ - static inline int pte_present(pte_t pte) { return pte_val(pte) & _PAGE_PRESENT; From 7835006979e5415aa4c9bc0e3e7063b5c5943ed4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:23 +0200 Subject: [PATCH 08/76] powerpc/nohash: Refactor declaration of {map/unmap}_kernel_page() map_kernel_page() and unmap_kernel_page() have the same prototypes on nohash/32 and nohash/64, keep only one declaration. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/7fec5f3288cf0d0eac61b1b3f48c3ea54eb80cad.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 8 -------- arch/powerpc/include/asm/nohash/64/pgtable.h | 2 -- arch/powerpc/include/asm/nohash/pgtable.h | 3 +++ arch/powerpc/mm/nohash/book3e_pgtable.c | 2 +- 4 files changed, 4 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index c8311ee088116..26289e4e767c9 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -58,14 +58,6 @@ extern int icache_44x_need_flush; #define pgd_ERROR(e) \ pr_err("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, pgd_val(e)) -#ifndef __ASSEMBLY__ - -int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); -void unmap_kernel_page(unsigned long va); - -#endif /* !__ASSEMBLY__ */ - - /* * This is the bottom of the PKMAP area with HIGHMEM or an arbitrary * value (for now) on others, from where we can start layout kernel diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index dee3fc654d401..f5a8e8a9dba46 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -309,8 +309,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, /* We borrow MSB 56 (LSB 7) to store the exclusive marker in swap PTEs. */ #define _PAGE_SWP_EXCLUSIVE 0x80 -int map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot); -void unmap_kernel_page(unsigned long va); extern int __meminit vmemmap_create_mapping(unsigned long start, unsigned long page_size, unsigned long phys); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index ab26af2b421ae..3d684b500fe61 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -242,5 +242,8 @@ static inline int pud_huge(pud_t pud) #define is_hugepd(hpd) (hugepd_ok(hpd)) #endif +int map_kernel_page(unsigned long va, phys_addr_t pa, pgprot_t prot); +void unmap_kernel_page(unsigned long va); + #endif /* __ASSEMBLY__ */ #endif diff --git a/arch/powerpc/mm/nohash/book3e_pgtable.c b/arch/powerpc/mm/nohash/book3e_pgtable.c index b80fc4a91a534..1c5e4ecbebebf 100644 --- a/arch/powerpc/mm/nohash/book3e_pgtable.c +++ b/arch/powerpc/mm/nohash/book3e_pgtable.c @@ -71,7 +71,7 @@ static void __init *early_alloc_pgtable(unsigned long size) * map_kernel_page adds an entry to the ioremap page table * and adds an entry to the HPT, possibly bolting it */ -int __ref map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) +int __ref map_kernel_page(unsigned long ea, phys_addr_t pa, pgprot_t prot) { pgd_t *pgdp; p4d_t *p4dp; From 4c1a89d983be951a3e39d7f9c1d6987f3054e32d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:24 +0200 Subject: [PATCH 09/76] powerpc/nohash: Move 8xx version of pte_update() into pte-8xx.h No point in having 8xx special pte_update() in common header, move it into pte-8xx.h Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/17e209b1a1a43ed219e9e1f2947ec594ed4f9394.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 57 +------------------- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 57 ++++++++++++++++++++ 2 files changed, 58 insertions(+), 56 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 26289e4e767c9..be8bca42bdce3 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -219,63 +219,8 @@ static inline void pmd_clear(pmd_t *pmdp) * that an executable user mapping was modified, which is needed * to properly flush the virtually tagged instruction cache of * those implementations. - * - * On the 8xx, the page tables are a bit special. For 16k pages, we have - * 4 identical entries. For 512k pages, we have 128 entries as if it was - * 4k pages, but they are flagged as 512k pages for the hardware. - * For other page sizes, we have a single entry in the table. */ -#ifdef CONFIG_PPC_8xx -static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr); -static int hugepd_ok(hugepd_t hpd); - -static int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) -{ - if (!huge) - return PAGE_SIZE / SZ_4K; - else if (hugepd_ok(*((hugepd_t *)pmd))) - return 1; - else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) - return SZ_16K / SZ_4K; - else - return SZ_512K / SZ_4K; -} - -static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, - unsigned long clr, unsigned long set, int huge) -{ - pte_basic_t *entry = (pte_basic_t *)p; - pte_basic_t old = pte_val(*p); - pte_basic_t new = (old & ~(pte_basic_t)clr) | set; - int num, i; - pmd_t *pmd = pmd_off(mm, addr); - - num = number_of_cells_per_pte(pmd, new, huge); - - for (i = 0; i < num; i += PAGE_SIZE / SZ_4K, new += PAGE_SIZE) { - *entry++ = new; - if (IS_ENABLED(CONFIG_PPC_16K_PAGES) && num != 1) { - *entry++ = new; - *entry++ = new; - *entry++ = new; - } - } - - return old; -} - -#ifdef CONFIG_PPC_16K_PAGES -#define ptep_get ptep_get -static inline pte_t ptep_get(pte_t *ptep) -{ - pte_basic_t val = READ_ONCE(ptep->pte); - pte_t pte = {val, val, val, val}; - - return pte; -} -#endif /* CONFIG_PPC_16K_PAGES */ - -#else +#ifndef pte_update static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, unsigned long clr, unsigned long set, int huge) { diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index e6fe1d5731f2c..52395a5ecd707 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -187,6 +187,63 @@ static inline unsigned long pte_leaf_size(pte_t pte) #define pte_leaf_size pte_leaf_size +/* + * On the 8xx, the page tables are a bit special. For 16k pages, we have + * 4 identical entries. For 512k pages, we have 128 entries as if it was + * 4k pages, but they are flagged as 512k pages for the hardware. + * For other page sizes, we have a single entry in the table. + */ +static pmd_t *pmd_off(struct mm_struct *mm, unsigned long addr); +static int hugepd_ok(hugepd_t hpd); + +static inline int number_of_cells_per_pte(pmd_t *pmd, pte_basic_t val, int huge) +{ + if (!huge) + return PAGE_SIZE / SZ_4K; + else if (hugepd_ok(*((hugepd_t *)pmd))) + return 1; + else if (IS_ENABLED(CONFIG_PPC_4K_PAGES) && !(val & _PAGE_HUGE)) + return SZ_16K / SZ_4K; + else + return SZ_512K / SZ_4K; +} + +static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, + unsigned long clr, unsigned long set, int huge) +{ + pte_basic_t *entry = (pte_basic_t *)p; + pte_basic_t old = pte_val(*p); + pte_basic_t new = (old & ~(pte_basic_t)clr) | set; + int num, i; + pmd_t *pmd = pmd_off(mm, addr); + + num = number_of_cells_per_pte(pmd, new, huge); + + for (i = 0; i < num; i += PAGE_SIZE / SZ_4K, new += PAGE_SIZE) { + *entry++ = new; + if (IS_ENABLED(CONFIG_PPC_16K_PAGES) && num != 1) { + *entry++ = new; + *entry++ = new; + *entry++ = new; + } + } + + return old; +} + +#define pte_update pte_update + +#ifdef CONFIG_PPC_16K_PAGES +#define ptep_get ptep_get +static inline pte_t ptep_get(pte_t *ptep) +{ + pte_basic_t val = READ_ONCE(ptep->pte); + pte_t pte = {val, val, val, val}; + + return pte; +} +#endif /* CONFIG_PPC_16K_PAGES */ + #endif #endif /* __KERNEL__ */ From 0f4027eab59261f2fb72586f18efb44be3594dd4 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:25 +0200 Subject: [PATCH 10/76] powerpc/nohash: Replace #ifdef CONFIG_44x by IS_ENABLED(CONFIG_44x) in pgtable.h No need of a #ifdef, use IS_ENABLED(CONFIG_44x) Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/7c7d97322a6a05a9842b1e8c4b41265916f542ca.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index be8bca42bdce3..a74476de1ef67 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -9,9 +9,7 @@ #include #include /* For sub-arch specific PPC_PIN_SIZE */ -#ifdef CONFIG_44x extern int icache_44x_need_flush; -#endif #endif /* __ASSEMBLY__ */ @@ -229,10 +227,9 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p *p = __pte(new); -#ifdef CONFIG_44x - if ((old & _PAGE_USER) && (old & _PAGE_EXEC)) + if (IS_ENABLED(CONFIG_44x) && (old & _PAGE_USER) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; -#endif + return old; } #endif From 42a2722319f0d3d5612ca8efd3ce7d7eae512291 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:26 +0200 Subject: [PATCH 11/76] powerpc/nohash: Refactor pte_update() pte_update() is similar. Take the nohash/32 version which works on nohash/64 and add the debug call to assert_pte_locked() which is only on nohash/64. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/e01cb630cad42f645915ce7702d23985241b71fc.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 33 --------------- arch/powerpc/include/asm/nohash/64/pgtable.h | 17 -------- arch/powerpc/include/asm/nohash/pgtable.h | 42 ++++++++++++++++++++ 3 files changed, 42 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a74476de1ef67..ae7f3c8afd4fa 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -9,8 +9,6 @@ #include #include /* For sub-arch specific PPC_PIN_SIZE */ -extern int icache_44x_need_flush; - #endif /* __ASSEMBLY__ */ #define PTE_INDEX_SIZE PTE_SHIFT @@ -203,37 +201,6 @@ static inline void pmd_clear(pmd_t *pmdp) *pmdp = __pmd(0); } -/* - * PTE updates. This function is called whenever an existing - * valid PTE is updated. This does -not- include set_pte_at() - * which nowadays only sets a new PTE. - * - * Depending on the type of MMU, we may need to use atomic updates - * and the PTE may be either 32 or 64 bit wide. In the later case, - * when using atomic updates, only the low part of the PTE is - * accessed atomically. - * - * In addition, on 44x, we also maintain a global flag indicating - * that an executable user mapping was modified, which is needed - * to properly flush the virtually tagged instruction cache of - * those implementations. - */ -#ifndef pte_update -static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, - unsigned long clr, unsigned long set, int huge) -{ - pte_basic_t old = pte_val(*p); - pte_basic_t new = (old & ~(pte_basic_t)clr) | set; - - *p = __pte(new); - - if (IS_ENABLED(CONFIG_44x) && (old & _PAGE_USER) && (old & _PAGE_EXEC)) - icache_44x_need_flush = 1; - - return old; -} -#endif - #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index f5a8e8a9dba46..b149a39f26853 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -171,23 +171,6 @@ static inline void p4d_set(p4d_t *p4dp, unsigned long val) *p4dp = __p4d(val); } -/* Atomic PTE updates */ -static inline unsigned long pte_update(struct mm_struct *mm, - unsigned long addr, - pte_t *ptep, unsigned long clr, - unsigned long set, - int huge) -{ - unsigned long old = pte_val(*ptep); - *ptep = __pte((old & ~clr) | set); - - /* huge pages use the old page table lock */ - if (!huge) - assert_pte_locked(mm, addr); - - return old; -} - static inline int pte_young(pte_t pte) { return pte_val(pte) & _PAGE_ACCESSED; diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 3d684b500fe61..bd5c3a4baabd0 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -2,6 +2,11 @@ #ifndef _ASM_POWERPC_NOHASH_PGTABLE_H #define _ASM_POWERPC_NOHASH_PGTABLE_H +#ifndef __ASSEMBLY__ +static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, + unsigned long clr, unsigned long set, int huge); +#endif + #if defined(CONFIG_PPC64) #include #else @@ -18,6 +23,43 @@ #ifndef __ASSEMBLY__ +extern int icache_44x_need_flush; + +/* + * PTE updates. This function is called whenever an existing + * valid PTE is updated. This does -not- include set_pte_at() + * which nowadays only sets a new PTE. + * + * Depending on the type of MMU, we may need to use atomic updates + * and the PTE may be either 32 or 64 bit wide. In the later case, + * when using atomic updates, only the low part of the PTE is + * accessed atomically. + * + * In addition, on 44x, we also maintain a global flag indicating + * that an executable user mapping was modified, which is needed + * to properly flush the virtually tagged instruction cache of + * those implementations. + */ +#ifndef pte_update +static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, pte_t *p, + unsigned long clr, unsigned long set, int huge) +{ + pte_basic_t old = pte_val(*p); + pte_basic_t new = (old & ~(pte_basic_t)clr) | set; + + *p = __pte(new); + + if (IS_ENABLED(CONFIG_44x) && (old & _PAGE_USER) && (old & _PAGE_EXEC)) + icache_44x_need_flush = 1; + + /* huge pages use the old page table lock */ + if (!huge) + assert_pte_locked(mm, addr); + + return old; +} +#endif + /* Generic accessors to PTE bits */ #ifndef pte_write static inline int pte_write(pte_t pte) From 7c929ad0b3167e980a3963e03403a761138a4350 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:27 +0200 Subject: [PATCH 12/76] powerpc/nohash: Refactor checking of no-change in pte_update() On nohash/64, a few callers of pte_update() check if there is really a change in order to avoid an unnecessary write. Refactor that inside pte_update(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/076563e611c2b51036686a8d378bfd5ef1726341.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/64/pgtable.h | 9 --------- arch/powerpc/include/asm/nohash/pgtable.h | 3 +++ 2 files changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index b149a39f26853..cba08a62c52cd 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -181,8 +181,6 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, { unsigned long old; - if (!pte_young(*ptep)) - return 0; old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); return (old & _PAGE_ACCESSED) != 0; } @@ -198,10 +196,6 @@ static inline int __ptep_test_and_clear_young(struct mm_struct *mm, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); } @@ -209,9 +203,6 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - if ((pte_val(*ptep) & _PAGE_RW) == 0) - return; - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); } diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index bd5c3a4baabd0..8adaacbbdd1d6 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -47,6 +47,9 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p pte_basic_t old = pte_val(*p); pte_basic_t new = (old & ~(pte_basic_t)clr) | set; + if (new == old) + return old; + *p = __pte(new); if (IS_ENABLED(CONFIG_44x) && (old & _PAGE_USER) && (old & _PAGE_EXEC)) From 27672be7751f25566e69bc228c8b8440a0772f8b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:28 +0200 Subject: [PATCH 13/76] powerpc/nohash: Deduplicate _PAGE_CHG_MASK _PAGE_CHG_MASK is identical between nohash/32 and nohash/64, deduplicate it. While at it, clean the #ifdef for PTE_RPN_MASK in nohash/32 as it is already CONFIG_PPC32. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/c1a1893dbba4afb825bfa78b262f0b0e0fc3b490.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 8 +------- arch/powerpc/include/asm/nohash/64/pgtable.h | 6 ------ arch/powerpc/include/asm/nohash/pgtable.h | 6 ++++++ 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index ae7f3c8afd4fa..a39ecd4980841 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -143,7 +143,7 @@ * The mask covered by the RPN must be a ULL on 32-bit platforms with * 64-bit PTEs. */ -#if defined(CONFIG_PPC32) && defined(CONFIG_PTE_64BIT) +#ifdef CONFIG_PTE_64BIT #define PTE_RPN_MASK (~((1ULL << PTE_RPN_SHIFT) - 1)) #define MAX_POSSIBLE_PHYSMEM_BITS 36 #else @@ -151,12 +151,6 @@ #define MAX_POSSIBLE_PHYSMEM_BITS 32 #endif -/* - * _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes. - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) - #ifndef __ASSEMBLY__ #define pte_clear(mm, addr, ptep) \ diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index cba08a62c52cd..34a518a1c04d0 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -75,12 +75,6 @@ #define PTE_RPN_MASK (~((1UL << PTE_RPN_SHIFT) - 1)) -/* - * _PAGE_CHG_MASK masks of bits that are to be preserved across - * pgprot changes. - */ -#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) - #define H_PAGE_4K_PFN 0 #ifndef __ASSEMBLY__ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 8adaacbbdd1d6..c64a040f4a6a7 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -13,6 +13,12 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p #include #endif +/* + * _PAGE_CHG_MASK masks of bits that are to be preserved across + * pgprot changes. + */ +#define _PAGE_CHG_MASK (PTE_RPN_MASK | _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_SPECIAL) + /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_NO_CACHE) From 3a4288164d631b88a57119777b15099eb23c6fbf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:29 +0200 Subject: [PATCH 14/76] powerpc/nohash: Deduplicate pte helpers Deduplicate following helpers that are identical on nohash/32 and nohash/64: pte_mkwrite_novma() pte_mkdirty() pte_mkyoung() pte_wrprotect() pte_mkexec() pte_young() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/485f3cbafaa948143f92692e17ca652dfed68da2.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 36 -------------------- arch/powerpc/include/asm/nohash/64/pgtable.h | 25 -------------- arch/powerpc/include/asm/nohash/pgtable.h | 36 ++++++++++++++++++++ 3 files changed, 36 insertions(+), 61 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index a39ecd4980841..de51f78449a05 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -156,37 +156,6 @@ #define pte_clear(mm, addr, ptep) \ do { pte_update(mm, addr, ptep, ~0, 0, 0); } while (0) -#ifndef pte_mkwrite_novma -static inline pte_t pte_mkwrite_novma(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_RW); -} -#endif - -static inline pte_t pte_mkdirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_DIRTY); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - -#ifndef pte_wrprotect -static inline pte_t pte_wrprotect(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_RW); -} -#endif - -#ifndef pte_mkexec -static inline pte_t pte_mkexec(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_EXEC); -} -#endif - #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) #define pmd_present(pmd) (pmd_val(pmd) & _PMD_PRESENT_MASK) @@ -238,11 +207,6 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, } #endif -static inline int pte_young(pte_t pte) -{ - return pte_val(pte) & _PAGE_ACCESSED; -} - /* * Note that on Book E processors, the pmd contains the kernel virtual * (lowmem) address of the pte page. The physical address is less useful diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 34a518a1c04d0..e8bbc6ec10849 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -80,26 +80,6 @@ #ifndef __ASSEMBLY__ /* pte_clear moved to later in this file */ -static inline pte_t pte_mkwrite_novma(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_RW); -} - -static inline pte_t pte_mkdirty(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_DIRTY); -} - -static inline pte_t pte_mkyoung(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_ACCESSED); -} - -static inline pte_t pte_wrprotect(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_RW); -} - #define PMD_BAD_BITS (PTE_TABLE_SIZE-1) #define PUD_BAD_BITS (PMD_TABLE_SIZE-1) @@ -165,11 +145,6 @@ static inline void p4d_set(p4d_t *p4dp, unsigned long val) *p4dp = __p4d(val); } -static inline int pte_young(pte_t pte) -{ - return pte_val(pte) & _PAGE_ACCESSED; -} - static inline int __ptep_test_and_clear_young(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index c64a040f4a6a7..21f232d2e34f1 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -70,6 +70,37 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p #endif /* Generic accessors to PTE bits */ +#ifndef pte_mkwrite_novma +static inline pte_t pte_mkwrite_novma(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_RW); +} +#endif + +static inline pte_t pte_mkdirty(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_DIRTY); +} + +static inline pte_t pte_mkyoung(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_ACCESSED); +} + +#ifndef pte_wrprotect +static inline pte_t pte_wrprotect(pte_t pte) +{ + return __pte(pte_val(pte) & ~_PAGE_RW); +} +#endif + +#ifndef pte_mkexec +static inline pte_t pte_mkexec(pte_t pte) +{ + return __pte(pte_val(pte) | _PAGE_EXEC); +} +#endif + #ifndef pte_write static inline int pte_write(pte_t pte) { @@ -96,6 +127,11 @@ static inline bool pte_hw_valid(pte_t pte) return pte_val(pte) & _PAGE_PRESENT; } +static inline int pte_young(pte_t pte) +{ + return pte_val(pte) & _PAGE_ACCESSED; +} + /* * Don't just check for any non zero bits in __PAGE_USER, since for book3e * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in From 8c3d9eb323bbf2b37cdc5c01ebf9604175b5970f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:30 +0200 Subject: [PATCH 15/76] powerpc/nohash: Refactor ptep_test_and_clear_young() Remove ptep_test_and_clear_young() macro, make __ptep_test_and_clear_young() common to nohash/32 and nohash/64 and change it to become ptep_test_and_clear_young() Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/de90679ae169104fa3c98d0a8828d7ede228fc52.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 11 ----------- arch/powerpc/include/asm/nohash/64/pgtable.h | 19 +------------------ arch/powerpc/include/asm/nohash/pgtable.h | 11 +++++++++++ 3 files changed, 12 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index de51f78449a05..b7605000bd919 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -164,17 +164,6 @@ static inline void pmd_clear(pmd_t *pmdp) *pmdp = __pmd(0); } -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old; - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); - return (old & _PAGE_ACCESSED) != 0; -} -#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ - __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep) - #define __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index e8bbc6ec10849..56041036fa34e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -145,22 +145,6 @@ static inline void p4d_set(p4d_t *p4dp, unsigned long val) *p4dp = __p4d(val); } -static inline int __ptep_test_and_clear_young(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old; - - old = pte_update(mm, addr, ptep, _PAGE_ACCESSED, 0, 0); - return (old & _PAGE_ACCESSED) != 0; -} -#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG -#define ptep_test_and_clear_young(__vma, __addr, __ptep) \ -({ \ - int __r; \ - __r = __ptep_test_and_clear_young((__vma)->vm_mm, __addr, __ptep); \ - __r; \ -}) - #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -178,8 +162,7 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH #define ptep_clear_flush_young(__vma, __address, __ptep) \ ({ \ - int __young = __ptep_test_and_clear_young((__vma)->vm_mm, __address, \ - __ptep); \ + int __young = ptep_test_and_clear_young(__vma, __address, __ptep);\ __young; \ }) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 21f232d2e34f1..2b043b72f642e 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -69,6 +69,17 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p } #endif +static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, + unsigned long addr, pte_t *ptep) +{ + unsigned long old; + + old = pte_update(vma->vm_mm, addr, ptep, _PAGE_ACCESSED, 0, 0); + + return (old & _PAGE_ACCESSED) != 0; +} +#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG + /* Generic accessors to PTE bits */ #ifndef pte_mkwrite_novma static inline pte_t pte_mkwrite_novma(pte_t pte) From cc68d77febe055b6499dda5fa13bda976a12a85c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:31 +0200 Subject: [PATCH 16/76] powerpc/nohash: Deduplicate ptep_set_wrprotect() and ptep_get_and_clear() ptep_set_wrprotect() and ptep_get_and_clear are identical for nohash/32 and nohash/64. Make them common. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/ffe46edecdabce915e2d1a4b79a3b2ab770f2248.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 16 ---------------- arch/powerpc/include/asm/nohash/64/pgtable.h | 15 --------------- arch/powerpc/include/asm/nohash/pgtable.h | 16 ++++++++++++++++ 3 files changed, 16 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index b7605000bd919..0be464af4cb1e 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -164,22 +164,6 @@ static inline void pmd_clear(pmd_t *pmdp) *pmdp = __pmd(0); } -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - return __pte(pte_update(mm, addr, ptep, ~0, 0, 0)); -} - -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -#ifndef ptep_set_wrprotect -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); -} -#endif - #ifndef __ptep_set_access_flags static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 56041036fa34e..dc6e35c3a53f3 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -145,13 +145,6 @@ static inline void p4d_set(p4d_t *p4dp, unsigned long val) *p4dp = __p4d(val); } -#define __HAVE_ARCH_PTEP_SET_WRPROTECT -static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); -} - #define __HAVE_ARCH_HUGE_PTEP_SET_WRPROTECT static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -166,14 +159,6 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __young; \ }) -#define __HAVE_ARCH_PTEP_GET_AND_CLEAR -static inline pte_t ptep_get_and_clear(struct mm_struct *mm, - unsigned long addr, pte_t *ptep) -{ - unsigned long old = pte_update(mm, addr, ptep, ~0UL, 0, 0); - return __pte(old); -} - static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t * ptep) { diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 2b043b72f642e..7e810a84ac150 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -80,6 +80,22 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, } #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG +#ifndef ptep_set_wrprotect +static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); +} +#endif +#define __HAVE_ARCH_PTEP_SET_WRPROTECT + +static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + return __pte(pte_update(mm, addr, ptep, ~0UL, 0, 0)); +} +#define __HAVE_ARCH_PTEP_GET_AND_CLEAR + /* Generic accessors to PTE bits */ #ifndef pte_mkwrite_novma static inline pte_t pte_mkwrite_novma(pte_t pte) From 2ef9f4bb9c47ed30ff3c7961744cae545c034154 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:32 +0200 Subject: [PATCH 17/76] powerpc/nohash: Refactor pte_clear() pte_clear() are doing the same on nohash/32 and nohash/64, Keep the static inline version of nohash/64, make it common and remove the macro version of nohash/32. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/818f7df83d7e9e18f55e274cd3c44f2871ade4dd.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 3 --- arch/powerpc/include/asm/nohash/64/pgtable.h | 7 ------- arch/powerpc/include/asm/nohash/pgtable.h | 5 +++++ 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 0be464af4cb1e..481594097f460 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -153,9 +153,6 @@ #ifndef __ASSEMBLY__ -#define pte_clear(mm, addr, ptep) \ - do { pte_update(mm, addr, ptep, ~0, 0, 0); } while (0) - #define pmd_none(pmd) (!pmd_val(pmd)) #define pmd_bad(pmd) (pmd_val(pmd) & _PMD_BAD) #define pmd_present(pmd) (pmd_val(pmd) & _PMD_PRESENT_MASK) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index dc6e35c3a53f3..b59fbf754f820 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -159,13 +159,6 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __young; \ }) -static inline void pte_clear(struct mm_struct *mm, unsigned long addr, - pte_t * ptep) -{ - pte_update(mm, addr, ptep, ~0UL, 0, 0); -} - - /* Set the dirty and/or accessed bits atomically in a linux PTE */ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, pte_t *ptep, pte_t entry, diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 7e810a84ac150..b974d1ab8a976 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -96,6 +96,11 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, } #define __HAVE_ARCH_PTEP_GET_AND_CLEAR +static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +{ + pte_update(mm, addr, ptep, ~0UL, 0, 0); +} + /* Generic accessors to PTE bits */ #ifndef pte_mkwrite_novma static inline pte_t pte_mkwrite_novma(pte_t pte) From 799d8836a7c4f4327833e4a5ca952a1700acdb14 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:33 +0200 Subject: [PATCH 18/76] powerpc/nohash: Refactor __ptep_set_access_flags() nohash/32 version of __ptep_set_access_flags() does the same as nohash/64 version, the only difference is that nohash/32 version is more complete and uses pte_update(). Make it common and remove the nohash/64 version. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/e296885df46289d3e5f4cb51efeefe593f76ef24.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pgtable.h | 16 ---------------- arch/powerpc/include/asm/nohash/64/pgtable.h | 15 --------------- arch/powerpc/include/asm/nohash/pgtable.h | 17 +++++++++++++++++ 3 files changed, 17 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pgtable.h b/arch/powerpc/include/asm/nohash/32/pgtable.h index 481594097f460..9164a9e41b020 100644 --- a/arch/powerpc/include/asm/nohash/32/pgtable.h +++ b/arch/powerpc/include/asm/nohash/32/pgtable.h @@ -161,22 +161,6 @@ static inline void pmd_clear(pmd_t *pmdp) *pmdp = __pmd(0); } -#ifndef __ptep_set_access_flags -static inline void __ptep_set_access_flags(struct vm_area_struct *vma, - pte_t *ptep, pte_t entry, - unsigned long address, - int psize) -{ - unsigned long set = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); - int huge = psize > mmu_virtual_psize ? 1 : 0; - - pte_update(vma->vm_mm, address, ptep, 0, set, huge); - - flush_tlb_page(vma, address); -} -#endif - /* * Note that on Book E processors, the pmd contains the kernel virtual * (lowmem) address of the pte page. The physical address is less useful diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index b59fbf754f820..36b9bad428ccc 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -159,21 +159,6 @@ static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, __young; \ }) -/* Set the dirty and/or accessed bits atomically in a linux PTE */ -static inline void __ptep_set_access_flags(struct vm_area_struct *vma, - pte_t *ptep, pte_t entry, - unsigned long address, - int psize) -{ - unsigned long bits = pte_val(entry) & - (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); - - unsigned long old = pte_val(*ptep); - *ptep = __pte(old | bits); - - flush_tlb_page(vma, address); -} - #define pmd_ERROR(e) \ pr_err("%s:%d: bad pmd %08lx.\n", __FILE__, __LINE__, pmd_val(e)) #define pgd_ERROR(e) \ diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index b974d1ab8a976..c5f21eda2a43b 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -101,6 +101,23 @@ static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *pt pte_update(mm, addr, ptep, ~0UL, 0, 0); } +/* Set the dirty and/or accessed bits atomically in a linux PTE */ +#ifndef __ptep_set_access_flags +static inline void __ptep_set_access_flags(struct vm_area_struct *vma, + pte_t *ptep, pte_t entry, + unsigned long address, + int psize) +{ + unsigned long set = pte_val(entry) & + (_PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_RW | _PAGE_EXEC); + int huge = psize > mmu_virtual_psize ? 1 : 0; + + pte_update(vma->vm_mm, address, ptep, 0, set, huge); + + flush_tlb_page(vma, address); +} +#endif + /* Generic accessors to PTE bits */ #ifndef pte_mkwrite_novma static inline pte_t pte_mkwrite_novma(pte_t pte) From 4c8dd6c9872d4e89fd2b3a6fc92fd6cc9cdce347 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:34 +0200 Subject: [PATCH 19/76] powerpc/e500: Simplify pte_mkexec() Commit b6cb20fdc273 ("powerpc/book3e: Fix set_memory_x() and set_memory_nx()") implemented a more elaborated version of pte_mkwrite() suitable for both kernel and user pages. That was needed because set_memory_x() was using pte_mkwrite(). But since commit a4c182ecf335 ("powerpc/set_memory: Avoid spinlock recursion in change_page_attr()") pte_mkwrite() is not used anymore by set_memory_x() so pte_mkwrite() can be simplified as it is only used for user pages. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/cdc822322fe2ff4b0f5ecfde71d09d950b1c7557.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pte-e500.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index d8924cbd61e4a..99288e26b6b09 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -115,10 +115,7 @@ static inline pte_t pte_mkuser(pte_t pte) static inline pte_t pte_mkexec(pte_t pte) { - if (pte_val(pte) & _PAGE_BAP_UR) - return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); - else - return __pte((pte_val(pte) & ~_PAGE_BAP_UX) | _PAGE_BAP_SX); + return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); } #define pte_mkexec pte_mkexec From d3c0dfcfc95796701e82719722fd997ec5256013 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:35 +0200 Subject: [PATCH 20/76] powerpc: Implement and use pgprot_nx() ioremap_page_range() calls pgprot_nx() vmap() and vmap_pfn() clear execute permission by calling pgprot_nx(). When pgprot_nx() is not defined it falls back to a nop. Implement it for powerpc then use it in early_ioremap_range(). Then the call to pte_exprotect() can be removed from ioremap_prot(). Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/5993a7a097e989af1c97fc4a6c011fefc67dbe6e.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pgtable.h | 6 ++++++ arch/powerpc/mm/ioremap.c | 5 ++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/pgtable.h b/arch/powerpc/include/asm/pgtable.h index 966e7c5119f63..2bfb7dd3b49e6 100644 --- a/arch/powerpc/include/asm/pgtable.h +++ b/arch/powerpc/include/asm/pgtable.h @@ -71,6 +71,12 @@ static inline pgprot_t pte_pgprot(pte_t pte) return __pgprot(pte_flags); } +static inline pgprot_t pgprot_nx(pgprot_t prot) +{ + return pte_pgprot(pte_exprotect(__pte(pgprot_val(prot)))); +} +#define pgprot_nx pgprot_nx + #ifndef pmd_page_vaddr static inline const void *pmd_page_vaddr(pmd_t pmd) { diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 705e8e8ffde4d..d5159f2053800 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -50,8 +50,7 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) if (pte_write(pte)) pte = pte_mkdirty(pte); - /* we don't want to let _PAGE_USER and _PAGE_EXEC leak out */ - pte = pte_exprotect(pte); + /* we don't want to let _PAGE_USER leak out */ pte = pte_mkprivileged(pte); if (iowa_is_active()) @@ -66,7 +65,7 @@ int early_ioremap_range(unsigned long ea, phys_addr_t pa, unsigned long i; for (i = 0; i < size; i += PAGE_SIZE) { - int err = map_kernel_page(ea + i, pa + i, prot); + int err = map_kernel_page(ea + i, pa + i, pgprot_nx(prot)); if (WARN_ON_ONCE(err)) /* Should clean up */ return err; From c7263f156395d1f2a2142375a75b7b040686a07a Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:36 +0200 Subject: [PATCH 21/76] powerpc: Fail ioremap() instead of silently ignoring flags when PAGE_USER is set Calling ioremap() with _PAGE_USER (or _PAGE_PRIVILEDGE unset) is wrong. Loudly fail the call to ioremap() instead of blindly clearing the flags. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/b6dd5485ad00d2aafd2bb9b7c2c4eac3ebf2cdaf.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ioremap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index d5159f2053800..7823c38f09dee 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -51,7 +51,8 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) pte = pte_mkdirty(pte); /* we don't want to let _PAGE_USER leak out */ - pte = pte_mkprivileged(pte); + if (WARN_ON(pte_user(pte))) + return NULL; if (iowa_is_active()) return iowa_ioremap(addr, size, pte_pgprot(pte), caller); From 69339071bb27f0b1371cd23d6dada3f976261c20 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:37 +0200 Subject: [PATCH 22/76] powerpc: Remove pte_mkuser() and pte_mkpriviledged() pte_mkuser() is never used. Remove it. pte_mkpriviledged() is not used anymore. Remove it too. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/1a1dc18816456c637dc8a9c38d532f7598b60ac4.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 10 ---------- arch/powerpc/include/asm/book3s/64/pgtable.h | 10 ---------- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 14 -------------- arch/powerpc/include/asm/nohash/pgtable.h | 14 -------------- arch/powerpc/include/asm/nohash/pte-e500.h | 15 --------------- 5 files changed, 63 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 45b69ae2631e0..80505915c77cf 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -522,16 +522,6 @@ static inline pte_t pte_mkhuge(pte_t pte) return pte; } -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_USER); -} - -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_USER); -} - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index dbd545e73161a..c3b921769ecea 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -630,16 +630,6 @@ static inline pte_t pte_mkdevmap(pte_t pte) return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_SPECIAL | _PAGE_DEVMAP)); } -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte_raw(pte_raw(pte) | cpu_to_be64(_PAGE_PRIVILEGED)); -} - -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte_raw(pte_raw(pte) & cpu_to_be64(~_PAGE_PRIVILEGED)); -} - /* * This is potentially called with a pmd as the argument, in which case it's not * safe to check _PAGE_DEVMAP unless we also confirm that _PAGE_PTE is set. diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 52395a5ecd707..843fe0138a663 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -122,20 +122,6 @@ static inline bool pte_user(pte_t pte) #define pte_user pte_user -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_SH); -} - -#define pte_mkprivileged pte_mkprivileged - -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_SH); -} - -#define pte_mkuser pte_mkuser - static inline pte_t pte_mkhuge(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPS | _PAGE_HUGE); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index c5f21eda2a43b..18c1c9430a864 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -251,20 +251,6 @@ static inline pte_t pte_mkhuge(pte_t pte) } #endif -#ifndef pte_mkprivileged -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte(pte_val(pte) & ~_PAGE_USER); -} -#endif - -#ifndef pte_mkuser -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte(pte_val(pte) | _PAGE_USER); -} -#endif - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 99288e26b6b09..9f9e3f02d4146 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -54,7 +54,6 @@ #define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) #define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) #define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ -#define _PAGE_PRIVILEGED (_PAGE_BAP_SR) #define _PAGE_SPECIAL _PAGE_SW0 @@ -99,20 +98,6 @@ #define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) #ifndef __ASSEMBLY__ -static inline pte_t pte_mkprivileged(pte_t pte) -{ - return __pte((pte_val(pte) & ~_PAGE_USER) | _PAGE_PRIVILEGED); -} - -#define pte_mkprivileged pte_mkprivileged - -static inline pte_t pte_mkuser(pte_t pte) -{ - return __pte((pte_val(pte) & ~_PAGE_PRIVILEGED) | _PAGE_USER); -} - -#define pte_mkuser pte_mkuser - static inline pte_t pte_mkexec(pte_t pte) { return __pte((pte_val(pte) & ~_PAGE_BAP_SX) | _PAGE_BAP_UX); From a78587473642aec302697cdaceb719a7f8791369 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:38 +0200 Subject: [PATCH 23/76] powerpc: Rely on address instead of pte_user() pte_user() may return 'false' when a user page is PAGE_NONE. In that case it is still a user page and needs to be handled as such. So use is_kernel_addr() instead. And remove "user" text from ptdump as ptdump only dumps kernel tables. Note: no change done for book3s/64 which still has it 'priviledge' bit. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/c778dad89fad07727c31717a9c62f45357c29ebc.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pgtable.h | 2 +- arch/powerpc/mm/book3s32/mmu.c | 4 ++-- arch/powerpc/mm/nohash/e500.c | 2 +- arch/powerpc/mm/pgtable.c | 22 +++++++++++----------- arch/powerpc/mm/ptdump/8xx.c | 5 ----- arch/powerpc/mm/ptdump/shared.c | 5 ----- 6 files changed, 15 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 18c1c9430a864..942f46e86b4b7 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -58,7 +58,7 @@ static inline pte_basic_t pte_update(struct mm_struct *mm, unsigned long addr, p *p = __pte(new); - if (IS_ENABLED(CONFIG_44x) && (old & _PAGE_USER) && (old & _PAGE_EXEC)) + if (IS_ENABLED(CONFIG_44x) && !is_kernel_addr(addr) && (old & _PAGE_EXEC)) icache_44x_need_flush = 1; /* huge pages use the old page table lock */ diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index 850783cfa9c73..d1041c946ce20 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -127,7 +127,7 @@ static void setibat(int index, unsigned long virt, phys_addr_t phys, wimgxpp = (flags & _PAGE_COHERENT) | (_PAGE_EXEC ? BPP_RX : BPP_XX); bat[0].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[0].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[0].batu |= 1; /* Vp = 1 */ } @@ -280,7 +280,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; - if (flags & _PAGE_USER) + if (!is_kernel_addr(virt)) bat[1].batu |= 1; /* Vp = 1 */ if (flags & _PAGE_GUARDED) { /* G bit must be zero in IBATs */ diff --git a/arch/powerpc/mm/nohash/e500.c b/arch/powerpc/mm/nohash/e500.c index 40a4e69ae1a91..5b7d7a932bfd4 100644 --- a/arch/powerpc/mm/nohash/e500.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -122,7 +122,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS7 = (u64)phys >> 32; /* Below is unlikely -- only for large user pages or similar */ - if (pte_user(__pte(flags))) { + if (!is_kernel_addr(virt)) { TLBCAM[index].MAS3 |= MAS3_UR; TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 3f86fd217690b..781a68c69c2f0 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -46,13 +46,13 @@ static inline int is_exec_fault(void) * and we avoid _PAGE_SPECIAL and cache inhibited pte. We also only do that * on userspace PTEs */ -static inline int pte_looks_normal(pte_t pte) +static inline int pte_looks_normal(pte_t pte, unsigned long addr) { if (pte_present(pte) && !pte_special(pte)) { if (pte_ci(pte)) return 0; - if (pte_user(pte)) + if (!is_kernel_addr(addr)) return 1; } return 0; @@ -79,11 +79,11 @@ static struct folio *maybe_pte_to_folio(pte_t pte) * support falls into the same category. */ -static pte_t set_pte_filter_hash(pte_t pte) +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { pte = __pte(pte_val(pte) & ~_PAGE_HPTEFLAGS); - if (pte_looks_normal(pte) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || - cpu_has_feature(CPU_FTR_NOEXECUTE))) { + if (pte_looks_normal(pte, addr) && !(cpu_has_feature(CPU_FTR_COHERENT_ICACHE) || + cpu_has_feature(CPU_FTR_NOEXECUTE))) { struct folio *folio = maybe_pte_to_folio(pte); if (!folio) return pte; @@ -97,7 +97,7 @@ static pte_t set_pte_filter_hash(pte_t pte) #else /* CONFIG_PPC_BOOK3S */ -static pte_t set_pte_filter_hash(pte_t pte) { return pte; } +static pte_t set_pte_filter_hash(pte_t pte, unsigned long addr) { return pte; } #endif /* CONFIG_PPC_BOOK3S */ @@ -105,7 +105,7 @@ static pte_t set_pte_filter_hash(pte_t pte) { return pte; } * as we don't have two bits to spare for _PAGE_EXEC and _PAGE_HWEXEC so * instead we "filter out" the exec permission for non clean pages. */ -static inline pte_t set_pte_filter(pte_t pte) +static inline pte_t set_pte_filter(pte_t pte, unsigned long addr) { struct folio *folio; @@ -113,10 +113,10 @@ static inline pte_t set_pte_filter(pte_t pte) return pte; if (mmu_has_feature(MMU_FTR_HPTE_TABLE)) - return set_pte_filter_hash(pte); + return set_pte_filter_hash(pte, addr); /* No exec permission in the first place, move on */ - if (!pte_exec(pte) || !pte_looks_normal(pte)) + if (!pte_exec(pte) || !pte_looks_normal(pte, addr)) return pte; /* If you set _PAGE_EXEC on weird pages you're on your own */ @@ -200,7 +200,7 @@ void set_ptes(struct mm_struct *mm, unsigned long addr, pte_t *ptep, * this context might not have been activated yet when this * is called. */ - pte = set_pte_filter(pte); + pte = set_pte_filter(pte, addr); /* Perform the setting of the PTE */ arch_enter_lazy_mmu_mode(); @@ -301,7 +301,7 @@ void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_ */ VM_WARN_ON(pte_hw_valid(*ptep) && !pte_protnone(*ptep)); - pte = set_pte_filter(pte); + pte = set_pte_filter(pte, addr); val = pte_val(pte); diff --git a/arch/powerpc/mm/ptdump/8xx.c b/arch/powerpc/mm/ptdump/8xx.c index fac932eb8f9aa..b5c79b11ea3c2 100644 --- a/arch/powerpc/mm/ptdump/8xx.c +++ b/arch/powerpc/mm/ptdump/8xx.c @@ -20,11 +20,6 @@ static const struct flag_info flag_array[] = { #endif .set = "huge", .clear = " ", - }, { - .mask = _PAGE_SH, - .val = 0, - .set = "user", - .clear = " ", }, { .mask = _PAGE_RO | _PAGE_NA, .val = 0, diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index f884760ca5cfe..5ff101654c45b 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -11,11 +11,6 @@ static const struct flag_info flag_array[] = { { - .mask = _PAGE_USER, - .val = _PAGE_USER, - .set = "user", - .clear = " ", - }, { .mask = _PAGE_RW, .val = 0, .set = "r ", From a5a08dc90f4513d1a78582ec24b687fad01cc843 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:39 +0200 Subject: [PATCH 24/76] powerpc: Refactor permission masks used for __P/__S table and kernel memory flags Prepare a common version of the permission masks that will be based on _PAGE_NA, _PAGE_RO, _PAGE_ROX, _PAGE_RW, _PAGE_RWX that will be defined in platform specific headers in later patches. Put them in a new header pgtable-masks.h which will be included by platforms. And prepare a common version of flags used for mapping kernel memory that will be based on _PAGE_RO, _PAGE_ROX, _PAGE_RW, _PAGE_RWX that will be defined in platform specific headers. Put them in unless _PAGE_KERNEL_RO is already defined so that platform specific definitions can be dismantled one by one. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/d31b59efcdb38e675479563307af891aeadf6ec9.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/pgtable-masks.h | 30 ++++++++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 arch/powerpc/include/asm/pgtable-masks.h diff --git a/arch/powerpc/include/asm/pgtable-masks.h b/arch/powerpc/include/asm/pgtable-masks.h new file mode 100644 index 0000000000000..808a3b9e8fc05 --- /dev/null +++ b/arch/powerpc/include/asm/pgtable-masks.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_POWERPC_PGTABLE_MASKS_H +#define _ASM_POWERPC_PGTABLE_MASKS_H + +#ifndef _PAGE_NA +#define _PAGE_NA 0 +#define _PAGE_RO _PAGE_READ +#define _PAGE_ROX (_PAGE_READ | _PAGE_EXEC) +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) +#endif + +/* Permission flags for kernel mappings */ +#ifndef _PAGE_KERNEL_RO +#define _PAGE_KERNEL_RO _PAGE_RO +#define _PAGE_KERNEL_ROX _PAGE_ROX +#define _PAGE_KERNEL_RW (_PAGE_RW | _PAGE_DIRTY) +#define _PAGE_KERNEL_RWX (_PAGE_RWX | _PAGE_DIRTY) +#endif + +/* Permission masks used to generate the __P and __S table */ +#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) +#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) +#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RWX) +#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_ROX) +#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_RO) +#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_ROX) + +#endif /* _ASM_POWERPC_PGTABLE_MASKS_H */ From f9f09b93e80148fc5824afb338c318272abde529 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:40 +0200 Subject: [PATCH 25/76] powerpc/8xx: Use generic permission masks 8xx already has _PAGE_NA and _PAGE_RO. So add _PAGE_ROX, _PAGE_RW and _PAGE_RWX and remove specific permission masks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/5d0b5ce43485f697313eee4326ddff97157fb219.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 843fe0138a663..62c965a4511ab 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -48,6 +48,10 @@ #define _PAGE_HUGE 0x0800 /* Copied to L1 PS bit 29 */ +#define _PAGE_ROX (_PAGE_RO | _PAGE_EXEC) +#define _PAGE_RW 0 +#define _PAGE_RWX _PAGE_EXEC + /* cache related flags non existing on 8xx */ #define _PAGE_COHERENT 0 #define _PAGE_WRITETHRU 0 @@ -77,14 +81,7 @@ #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_PSIZE) #define _PAGE_BASE (_PAGE_BASE_NC) -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) -#define PAGE_SHARED __pgprot(_PAGE_BASE) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_RO) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_RO) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_RO | _PAGE_EXEC) +#include #ifndef __ASSEMBLY__ static inline pte_t pte_wrprotect(pte_t pte) From 58f534623c4d8800c2e5d63da9783530848e570c Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:41 +0200 Subject: [PATCH 26/76] powerpc/64s: Use generic permission masks book3s64 need specific masks because it needs _PAGE_PRIVILEGED for PAGE_NONE. book3s64 already has _PAGE_RW and _PAGE_RWX. So add _PAGE_NA, _PAGE_RO and _PAGE_ROX and remove specific permission masks. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/d37a418de52e2c950cad6797e81663b56991366f.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/64/pgtable.h | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index c3b921769ecea..0fd12bdc7b5eb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -17,6 +17,9 @@ #define _PAGE_EXEC 0x00001 /* execute permission */ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ +#define _PAGE_NA _PAGE_PRIVILEGED +#define _PAGE_RO _PAGE_READ +#define _PAGE_ROX (_PAGE_READ | _PAGE_EXEC) #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) #define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_EXEC) #define _PAGE_PRIVILEGED 0x00008 /* kernel access only */ @@ -136,21 +139,8 @@ #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) -/* Permission masks used to generate the __P and __S table, - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now (we could make write-only - * pages on BookE but we don't bother for now). Execute permission control is - * possible on platforms that define _PAGE_EXEC - */ -#define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_PRIVILEGED) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_READ) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_READ) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_READ | _PAGE_EXEC) +#include + /* Radix only, Hash uses PAGE_READONLY_X + execute-only pkey instead */ #define PAGE_EXECONLY __pgprot(_PAGE_BASE | _PAGE_EXEC) From d20506d4728c3b7408e84d9aececbcb78c3061ee Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:42 +0200 Subject: [PATCH 27/76] powerpc/nohash: Add _PAGE_WRITE to supplement _PAGE_RW Several places, _PAGE_RW maps to write permission and don't always imply read. To make it more clear, do as book3s/64 in commit c7d54842deb1 ("powerpc/mm: Use _PAGE_READ to indicate Read access") and use _PAGE_WRITE when more relevant. For the time being _PAGE_WRITE is equivalent to _PAGE_RW but that will change when _PAGE_READ gets added in following patches. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/1f79b88db54d030ada776dc9845e0e88345bfc28.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-40x.h | 2 ++ arch/powerpc/include/asm/nohash/32/pte-44x.h | 2 ++ arch/powerpc/include/asm/nohash/32/pte-85xx.h | 2 ++ arch/powerpc/include/asm/nohash/64/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/pgtable.h | 9 ++++++--- arch/powerpc/include/asm/nohash/pte-e500.h | 2 ++ arch/powerpc/kernel/head_40x.S | 12 ++++++------ arch/powerpc/kernel/head_44x.S | 4 ++-- arch/powerpc/kernel/head_85xx.S | 2 +- arch/powerpc/mm/nohash/e500.c | 4 ++-- 10 files changed, 26 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index 0b4e5f8ce3e8a..e28ef0f5781e7 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -49,6 +49,8 @@ #define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ +#define _PAGE_WRITE _PAGE_RW + /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index b7ed13cee1378..fc0c075006ead 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -75,6 +75,8 @@ #define _PAGE_NO_CACHE 0x00000400 /* H: I bit */ #define _PAGE_WRITETHRU 0x00000800 /* H: W bit */ +#define _PAGE_WRITE _PAGE_RW + /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 16451df5ddb05..462acf69e3021 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -31,6 +31,8 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ +#define _PAGE_WRITE _PAGE_RW + #define _PAGE_KERNEL_RO 0 #define _PAGE_KERNEL_ROX _PAGE_EXEC #define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) diff --git a/arch/powerpc/include/asm/nohash/64/pgtable.h b/arch/powerpc/include/asm/nohash/64/pgtable.h index 36b9bad428ccc..2202c78730e8e 100644 --- a/arch/powerpc/include/asm/nohash/64/pgtable.h +++ b/arch/powerpc/include/asm/nohash/64/pgtable.h @@ -149,7 +149,7 @@ static inline void p4d_set(p4d_t *p4dp, unsigned long val) static inline void huge_ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(mm, addr, ptep, _PAGE_RW, 0, 1); + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 1); } #define __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 942f46e86b4b7..9a91efa6455b8 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -84,7 +84,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); } #endif #define __HAVE_ARCH_PTEP_SET_WRPROTECT @@ -122,6 +122,9 @@ static inline void __ptep_set_access_flags(struct vm_area_struct *vma, #ifndef pte_mkwrite_novma static inline pte_t pte_mkwrite_novma(pte_t pte) { + /* + * write implies read, hence set both + */ return __pte(pte_val(pte) | _PAGE_RW); } #endif @@ -139,7 +142,7 @@ static inline pte_t pte_mkyoung(pte_t pte) #ifndef pte_wrprotect static inline pte_t pte_wrprotect(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_RW); + return __pte(pte_val(pte) & ~_PAGE_WRITE); } #endif @@ -153,7 +156,7 @@ static inline pte_t pte_mkexec(pte_t pte) #ifndef pte_write static inline int pte_write(pte_t pte) { - return pte_val(pte) & _PAGE_RW; + return pte_val(pte) & _PAGE_WRITE; } #endif #ifndef pte_read diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 9f9e3f02d4146..b775c7d465a41 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -55,6 +55,8 @@ #define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) #define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ +#define _PAGE_WRITE _PAGE_RW + #define _PAGE_SPECIAL _PAGE_SW0 /* Base page size */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index b32e7b2ebdcfd..9f92f5c5e6aa7 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -316,9 +316,9 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) andc. r9, r9, r11 /* Check permission */ bne 5f - rlwinm r9, r11, 1, _PAGE_RW /* dirty => rw */ - and r9, r9, r11 /* hwwrite = dirty & rw */ - rlwimi r11, r9, 0, _PAGE_RW /* replace rw by hwwrite */ + rlwinm r9, r11, 1, _PAGE_WRITE /* dirty => w */ + and r9, r9, r11 /* hwwrite = dirty & w */ + rlwimi r11, r9, 0, _PAGE_WRITE /* replace w by hwwrite */ /* Create TLB tag. This is the faulting address plus a static * set of bits. These are size, valid, E, U0. @@ -400,9 +400,9 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) andc. r9, r9, r11 /* Check permission */ bne 5f - rlwinm r9, r11, 1, _PAGE_RW /* dirty => rw */ - and r9, r9, r11 /* hwwrite = dirty & rw */ - rlwimi r11, r9, 0, _PAGE_RW /* replace rw by hwwrite */ + rlwinm r9, r11, 1, _PAGE_WRITE /* dirty => w */ + and r9, r9, r11 /* hwwrite = dirty & w */ + rlwimi r11, r9, 0, _PAGE_WRITE /* replace w by hwwrite */ /* Create TLB tag. This is the faulting address plus a static * set of bits. These are size, valid, E, U0. diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index a3197c9f721cd..858dabf844329 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -342,7 +342,7 @@ interrupt_base: mtspr SPRN_MMUCR,r12 /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens @@ -586,7 +586,7 @@ finish_tlb_load_44x: 4: mtspr SPRN_MMUCR,r12 /* Set MMUCR */ /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 0f1641a31250d..1b122cba557b7 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -471,7 +471,7 @@ END_BTB_FLUSH_SECTION 4: /* Mask of required permission bits. Note that while we - * do copy ESR:ST to _PAGE_RW position as trying to write + * do copy ESR:ST to _PAGE_WRITE position as trying to write * to an RO page is pretty common, we don't do it with * _PAGE_DIRTY. We could do it, but it's a fairly rare * event so I'd rather take the overhead when it happens diff --git a/arch/powerpc/mm/nohash/e500.c b/arch/powerpc/mm/nohash/e500.c index 5b7d7a932bfd4..921c3521ec113 100644 --- a/arch/powerpc/mm/nohash/e500.c +++ b/arch/powerpc/mm/nohash/e500.c @@ -117,7 +117,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, TLBCAM[index].MAS2 |= (flags & _PAGE_ENDIAN) ? MAS2_E : 0; TLBCAM[index].MAS3 = (phys & MAS3_RPN) | MAS3_SR; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_SW : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_SW : 0; if (mmu_has_feature(MMU_FTR_BIG_PHYS)) TLBCAM[index].MAS7 = (u64)phys >> 32; @@ -125,7 +125,7 @@ static void settlbcam(int index, unsigned long virt, phys_addr_t phys, if (!is_kernel_addr(virt)) { TLBCAM[index].MAS3 |= MAS3_UR; TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_UX : 0; - TLBCAM[index].MAS3 |= (flags & _PAGE_RW) ? MAS3_UW : 0; + TLBCAM[index].MAS3 |= (flags & _PAGE_WRITE) ? MAS3_UW : 0; } else { TLBCAM[index].MAS3 |= (flags & _PAGE_EXEC) ? MAS3_SX : 0; } From 8e9bd41e4ce1001f5b89e4c9a69f870f39d56c12 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:43 +0200 Subject: [PATCH 28/76] powerpc/nohash: Replace pte_user() by pte_read() pte_user() is now only used in pte_access_permitted() to check access on vmas. User flag is cleared to make a page unreadable. So rename it pte_read() and remove pte_user() which isn't used anymore. For the time being it checks _PAGE_USER but in the near futur all plateforms will be converted to _PAGE_READ so lets support both for now. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/72cbb5be595e9ef884140def73815ed0b0b37010.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 7 ------- arch/powerpc/include/asm/nohash/pgtable.h | 13 +++++++------ arch/powerpc/mm/ioremap.c | 4 ---- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 62c965a4511ab..1ee38befd29a2 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -112,13 +112,6 @@ static inline pte_t pte_mkwrite_novma(pte_t pte) #define pte_mkwrite_novma pte_mkwrite_novma -static inline bool pte_user(pte_t pte) -{ - return !(pte_val(pte) & _PAGE_SH); -} - -#define pte_user pte_user - static inline pte_t pte_mkhuge(pte_t pte) { return __pte(pte_val(pte) | _PAGE_SPS | _PAGE_HUGE); diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 9a91efa6455b8..84b6a160d2fc3 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -159,9 +159,6 @@ static inline int pte_write(pte_t pte) return pte_val(pte) & _PAGE_WRITE; } #endif -#ifndef pte_read -static inline int pte_read(pte_t pte) { return 1; } -#endif static inline int pte_dirty(pte_t pte) { return pte_val(pte) & _PAGE_DIRTY; } static inline int pte_special(pte_t pte) { return pte_val(pte) & _PAGE_SPECIAL; } static inline int pte_none(pte_t pte) { return (pte_val(pte) & ~_PTE_NONE_MASK) == 0; } @@ -189,10 +186,14 @@ static inline int pte_young(pte_t pte) * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. */ -#ifndef pte_user -static inline bool pte_user(pte_t pte) +#ifndef pte_read +static inline bool pte_read(pte_t pte) { +#ifdef _PAGE_READ + return (pte_val(pte) & _PAGE_READ) == _PAGE_READ; +#else return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; +#endif } #endif @@ -207,7 +208,7 @@ static inline bool pte_access_permitted(pte_t pte, bool write) * A read-only access is controlled by _PAGE_USER bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + if (!pte_present(pte) || !pte_read(pte)) return false; if (write && !pte_write(pte)) diff --git a/arch/powerpc/mm/ioremap.c b/arch/powerpc/mm/ioremap.c index 7823c38f09dee..7b0afcabd89f0 100644 --- a/arch/powerpc/mm/ioremap.c +++ b/arch/powerpc/mm/ioremap.c @@ -50,10 +50,6 @@ void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long flags) if (pte_write(pte)) pte = pte_mkdirty(pte); - /* we don't want to let _PAGE_USER leak out */ - if (WARN_ON(pte_user(pte))) - return NULL; - if (iowa_is_active()) return iowa_ioremap(addr, size, pte_pgprot(pte), caller); return __ioremap_caller(addr, size, pte_pgprot(pte), caller); From 48cf93bb168d506a8278a6fb25c2f88c1c93ce6e Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:44 +0200 Subject: [PATCH 29/76] powerpc/e500: Introduce _PAGE_READ and remove _PAGE_USER e500 MMU has 6 page protection bits: - R, W, X for supervisor - R, W, X for user It means that it can support X without R. To do that, _PAGE_READ flag is needed. With 32 bits PTE there is no bit available for it in PTE. On the other hand the only real use of _PAGE_USER is to implement PAGE_NONE by clearing _PAGE_USER. As _PAGE_NONE can also be implemented by clearing _PAGE_READ, remove _PAGE_USER and add _PAGE_READ. Move _PAGE_PRESENT into bit 30 so that _PAGE_READ can match SR bit. With 64 bits PTE _PAGE_USER is already the combination of SR and UR so all we need to do is to rename it _PAGE_READ. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/0849ab6bf7ae2af23f94b0457fa40d0ea3983fe4.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-85xx.h | 22 ++++--------------- arch/powerpc/include/asm/nohash/pte-e500.h | 20 ++++++++--------- arch/powerpc/kernel/head_85xx.S | 10 ++++----- 3 files changed, 18 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-85xx.h b/arch/powerpc/include/asm/nohash/32/pte-85xx.h index 462acf69e3021..653a342d3b253 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-85xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-85xx.h @@ -17,9 +17,9 @@ */ /* Definitions for FSL Book-E Cores */ -#define _PAGE_PRESENT 0x00001 /* S: PTE contains a translation */ -#define _PAGE_USER 0x00002 /* S: User page (maps to UR) */ -#define _PAGE_RW 0x00004 /* S: Write permission (SW) */ +#define _PAGE_READ 0x00001 /* H: Read permission (SR) */ +#define _PAGE_PRESENT 0x00002 /* S: PTE contains a translation */ +#define _PAGE_WRITE 0x00004 /* S: Write permission (SW) */ #define _PAGE_DIRTY 0x00008 /* S: Page dirty */ #define _PAGE_EXEC 0x00010 /* H: SX permission */ #define _PAGE_ACCESSED 0x00020 /* S: Page referenced */ @@ -31,13 +31,6 @@ #define _PAGE_WRITETHRU 0x00400 /* H: W bit */ #define _PAGE_SPECIAL 0x00800 /* S: Special page */ -#define _PAGE_WRITE _PAGE_RW - -#define _PAGE_KERNEL_RO 0 -#define _PAGE_KERNEL_ROX _PAGE_EXEC -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 @@ -63,14 +56,7 @@ #define _PAGE_BASE (_PAGE_BASE_NC) #endif -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#include #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_FSL_85xx_H */ diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index b775c7d465a41..31d2c3ea7df8f 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -48,14 +48,19 @@ /* "Higher level" linux bit combinations */ #define _PAGE_EXEC (_PAGE_BAP_SX | _PAGE_BAP_UX) /* .. and was cache cleaned */ -#define _PAGE_RW (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ +#define _PAGE_READ (_PAGE_BAP_SR | _PAGE_BAP_UR) /* User read permission */ +#define _PAGE_WRITE (_PAGE_BAP_SW | _PAGE_BAP_UW) /* User write permission */ + #define _PAGE_KERNEL_RW (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY) #define _PAGE_KERNEL_RO (_PAGE_BAP_SR) #define _PAGE_KERNEL_RWX (_PAGE_BAP_SW | _PAGE_BAP_SR | _PAGE_DIRTY | _PAGE_BAP_SX) #define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) -#define _PAGE_USER (_PAGE_BAP_UR | _PAGE_BAP_SR) /* Can be read */ -#define _PAGE_WRITE _PAGE_RW +#define _PAGE_NA 0 +#define _PAGE_RO _PAGE_READ +#define _PAGE_ROX (_PAGE_READ | _PAGE_BAP_UX) +#define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) +#define _PAGE_RWX (_PAGE_READ | _PAGE_WRITE | _PAGE_BAP_UX) #define _PAGE_SPECIAL _PAGE_SW0 @@ -90,14 +95,7 @@ #define _PAGE_BASE (_PAGE_BASE_NC) #endif -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_BAP_UX) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_BAP_UX) +#include #ifndef __ASSEMBLY__ static inline pte_t pte_mkexec(pte_t pte) diff --git a/arch/powerpc/kernel/head_85xx.S b/arch/powerpc/kernel/head_85xx.S index 1b122cba557b7..39724ff5ae1f5 100644 --- a/arch/powerpc/kernel/head_85xx.S +++ b/arch/powerpc/kernel/head_85xx.S @@ -485,10 +485,10 @@ END_BTB_FLUSH_SECTION */ mfspr r12,SPRN_ESR #ifdef CONFIG_PTE_64BIT - li r13,_PAGE_PRESENT + li r13,_PAGE_PRESENT|_PAGE_BAP_SR oris r13,r13,_PAGE_ACCESSED@h #else - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_READ|_PAGE_ACCESSED #endif rlwimi r13,r12,11,29,29 @@ -783,15 +783,15 @@ BEGIN_MMU_FTR_SECTION mtspr SPRN_MAS7, r10 END_MMU_FTR_SECTION_IFSET(MMU_FTR_BIG_PHYS) #else - li r10, (_PAGE_EXEC | _PAGE_PRESENT) + li r10, (_PAGE_EXEC | _PAGE_READ) mr r13, r11 rlwimi r10, r11, 31, 29, 29 /* extract _PAGE_DIRTY into SW */ and r12, r11, r10 - andi. r10, r11, _PAGE_USER /* Test for _PAGE_USER */ + mcrf cr0, cr5 /* Test for user page */ slwi r10, r12, 1 or r10, r10, r12 rlwinm r10, r10, 0, ~_PAGE_EXEC /* Clear SX on user pages */ - iseleq r12, r12, r10 + isellt r12, r10, r12 rlwimi r13, r12, 0, 20, 31 /* Get RPN from PTE, merge w/ perms */ mtspr SPRN_MAS3, r13 #endif From 93820bfeefc4a125a6cedd1ee1a956eeb3eb2580 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:45 +0200 Subject: [PATCH 30/76] powerpc/44x: Introduce _PAGE_READ and remove _PAGE_USER 44x MMU has 6 page protection bits: - R, W, X for supervisor - R, W, X for user It means that it can support X without R. To do that, _PAGE_READ flag is needed but there is no bit available for it in PTE. On the other hand the only real use of _PAGE_USER is to implement PAGE_NONE by clearing _PAGE_USER. As _PAGE_NONE can also be implemented by clearing _PAGE_READ, remove _PAGE_USER and add _PAGE_READ. In order to insert bits in one go during TLB miss, move _PAGE_ACCESSED and put _PAGE_READ just after _PAGE_DIRTY so that _PAGE_DIRTY is copied into SW and _PAGE_READ into SR at once. With that change, 44x now also honors execute-only protection. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/043e17987b260b99b45094138c6cb2e89e63d499.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-44x.h | 22 +++--------- arch/powerpc/kernel/head_44x.S | 36 ++++++++++---------- 2 files changed, 22 insertions(+), 36 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-44x.h b/arch/powerpc/include/asm/nohash/32/pte-44x.h index fc0c075006ead..8518137252371 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-44x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-44x.h @@ -63,28 +63,21 @@ */ #define _PAGE_PRESENT 0x00000001 /* S: PTE valid */ -#define _PAGE_RW 0x00000002 /* S: Write permission */ +#define _PAGE_WRITE 0x00000002 /* S: Write permission */ #define _PAGE_EXEC 0x00000004 /* H: Execute permission */ -#define _PAGE_ACCESSED 0x00000008 /* S: Page referenced */ +#define _PAGE_READ 0x00000008 /* S: Read permission */ #define _PAGE_DIRTY 0x00000010 /* S: Page dirty */ #define _PAGE_SPECIAL 0x00000020 /* S: Special page */ -#define _PAGE_USER 0x00000040 /* S: User page */ +#define _PAGE_ACCESSED 0x00000040 /* S: Page referenced */ #define _PAGE_ENDIAN 0x00000080 /* H: E bit */ #define _PAGE_GUARDED 0x00000100 /* H: G bit */ #define _PAGE_COHERENT 0x00000200 /* H: M bit */ #define _PAGE_NO_CACHE 0x00000400 /* H: I bit */ #define _PAGE_WRITETHRU 0x00000800 /* H: W bit */ -#define _PAGE_WRITE _PAGE_RW - /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 -#define _PAGE_KERNEL_RO 0 -#define _PAGE_KERNEL_ROX _PAGE_EXEC -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - /* TODO: Add large page lowmem mapping support */ #define _PMD_PRESENT 0 #define _PMD_PRESENT_MASK (PAGE_MASK) @@ -107,14 +100,7 @@ #define _PAGE_BASE (_PAGE_BASE_NC) #endif -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#include #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_44x_H */ diff --git a/arch/powerpc/kernel/head_44x.S b/arch/powerpc/kernel/head_44x.S index 858dabf844329..25642e802ed3d 100644 --- a/arch/powerpc/kernel/head_44x.S +++ b/arch/powerpc/kernel/head_44x.S @@ -314,8 +314,8 @@ interrupt_base: * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw r10, r11 - blt+ 3f + cmplw cr7, r10, r11 + blt+ cr7, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -355,7 +355,7 @@ interrupt_base: * place or can we save a couple of instructions here ? */ mfspr r12,SPRN_ESR - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_READ rlwimi r13,r12,10,30,30 /* Load the PTE */ @@ -428,8 +428,8 @@ interrupt_base: * kernel page tables. */ lis r11, PAGE_OFFSET@h - cmplw r10, r11 - blt+ 3f + cmplw cr7, r10, r11 + blt+ cr7, 3f lis r11, swapper_pg_dir@h ori r11, r11, swapper_pg_dir@l @@ -515,6 +515,7 @@ interrupt_base: * r11 - PTE high word value * r12 - PTE low word value * r13 - TLB index + * cr7 - Result of comparison with PAGE_OFFSET * MMUCR - loaded with proper value when we get here * Upon exit, we reload everything and RFI. */ @@ -533,11 +534,10 @@ finish_tlb_load_44x: tlbwe r10,r13,PPC44x_TLB_PAGEID /* Write PAGEID */ /* And WS 2 */ - li r10,0xf85 /* Mask to apply from PTE */ - rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + li r10,0xf84 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,31 /* DIRTY,READ -> SW,SR position */ and r11,r12,r10 /* Mask PTE bits to keep */ - andi. r10,r12,_PAGE_USER /* User page ? */ - beq 1f /* nope, leave U bits empty */ + bge cr7,1f /* User page ? no, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ rlwinm r11,r11,0,~PPC44x_TLB_SX /* Clear SX if User page */ 1: tlbwe r11,r13,PPC44x_TLB_ATTRIB /* Write ATTRIB */ @@ -568,8 +568,8 @@ finish_tlb_load_44x: * kernel page tables. */ lis r11,PAGE_OFFSET@h - cmplw cr0,r10,r11 - blt+ 3f + cmplw cr7,r10,r11 + blt+ cr7,3f lis r11,swapper_pg_dir@h ori r11,r11, swapper_pg_dir@l li r12,0 /* MMUCR = 0 */ @@ -599,7 +599,7 @@ finish_tlb_load_44x: * place or can we save a couple of instructions here ? */ mfspr r12,SPRN_ESR - li r13,_PAGE_PRESENT|_PAGE_ACCESSED + li r13,_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_READ rlwimi r13,r12,10,30,30 /* Load the PTE */ @@ -669,8 +669,8 @@ finish_tlb_load_44x: * kernel page tables. */ lis r11,PAGE_OFFSET@h - cmplw cr0,r10,r11 - blt+ 3f + cmplw cr7,r10,r11 + blt+ cr7,3f lis r11,swapper_pg_dir@h ori r11,r11, swapper_pg_dir@l li r12,0 /* MMUCR = 0 */ @@ -744,6 +744,7 @@ finish_tlb_load_44x: * r11 - PTE high word value * r12 - PTE low word value * r13 - free to use + * cr7 - Result of comparison with PAGE_OFFSET * MMUCR - loaded with proper value when we get here * Upon exit, we reload everything and RFI. */ @@ -753,11 +754,10 @@ finish_tlb_load_47x: tlbwe r11,r13,1 /* And make up word 2 */ - li r10,0xf85 /* Mask to apply from PTE */ - rlwimi r10,r12,29,30,30 /* DIRTY -> SW position */ + li r10,0xf84 /* Mask to apply from PTE */ + rlwimi r10,r12,29,30,31 /* DIRTY,READ -> SW,SR position */ and r11,r12,r10 /* Mask PTE bits to keep */ - andi. r10,r12,_PAGE_USER /* User page ? */ - beq 1f /* nope, leave U bits empty */ + bge cr7,1f /* User page ? no, leave U bits empty */ rlwimi r11,r11,3,26,28 /* yes, copy S bits to U */ rlwinm r11,r11,0,~PPC47x_TLB2_SX /* Clear SX if User page */ 1: tlbwe r11,r13,2 From ed815bd3fe9b14a742e2ae094f7f55f70918dbbc Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:46 +0200 Subject: [PATCH 31/76] powerpc/40x: Introduce _PAGE_READ and remove _PAGE_USER _PAGE_USER is used to select the zone. Today zone 0 is kernel and zone 1 is user. To implement _PAGE_NONE, _PAGE_USER is cleared, leading to no access for user but kernel still has access to the page so it's possible for a user application to write in that page by using a kernel function as trampoline. What is really wanted is to have user rights on pages below TASK_SIZE and no user rights on pages above TASK_SIZE. Use zones for that. There are 16 zones so lets use the 4 upper address bits to set the zone and declare zone rights based on TASK_SIZE. Then drop _PAGE_USER and reuse it as _PAGE_READ that will be checked in Data TLB miss handler. That will properly handle PAGE_NONE for both kernel and user. In addition, it partially implements execute-only right. The implementation won't be complete because once a TLB has been loaded via the Instruction TLB miss handler, it will be possible to read the page. But at least it can't be read unless it is executed first. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/2a13e3ba8a5dec43143cc1f9a91ec71ea1529f3c.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/32/pte-40x.h | 20 +++----------------- arch/powerpc/kernel/head_40x.S | 7 ++++--- arch/powerpc/mm/nohash/40x.c | 19 ++++++++++++------- 3 files changed, 19 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/32/pte-40x.h b/arch/powerpc/include/asm/nohash/32/pte-40x.h index e28ef0f5781e7..d759cfd747541 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-40x.h +++ b/arch/powerpc/include/asm/nohash/32/pte-40x.h @@ -42,26 +42,19 @@ #define _PAGE_PRESENT 0x002 /* software: PTE contains a translation */ #define _PAGE_NO_CACHE 0x004 /* I: caching is inhibited */ #define _PAGE_WRITETHRU 0x008 /* W: caching is write-through */ -#define _PAGE_USER 0x010 /* matches one of the zone permission bits */ +#define _PAGE_READ 0x010 /* software: read permission */ #define _PAGE_SPECIAL 0x020 /* software: Special page */ #define _PAGE_DIRTY 0x080 /* software: dirty page */ -#define _PAGE_RW 0x100 /* hardware: WR, anded with dirty in exception */ +#define _PAGE_WRITE 0x100 /* hardware: WR, anded with dirty in exception */ #define _PAGE_EXEC 0x200 /* hardware: EX permission */ #define _PAGE_ACCESSED 0x400 /* software: R: page referenced */ -#define _PAGE_WRITE _PAGE_RW - /* No page size encoding in the linux PTE */ #define _PAGE_PSIZE 0 /* cache related flags non existing on 40x */ #define _PAGE_COHERENT 0 -#define _PAGE_KERNEL_RO 0 -#define _PAGE_KERNEL_ROX _PAGE_EXEC -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - #define _PMD_PRESENT 0x400 /* PMD points to page of PTEs */ #define _PMD_PRESENT_MASK _PMD_PRESENT #define _PMD_BAD 0x802 @@ -74,14 +67,7 @@ #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC) -/* Permission masks used to generate the __P and __S table */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#include #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_NOHASH_32_PTE_40x_H */ diff --git a/arch/powerpc/kernel/head_40x.S b/arch/powerpc/kernel/head_40x.S index 9f92f5c5e6aa7..9fc90410b385c 100644 --- a/arch/powerpc/kernel/head_40x.S +++ b/arch/powerpc/kernel/head_40x.S @@ -312,7 +312,7 @@ _ASM_NOKPROBE_SYMBOL(\name\()_virt) rlwimi r11, r10, 22, 20, 29 /* Compute PTE address */ lwz r11, 0(r11) /* Get Linux PTE */ - li r9, _PAGE_PRESENT | _PAGE_ACCESSED + li r9, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_READ andc. r9, r9, r11 /* Check permission */ bne 5f @@ -561,10 +561,11 @@ finish_tlb_load: /* * Clear out the software-only bits in the PTE to generate the * TLB_DATA value. These are the bottom 2 bits of the RPM, the - * top 3 bits of the zone field, and M. + * 4 bits of the zone field, and M. */ - li r9, 0x0ce2 + li r9, 0x0cf2 andc r11, r11, r9 + rlwimi r11, r10, 8, 24, 27 /* Copy 4 upper address bit into zone */ /* load the next available TLB index. */ lwz r9, tlb_4xx_index@l(0) diff --git a/arch/powerpc/mm/nohash/40x.c b/arch/powerpc/mm/nohash/40x.c index 3684d6e570fb5..e835e80c09dba 100644 --- a/arch/powerpc/mm/nohash/40x.c +++ b/arch/powerpc/mm/nohash/40x.c @@ -48,20 +48,25 @@ */ void __init MMU_init_hw(void) { + int i; + unsigned long zpr; + /* * The Zone Protection Register (ZPR) defines how protection will - * be applied to every page which is a member of a given zone. At - * present, we utilize only two of the 4xx's zones. + * be applied to every page which is a member of a given zone. * The zone index bits (of ZSEL) in the PTE are used for software - * indicators, except the LSB. For user access, zone 1 is used, - * for kernel access, zone 0 is used. We set all but zone 1 - * to zero, allowing only kernel access as indicated in the PTE. - * For zone 1, we set a 01 binary (a value of 10 will not work) + * indicators. We use the 4 upper bits of virtual address to select + * the zone. We set all zones above TASK_SIZE to zero, allowing + * only kernel access as indicated in the PTE. For zones below + * TASK_SIZE, we set a 01 binary (a value of 10 will not work) * to allow user access as indicated in the PTE. This also allows * kernel access as indicated in the PTE. */ - mtspr(SPRN_ZPR, 0x10000000); + for (i = 0, zpr = 0; i < TASK_SIZE >> 28; i++) + zpr |= 1 << (30 - i * 2); + + mtspr(SPRN_ZPR, zpr); flush_instruction_cache(); From 46ebef51fd92f52ba7dca21d3c4332e4127de515 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:47 +0200 Subject: [PATCH 32/76] powerpc/32s: Add _PAGE_WRITE to supplement _PAGE_RW Several places, _PAGE_RW maps to write permission and don't always imply read. To make it more clear, do as book3s/64 in commit c7d54842deb1 ("powerpc/mm: Use _PAGE_READ to indicate Read access") and use _PAGE_WRITE when more relevant. For the time being _PAGE_WRITE is equivalent to _PAGE_RW but that will change when _PAGE_READ gets added in following patches. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/5798782869fe4d2698f104948dabd17657b89395.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 15 ++++++++++++--- arch/powerpc/kernel/head_book3s_32.S | 6 +++--- arch/powerpc/mm/book3s32/hash_low.S | 12 ++++++------ arch/powerpc/mm/book3s32/mmu.c | 2 +- 4 files changed, 22 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 80505915c77cf..480ad6b4fd6fe 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -31,6 +31,8 @@ #define _PAGE_RW 0x400 /* software: user write access allowed */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ +#define _PAGE_WRITE _PAGE_RW + #ifdef CONFIG_PTE_64BIT /* We never clear the high word of the pte */ #define _PTE_NONE_MASK (0xffffffff00000000ULL | _PAGE_HASHPTE) @@ -347,7 +349,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { - pte_update(mm, addr, ptep, _PAGE_RW, 0, 0); + pte_update(mm, addr, ptep, _PAGE_WRITE, 0, 0); } static inline void __ptep_set_access_flags(struct vm_area_struct *vma, @@ -406,7 +408,11 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) } /* Generic accessors to PTE bits */ -static inline int pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_RW);} +static inline bool pte_write(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_WRITE); +} + static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } @@ -469,7 +475,7 @@ static inline pte_t pfn_pte(unsigned long pfn, pgprot_t pgprot) /* Generic modifiers for PTE bits */ static inline pte_t pte_wrprotect(pte_t pte) { - return __pte(pte_val(pte) & ~_PAGE_RW); + return __pte(pte_val(pte) & ~_PAGE_WRITE); } static inline pte_t pte_exprotect(pte_t pte) @@ -499,6 +505,9 @@ static inline pte_t pte_mkpte(pte_t pte) static inline pte_t pte_mkwrite_novma(pte_t pte) { + /* + * write implies read, hence set both + */ return __pte(pte_val(pte) | _PAGE_RW); } diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 6764b98ca360f..615d429d7bd10 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -503,9 +503,9 @@ DataLoadTLBMiss: andc. r1,r1,r0 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r1,r0,32-9,30,30 /* _PAGE_WRITE -> PP msb */ rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r1,r0,32-3,24,24 /* _PAGE_RW -> _PAGE_DIRTY */ + rlwimi r1,r0,32-3,24,24 /* _PAGE_WRITE -> _PAGE_DIRTY */ rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ xori r1,r1,_PAGE_DIRTY /* clear dirty when not rw */ ori r1,r1,0xe04 /* clear out reserved bits */ @@ -689,7 +689,7 @@ hash_page_dsi: mfdar r4 mfsrr0 r5 mfsrr1 r9 - rlwinm r3, r3, 32 - 15, _PAGE_RW /* DSISR_STORE -> _PAGE_RW */ + rlwinm r3, r3, 32 - 15, _PAGE_WRITE /* DSISR_STORE -> _PAGE_WRITE */ bl hash_page mfspr r10, SPRN_SPRG_THREAD restore_regs_thread r10 diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index 8b804e1a9fa44..acb0584c174cb 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -37,7 +37,7 @@ /* * Load a PTE into the hash table, if possible. * The address is in r4, and r3 contains an access flag: - * _PAGE_RW (0x400) if a write. + * _PAGE_WRITE (0x400) if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * @@ -113,15 +113,15 @@ _GLOBAL(hash_page) lwarx r6,0,r8 /* get linux-style pte, flag word */ #ifdef CONFIG_PPC_KUAP mfsrin r5,r4 - rlwinm r0,r9,28,_PAGE_RW /* MSR[PR] => _PAGE_RW */ - rlwinm r5,r5,12,_PAGE_RW /* Ks => _PAGE_RW */ + rlwinm r0,r9,28,_PAGE_WRITE /* MSR[PR] => _PAGE_WRITE */ + rlwinm r5,r5,12,_PAGE_WRITE /* Ks => _PAGE_WRITE */ andc r5,r5,r0 /* Ks & ~MSR[PR] */ - andc r5,r6,r5 /* Clear _PAGE_RW when Ks = 1 && MSR[PR] = 0 */ + andc r5,r6,r5 /* Clear _PAGE_WRITE when Ks = 1 && MSR[PR] = 0 */ andc. r5,r3,r5 /* check access & ~permission */ #else andc. r5,r3,r6 /* check access & ~permission */ #endif - rlwinm r0,r3,32-3,24,24 /* _PAGE_RW access -> _PAGE_DIRTY */ + rlwinm r0,r3,32-3,24,24 /* _PAGE_WRITE access -> _PAGE_DIRTY */ ori r0,r0,_PAGE_ACCESSED|_PAGE_HASHPTE #ifdef CONFIG_SMP bne- .Lhash_page_out /* return if access not permitted */ @@ -307,7 +307,7 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) __REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ - rlwinm r8,r5,32-9,30,30 /* _PAGE_RW -> PP msb */ + rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ diff --git a/arch/powerpc/mm/book3s32/mmu.c b/arch/powerpc/mm/book3s32/mmu.c index d1041c946ce20..5445587bfe841 100644 --- a/arch/powerpc/mm/book3s32/mmu.c +++ b/arch/powerpc/mm/book3s32/mmu.c @@ -277,7 +277,7 @@ void __init setbat(int index, unsigned long virt, phys_addr_t phys, /* Do DBAT first */ wimgxpp = flags & (_PAGE_WRITETHRU | _PAGE_NO_CACHE | _PAGE_COHERENT | _PAGE_GUARDED); - wimgxpp |= (flags & _PAGE_RW)? BPP_RW: BPP_RX; + wimgxpp |= (flags & _PAGE_WRITE) ? BPP_RW : BPP_RX; bat[1].batu = virt | (bl << 2) | 2; /* Vs=1, Vp=0 */ bat[1].batl = BAT_PHYS_ADDR(phys) | wimgxpp; if (!is_kernel_addr(virt)) From bac4cffc7c4a009cf0d16f1785a275e0a7715e8d Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:48 +0200 Subject: [PATCH 33/76] powerpc/32s: Introduce _PAGE_READ and remove _PAGE_USER On 603 MMU, TLB missed are handled by SW and there are separated DTLB and ITLB. It is therefore possible to implement execute-only protection by not loading DTLB when read access is not permitted. To do that, _PAGE_READ flag is needed but there is no bit available for it in PTE. On the other hand the only real use of _PAGE_USER is to implement PAGE_NONE by clearing _PAGE_USER. As _PAGE_NONE can also be implemented by clearing _PAGE_READ, remove _PAGE_USER and add _PAGE_READ. Then use the virtual address to know whether user rights or kernel rights are to be used. With that change, 603 MMU now honors execute-only protection. For hash (604) MMU it is more tricky because hash table is common to load/store and execute. Nevertheless it is still possible to check whether _PAGE_READ is set before loading hash table for a load/store access. At least it can't be read unless it is executed first. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/b7702dd5a041ec59055ed2880f4952e94c087a2e.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 48 ++++----------- arch/powerpc/kernel/head_book3s_32.S | 61 +++++++++++--------- arch/powerpc/mm/book3s32/hash_low.S | 22 ++++--- 3 files changed, 60 insertions(+), 71 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 480ad6b4fd6fe..244621c88510e 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -20,7 +20,7 @@ #define _PAGE_PRESENT 0x001 /* software: pte contains a translation */ #define _PAGE_HASHPTE 0x002 /* hash_page has made an HPTE for this pte */ -#define _PAGE_USER 0x004 /* usermode access allowed */ +#define _PAGE_READ 0x004 /* software: read access allowed */ #define _PAGE_GUARDED 0x008 /* G: prohibit speculative access */ #define _PAGE_COHERENT 0x010 /* M: enforce memory coherence (SMP systems) */ #define _PAGE_NO_CACHE 0x020 /* I: cache inhibit */ @@ -28,11 +28,9 @@ #define _PAGE_DIRTY 0x080 /* C: page changed */ #define _PAGE_ACCESSED 0x100 /* R: page referenced */ #define _PAGE_EXEC 0x200 /* software: exec allowed */ -#define _PAGE_RW 0x400 /* software: user write access allowed */ +#define _PAGE_WRITE 0x400 /* software: user write access allowed */ #define _PAGE_SPECIAL 0x800 /* software: Special page */ -#define _PAGE_WRITE _PAGE_RW - #ifdef CONFIG_PTE_64BIT /* We never clear the high word of the pte */ #define _PTE_NONE_MASK (0xffffffff00000000ULL | _PAGE_HASHPTE) @@ -44,26 +42,13 @@ #define _PMD_PRESENT_MASK (PAGE_MASK) #define _PMD_BAD (~PAGE_MASK) -/* We borrow the _PAGE_USER bit to store the exclusive marker in swap PTEs. */ -#define _PAGE_SWP_EXCLUSIVE _PAGE_USER +/* We borrow the _PAGE_READ bit to store the exclusive marker in swap PTEs. */ +#define _PAGE_SWP_EXCLUSIVE _PAGE_READ /* And here we include common definitions */ -#define _PAGE_KERNEL_RO 0 -#define _PAGE_KERNEL_ROX (_PAGE_EXEC) -#define _PAGE_KERNEL_RW (_PAGE_DIRTY | _PAGE_RW) -#define _PAGE_KERNEL_RWX (_PAGE_DIRTY | _PAGE_RW | _PAGE_EXEC) - #define _PAGE_HPTEFLAGS _PAGE_HASHPTE -#ifndef __ASSEMBLY__ - -static inline bool pte_user(pte_t pte) -{ - return pte_val(pte) & _PAGE_USER; -} -#endif /* __ASSEMBLY__ */ - /* * Location of the PFN in the PTE. Most 32-bit platforms use the same * as _PAGE_SHIFT here (ie, naturally aligned). @@ -99,20 +84,7 @@ static inline bool pte_user(pte_t pte) #define _PAGE_BASE_NC (_PAGE_PRESENT | _PAGE_ACCESSED) #define _PAGE_BASE (_PAGE_BASE_NC | _PAGE_COHERENT) -/* - * Permission masks used to generate the __P and __S table. - * - * Note:__pgprot is defined in arch/powerpc/include/asm/page.h - * - * Write permissions imply read permissions for now. - */ -#define PAGE_NONE __pgprot(_PAGE_BASE) -#define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW) -#define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_RW | _PAGE_EXEC) -#define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_COPY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) -#define PAGE_READONLY __pgprot(_PAGE_BASE | _PAGE_USER) -#define PAGE_READONLY_X __pgprot(_PAGE_BASE | _PAGE_USER | _PAGE_EXEC) +#include /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) @@ -408,12 +380,16 @@ static inline pte_t pte_swp_clear_exclusive(pte_t pte) } /* Generic accessors to PTE bits */ +static inline bool pte_read(pte_t pte) +{ + return !!(pte_val(pte) & _PAGE_READ); +} + static inline bool pte_write(pte_t pte) { return !!(pte_val(pte) & _PAGE_WRITE); } -static inline int pte_read(pte_t pte) { return 1; } static inline int pte_dirty(pte_t pte) { return !!(pte_val(pte) & _PAGE_DIRTY); } static inline int pte_young(pte_t pte) { return !!(pte_val(pte) & _PAGE_ACCESSED); } static inline int pte_special(pte_t pte) { return !!(pte_val(pte) & _PAGE_SPECIAL); } @@ -448,10 +424,10 @@ static inline bool pte_ci(pte_t pte) static inline bool pte_access_permitted(pte_t pte, bool write) { /* - * A read-only access is controlled by _PAGE_USER bit. + * A read-only access is controlled by _PAGE_READ bit. * We have _PAGE_READ set for WRITE and EXECUTE */ - if (!pte_present(pte) || !pte_user(pte) || !pte_read(pte)) + if (!pte_present(pte) || !pte_read(pte)) return false; if (write && !pte_write(pte)) diff --git a/arch/powerpc/kernel/head_book3s_32.S b/arch/powerpc/kernel/head_book3s_32.S index 615d429d7bd10..c1d89764dd222 100644 --- a/arch/powerpc/kernel/head_book3s_32.S +++ b/arch/powerpc/kernel/head_book3s_32.S @@ -412,10 +412,10 @@ END_FTR_SECTION_IFSET(CPU_FTR_FPU_UNAVAILABLE) . = INTERRUPT_INST_TLB_MISS_603 InstructionTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_IMISS @@ -424,12 +424,13 @@ InstructionTLBMiss: cmplw 0,r1,r3 #endif mfspr r2, SPRN_SDR1 - li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC | _PAGE_USER + li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC rlwinm r2, r2, 28, 0xfffff000 #ifdef CONFIG_MODULES + li r0, 3 bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ - li r1,_PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_EXEC + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ #endif 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ @@ -437,13 +438,15 @@ InstructionTLBMiss: rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- InstructionAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- InstructionAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ +#ifdef CONFIG_MODULES + rlwimi r2, r0, 0, 31, 31 /* userspace ? -> PP lsb */ +#endif ori r1, r1, 0xe06 /* clear out reserved bits */ - andc r1, r0, r1 /* PP = user? 1 : 0 */ + andc r1, r2, r1 /* PP = user? 1 : 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -478,38 +481,38 @@ InstructionAddressInvalid: . = INTERRUPT_DATA_LOAD_TLB_MISS_603 DataLoadTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SDR1 - li r1, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER + li r1, _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_READ rlwinm r2, r2, 28, 0xfffff000 + li r0, 3 bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ - li r1, _PAGE_PRESENT | _PAGE_ACCESSED + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwinm r1,r0,32-9,30,30 /* _PAGE_WRITE -> PP msb */ - rlwimi r0,r0,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r1,r0,32-3,24,24 /* _PAGE_WRITE -> _PAGE_DIRTY */ - rlwimi r0,r0,32-1,31,31 /* _PAGE_USER -> PP lsb */ + rlwinm r1,r2,32-9,30,30 /* _PAGE_WRITE -> PP msb */ + rlwimi r2,r0,0,30,31 /* userspace ? -> PP */ + rlwimi r1,r2,32-3,24,24 /* _PAGE_WRITE -> _PAGE_DIRTY */ xori r1,r1,_PAGE_DIRTY /* clear dirty when not rw */ ori r1,r1,0xe04 /* clear out reserved bits */ - andc r1,r0,r1 /* PP = user? rw? 1: 3: 0 */ + andc r1,r2,r1 /* PP = user? rw? 1: 3: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -558,34 +561,35 @@ DataAddressInvalid: . = INTERRUPT_DATA_STORE_TLB_MISS_603 DataStoreTLBMiss: /* - * r0: scratch + * r0: userspace flag (later scratch) * r1: linux style pte ( later becomes ppc hardware pte ) * r2: ptr to linux-style pte - * r3: scratch + * r3: fault address */ /* Get PTE (linux-style) and check access */ mfspr r3,SPRN_DMISS lis r1, TASK_SIZE@h /* check if kernel address */ cmplw 0,r1,r3 mfspr r2, SPRN_SDR1 - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED | _PAGE_USER + li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED rlwinm r2, r2, 28, 0xfffff000 + li r0, 3 bgt- 112f lis r2, (swapper_pg_dir - PAGE_OFFSET)@ha /* if kernel address, use */ - li r1, _PAGE_RW | _PAGE_DIRTY | _PAGE_PRESENT | _PAGE_ACCESSED + li r0, 0 addi r2, r2, (swapper_pg_dir - PAGE_OFFSET)@l /* kernel page table */ 112: rlwimi r2,r3,12,20,29 /* insert top 10 bits of address */ lwz r2,0(r2) /* get pmd entry */ rlwinm. r2,r2,0,0,19 /* extract address of pte page */ beq- DataAddressInvalid /* return if no mapping */ rlwimi r2,r3,22,20,29 /* insert next 10 bits of address */ - lwz r0,0(r2) /* get linux-style pte */ - andc. r1,r1,r0 /* check access & ~permission */ + lwz r2,0(r2) /* get linux-style pte */ + andc. r1,r1,r2 /* check access & ~permission */ bne- DataAddressInvalid /* return if access not permitted */ /* Convert linux-style PTE to low word of PPC-style PTE */ - rlwimi r0,r0,32-2,31,31 /* _PAGE_USER -> PP lsb */ + rlwimi r2,r0,0,31,31 /* userspace ? -> PP lsb */ li r1,0xe06 /* clear out reserved bits & PP msb */ - andc r1,r0,r1 /* PP = user? 1: 0 */ + andc r1,r2,r1 /* PP = user? 1: 0 */ BEGIN_FTR_SECTION rlwinm r1,r1,0,~_PAGE_COHERENT /* clear M (coherence not required) */ END_FTR_SECTION_IFCLR(CPU_FTR_NEED_COHERENT) @@ -690,6 +694,7 @@ hash_page_dsi: mfsrr0 r5 mfsrr1 r9 rlwinm r3, r3, 32 - 15, _PAGE_WRITE /* DSISR_STORE -> _PAGE_WRITE */ + ori r3, r3, _PAGE_PRESENT | _PAGE_READ bl hash_page mfspr r10, SPRN_SPRG_THREAD restore_regs_thread r10 @@ -699,7 +704,7 @@ hash_page_isi: mr r11, r10 mfspr r10, SPRN_SPRG_THREAD save_regs_thread r10 - li r3, 0 + li r3, _PAGE_PRESENT | _PAGE_EXEC lwz r4, SRR0(r10) lwz r9, SRR1(r10) bl hash_page diff --git a/arch/powerpc/mm/book3s32/hash_low.S b/arch/powerpc/mm/book3s32/hash_low.S index acb0584c174cb..4ed0efd03db54 100644 --- a/arch/powerpc/mm/book3s32/hash_low.S +++ b/arch/powerpc/mm/book3s32/hash_low.S @@ -36,8 +36,9 @@ /* * Load a PTE into the hash table, if possible. - * The address is in r4, and r3 contains an access flag: - * _PAGE_WRITE (0x400) if a write. + * The address is in r4, and r3 contains required access flags: + * - For ISI: _PAGE_PRESENT | _PAGE_EXEC + * - For DSI: _PAGE_PRESENT | _PAGE_READ | _PAGE_WRITE if a write. * r9 contains the SRR1 value, from which we use the MSR_PR bit. * SPRG_THREAD contains the physical address of the current task's thread. * @@ -67,12 +68,16 @@ _GLOBAL(hash_page) lis r0, TASK_SIZE@h /* check if kernel address */ cmplw 0,r4,r0 mfspr r8,SPRN_SPRG_THREAD /* current task's THREAD (phys) */ - ori r3,r3,_PAGE_USER|_PAGE_PRESENT /* test low addresses as user */ lwz r5,PGDIR(r8) /* virt page-table root */ blt+ 112f /* assume user more likely */ lis r5,swapper_pg_dir@ha /* if kernel address, use */ + andi. r0,r9,MSR_PR /* Check usermode */ addi r5,r5,swapper_pg_dir@l /* kernel page table */ - rlwimi r3,r9,32-12,29,29 /* MSR_PR -> _PAGE_USER */ +#ifdef CONFIG_SMP + bne- .Lhash_page_out /* return if usermode */ +#else + bnelr- +#endif 112: tophys(r5, r5) #ifndef CONFIG_PTE_64BIT rlwimi r5,r4,12,20,29 /* insert top 10 bits of address */ @@ -307,12 +312,15 @@ Hash_msk = (((1 << Hash_bits) - 1) * 64) __REF _GLOBAL(create_hpte) /* Convert linux-style PTE (r5) to low word of PPC-style PTE (r8) */ + lis r0, TASK_SIZE@h + rlwinm r5,r5,0,~3 /* Clear PP bits */ + cmplw r4,r0 rlwinm r8,r5,32-9,30,30 /* _PAGE_WRITE -> PP msb */ rlwinm r0,r5,32-6,30,30 /* _PAGE_DIRTY -> PP msb */ and r8,r8,r0 /* writable if _RW & _DIRTY */ - rlwimi r5,r5,32-1,30,30 /* _PAGE_USER -> PP msb */ - rlwimi r5,r5,32-2,31,31 /* _PAGE_USER -> PP lsb */ - ori r8,r8,0xe04 /* clear out reserved bits */ + bge- 1f /* Kernelspace ? Skip */ + ori r5,r5,3 /* Userspace ? PP = 3 */ +1: ori r8,r8,0xe04 /* clear out reserved bits */ andc r8,r5,r8 /* PP = user? (rw&dirty? 1: 3): 0 */ BEGIN_FTR_SECTION rlwinm r8,r8,0,~_PAGE_COHERENT /* clear M (coherence not required) */ From ceaba662c06598e52cbe4b90fef6b71b7f965cf9 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:49 +0200 Subject: [PATCH 34/76] powerpc/ptdump: Display _PAGE_READ and _PAGE_WRITE Instead of always displaying either 'rw' or 'r ' depending on _PAGE_RW, display 'r' or ' ' for _PAGE_READ and 'w' or ' ' for _PAGE_WRITE. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/dd8201a0f8fd87ce62a7ff2edc958b604b8ec3c0.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/mm/ptdump/shared.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/mm/ptdump/shared.c b/arch/powerpc/mm/ptdump/shared.c index 5ff101654c45b..39c30c62b7ea7 100644 --- a/arch/powerpc/mm/ptdump/shared.c +++ b/arch/powerpc/mm/ptdump/shared.c @@ -11,10 +11,15 @@ static const struct flag_info flag_array[] = { { - .mask = _PAGE_RW, + .mask = _PAGE_READ, .val = 0, - .set = "r ", - .clear = "rw", + .set = " ", + .clear = "r", + }, { + .mask = _PAGE_WRITE, + .val = 0, + .set = " ", + .clear = "w", }, { .mask = _PAGE_EXEC, .val = _PAGE_EXEC, From 163a72fa89161b57b05d848aedfbd5103fac9dd7 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:50 +0200 Subject: [PATCH 35/76] powerpc: Finally remove _PAGE_USER _PAGE_USER is now gone on all targets. Remove it completely. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/76ebe74fdaed4297a1d8203a61174650c1d8d278.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/nohash/pgtable.h | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 84b6a160d2fc3..8804bcc260cb9 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -182,18 +182,14 @@ static inline int pte_young(pte_t pte) } /* - * Don't just check for any non zero bits in __PAGE_USER, since for book3e + * Don't just check for any non zero bits in __PAGE_READ, since for book3e * and PTE_64BIT, PAGE_KERNEL_X contains _PAGE_BAP_SR which is also in - * _PAGE_USER. Need to explicitly match _PAGE_BAP_UR bit in that case too. + * _PAGE_READ. Need to explicitly match _PAGE_BAP_UR bit in that case too. */ #ifndef pte_read static inline bool pte_read(pte_t pte) { -#ifdef _PAGE_READ return (pte_val(pte) & _PAGE_READ) == _PAGE_READ; -#else - return (pte_val(pte) & _PAGE_USER) == _PAGE_USER; -#endif } #endif @@ -205,7 +201,7 @@ static inline bool pte_read(pte_t pte) static inline bool pte_access_permitted(pte_t pte, bool write) { /* - * A read-only access is controlled by _PAGE_USER bit. + * A read-only access is controlled by _PAGE_READ bit. * We have _PAGE_READ set for WRITE and EXECUTE */ if (!pte_present(pte) || !pte_read(pte)) From b1fba034a6793e9601d581594a781b46c255471f Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Mon, 25 Sep 2023 20:31:51 +0200 Subject: [PATCH 36/76] powerpc: Support execute-only on all powerpc Introduce PAGE_EXECONLY_X macro which provides exec-only rights. The _X may be seen as redundant with the EXECONLY but it helps keep consistency, all macros having the EXEC right have _X. And put it next to PAGE_NONE as PAGE_EXECONLY_X is somehow PAGE_NONE + EXEC just like all other SOMETHING_X are just SOMETHING + EXEC. On book3s/64 PAGE_EXECONLY becomes PAGE_READONLY_X. On book3s/64, as PAGE_EXECONLY is only valid for Radix add VM_READ flag in vm_get_page_prot() for non-Radix. And update access_error() so that a non exec fault on a VM_EXEC only mapping is always invalid, even when the underlying layer don't always generate a fault for that. For 8xx, set PAGE_EXECONLY_X as _PAGE_NA | _PAGE_EXEC. For others, only set it as just _PAGE_EXEC With that change, 8xx, e500 and 44x fully honor execute-only protection. On 40x that is a partial implementation of execute-only. The implementation won't be complete because once a TLB has been loaded via the Instruction TLB miss handler, it will be possible to read the page. But at least it can't be read unless it is executed first. On 603 MMU, TLB missed are handled by SW and there are separate DTLB and ITLB. Execute-only is therefore now supported by not loading DTLB when read access is not permitted. On hash (604) MMU it is more tricky because hash table is common to load/store and execute. Nevertheless it is still possible to check whether _PAGE_READ is set before loading hash table for a load/store access. At least it can't be read unless it is executed first. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/4283ea9cbef9ff2fbee468904800e1962bc8fc18.1695659959.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/book3s/32/pgtable.h | 2 +- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 +--- arch/powerpc/include/asm/nohash/32/pte-8xx.h | 1 + arch/powerpc/include/asm/nohash/pgtable.h | 2 +- arch/powerpc/include/asm/nohash/pte-e500.h | 1 + arch/powerpc/include/asm/pgtable-masks.h | 2 ++ arch/powerpc/mm/book3s64/pgtable.c | 10 ++++------ arch/powerpc/mm/fault.c | 9 +++++---- arch/powerpc/mm/pgtable.c | 4 ++-- 9 files changed, 18 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/32/pgtable.h b/arch/powerpc/include/asm/book3s/32/pgtable.h index 244621c88510e..52971ee30717f 100644 --- a/arch/powerpc/include/asm/book3s/32/pgtable.h +++ b/arch/powerpc/include/asm/book3s/32/pgtable.h @@ -425,7 +425,7 @@ static inline bool pte_access_permitted(pte_t pte, bool write) { /* * A read-only access is controlled by _PAGE_READ bit. - * We have _PAGE_READ set for WRITE and EXECUTE + * We have _PAGE_READ set for WRITE */ if (!pte_present(pte) || !pte_read(pte)) return false; diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 0fd12bdc7b5eb..751b01227e364 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -18,6 +18,7 @@ #define _PAGE_WRITE 0x00002 /* write access allowed */ #define _PAGE_READ 0x00004 /* read access allowed */ #define _PAGE_NA _PAGE_PRIVILEGED +#define _PAGE_NAX _PAGE_EXEC #define _PAGE_RO _PAGE_READ #define _PAGE_ROX (_PAGE_READ | _PAGE_EXEC) #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) @@ -141,9 +142,6 @@ #include -/* Radix only, Hash uses PAGE_READONLY_X + execute-only pkey instead */ -#define PAGE_EXECONLY __pgprot(_PAGE_BASE | _PAGE_EXEC) - /* Permission masks used for kernel mappings */ #define PAGE_KERNEL __pgprot(_PAGE_BASE | _PAGE_KERNEL_RW) #define PAGE_KERNEL_NC __pgprot(_PAGE_BASE_NC | _PAGE_KERNEL_RW | _PAGE_TOLERANT) diff --git a/arch/powerpc/include/asm/nohash/32/pte-8xx.h b/arch/powerpc/include/asm/nohash/32/pte-8xx.h index 1ee38befd29a2..137dc3c84e456 100644 --- a/arch/powerpc/include/asm/nohash/32/pte-8xx.h +++ b/arch/powerpc/include/asm/nohash/32/pte-8xx.h @@ -48,6 +48,7 @@ #define _PAGE_HUGE 0x0800 /* Copied to L1 PS bit 29 */ +#define _PAGE_NAX (_PAGE_NA | _PAGE_EXEC) #define _PAGE_ROX (_PAGE_RO | _PAGE_EXEC) #define _PAGE_RW 0 #define _PAGE_RWX _PAGE_EXEC diff --git a/arch/powerpc/include/asm/nohash/pgtable.h b/arch/powerpc/include/asm/nohash/pgtable.h index 8804bcc260cb9..427db14292c9e 100644 --- a/arch/powerpc/include/asm/nohash/pgtable.h +++ b/arch/powerpc/include/asm/nohash/pgtable.h @@ -202,7 +202,7 @@ static inline bool pte_access_permitted(pte_t pte, bool write) { /* * A read-only access is controlled by _PAGE_READ bit. - * We have _PAGE_READ set for WRITE and EXECUTE + * We have _PAGE_READ set for WRITE */ if (!pte_present(pte) || !pte_read(pte)) return false; diff --git a/arch/powerpc/include/asm/nohash/pte-e500.h b/arch/powerpc/include/asm/nohash/pte-e500.h index 31d2c3ea7df8f..f516f0b5b7a82 100644 --- a/arch/powerpc/include/asm/nohash/pte-e500.h +++ b/arch/powerpc/include/asm/nohash/pte-e500.h @@ -57,6 +57,7 @@ #define _PAGE_KERNEL_ROX (_PAGE_BAP_SR | _PAGE_BAP_SX) #define _PAGE_NA 0 +#define _PAGE_NAX _PAGE_BAP_UX #define _PAGE_RO _PAGE_READ #define _PAGE_ROX (_PAGE_READ | _PAGE_BAP_UX) #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) diff --git a/arch/powerpc/include/asm/pgtable-masks.h b/arch/powerpc/include/asm/pgtable-masks.h index 808a3b9e8fc05..6e8e2db26a5a8 100644 --- a/arch/powerpc/include/asm/pgtable-masks.h +++ b/arch/powerpc/include/asm/pgtable-masks.h @@ -4,6 +4,7 @@ #ifndef _PAGE_NA #define _PAGE_NA 0 +#define _PAGE_NAX _PAGE_EXEC #define _PAGE_RO _PAGE_READ #define _PAGE_ROX (_PAGE_READ | _PAGE_EXEC) #define _PAGE_RW (_PAGE_READ | _PAGE_WRITE) @@ -20,6 +21,7 @@ /* Permission masks used to generate the __P and __S table */ #define PAGE_NONE __pgprot(_PAGE_BASE | _PAGE_NA) +#define PAGE_EXECONLY_X __pgprot(_PAGE_BASE | _PAGE_NAX) #define PAGE_SHARED __pgprot(_PAGE_BASE | _PAGE_RW) #define PAGE_SHARED_X __pgprot(_PAGE_BASE | _PAGE_RWX) #define PAGE_COPY __pgprot(_PAGE_BASE | _PAGE_RO) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 8f8a62d3ff4de..be229290a6a77 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -635,12 +635,10 @@ pgprot_t vm_get_page_prot(unsigned long vm_flags) unsigned long prot; /* Radix supports execute-only, but protection_map maps X -> RX */ - if (radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) { - prot = pgprot_val(PAGE_EXECONLY); - } else { - prot = pgprot_val(protection_map[vm_flags & - (VM_ACCESS_FLAGS | VM_SHARED)]); - } + if (!radix_enabled() && ((vm_flags & VM_ACCESS_FLAGS) == VM_EXEC)) + vm_flags |= VM_READ; + + prot = pgprot_val(protection_map[vm_flags & (VM_ACCESS_FLAGS | VM_SHARED)]); if (vm_flags & VM_SAO) prot |= _PAGE_SAO; diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index b1723094d464c..9e49ede2bc1c0 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -266,14 +266,15 @@ static bool access_error(bool is_write, bool is_exec, struct vm_area_struct *vma } /* - * VM_READ, VM_WRITE and VM_EXEC all imply read permissions, as - * defined in protection_map[]. Read faults can only be caused by - * a PROT_NONE mapping, or with a PROT_EXEC-only mapping on Radix. + * VM_READ, VM_WRITE and VM_EXEC may imply read permissions, as + * defined in protection_map[]. In that case Read faults can only be + * caused by a PROT_NONE mapping. However a non exec access on a + * VM_EXEC only mapping is invalid anyway, so report it as such. */ if (unlikely(!vma_is_accessible(vma))) return true; - if (unlikely(radix_enabled() && ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC))) + if ((vma->vm_flags & VM_ACCESS_FLAGS) == VM_EXEC) return true; /* diff --git a/arch/powerpc/mm/pgtable.c b/arch/powerpc/mm/pgtable.c index 781a68c69c2f0..79508c1d15d7f 100644 --- a/arch/powerpc/mm/pgtable.c +++ b/arch/powerpc/mm/pgtable.c @@ -492,7 +492,7 @@ const pgprot_t protection_map[16] = { [VM_READ] = PAGE_READONLY, [VM_WRITE] = PAGE_COPY, [VM_WRITE | VM_READ] = PAGE_COPY, - [VM_EXEC] = PAGE_READONLY_X, + [VM_EXEC] = PAGE_EXECONLY_X, [VM_EXEC | VM_READ] = PAGE_READONLY_X, [VM_EXEC | VM_WRITE] = PAGE_COPY_X, [VM_EXEC | VM_WRITE | VM_READ] = PAGE_COPY_X, @@ -500,7 +500,7 @@ const pgprot_t protection_map[16] = { [VM_SHARED | VM_READ] = PAGE_READONLY, [VM_SHARED | VM_WRITE] = PAGE_SHARED, [VM_SHARED | VM_WRITE | VM_READ] = PAGE_SHARED, - [VM_SHARED | VM_EXEC] = PAGE_READONLY_X, + [VM_SHARED | VM_EXEC] = PAGE_EXECONLY_X, [VM_SHARED | VM_EXEC | VM_READ] = PAGE_READONLY_X, [VM_SHARED | VM_EXEC | VM_WRITE] = PAGE_SHARED_X, [VM_SHARED | VM_EXEC | VM_WRITE | VM_READ] = PAGE_SHARED_X From ff7a60ab1e065257a0e467c13b519f4debcd7fcf Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:00 +1100 Subject: [PATCH 37/76] powerpc/xive: Fix endian conversion size Sparse reports a size mismatch in the endian swap. The Opal implementation[1] passes the value as a __be64, and the receiving variable out_qsize is a u64, so the use of be32_to_cpu() appears to be an error. [1]: https://github.com/open-power/skiboot/blob/80e2b1dc73/hw/xive.c#L3854 Fixes: 88ec6b93c8e7 ("powerpc/xive: add OPAL extensions for the XIVE native exploitation support") Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-2-bgray@linux.ibm.com --- arch/powerpc/sysdev/xive/native.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/xive/native.c b/arch/powerpc/sysdev/xive/native.c index 9f0af4d795d88..f1c0fa6ece21d 100644 --- a/arch/powerpc/sysdev/xive/native.c +++ b/arch/powerpc/sysdev/xive/native.c @@ -802,7 +802,7 @@ int xive_native_get_queue_info(u32 vp_id, u32 prio, if (out_qpage) *out_qpage = be64_to_cpu(qpage); if (out_qsize) - *out_qsize = be32_to_cpu(qsize); + *out_qsize = be64_to_cpu(qsize); if (out_qeoi_page) *out_qeoi_page = be64_to_cpu(qeoi_page); if (out_escalate_irq) From 340a60e3725b9a229eaf03a9b3f8538f22f6ac16 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:02 +1100 Subject: [PATCH 38/76] powerpc: Explicitly reverse bytes when checking for byte reversal Sparse reports an invalid endian cast here. The code is written for big endian platforms, so le32_to_cpu() acts as a byte reversal. This file is checked by sparse on a little endian build though, so replace the reverse function with the dedicated swab32() function to better express the intent of the code. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-4-bgray@linux.ibm.com --- arch/powerpc/sysdev/mpic.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/sysdev/mpic.c b/arch/powerpc/sysdev/mpic.c index ba287abcb008b..dabbdd356664c 100644 --- a/arch/powerpc/sysdev/mpic.c +++ b/arch/powerpc/sysdev/mpic.c @@ -355,7 +355,7 @@ static void __init mpic_test_broken_ipi(struct mpic *mpic) mpic_write(mpic->gregs, MPIC_INFO(GREG_IPI_VECTOR_PRI_0), MPIC_VECPRI_MASK); r = mpic_read(mpic->gregs, MPIC_INFO(GREG_IPI_VECTOR_PRI_0)); - if (r == le32_to_cpu(MPIC_VECPRI_MASK)) { + if (r == swab32(MPIC_VECPRI_MASK)) { printk(KERN_INFO "mpic: Detected reversed IPI registers\n"); mpic->flags |= MPIC_BROKEN_IPI; } From ddfb7d9db843fd4fbdff81fb8a8743f865c3dd96 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:03 +1100 Subject: [PATCH 39/76] powerpc: Use NULL instead of 0 for null pointers Sparse reports several uses of 0 for pointer arguments and comparisons. Replace with NULL to better convey the intent. Remove entirely if a comparison to follow the kernel style of implicit boolean conversions. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-5-bgray@linux.ibm.com --- arch/powerpc/kernel/setup_64.c | 2 +- arch/powerpc/kvm/book3s_xive_native.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 4 ++-- arch/powerpc/perf/imc-pmu.c | 2 +- arch/powerpc/platforms/4xx/soc.c | 2 +- arch/powerpc/platforms/pseries/lpar.c | 8 ++++---- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/setup_64.c b/arch/powerpc/kernel/setup_64.c index 246201d0d879e..2f19d5e944852 100644 --- a/arch/powerpc/kernel/setup_64.c +++ b/arch/powerpc/kernel/setup_64.c @@ -364,7 +364,7 @@ void __init early_setup(unsigned long dt_ptr) */ initialise_paca(&boot_paca, 0); fixup_boot_paca(&boot_paca); - WARN_ON(local_paca != 0); + WARN_ON(local_paca); setup_paca(&boot_paca); /* install the paca into registers */ /* -------- printk is now safe to use ------- */ diff --git a/arch/powerpc/kvm/book3s_xive_native.c b/arch/powerpc/kvm/book3s_xive_native.c index 712ab91ced398..6e2ebbd8aaace 100644 --- a/arch/powerpc/kvm/book3s_xive_native.c +++ b/arch/powerpc/kvm/book3s_xive_native.c @@ -567,7 +567,7 @@ static int kvmppc_xive_native_set_queue_config(struct kvmppc_xive *xive, u8 priority; struct kvm_ppc_xive_eq kvm_eq; int rc; - __be32 *qaddr = 0; + __be32 *qaddr = NULL; struct page *page; struct xive_q *q; gfn_t gfn; diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 37043dfc1addd..471e37cf8e2a4 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -146,9 +146,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) * update ctgtx.idx as it pretends to output instructions, then we can * calculate total size from idx. */ - bpf_jit_build_prologue(0, &cgctx); + bpf_jit_build_prologue(NULL, &cgctx); addrs[fp->len] = cgctx.idx * 4; - bpf_jit_build_epilogue(0, &cgctx); + bpf_jit_build_epilogue(NULL, &cgctx); fixup_len = fp->aux->num_exentries * BPF_FIXUP_LEN * 4; extable_len = fp->aux->num_exentries * sizeof(struct exception_table_entry); diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 9d229ef7f86ef..eba2baabccb03 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -544,7 +544,7 @@ static int nest_imc_event_init(struct perf_event *event) break; } pcni++; - } while (pcni->vbase != 0); + } while (pcni->vbase); if (!flag) return -ENODEV; diff --git a/arch/powerpc/platforms/4xx/soc.c b/arch/powerpc/platforms/4xx/soc.c index b2d940437a662..5412e6b21e106 100644 --- a/arch/powerpc/platforms/4xx/soc.c +++ b/arch/powerpc/platforms/4xx/soc.c @@ -112,7 +112,7 @@ static int __init ppc4xx_l2c_probe(void) } /* Install error handler */ - if (request_irq(irq, l2c_error_handler, 0, "L2C", 0) < 0) { + if (request_irq(irq, l2c_error_handler, 0, "L2C", NULL) < 0) { printk(KERN_ERR "Cannot install L2C error handler" ", cache is not enabled\n"); of_node_put(np); diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index f2cb62148f36f..b4e86b7ea584d 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -192,9 +192,9 @@ static void free_dtl_buffers(unsigned long *time_limit) continue; kmem_cache_free(dtl_cache, pp->dispatch_log); pp->dtl_ridx = 0; - pp->dispatch_log = 0; - pp->dispatch_log_end = 0; - pp->dtl_curr = 0; + pp->dispatch_log = NULL; + pp->dispatch_log_end = NULL; + pp->dtl_curr = NULL; if (time_limit && time_after(jiffies, *time_limit)) { cond_resched(); @@ -223,7 +223,7 @@ static void destroy_cpu_associativity(void) { kfree(vcpu_associativity); kfree(pcpu_associativity); - vcpu_associativity = pcpu_associativity = 0; + vcpu_associativity = pcpu_associativity = NULL; } static __be32 *__get_cpu_associativity(int cpu, __be32 *cpu_assoc, int flag) From 419d5d112c2e1e78beda9c3299f71c35141d8dba Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:04 +1100 Subject: [PATCH 40/76] powerpc: Remove extern from function implementations Sparse reports several function implementations annotated with extern. This is clearly incorrect, likely just copied from an actual extern declaration in another file. Fix the sparse warnings by removing extern. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-6-bgray@linux.ibm.com --- arch/powerpc/kernel/iommu.c | 8 ++++---- arch/powerpc/kernel/traps.c | 4 ++-- arch/powerpc/kvm/book3s_64_vio.c | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c index 14251bc5219eb..3e28579f7c625 100644 --- a/arch/powerpc/kernel/iommu.c +++ b/arch/powerpc/kernel/iommu.c @@ -1074,10 +1074,10 @@ int iommu_tce_check_gpa(unsigned long page_shift, unsigned long gpa) } EXPORT_SYMBOL_GPL(iommu_tce_check_gpa); -extern long iommu_tce_xchg_no_kill(struct mm_struct *mm, - struct iommu_table *tbl, - unsigned long entry, unsigned long *hpa, - enum dma_data_direction *direction) +long iommu_tce_xchg_no_kill(struct mm_struct *mm, + struct iommu_table *tbl, + unsigned long entry, unsigned long *hpa, + enum dma_data_direction *direction) { long ret; unsigned long size = 0; diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index fe3f720c9cd61..5ea2014aff90d 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -157,7 +157,7 @@ static int die_owner = -1; static unsigned int die_nest_count; static int die_counter; -extern void panic_flush_kmsg_start(void) +void panic_flush_kmsg_start(void) { /* * These are mostly taken from kernel/panic.c, but tries to do @@ -170,7 +170,7 @@ extern void panic_flush_kmsg_start(void) bust_spinlocks(1); } -extern void panic_flush_kmsg_end(void) +void panic_flush_kmsg_end(void) { kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 93b695b289e99..15200d766fc53 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -77,8 +77,8 @@ static void kvm_spapr_tce_liobn_put(struct kref *kref) call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free); } -extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, - struct iommu_group *grp) +void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp) { int i; struct kvmppc_spapr_tce_table *stt; @@ -105,8 +105,8 @@ extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, rcu_read_unlock(); } -extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, - struct iommu_group *grp) +long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp) { struct kvmppc_spapr_tce_table *stt = NULL; bool found = false; From 2b4a6cc9a1a7cf6958c8b11f94e61c8e81b60b88 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:05 +1100 Subject: [PATCH 41/76] powerpc: Annotate endianness of various variables and functions Sparse reports several endianness warnings on variables and functions that are consistently treated as big endian. There are no multi-endianness shenanigans going on here so fix these low hanging fruit up in one patch. All changes are just type annotations; no endianness switching operations are introduced by this patch. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-7-bgray@linux.ibm.com --- arch/powerpc/include/asm/book3s/64/pgtable.h | 2 +- arch/powerpc/include/asm/imc-pmu.h | 16 ++++++++-------- arch/powerpc/kernel/prom_init.c | 2 +- arch/powerpc/kexec/core_64.c | 4 ++-- arch/powerpc/kexec/file_load_64.c | 6 +++--- arch/powerpc/mm/drmem.c | 2 +- arch/powerpc/perf/hv-24x7.c | 2 +- arch/powerpc/perf/imc-pmu.c | 9 +++++---- arch/powerpc/platforms/powermac/feature.c | 3 ++- arch/powerpc/platforms/pseries/hotplug-memory.c | 3 ++- 10 files changed, 26 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 751b01227e364..cb77eddca54b9 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -626,7 +626,7 @@ static inline pte_t pte_mkdevmap(pte_t pte) */ static inline int pte_devmap(pte_t pte) { - u64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE); + __be64 mask = cpu_to_be64(_PAGE_DEVMAP | _PAGE_PTE); return (pte_raw(pte) & mask) == mask; } diff --git a/arch/powerpc/include/asm/imc-pmu.h b/arch/powerpc/include/asm/imc-pmu.h index 699a88584ae16..a656635df3861 100644 --- a/arch/powerpc/include/asm/imc-pmu.h +++ b/arch/powerpc/include/asm/imc-pmu.h @@ -74,14 +74,14 @@ struct imc_events { * The following is the data structure to hold trace imc data. */ struct trace_imc_data { - u64 tb1; - u64 ip; - u64 val; - u64 cpmc1; - u64 cpmc2; - u64 cpmc3; - u64 cpmc4; - u64 tb2; + __be64 tb1; + __be64 ip; + __be64 val; + __be64 cpmc1; + __be64 cpmc2; + __be64 cpmc3; + __be64 cpmc4; + __be64 tb2; }; /* Event attribute array index */ diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index d464ba412084d..e67effdba85cc 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -947,7 +947,7 @@ struct option_vector7 { } __packed; struct ibm_arch_vec { - struct { u32 mask, val; } pvrs[14]; + struct { __be32 mask, val; } pvrs[14]; u8 num_vectors; diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index a79e28c91e2be..0bee7ca9a77c6 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -379,8 +379,8 @@ void default_machine_kexec(struct kimage *image) #ifdef CONFIG_PPC_64S_HASH_MMU /* Values we need to export to the second kernel via the device tree. */ -static unsigned long htab_base; -static unsigned long htab_size; +static __be64 htab_base; +static __be64 htab_size; static struct property htab_base_prop = { .name = "linux,htab-base", diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 09187aca3d1f8..961a6dd673656 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -32,7 +32,7 @@ #include struct umem_info { - u64 *buf; /* data buffer for usable-memory property */ + __be64 *buf; /* data buffer for usable-memory property */ u32 size; /* size allocated for the data buffer */ u32 max_entries; /* maximum no. of entries */ u32 idx; /* index of current entry */ @@ -443,10 +443,10 @@ static int locate_mem_hole_bottom_up_ppc64(struct kexec_buf *kbuf, * * Returns buffer on success, NULL on error. */ -static u64 *check_realloc_usable_mem(struct umem_info *um_info, int cnt) +static __be64 *check_realloc_usable_mem(struct umem_info *um_info, int cnt) { u32 new_size; - u64 *tbuf; + __be64 *tbuf; if ((um_info->idx + cnt) <= um_info->max_entries) return um_info->buf; diff --git a/arch/powerpc/mm/drmem.c b/arch/powerpc/mm/drmem.c index 2369d1bf2411e..fde7790277f75 100644 --- a/arch/powerpc/mm/drmem.c +++ b/arch/powerpc/mm/drmem.c @@ -67,7 +67,7 @@ static int drmem_update_dt_v1(struct device_node *memory, struct property *new_prop; struct of_drconf_cell_v1 *dr_cell; struct drmem_lmb *lmb; - u32 *p; + __be32 *p; new_prop = clone_property(prop, prop->length); if (!new_prop) diff --git a/arch/powerpc/perf/hv-24x7.c b/arch/powerpc/perf/hv-24x7.c index 3449be7c0d51f..057ec2e3451d1 100644 --- a/arch/powerpc/perf/hv-24x7.c +++ b/arch/powerpc/perf/hv-24x7.c @@ -1338,7 +1338,7 @@ static int get_count_from_result(struct perf_event *event, for (i = count = 0, element_data = res->elements + data_offset; i < num_elements; i++, element_data += data_size + data_offset) - count += be64_to_cpu(*((u64 *) element_data)); + count += be64_to_cpu(*((__be64 *)element_data)); *countp = count; diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index eba2baabccb03..2016aee270379 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -1025,16 +1025,16 @@ static bool is_thread_imc_pmu(struct perf_event *event) return false; } -static u64 * get_event_base_addr(struct perf_event *event) +static __be64 *get_event_base_addr(struct perf_event *event) { u64 addr; if (is_thread_imc_pmu(event)) { addr = (u64)per_cpu(thread_imc_mem, smp_processor_id()); - return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK)); + return (__be64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK)); } - return (u64 *)event->hw.event_base; + return (__be64 *)event->hw.event_base; } static void thread_imc_pmu_start_txn(struct pmu *pmu, @@ -1058,7 +1058,8 @@ static int thread_imc_pmu_commit_txn(struct pmu *pmu) static u64 imc_read_counter(struct perf_event *event) { - u64 *addr, data; + __be64 *addr; + u64 data; /* * In-Memory Collection (IMC) counters are free flowing counters. diff --git a/arch/powerpc/platforms/powermac/feature.c b/arch/powerpc/platforms/powermac/feature.c index ae62d432db8bb..81c9fbae88b14 100644 --- a/arch/powerpc/platforms/powermac/feature.c +++ b/arch/powerpc/platforms/powermac/feature.c @@ -2614,7 +2614,8 @@ static void __init probe_one_macio(const char *name, const char *compat, int typ struct device_node* node; int i; volatile u32 __iomem *base; - const u32 *addrp, *revp; + const __be32 *addrp; + const u32 *revp; phys_addr_t addr; u64 size; diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index aa4042dcd6d40..a43bfb01720ae 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -55,7 +55,8 @@ static bool find_aa_index(struct device_node *dr_node, struct property *ala_prop, const u32 *lmb_assoc, u32 *aa_index) { - u32 *assoc_arrays, new_prop_size; + __be32 *assoc_arrays; + u32 new_prop_size; struct property *new_prop; int aa_arrays, aa_array_entries, aa_array_sz; int i, index; From 8577dd00a6ba335bc359313599d6100522a1931c Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:07 +1100 Subject: [PATCH 42/76] powerpc/opal: Annotate out param endianness Sparse reports an endian mismatch with args to opal_int_get_xirr(). Checking the skiboot source[1] shows the function takes a __be32* (as expected), so update the function declaration to reflect this. [1]: https://github.com/open-power/skiboot/blob/80e2b1dc73/hw/xive.c#L3479 Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-9-bgray@linux.ibm.com --- arch/powerpc/include/asm/opal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index a9b31cc258fcb..b66b0c615f4f1 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -227,7 +227,7 @@ int64_t opal_pci_set_power_state(uint64_t async_token, uint64_t id, uint64_t data); int64_t opal_pci_poll2(uint64_t id, uint64_t data); -int64_t opal_int_get_xirr(uint32_t *out_xirr, bool just_poll); +int64_t opal_int_get_xirr(__be32 *out_xirr, bool just_poll); int64_t opal_int_set_cppr(uint8_t cppr); int64_t opal_int_eoi(uint32_t xirr); int64_t opal_int_set_mfrr(uint32_t cpu, uint8_t mfrr); From c6519c6df0722e432f330afbc7c00d16d5be5c58 Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:08 +1100 Subject: [PATCH 43/76] powerpc/uaccess: Cast away __user annotation after verification Sparse reports dereference of a __user pointer. copy_mc_to_user() takes a __user pointer, verifies it, then calls the generic copy routine copy_mc_generic(). As we have verified the pointer, cast out the __user annotation when passing to copy_mc_generic(). Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-10-bgray@linux.ibm.com --- arch/powerpc/include/asm/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index fb725ec77926e..f1f9890f50d3e 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -374,7 +374,7 @@ copy_mc_to_user(void __user *to, const void *from, unsigned long n) if (check_copy_size(from, n, true)) { if (access_ok(to, n)) { allow_write_to_user(to, n); - n = copy_mc_generic((void *)to, from, n); + n = copy_mc_generic((void __force *)to, from, n); prevent_write_to_user(to, n); } } From 2c4ce3e65b1a543123ffcec4b021ad6ebd4e4e4e Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:09 +1100 Subject: [PATCH 44/76] powerpc: Cast away __iomem in low level IO routines Sparse reports dereferencing an __iomem pointer. These routines are clearly low level handlers for IO memory, so force cast away the __iomem annotation to tell sparse the dereferences are safe. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-11-bgray@linux.ibm.com --- arch/powerpc/kernel/io.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/kernel/io.c b/arch/powerpc/kernel/io.c index 2f29b7d432de3..6af535905984a 100644 --- a/arch/powerpc/kernel/io.c +++ b/arch/powerpc/kernel/io.c @@ -33,7 +33,7 @@ void _insb(const volatile u8 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u8 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -49,7 +49,7 @@ void _outsb(volatile u8 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u8 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } @@ -64,7 +64,7 @@ void _insw_ns(const volatile u16 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u16 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -80,7 +80,7 @@ void _outsw_ns(volatile u16 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u16 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } @@ -95,7 +95,7 @@ void _insl_ns(const volatile u32 __iomem *port, void *buf, long count) return; asm volatile("sync"); do { - tmp = *port; + tmp = *(const volatile u32 __force *)port; eieio(); *tbuf++ = tmp; } while (--count != 0); @@ -111,7 +111,7 @@ void _outsl_ns(volatile u32 __iomem *port, const void *buf, long count) return; asm volatile("sync"); do { - *port = *tbuf++; + *(volatile u32 __force *)port = *tbuf++; } while (--count != 0); asm volatile("sync"); } From 82f635243f209a85d3deb9f64439c3ea84cd4ecb Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:10 +1100 Subject: [PATCH 45/76] powerpc/eeh: Remove unnecessary cast Sparse reports a warning when casting to an int. There is no need to cast in the first place, so drop them. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-12-bgray@linux.ibm.com --- arch/powerpc/kernel/eeh_driver.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/eeh_driver.c b/arch/powerpc/kernel/eeh_driver.c index 438568a472d03..48773d2d9be3c 100644 --- a/arch/powerpc/kernel/eeh_driver.c +++ b/arch/powerpc/kernel/eeh_driver.c @@ -39,7 +39,7 @@ static int eeh_result_priority(enum pci_ers_result result) case PCI_ERS_RESULT_NEED_RESET: return 6; default: - WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", (int)result); + WARN_ONCE(1, "Unknown pci_ers_result value: %d\n", result); return 0; } }; @@ -60,7 +60,7 @@ static const char *pci_ers_result_name(enum pci_ers_result result) case PCI_ERS_RESULT_NO_AER_DRIVER: return "no AER driver"; default: - WARN_ONCE(1, "Unknown result type: %d\n", (int)result); + WARN_ONCE(1, "Unknown result type: %d\n", result); return "unknown"; } }; From b574b817cc7bfcc87bbc92b4b19525442401ae5e Mon Sep 17 00:00:00 2001 From: Benjamin Gray Date: Wed, 11 Oct 2023 16:37:11 +1100 Subject: [PATCH 46/76] powerpc/fadump: Annotate endianness cast with __force Sparse reports an endianness error with the else case of val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : (u64)(reg_entry->reg_val)); This is a safe operation because the code is explicitly working with dynamic endianness, so add the __force annotation to tell Sparse to ignore it. Signed-off-by: Benjamin Gray Signed-off-by: Michael Ellerman Link: https://msgid.link/20231011053711.93427-13-bgray@linux.ibm.com --- arch/powerpc/platforms/powernv/opal-fadump.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powernv/opal-fadump.h b/arch/powerpc/platforms/powernv/opal-fadump.h index 3f715efb0aa6e..5eeb794b5eb1e 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.h +++ b/arch/powerpc/platforms/powernv/opal-fadump.h @@ -135,7 +135,7 @@ static inline void opal_fadump_read_regs(char *bufp, unsigned int regs_cnt, for (i = 0; i < regs_cnt; i++, bufp += reg_entry_size) { reg_entry = (struct hdat_fadump_reg_entry *)bufp; val = (cpu_endian ? be64_to_cpu(reg_entry->reg_val) : - (u64)(reg_entry->reg_val)); + (u64 __force)(reg_entry->reg_val)); opal_fadump_set_regval_regnum(regs, be32_to_cpu(reg_entry->reg_type), be32_to_cpu(reg_entry->reg_num), From 6db51ff905362e6c79593dbda08f61f24b25971c Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 5 Jan 2023 14:41:45 +0800 Subject: [PATCH 47/76] macintosh/macio-adb: add missing iounmap() on error in macio_init() Add missing iounmap(), if request_irq() fails. Signed-off-by: Yang Yingliang Signed-off-by: Michael Ellerman Link: https://msgid.link/20230105064145.3879356-1-yangyingliang@huawei.com --- drivers/macintosh/macio-adb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/macintosh/macio-adb.c b/drivers/macintosh/macio-adb.c index 55a9f8c3a150e..779f1268286eb 100644 --- a/drivers/macintosh/macio-adb.c +++ b/drivers/macintosh/macio-adb.c @@ -123,6 +123,7 @@ int macio_init(void) irq = irq_of_parse_and_map(adbs, 0); of_node_put(adbs); if (request_irq(irq, macio_adb_interrupt, 0, "ADB", (void *)0)) { + iounmap(adb); printk(KERN_ERR "ADB: can't get irq %d\n", irq); return -EAGAIN; } From b28d1ccf921a4333be14017d82066386d419e638 Mon Sep 17 00:00:00 2001 From: Stanislav Kinsburskii Date: Sat, 15 Apr 2023 04:17:53 -0700 Subject: [PATCH 48/76] powerpc/io: Expect immutable pointer in virt_to_phys() prototype MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit virt_to_phys() doesn't need the address pointer to be mutable. At the same time allowing it to be mutable leads to the following build warning for constant pointers: warning: passing argument 1 of ‘virt_to_phys’ discards ‘const’ qualifier from pointer target type Signed-off-by: Stanislav Kinsburskii Reviewed-by: Linus Walleij Signed-off-by: Michael Ellerman Link: https://msgid.link/168155747391.13678.10634415747614468991.stgit@skinsburskii.localdomain --- arch/powerpc/include/asm/io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 0732b743e0996..5220274a62772 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -950,7 +950,7 @@ extern void __iomem *__ioremap_caller(phys_addr_t, unsigned long size, * almost all conceivable cases a device driver should not be using * this function */ -static inline unsigned long virt_to_phys(volatile void * address) +static inline unsigned long virt_to_phys(const volatile void * address) { WARN_ON(IS_ENABLED(CONFIG_DEBUG_VIRTUAL) && !virt_addr_valid(address)); From d45c4b48dafb5820e5cc267ff9a6d7784d13a43c Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 24 Aug 2023 16:42:10 +1000 Subject: [PATCH 49/76] powerpc: Hide empty pt_regs at base of the stack A thread started via eg. user_mode_thread() runs in the kernel to begin with and then may later return to userspace. While it's running in the kernel it has a pt_regs at the base of its kernel stack, but that pt_regs is all zeroes. If the thread oopses in that state, it leads to an ugly stack trace with a big block of zero GPRs, as reported by Joel: Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0) CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc7-00004-gf7757129e3de-dirty #3 Hardware name: IBM PowerNV (emulated by qemu) POWER9 0x4e1200 opal:v7.0 PowerNV Call Trace: [c0000000036afb00] [c0000000010dd058] dump_stack_lvl+0x6c/0x9c (unreliable) [c0000000036afb30] [c00000000013c524] panic+0x178/0x424 [c0000000036afbd0] [c000000002005100] mount_root_generic+0x250/0x324 [c0000000036afca0] [c0000000020057d0] prepare_namespace+0x2d4/0x344 [c0000000036afd20] [c0000000020049c0] kernel_init_freeable+0x358/0x3ac [c0000000036afdf0] [c0000000000111b0] kernel_init+0x30/0x1a0 [c0000000036afe50] [c00000000000debc] ret_from_kernel_user_thread+0x14/0x1c --- interrupt: 0 at 0x0 NIP: 0000000000000000 LR: 0000000000000000 CTR: 0000000000000000 REGS: c0000000036afe80 TRAP: 0000 Not tainted (6.5.0-rc7-00004-gf7757129e3de-dirty) MSR: 0000000000000000 <> CR: 00000000 XER: 00000000 CFAR: 0000000000000000 IRQMASK: 0 GPR00: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR04: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR08: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR12: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR16: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR20: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR24: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 GPR28: 0000000000000000 0000000000000000 0000000000000000 0000000000000000 NIP [0000000000000000] 0x0 LR [0000000000000000] 0x0 --- interrupt: 0 The all-zero pt_regs looks ugly and conveys no useful information, other than its presence. So detect that case and just show the presence of the frame by printing the interrupt marker, eg: Kernel panic - not syncing: VFS: Unable to mount root fs on unknown-block(0,0) CPU: 0 PID: 1 Comm: swapper/0 Not tainted 6.5.0-rc3-00126-g18e9506562a0-dirty #301 Hardware name: IBM pSeries (emulated by qemu) POWER9 (raw) 0x4e1202 0xf000005 of:SLOF,HEAD hv:linux,kvm pSeries Call Trace: [c000000003aabb00] [c000000001143db8] dump_stack_lvl+0x6c/0x9c (unreliable) [c000000003aabb30] [c00000000014c624] panic+0x178/0x424 [c000000003aabbd0] [c0000000020050fc] mount_root_generic+0x250/0x324 [c000000003aabca0] [c0000000020057cc] prepare_namespace+0x2d4/0x344 [c000000003aabd20] [c0000000020049bc] kernel_init_freeable+0x358/0x3ac [c000000003aabdf0] [c0000000000111b0] kernel_init+0x30/0x1a0 [c000000003aabe50] [c00000000000debc] ret_from_kernel_user_thread+0x14/0x1c --- interrupt: 0 at 0x0 To avoid ever suppressing a valid pt_regs make sure the pt_regs has a zero MSR and TRAP value, and is located at the very base of the stack. Fixes: 6895dfc04741 ("powerpc: copy_thread fill in interrupt frame marker and back chain") Reported-by: Joel Stanley Reported-by: Nicholas Piggin Reviewed-by: Joel Stanley Signed-off-by: Michael Ellerman Link: https://msgid.link/20230824064210.907266-1-mpe@ellerman.id.au --- arch/powerpc/kernel/process.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index b68898ac07e19..392404688cec3 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -2258,6 +2258,22 @@ unsigned long __get_wchan(struct task_struct *p) return ret; } +static bool empty_user_regs(struct pt_regs *regs, struct task_struct *tsk) +{ + unsigned long stack_page; + + // A non-empty pt_regs should never have a zero MSR or TRAP value. + if (regs->msr || regs->trap) + return false; + + // Check it sits at the very base of the stack + stack_page = (unsigned long)task_stack_page(tsk); + if ((unsigned long)(regs + 1) != stack_page + THREAD_SIZE) + return false; + + return true; +} + static int kstack_depth_to_print = CONFIG_PRINT_STACK_DEPTH; void __no_sanitize_address show_stack(struct task_struct *tsk, @@ -2322,9 +2338,13 @@ void __no_sanitize_address show_stack(struct task_struct *tsk, lr = regs->link; printk("%s--- interrupt: %lx at %pS\n", loglvl, regs->trap, (void *)regs->nip); - __show_regs(regs); - printk("%s--- interrupt: %lx\n", - loglvl, regs->trap); + + // Detect the case of an empty pt_regs at the very base + // of the stack and suppress showing it in full. + if (!empty_user_regs(regs, tsk)) { + __show_regs(regs); + printk("%s--- interrupt: %lx\n", loglvl, regs->trap); + } firstframe = 1; } From e08c43e6c3eb5d805b61d981f1e8286ee0dc6d1a Mon Sep 17 00:00:00 2001 From: Kuan-Wei Chiu Date: Sat, 14 Oct 2023 01:57:14 +0800 Subject: [PATCH 50/76] powerpc/perf: Optimize find_alternatives_list() using binary search This patch improves the performance of event alternative lookup by replacing the previous linear search with a more efficient binary search. This change reduces the time complexity for the search process from O(n) to O(log(n)). A pre-sorted table of event values and their corresponding indices has been introduced to expedite the search process. Signed-off-by: Kuan-Wei Chiu [mpe: Call the array "presort*ed*_event_table", minor formatting] Signed-off-by: Michael Ellerman Link: https://msgid.link/20231013175714.2142775-1-visitorckw@gmail.com --- arch/powerpc/perf/power6-pmu.c | 46 +++++++++++++++++++++------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/perf/power6-pmu.c b/arch/powerpc/perf/power6-pmu.c index 5729b6e059de0..9f720b522e171 100644 --- a/arch/powerpc/perf/power6-pmu.c +++ b/arch/powerpc/perf/power6-pmu.c @@ -335,26 +335,38 @@ static const unsigned int event_alternatives[][MAX_ALT] = { { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ }; -/* - * This could be made more efficient with a binary search on - * a presorted list, if necessary - */ static int find_alternatives_list(u64 event) { - int i, j; - unsigned int alt; - - for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { - if (event < event_alternatives[i][0]) - return -1; - for (j = 0; j < MAX_ALT; ++j) { - alt = event_alternatives[i][j]; - if (!alt || event < alt) - break; - if (event == alt) - return i; - } + const unsigned int presorted_event_table[] = { + 0x0130e8, 0x080080, 0x080088, 0x10000a, 0x10000b, 0x10000d, 0x10000e, + 0x100010, 0x10001a, 0x100026, 0x100054, 0x100056, 0x1000f0, 0x1000f8, + 0x1000fc, 0x200008, 0x20000e, 0x200010, 0x200012, 0x200054, 0x2000f0, + 0x2000f2, 0x2000f4, 0x2000f5, 0x2000f6, 0x2000f8, 0x2000fc, 0x2000fe, + 0x2d0030, 0x30000a, 0x30000c, 0x300010, 0x300012, 0x30001a, 0x300056, + 0x3000f0, 0x3000f2, 0x3000f6, 0x3000f8, 0x3000fc, 0x3000fe, 0x400006, + 0x400007, 0x40000a, 0x40000e, 0x400010, 0x400018, 0x400056, 0x4000f0, + 0x4000f8, 0x600005 + }; + const unsigned int event_index_table[] = { + 0, 1, 2, 3, 4, 1, 5, 6, 7, 8, 9, 10, 11, 12, 13, 12, 14, + 7, 15, 2, 9, 16, 3, 4, 0, 17, 10, 18, 19, 20, 1, 17, 15, 19, + 18, 2, 16, 21, 8, 0, 22, 13, 14, 11, 21, 5, 20, 22, 1, 6, 3 + }; + int hi = ARRAY_SIZE(presorted_event_table) - 1; + int lo = 0; + + while (lo <= hi) { + int mid = lo + (hi - lo) / 2; + unsigned int alt = presorted_event_table[mid]; + + if (alt < event) + lo = mid + 1; + else if (alt > event) + hi = mid - 1; + else + return event_index_table[mid]; } + return -1; } From efce8422dd53fae6cdbd37741034ac4eb4730819 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Thu, 19 Oct 2023 14:44:52 +0530 Subject: [PATCH 51/76] powerpc/paravirt: Improve vcpu_is_preempted PowerVM Hypervisor dispatches on a whole core basis. In a shared LPAR, a CPU from a core that is CEDED or preempted may have a larger latency. In such a scenario, its preferable to choose a different CPU to run. If one of the CPUs in the core is active, i.e neither CEDED nor preempted, then consider this CPU as not preempted. Also if any of the CPUs in the core has yielded but OS has not requested CEDE or CONFER, then consider this CPU to be preempted. Correct detection of preempted CPUs is important for detecting idle CPUs/cores in task scheduler. Tested-by: Aboorva Devarajan Reviewed-by: Shrikanth Hegde Signed-off-by: Srikar Dronamraju Signed-off-by: Michael Ellerman Link: https://msgid.link/20231019091452.95260-1-srikar@linux.vnet.ibm.com --- arch/powerpc/include/asm/paravirt.h | 47 +++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/paravirt.h b/arch/powerpc/include/asm/paravirt.h index e08513d731193..ac4279208d633 100644 --- a/arch/powerpc/include/asm/paravirt.h +++ b/arch/powerpc/include/asm/paravirt.h @@ -71,6 +71,11 @@ static inline void yield_to_any(void) { plpar_hcall_norets_notrace(H_CONFER, -1, 0); } + +static inline bool is_vcpu_idle(int vcpu) +{ + return lppaca_of(vcpu).idle; +} #else static inline bool is_shared_processor(void) { @@ -100,6 +105,10 @@ static inline void prod_cpu(int cpu) ___bad_prod_cpu(); /* This would be a bug */ } +static inline bool is_vcpu_idle(int vcpu) +{ + return false; +} #endif #define vcpu_is_preempted vcpu_is_preempted @@ -121,9 +130,23 @@ static inline bool vcpu_is_preempted(int cpu) if (!is_shared_processor()) return false; + /* + * If the hypervisor has dispatched the target CPU on a physical + * processor, then the target CPU is definitely not preempted. + */ + if (!(yield_count_of(cpu) & 1)) + return false; + + /* + * If the target CPU has yielded to Hypervisor but OS has not + * requested idle then the target CPU is definitely preempted. + */ + if (!is_vcpu_idle(cpu)) + return true; + #ifdef CONFIG_PPC_SPLPAR if (!is_kvm_guest()) { - int first_cpu; + int first_cpu, i; /* * The result of vcpu_is_preempted() is used in a @@ -149,11 +172,29 @@ static inline bool vcpu_is_preempted(int cpu) */ if (cpu_first_thread_sibling(cpu) == first_cpu) return false; + + /* + * If any of the threads of the target CPU's core are not + * preempted or ceded, then consider target CPU to be + * non-preempted. + */ + first_cpu = cpu_first_thread_sibling(cpu); + for (i = first_cpu; i < first_cpu + threads_per_core; i++) { + if (i == cpu) + continue; + if (!(yield_count_of(i) & 1)) + return false; + if (!is_vcpu_idle(i)) + return true; + } } #endif - if (yield_count_of(cpu) & 1) - return true; + /* + * None of the threads in target CPU's core are running but none of + * them were preempted too. Hence assume the target CPU to be + * non-preempted. + */ return false; } From 3bf983e4e93ce8e6d69e9d63f52a66ec0856672e Mon Sep 17 00:00:00 2001 From: Gaurav Batra Date: Mon, 2 Oct 2023 22:08:02 -0500 Subject: [PATCH 52/76] powerpc/pseries/iommu: enable_ddw incorrectly returns direct mapping for SR-IOV device When a device is initialized, the driver invokes dma_supported() twice - first for streaming mappings followed by coherent mappings. For an SR-IOV device, default window is deleted and DDW created. With vPMEM enabled, TCE mappings are dynamically created for both vPMEM and SR-IOV device. There are no direct mappings. First time when dma_supported() is called with 64 bit mask, DDW is created and marked as dynamic window. The second time dma_supported() is called, enable_ddw() finds existing window for the device and incorrectly returns it as "direct mapping". This only happens when size of DDW is big enough to map max LPAR memory. This results in streaming TCEs to not get dynamically mapped, since code incorrently assumes these are already pre-mapped. The adapter initially comes up but goes down due to EEH. Fixes: 381ceda88c4c ("powerpc/pseries/iommu: Make use of DDW for indirect mapping") Cc: stable@vger.kernel.org # v5.15+ Signed-off-by: Gaurav Batra Signed-off-by: Michael Ellerman Link: https://msgid.link/20231003030802.47914-1-gbatra@linux.vnet.ibm.com --- arch/powerpc/platforms/pseries/iommu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 16d93b580f61f..496e16c588aaa 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -914,7 +914,8 @@ static int remove_ddw(struct device_node *np, bool remove_prop, const char *win_ return 0; } -static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift) +static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *window_shift, + bool *direct_mapping) { struct dma_win *window; const struct dynamic_dma_window_prop *dma64; @@ -927,6 +928,7 @@ static bool find_existing_ddw(struct device_node *pdn, u64 *dma_addr, int *windo dma64 = window->prop; *dma_addr = be64_to_cpu(dma64->dma_base); *window_shift = be32_to_cpu(dma64->window_shift); + *direct_mapping = window->direct; found = true; break; } @@ -1270,10 +1272,8 @@ static bool enable_ddw(struct pci_dev *dev, struct device_node *pdn) mutex_lock(&dma_win_init_mutex); - if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len)) { - direct_mapping = (len >= max_ram_len); + if (find_existing_ddw(pdn, &dev->dev.archdata.dma_offset, &len, &direct_mapping)) goto out_unlock; - } /* * If we already went through this for a previous function of From 73b25505ce043b561028e5571d84dc82aa53c2b4 Mon Sep 17 00:00:00 2001 From: Haren Myneni Date: Thu, 19 Oct 2023 14:50:33 -0700 Subject: [PATCH 53/76] powerpc/vas: Limit open window failure messages in log bufffer The VAS open window call prints error message and returns -EBUSY after the migration suspend event initiated and until the resume event completed on the destination system. It can cause the log buffer filled with these error messages if the user space issues continuous open window calls. Similar case even for DLPAR CPU remove event when no credits are available until the credits are freed or with the other DLPAR CPU add event. So changes in the patch to use pr_err_ratelimited() instead of pr_err() to display open window failure and not-available credits error messages. Use pr_fmt() and make the corresponding changes to have the consistencein prefix all pr_*() messages (vas-api.c). Fixes: 37e6764895ef ("powerpc/pseries/vas: Add VAS migration handler") Signed-off-by: Haren Myneni [mpe: Use "vas-api" as the prefix to match the file name.] Signed-off-by: Michael Ellerman Link: https://msgid.link/20231019215033.1335251-1-haren@linux.ibm.com --- arch/powerpc/platforms/book3s/vas-api.c | 34 ++++++++++++------------- arch/powerpc/platforms/pseries/vas.c | 4 +-- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/platforms/book3s/vas-api.c b/arch/powerpc/platforms/book3s/vas-api.c index 77ea9335fd049..f381b177ea06a 100644 --- a/arch/powerpc/platforms/book3s/vas-api.c +++ b/arch/powerpc/platforms/book3s/vas-api.c @@ -4,6 +4,8 @@ * Copyright (C) 2019 Haren Myneni, IBM Corp */ +#define pr_fmt(fmt) "vas-api: " fmt + #include #include #include @@ -78,7 +80,7 @@ int get_vas_user_win_ref(struct vas_user_win_ref *task_ref) task_ref->mm = get_task_mm(current); if (!task_ref->mm) { put_pid(task_ref->pid); - pr_err("VAS: pid(%d): mm_struct is not found\n", + pr_err("pid(%d): mm_struct is not found\n", current->pid); return -EPERM; } @@ -235,8 +237,7 @@ void vas_update_csb(struct coprocessor_request_block *crb, rc = kill_pid_info(SIGSEGV, &info, pid); rcu_read_unlock(); - pr_devel("%s(): pid %d kill_proc_info() rc %d\n", __func__, - pid_vnr(pid), rc); + pr_devel("pid %d kill_proc_info() rc %d\n", pid_vnr(pid), rc); } void vas_dump_crb(struct coprocessor_request_block *crb) @@ -294,7 +295,7 @@ static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg) rc = copy_from_user(&uattr, uptr, sizeof(uattr)); if (rc) { - pr_err("%s(): copy_from_user() returns %d\n", __func__, rc); + pr_err("copy_from_user() returns %d\n", rc); return -EFAULT; } @@ -311,7 +312,7 @@ static int coproc_ioc_tx_win_open(struct file *fp, unsigned long arg) txwin = cp_inst->coproc->vops->open_win(uattr.vas_id, uattr.flags, cp_inst->coproc->cop_type); if (IS_ERR(txwin)) { - pr_err("%s() VAS window open failed, %ld\n", __func__, + pr_err_ratelimited("VAS window open failed rc=%ld\n", PTR_ERR(txwin)); return PTR_ERR(txwin); } @@ -405,8 +406,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) * window is not opened. Shouldn't expect this error. */ if (!cp_inst || !cp_inst->txwin) { - pr_err("%s(): Unexpected fault on paste address with TX window closed\n", - __func__); + pr_err("Unexpected fault on paste address with TX window closed\n"); return VM_FAULT_SIGBUS; } @@ -421,8 +421,7 @@ static vm_fault_t vas_mmap_fault(struct vm_fault *vmf) * issue NX request. */ if (txwin->task_ref.vma != vmf->vma) { - pr_err("%s(): No previous mapping with paste address\n", - __func__); + pr_err("No previous mapping with paste address\n"); return VM_FAULT_SIGBUS; } @@ -481,19 +480,19 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) txwin = cp_inst->txwin; if ((vma->vm_end - vma->vm_start) > PAGE_SIZE) { - pr_debug("%s(): size 0x%zx, PAGE_SIZE 0x%zx\n", __func__, + pr_debug("size 0x%zx, PAGE_SIZE 0x%zx\n", (vma->vm_end - vma->vm_start), PAGE_SIZE); return -EINVAL; } /* Ensure instance has an open send window */ if (!txwin) { - pr_err("%s(): No send window open?\n", __func__); + pr_err("No send window open?\n"); return -EINVAL; } if (!cp_inst->coproc->vops || !cp_inst->coproc->vops->paste_addr) { - pr_err("%s(): VAS API is not registered\n", __func__); + pr_err("VAS API is not registered\n"); return -EACCES; } @@ -510,14 +509,14 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) */ mutex_lock(&txwin->task_ref.mmap_mutex); if (txwin->status != VAS_WIN_ACTIVE) { - pr_err("%s(): Window is not active\n", __func__); + pr_err("Window is not active\n"); rc = -EACCES; goto out; } paste_addr = cp_inst->coproc->vops->paste_addr(txwin); if (!paste_addr) { - pr_err("%s(): Window paste address failed\n", __func__); + pr_err("Window paste address failed\n"); rc = -EINVAL; goto out; } @@ -533,8 +532,8 @@ static int coproc_mmap(struct file *fp, struct vm_area_struct *vma) rc = remap_pfn_range(vma, vma->vm_start, pfn + vma->vm_pgoff, vma->vm_end - vma->vm_start, prot); - pr_devel("%s(): paste addr %llx at %lx, rc %d\n", __func__, - paste_addr, vma->vm_start, rc); + pr_devel("paste addr %llx at %lx, rc %d\n", paste_addr, + vma->vm_start, rc); txwin->task_ref.vma = vma; vma->vm_ops = &vas_vm_ops; @@ -609,8 +608,7 @@ int vas_register_coproc_api(struct module *mod, enum vas_cop_type cop_type, goto err; } - pr_devel("%s: Added dev [%d,%d]\n", __func__, MAJOR(devno), - MINOR(devno)); + pr_devel("Added dev [%d,%d]\n", MAJOR(devno), MINOR(devno)); return 0; diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index e25ac52acf507..b1f25bac280b4 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -341,7 +341,7 @@ static struct vas_window *vas_allocate_window(int vas_id, u64 flags, if (atomic_inc_return(&cop_feat_caps->nr_used_credits) > atomic_read(&cop_feat_caps->nr_total_credits)) { - pr_err("Credits are not available to allocate window\n"); + pr_err_ratelimited("Credits are not available to allocate window\n"); rc = -EINVAL; goto out; } @@ -424,7 +424,7 @@ static struct vas_window *vas_allocate_window(int vas_id, u64 flags, put_vas_user_win_ref(&txwin->vas_win.task_ref); rc = -EBUSY; - pr_err("No credit is available to allocate window\n"); + pr_err_ratelimited("No credit is available to allocate window\n"); out_free: /* From 007240d59c11f87ac4f6cfc6a1d116630b6b634c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 9 Mar 2023 14:48:31 +0100 Subject: [PATCH 54/76] powerpc/imc-pmu: Use the correct spinlock initializer. The macro __SPIN_LOCK_INITIALIZER() is implementation specific. Users that desire to initialize a spinlock in a struct must use __SPIN_LOCK_UNLOCKED(). Use __SPIN_LOCK_UNLOCKED() for the spinlock_t in imc_global_refc. Fixes: 76d588dddc459 ("powerpc/imc-pmu: Fix use of mutex in IRQs disabled section") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Michael Ellerman Link: https://msgid.link/20230309134831.Nz12nqsU@linutronix.de --- arch/powerpc/perf/imc-pmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/perf/imc-pmu.c b/arch/powerpc/perf/imc-pmu.c index 2016aee270379..5d12ca386c1fc 100644 --- a/arch/powerpc/perf/imc-pmu.c +++ b/arch/powerpc/perf/imc-pmu.c @@ -51,7 +51,7 @@ static int trace_imc_mem_size; * core and trace-imc */ static struct imc_pmu_ref imc_global_refc = { - .lock = __SPIN_LOCK_INITIALIZER(imc_global_refc.lock), + .lock = __SPIN_LOCK_UNLOCKED(imc_global_refc.lock), .id = 0, .refc = 0, }; From 95f1a128cd728a7257d78e868f1f5a145fc43736 Mon Sep 17 00:00:00 2001 From: Wang Yufen Date: Wed, 14 Dec 2022 15:46:23 +0800 Subject: [PATCH 55/76] powerpc/pseries: fix potential memory leak in init_cpu_associativity() If the vcpu_associativity alloc memory successfully but the pcpu_associativity fails to alloc memory, the vcpu_associativity memory leaks. Fixes: d62c8deeb6e6 ("powerpc/pseries: Provide vcpu dispatch statistics") Signed-off-by: Wang Yufen Reviewed-by: "Naveen N. Rao" Signed-off-by: Michael Ellerman Link: https://msgid.link/1671003983-10794-1-git-send-email-wangyufen@huawei.com --- arch/powerpc/platforms/pseries/lpar.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c index b4e86b7ea584d..4561667832ed4 100644 --- a/arch/powerpc/platforms/pseries/lpar.c +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -526,8 +526,10 @@ static ssize_t vcpudispatch_stats_write(struct file *file, const char __user *p, if (cmd) { rc = init_cpu_associativity(); - if (rc) + if (rc) { + destroy_cpu_associativity(); goto out; + } for_each_possible_cpu(cpu) { disp = per_cpu_ptr(&vcpu_disp_data, cpu); From 269d79fb30f6b2419b8a67d99d8bdf58ced44d72 Mon Sep 17 00:00:00 2001 From: Nick Child Date: Tue, 14 Mar 2023 11:44:42 -0500 Subject: [PATCH 56/76] powerpc/boot: Add version to install filenames Rather than replacing the versionless vmlinux and System.map files, copy to files with the version info appended. Additionally, since executing the script is a last resort option, inform the user about the missing `installkernel` command and the location of the installation. This work is adapted from `arch/s390/boot/install.sh`, and also matches the behaviour of arm, arm64 and riscv. Signed-off-by: Nick Child Signed-off-by: Michael Ellerman Link: https://msgid.link/20230314164442.124929-1-nnac123@linux.ibm.com --- arch/powerpc/boot/install.sh | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/boot/install.sh b/arch/powerpc/boot/install.sh index 461902c8a46d4..101fcb397a0fe 100755 --- a/arch/powerpc/boot/install.sh +++ b/arch/powerpc/boot/install.sh @@ -21,13 +21,17 @@ set -e # this should work for both the pSeries zImage and the iSeries vmlinux.sm image_name=`basename $2` -if [ -f $4/$image_name ]; then - mv $4/$image_name $4/$image_name.old + +echo "Warning: '${INSTALLKERNEL}' command not available... Copying" \ + "directly to $4/$image_name-$1" >&2 + +if [ -f $4/$image_name-$1 ]; then + mv $4/$image_name-$1 $4/$image_name-$1.old fi -if [ -f $4/System.map ]; then - mv $4/System.map $4/System.old +if [ -f $4/System.map-$1 ]; then + mv $4/System.map-$1 $4/System-$1.old fi -cat $2 > $4/$image_name -cp $3 $4/System.map +cat $2 > $4/$image_name-$1 +cp $3 $4/System.map-$1 From d42f55e8ae741540d0d2ebab8ff02f68fb0c802f Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Tue, 30 May 2023 15:08:21 +0530 Subject: [PATCH 57/76] powerpc/tools: Pass -mabi=elfv2 to gcc-check-mprofile-kernel.sh Toolchains don't always default to the ELFv2 ABI. This is true with at least the kernel.org toolchains. As such, pass -mabi=elfv2 explicitly to ensure that we are testing against the correct compiler output. Signed-off-by: Naveen N Rao [mpe: Tweak comment wording] Signed-off-by: Michael Ellerman Link: https://msgid.link/20230530093821.298590-1-naveen@kernel.org --- arch/powerpc/tools/gcc-check-mprofile-kernel.sh | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh index a31a56016c094..73e331e7660ef 100755 --- a/arch/powerpc/tools/gcc-check-mprofile-kernel.sh +++ b/arch/powerpc/tools/gcc-check-mprofile-kernel.sh @@ -7,21 +7,20 @@ set -o pipefail # To debug, uncomment the following line # set -x -# -mprofile-kernel is only supported on 64-bit, so this should not be invoked -# for 32-bit. We pass in -m64 explicitly, and -mbig-endian and -mlittle-endian -# are passed in from Kconfig, which takes care of toolchains defaulting to -# other targets. +# -mprofile-kernel is only supported on 64-bit with ELFv2, so this should not +# be invoked for other targets. Therefore we can pass in -m64 and -mabi +# explicitly, to take care of toolchains defaulting to other targets. # Test whether the compile option -mprofile-kernel exists and generates # profiling code (ie. a call to _mcount()). echo "int func() { return 0; }" | \ - $* -m64 -S -x c -O2 -p -mprofile-kernel - -o - \ + $* -m64 -mabi=elfv2 -S -x c -O2 -p -mprofile-kernel - -o - \ 2> /dev/null | grep -q "_mcount" # Test whether the notrace attribute correctly suppresses calls to _mcount(). echo -e "#include \nnotrace int func() { return 0; }" | \ - $* -m64 -S -x c -O2 -p -mprofile-kernel - -o - \ + $* -m64 -mabi=elfv2 -S -x c -O2 -p -mprofile-kernel - -o - \ 2> /dev/null | grep -q "_mcount" && \ exit 1 From f01b0edd562ef5a05bae31fe3a296721f89af7d9 Mon Sep 17 00:00:00 2001 From: Naveen N Rao Date: Wed, 14 Jun 2023 14:29:26 +0530 Subject: [PATCH 58/76] powerpc/trace: Add support for HAVE_FUNCTION_ARG_ACCESS_API When creating a kprobe on function entry through tracefs, enable arguments to be recorded to be specified using $argN syntax. Signed-off-by: Naveen N Rao Signed-off-by: Michael Ellerman Link: https://msgid.link/20230614085926.2176641-1-naveen@kernel.org --- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/ptrace.h | 17 +++++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index d5d5388973ac7..6f105ee4f3cf5 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -237,6 +237,7 @@ config PPC select HAVE_EFFICIENT_UNALIGNED_ACCESS select HAVE_FAST_GUP select HAVE_FTRACE_MCOUNT_RECORD + select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUNCTION_DESCRIPTORS if PPC64_ELF_ABI_V1 select HAVE_FUNCTION_ERROR_INJECTION select HAVE_FUNCTION_GRAPH_TRACER diff --git a/arch/powerpc/include/asm/ptrace.h b/arch/powerpc/include/asm/ptrace.h index 9db8b16567e22..ea8f91fbc62f9 100644 --- a/arch/powerpc/include/asm/ptrace.h +++ b/arch/powerpc/include/asm/ptrace.h @@ -397,6 +397,23 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, return 0; } +/** + * regs_get_kernel_argument() - get Nth function argument in kernel + * @regs: pt_regs of that context + * @n: function argument number (start from 0) + * + * We support up to 8 arguments and assume they are sent in through the GPRs. + * This will fail for fp/vector arguments, but those aren't usually found in + * kernel code. This is expected to be called from kprobes or ftrace with regs. + */ +static inline unsigned long regs_get_kernel_argument(struct pt_regs *regs, unsigned int n) +{ +#define NR_REG_ARGUMENTS 8 + if (n < NR_REG_ARGUMENTS) + return regs_get_register(regs, offsetof(struct pt_regs, gpr[3 + n])); + return 0; +} + #endif /* __ASSEMBLY__ */ #ifndef __powerpc64__ From ea142e590aec55ba40c5affb4d49e68c713c63dc Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Thu, 19 Oct 2023 01:34:23 +1000 Subject: [PATCH 59/76] powerpc/perf: Fix disabling BHRB and instruction sampling When the PMU is disabled, MMCRA is not updated to disable BHRB and instruction sampling. This can lead to those features remaining enabled, which can slow down a real or emulated CPU. Fixes: 1cade527f6e9 ("powerpc/perf: BHRB control to disable BHRB logic when not used") Cc: stable@vger.kernel.org # v5.9+ Signed-off-by: Nicholas Piggin Signed-off-by: Michael Ellerman Link: https://msgid.link/20231018153423.298373-1-npiggin@gmail.com --- arch/powerpc/perf/core-book3s.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 8c1f7def596e4..10b946e9c6e75 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1371,8 +1371,7 @@ static void power_pmu_disable(struct pmu *pmu) /* * Disable instruction sampling if it was enabled */ - if (cpuhw->mmcr.mmcra & MMCRA_SAMPLE_ENABLE) - val &= ~MMCRA_SAMPLE_ENABLE; + val &= ~MMCRA_SAMPLE_ENABLE; /* Disable BHRB via mmcra (BHRBRD) for p10 */ if (ppmu->flags & PPMU_ARCH_31) @@ -1383,7 +1382,7 @@ static void power_pmu_disable(struct pmu *pmu) * instruction sampling or BHRB. */ if (val != mmcra) { - mtspr(SPRN_MMCRA, mmcra); + mtspr(SPRN_MMCRA, val); mb(); isync(); } From f6568647382c667912245c8d07aa26c9c6d4d0c8 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 16 Oct 2023 22:43:01 +1000 Subject: [PATCH 60/76] powerpc/qspinlock: stop queued waiters trying to set lock sleepy If a queued waiter notices the lock owner or the previous waiter has been preempted, it attempts to mark the lock sleepy, but it does this as a try-set operation using the original lock value it got when queueing, which will become stale as the queue progresses, and the try-set will fail. Drop this and just set the sleepy seen clock. Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde Reviewed-by: "Nysal Jan K.A" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231016124305.139923-3-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 6dd2f46bd3ef6..75608ced14c22 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -247,22 +247,18 @@ static __always_inline void seen_sleepy_lock(void) this_cpu_write(sleepy_lock_seen_clock, sched_clock()); } -static __always_inline void seen_sleepy_node(struct qspinlock *lock, u32 val) +static __always_inline void seen_sleepy_node(void) { if (pv_sleepy_lock) { if (pv_sleepy_lock_interval_ns) this_cpu_write(sleepy_lock_seen_clock, sched_clock()); - if (val & _Q_LOCKED_VAL) { - if (!(val & _Q_SLEEPY_VAL)) - try_set_sleepy(lock, val); - } + /* Don't set sleepy because we likely have a stale val */ } } -static struct qnode *get_tail_qnode(struct qspinlock *lock, u32 val) +static struct qnode *get_tail_qnode(struct qspinlock *lock, int prev_cpu) { - int cpu = decode_tail_cpu(val); - struct qnodes *qnodesp = per_cpu_ptr(&qnodes, cpu); + struct qnodes *qnodesp = per_cpu_ptr(&qnodes, prev_cpu); int idx; /* @@ -381,9 +377,8 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int } /* Called inside spin_begin() */ -static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, u32 val, bool paravirt) +static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt) { - int prev_cpu = decode_tail_cpu(val); u32 yield_count; int yield_cpu; bool preempted = false; @@ -412,7 +407,7 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * spin_end(); preempted = true; - seen_sleepy_node(lock, val); + seen_sleepy_node(); smp_rmb(); @@ -436,7 +431,7 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * spin_end(); preempted = true; - seen_sleepy_node(lock, val); + seen_sleepy_node(); smp_rmb(); /* See __yield_to_locked_owner comment */ @@ -587,7 +582,8 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b * head of the waitqueue. */ if (old & _Q_TAIL_CPU_MASK) { - struct qnode *prev = get_tail_qnode(lock, old); + int prev_cpu = decode_tail_cpu(old); + struct qnode *prev = get_tail_qnode(lock, prev_cpu); /* Link @node into the waitqueue. */ WRITE_ONCE(prev->next, node); @@ -597,7 +593,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b while (!READ_ONCE(node->locked)) { spec_barrier(); - if (yield_to_prev(lock, node, old, paravirt)) + if (yield_to_prev(lock, node, prev_cpu, paravirt)) seen_preempted = true; } spec_barrier(); From fd8fae50c9c6117c4e05954deab2cc515508666b Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 16 Oct 2023 22:43:02 +1000 Subject: [PATCH 61/76] powerpc/qspinlock: propagate owner preemptedness rather than CPU number Rather than propagating the CPU number of the preempted lock owner, just propagate whether the owner was preempted. Waiters must read the lock value when yielding to it to prevent races anyway, so might as well always load the owner CPU from the lock. To further simplify the code, also don't propagate the -1 (or sleepy=false in the new scheme) down the queue. Instead, have the waiters clear it themselves when finding the lock owner is not preempted. Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde Reviewed-by: "Nysal Jan K.A" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231016124305.139923-4-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 80 ++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 75608ced14c22..0932d24a6b073 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -16,7 +16,8 @@ struct qnode { struct qnode *next; struct qspinlock *lock; int cpu; - int yield_cpu; + u8 sleepy; /* 1 if the previous vCPU was preempted or + * if the previous node was sleepy */ u8 locked; /* 1 if lock acquired */ }; @@ -349,7 +350,7 @@ static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u return __yield_to_locked_owner(lock, val, paravirt, mustq); } -static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int *set_yield_cpu, bool paravirt) +static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool *set_sleepy, bool paravirt) { struct qnode *next; int owner; @@ -358,21 +359,17 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int return; if (!pv_yield_propagate_owner) return; - - owner = get_owner_cpu(val); - if (*set_yield_cpu == owner) + if (*set_sleepy) return; next = READ_ONCE(node->next); if (!next) return; + owner = get_owner_cpu(val); if (vcpu_is_preempted(owner)) { - next->yield_cpu = owner; - *set_yield_cpu = owner; - } else if (*set_yield_cpu != -1) { - next->yield_cpu = owner; - *set_yield_cpu = owner; + next->sleepy = 1; + *set_sleepy = true; } } @@ -380,7 +377,6 @@ static __always_inline void propagate_yield_cpu(struct qnode *node, u32 val, int static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode *node, int prev_cpu, bool paravirt) { u32 yield_count; - int yield_cpu; bool preempted = false; if (!paravirt) @@ -389,36 +385,32 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if (!pv_yield_propagate_owner) goto yield_prev; - yield_cpu = READ_ONCE(node->yield_cpu); - if (yield_cpu == -1) { - /* Propagate back the -1 CPU */ - if (node->next && node->next->yield_cpu != -1) - node->next->yield_cpu = yield_cpu; + if (!READ_ONCE(node->sleepy)) { + /* Propagate back sleepy==false */ + if (node->next && node->next->sleepy) + node->next->sleepy = 0; goto yield_prev; - } - - yield_count = yield_count_of(yield_cpu); - if ((yield_count & 1) == 0) - goto yield_prev; /* owner vcpu is running */ - - if (get_owner_cpu(READ_ONCE(lock->val)) != yield_cpu) - goto yield_prev; /* re-sample lock owner */ - - spin_end(); - - preempted = true; - seen_sleepy_node(); - - smp_rmb(); + } else { + u32 val = READ_ONCE(lock->val); + + if (val & _Q_LOCKED_VAL) { + if (node->next && !node->next->sleepy) { + /* + * Propagate sleepy to next waiter. Only if + * owner is preempted, which allows the queue + * to become "non-sleepy" if vCPU preemption + * ceases to occur, even if the lock remains + * highly contended. + */ + if (vcpu_is_preempted(get_owner_cpu(val))) + node->next->sleepy = 1; + } - if (yield_cpu == node->yield_cpu) { - if (node->next && node->next->yield_cpu != yield_cpu) - node->next->yield_cpu = yield_cpu; - yield_to_preempted(yield_cpu, yield_count); - spin_begin(); - return preempted; + preempted = yield_to_locked_owner(lock, val, paravirt); + if (preempted) + return preempted; + } } - spin_begin(); yield_prev: if (!pv_yield_prev) @@ -541,7 +533,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b bool sleepy = false; bool mustq = false; int idx; - int set_yield_cpu = -1; + bool set_sleepy = false; int iters = 0; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -565,7 +557,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b node->next = NULL; node->lock = lock; node->cpu = smp_processor_id(); - node->yield_cpu = -1; + node->sleepy = 0; node->locked = 0; tail = encode_tail_cpu(node->cpu); @@ -599,9 +591,9 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b spec_barrier(); spin_end(); - /* Clear out stale propagated yield_cpu */ - if (paravirt && pv_yield_propagate_owner && node->yield_cpu != -1) - node->yield_cpu = -1; + /* Clear out stale propagated sleepy */ + if (paravirt && pv_yield_propagate_owner && node->sleepy) + node->sleepy = 0; smp_rmb(); /* acquire barrier for the mcs lock */ @@ -644,7 +636,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b } } - propagate_yield_cpu(node, val, &set_yield_cpu, paravirt); + propagate_sleepy(node, val, &set_sleepy, paravirt); preempted = yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; From fcf77d44274b96a55cc74043561a4b3151b9ad24 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 16 Oct 2023 22:43:03 +1000 Subject: [PATCH 62/76] powerpc/qspinlock: don't propagate the not-sleepy state To simplify things, don't propagate the not-sleepy condition back down the queue. Instead, have the waiters clear their own node->sleepy when finding the lock owner is not preempted. Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde Reviewed-by: "Nysal Jan K.A" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231016124305.139923-5-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 0932d24a6b073..6bb627e90a321 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -350,7 +350,7 @@ static __always_inline bool yield_head_to_locked_owner(struct qspinlock *lock, u return __yield_to_locked_owner(lock, val, paravirt, mustq); } -static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool *set_sleepy, bool paravirt) +static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool paravirt) { struct qnode *next; int owner; @@ -359,18 +359,17 @@ static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool * return; if (!pv_yield_propagate_owner) return; - if (*set_sleepy) - return; next = READ_ONCE(node->next); if (!next) return; + if (next->sleepy) + return; + owner = get_owner_cpu(val); - if (vcpu_is_preempted(owner)) { + if (vcpu_is_preempted(owner)) next->sleepy = 1; - *set_sleepy = true; - } } /* Called inside spin_begin() */ @@ -385,12 +384,7 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if (!pv_yield_propagate_owner) goto yield_prev; - if (!READ_ONCE(node->sleepy)) { - /* Propagate back sleepy==false */ - if (node->next && node->next->sleepy) - node->next->sleepy = 0; - goto yield_prev; - } else { + if (node->sleepy) { u32 val = READ_ONCE(lock->val); if (val & _Q_LOCKED_VAL) { @@ -410,6 +404,7 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if (preempted) return preempted; } + node->sleepy = false; } yield_prev: @@ -533,7 +528,6 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b bool sleepy = false; bool mustq = false; int idx; - bool set_sleepy = false; int iters = 0; BUILD_BUG_ON(CONFIG_NR_CPUS >= (1U << _Q_TAIL_CPU_BITS)); @@ -591,10 +585,6 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b spec_barrier(); spin_end(); - /* Clear out stale propagated sleepy */ - if (paravirt && pv_yield_propagate_owner && node->sleepy) - node->sleepy = 0; - smp_rmb(); /* acquire barrier for the mcs lock */ /* @@ -636,7 +626,7 @@ static __always_inline void queued_spin_lock_mcs_queue(struct qspinlock *lock, b } } - propagate_sleepy(node, val, &set_sleepy, paravirt); + propagate_sleepy(node, val, paravirt); preempted = yield_head_to_locked_owner(lock, val, paravirt); if (!maybe_stealers) continue; From 1e6d5f725738fff83e1c3cbdb8547142eb8820c2 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 16 Oct 2023 22:43:04 +1000 Subject: [PATCH 63/76] powerpc/qspinlock: Propagate sleepy if previous waiter is preempted The sleepy (aka lock-owner-is-preempted) condition is propagated down the queue by each waiter. If a waiter becomes preempted, it can no longer propagate sleepy. To allow subsequent waiters to yield to the lock owner, also check the lock owner in this case. Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde Reviewed-by: "Nysal Jan K.A" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231016124305.139923-6-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index 6bb627e90a321..c68c2bd7b8534 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -384,7 +384,11 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if (!pv_yield_propagate_owner) goto yield_prev; - if (node->sleepy) { + /* + * If the previous waiter was preempted it might not be able to + * propagate sleepy to us, so check the lock in that case too. + */ + if (node->sleepy || vcpu_is_preempted(prev_cpu)) { u32 val = READ_ONCE(lock->val); if (val & _Q_LOCKED_VAL) { From b629b541702b082d161c307c9d7d9867911fc933 Mon Sep 17 00:00:00 2001 From: Nicholas Piggin Date: Mon, 16 Oct 2023 22:43:05 +1000 Subject: [PATCH 64/76] powerpc/qspinlock: Rename yield_propagate_owner tunable Rename yield_propagate_owner to yield_sleepy_owner, which better describes what it does (what, not how). Signed-off-by: Nicholas Piggin Tested-by: Shrikanth Hegde Reviewed-by: "Nysal Jan K.A" Signed-off-by: Michael Ellerman Link: https://msgid.link/20231016124305.139923-7-npiggin@gmail.com --- arch/powerpc/lib/qspinlock.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/lib/qspinlock.c b/arch/powerpc/lib/qspinlock.c index c68c2bd7b8534..5de4dd549f6ec 100644 --- a/arch/powerpc/lib/qspinlock.c +++ b/arch/powerpc/lib/qspinlock.c @@ -44,7 +44,7 @@ static bool pv_sleepy_lock_sticky __read_mostly = false; static u64 pv_sleepy_lock_interval_ns __read_mostly = 0; static int pv_sleepy_lock_factor __read_mostly = 256; static bool pv_yield_prev __read_mostly = true; -static bool pv_yield_propagate_owner __read_mostly = true; +static bool pv_yield_sleepy_owner __read_mostly = true; static bool pv_prod_head __read_mostly = false; static DEFINE_PER_CPU_ALIGNED(struct qnodes, qnodes); @@ -357,7 +357,7 @@ static __always_inline void propagate_sleepy(struct qnode *node, u32 val, bool p if (!paravirt) return; - if (!pv_yield_propagate_owner) + if (!pv_yield_sleepy_owner) return; next = READ_ONCE(node->next); @@ -381,7 +381,7 @@ static __always_inline bool yield_to_prev(struct qspinlock *lock, struct qnode * if (!paravirt) goto relax; - if (!pv_yield_propagate_owner) + if (!pv_yield_sleepy_owner) goto yield_prev; /* @@ -934,21 +934,21 @@ static int pv_yield_prev_get(void *data, u64 *val) DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_prev, pv_yield_prev_get, pv_yield_prev_set, "%llu\n"); -static int pv_yield_propagate_owner_set(void *data, u64 val) +static int pv_yield_sleepy_owner_set(void *data, u64 val) { - pv_yield_propagate_owner = !!val; + pv_yield_sleepy_owner = !!val; return 0; } -static int pv_yield_propagate_owner_get(void *data, u64 *val) +static int pv_yield_sleepy_owner_get(void *data, u64 *val) { - *val = pv_yield_propagate_owner; + *val = pv_yield_sleepy_owner; return 0; } -DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_propagate_owner, pv_yield_propagate_owner_get, pv_yield_propagate_owner_set, "%llu\n"); +DEFINE_SIMPLE_ATTRIBUTE(fops_pv_yield_sleepy_owner, pv_yield_sleepy_owner_get, pv_yield_sleepy_owner_set, "%llu\n"); static int pv_prod_head_set(void *data, u64 val) { @@ -980,7 +980,7 @@ static __init int spinlock_debugfs_init(void) debugfs_create_file("qspl_pv_sleepy_lock_interval_ns", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_interval_ns); debugfs_create_file("qspl_pv_sleepy_lock_factor", 0600, arch_debugfs_dir, NULL, &fops_pv_sleepy_lock_factor); debugfs_create_file("qspl_pv_yield_prev", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_prev); - debugfs_create_file("qspl_pv_yield_propagate_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_propagate_owner); + debugfs_create_file("qspl_pv_yield_sleepy_owner", 0600, arch_debugfs_dir, NULL, &fops_pv_yield_sleepy_owner); debugfs_create_file("qspl_pv_prod_head", 0600, arch_debugfs_dir, NULL, &fops_pv_prod_head); } From ad496f8f83960c211b00fdbd8ae4ac1e55e4f3d5 Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Wed, 11 Oct 2023 08:27:03 +0200 Subject: [PATCH 65/76] powerpc: Remove cpm_dp...() macros Since commit d3c511ac1d72 ("powerpc/cpm: Remove !CONFIG_PPC_CPM_NEW_BINDING code") cpm_dp...() macros have no added value anymore. Last user of those macros were fixed by commit 5e6cb39a256d ("net: fs_enet: Use cpm_muram_xxx() functions instead of cpm_dpxxx() macros") Remove them. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/3aaa40bf706afeab8fe9a74b8437704a4269a6a2.1697005615.git.christophe.leroy@csgroup.eu --- arch/powerpc/include/asm/cpm1.h | 5 ----- arch/powerpc/include/asm/cpm2.h | 4 ---- drivers/soc/fsl/qe/qe_common.c | 4 ++-- 3 files changed, 2 insertions(+), 11 deletions(-) diff --git a/arch/powerpc/include/asm/cpm1.h b/arch/powerpc/include/asm/cpm1.h index 3bdd74739cb88..e3c6969853ef9 100644 --- a/arch/powerpc/include/asm/cpm1.h +++ b/arch/powerpc/include/asm/cpm1.h @@ -49,11 +49,6 @@ */ extern cpm8xx_t __iomem *cpmp; /* Pointer to comm processor */ -#define cpm_dpalloc cpm_muram_alloc -#define cpm_dpfree cpm_muram_free -#define cpm_dpram_addr cpm_muram_addr -#define cpm_dpram_phys cpm_muram_dma - extern void cpm_setbrg(uint brg, uint rate); extern void __init cpm_load_patch(cpm8xx_t *cp); diff --git a/arch/powerpc/include/asm/cpm2.h b/arch/powerpc/include/asm/cpm2.h index 249d43cc64275..a22acc36eb9b2 100644 --- a/arch/powerpc/include/asm/cpm2.h +++ b/arch/powerpc/include/asm/cpm2.h @@ -87,10 +87,6 @@ */ extern cpm_cpm2_t __iomem *cpmp; /* Pointer to comm processor */ -#define cpm_dpalloc cpm_muram_alloc -#define cpm_dpfree cpm_muram_free -#define cpm_dpram_addr cpm_muram_addr - extern void cpm2_reset(void); /* Baud rate generators. diff --git a/drivers/soc/fsl/qe/qe_common.c b/drivers/soc/fsl/qe/qe_common.c index 9729ce86db591..a877347d37d3e 100644 --- a/drivers/soc/fsl/qe/qe_common.c +++ b/drivers/soc/fsl/qe/qe_common.c @@ -141,7 +141,7 @@ static s32 cpm_muram_alloc_common(unsigned long size, * * This function returns a non-negative offset into the muram area, or * a negative errno on failure. - * Use cpm_dpram_addr() to get the virtual address of the area. + * Use cpm_muram_addr() to get the virtual address of the area. * Use cpm_muram_free() to free the allocation. */ s32 cpm_muram_alloc(unsigned long size, unsigned long align) @@ -193,7 +193,7 @@ EXPORT_SYMBOL(cpm_muram_free); * @size: number of bytes to allocate * This function returns @offset if the area was available, a negative * errno otherwise. - * Use cpm_dpram_addr() to get the virtual address of the area. + * Use cpm_muram_addr() to get the virtual address of the area. * Use cpm_muram_free() to free the allocation. */ s32 cpm_muram_alloc_fixed(unsigned long offset, unsigned long size) From 89f17016a85280e1b36a6c0305e0191594cbe6ea Mon Sep 17 00:00:00 2001 From: Rob Herring Date: Fri, 6 Oct 2023 16:45:16 -0500 Subject: [PATCH 66/76] powerpc/fsl_msi: Use device_get_match_data() Use preferred device_get_match_data() instead of of_match_device() to get the driver match data. With this, adjust the includes to explicitly include the correct headers. Signed-off-by: Rob Herring Signed-off-by: Michael Ellerman Link: https://msgid.link/20231006214516.340589-1-robh@kernel.org --- arch/powerpc/sysdev/fsl_msi.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/sysdev/fsl_msi.c b/arch/powerpc/sysdev/fsl_msi.c index 57978a44d55b6..558ec68d768e6 100644 --- a/arch/powerpc/sysdev/fsl_msi.c +++ b/arch/powerpc/sysdev/fsl_msi.c @@ -11,9 +11,11 @@ #include #include #include +#include #include #include -#include +#include +#include #include #include #include @@ -392,7 +394,6 @@ static int fsl_msi_setup_hwirq(struct fsl_msi *msi, struct platform_device *dev, static const struct of_device_id fsl_of_msi_ids[]; static int fsl_of_msi_probe(struct platform_device *dev) { - const struct of_device_id *match; struct fsl_msi *msi; struct resource res, msiir; int err, i, j, irq_index, count; @@ -402,10 +403,7 @@ static int fsl_of_msi_probe(struct platform_device *dev) u32 offset; struct pci_controller *phb; - match = of_match_device(fsl_of_msi_ids, &dev->dev); - if (!match) - return -EINVAL; - features = match->data; + features = device_get_match_data(&dev->dev); printk(KERN_DEBUG "Setting up Freescale MSI support\n"); From 74726fda9fe306f848088ef73ec266cae0470d5b Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Sat, 7 Oct 2023 12:46:19 +0200 Subject: [PATCH 67/76] powerpc/code-patching: Perform hwsync in __patch_instruction() in case of failure Commit c28c15b6d28a ("powerpc/code-patching: Use temporary mm for Radix MMU") added a hwsync for when __patch_instruction() fails, we results in a quite odd unbalanced logic. Instead of calling mb() when __patch_instruction() returns an error, call mb() in the __patch_instruction()'s error path directly. Signed-off-by: Christophe Leroy Signed-off-by: Michael Ellerman Link: https://msgid.link/e88b154eaf2efd9ff177d472d3411dcdec8ff4f5.1696675567.git.christophe.leroy@csgroup.eu --- arch/powerpc/lib/code-patching.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index b00112d7ad467..7a47a871e6b8e 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -38,6 +38,7 @@ static int __patch_instruction(u32 *exec_addr, ppc_inst_t instr, u32 *patch_addr return 0; failed: + mb(); /* sync */ return -EPERM; } @@ -309,10 +310,6 @@ static int __do_patch_instruction_mm(u32 *addr, ppc_inst_t instr) err = __patch_instruction(addr, instr, patch_addr); - /* hwsync performed by __patch_instruction (sync) if successful */ - if (err) - mb(); /* sync */ - /* context synchronisation performed by __patch_instruction (isync or exception) */ stop_using_temp_mm(patching_mm, orig_mm); From ca2b746d5f91a37f01baedff54b9315a50ee617d Mon Sep 17 00:00:00 2001 From: Minjie Du Date: Mon, 17 Jul 2023 17:26:48 +0800 Subject: [PATCH 68/76] powerpc/pseries: use kfree_sensitive() in plpks_gen_password() password might contain private information, so better use kfree_sensitive to free it. In plpks_gen_password() use kfree_sensitive(). Signed-off-by: Minjie Du Signed-off-by: Michael Ellerman Link: https://msgid.link/20230717092648.9752-1-duminjie@vivo.com --- arch/powerpc/platforms/pseries/plpks.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/pseries/plpks.c b/arch/powerpc/platforms/pseries/plpks.c index 2d40304eb6c16..febe18f251d0c 100644 --- a/arch/powerpc/platforms/pseries/plpks.c +++ b/arch/powerpc/platforms/pseries/plpks.c @@ -150,7 +150,7 @@ static int plpks_gen_password(void) ospasswordlength = maxpwsize; ospassword = kzalloc(maxpwsize, GFP_KERNEL); if (!ospassword) { - kfree(password); + kfree_sensitive(password); return -ENOMEM; } memcpy(ospassword, password, ospasswordlength); @@ -163,7 +163,7 @@ static int plpks_gen_password(void) } } out: - kfree(password); + kfree_sensitive(password); return pseries_status_to_err(rc); } From 3aa4a0edff95c440aed102cb4908c623fa26d866 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 20 Oct 2023 19:43:54 +0530 Subject: [PATCH 69/76] powerpc/code-patching: introduce patch_instructions() patch_instruction() entails setting up pte, patching the instruction, clearing the pte and flushing the tlb. If multiple instructions need to be patched, every instruction would have to go through the above drill unnecessarily. Instead, introduce patch_instructions() function that sets up the pte, clears the pte and flushes the tlb only once per page range of instructions to be patched. Duplicate most of the patch_instruction() code instead of merging with it, to avoid the performance degradation observed on ppc32, for patch_instruction(), with the code path merged. Also, setup poking_init() always as BPF expects poking_init() to be setup even when STRICT_KERNEL_RWX is off. Signed-off-by: Hari Bathini Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231020141358.643575-2-hbathini@linux.ibm.com --- arch/powerpc/include/asm/code-patching.h | 1 + arch/powerpc/lib/code-patching.c | 141 ++++++++++++++++++++++- 2 files changed, 139 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/code-patching.h b/arch/powerpc/include/asm/code-patching.h index 3f881548fb61c..0e29ccf903d09 100644 --- a/arch/powerpc/include/asm/code-patching.h +++ b/arch/powerpc/include/asm/code-patching.h @@ -74,6 +74,7 @@ int create_cond_branch(ppc_inst_t *instr, const u32 *addr, int patch_branch(u32 *addr, unsigned long target, int flags); int patch_instruction(u32 *addr, ppc_inst_t instr); int raw_patch_instruction(u32 *addr, ppc_inst_t instr); +int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr); static inline unsigned long patch_site_addr(s32 *site) { diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c index 7a47a871e6b8e..c6ab46156cda5 100644 --- a/arch/powerpc/lib/code-patching.c +++ b/arch/powerpc/lib/code-patching.c @@ -205,9 +205,6 @@ void __init poking_init(void) { int ret; - if (!IS_ENABLED(CONFIG_STRICT_KERNEL_RWX)) - return; - if (mm_patch_enabled()) ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "powerpc/text_poke_mm:online", @@ -375,6 +372,144 @@ int patch_instruction(u32 *addr, ppc_inst_t instr) } NOKPROBE_SYMBOL(patch_instruction); +static int __patch_instructions(u32 *patch_addr, u32 *code, size_t len, bool repeat_instr) +{ + unsigned long start = (unsigned long)patch_addr; + + /* Repeat instruction */ + if (repeat_instr) { + ppc_inst_t instr = ppc_inst_read(code); + + if (ppc_inst_prefixed(instr)) { + u64 val = ppc_inst_as_ulong(instr); + + memset64((u64 *)patch_addr, val, len / 8); + } else { + u32 val = ppc_inst_val(instr); + + memset32(patch_addr, val, len / 4); + } + } else { + memcpy(patch_addr, code, len); + } + + smp_wmb(); /* smp write barrier */ + flush_icache_range(start, start + len); + return 0; +} + +/* + * A page is mapped and instructions that fit the page are patched. + * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below. + */ +static int __do_patch_instructions_mm(u32 *addr, u32 *code, size_t len, bool repeat_instr) +{ + struct mm_struct *patching_mm, *orig_mm; + unsigned long pfn = get_patch_pfn(addr); + unsigned long text_poke_addr; + spinlock_t *ptl; + u32 *patch_addr; + pte_t *pte; + int err; + + patching_mm = __this_cpu_read(cpu_patching_context.mm); + text_poke_addr = __this_cpu_read(cpu_patching_context.addr); + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + pte = get_locked_pte(patching_mm, text_poke_addr, &ptl); + if (!pte) + return -ENOMEM; + + __set_pte_at(patching_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + + /* order PTE update before use, also serves as the hwsync */ + asm volatile("ptesync" ::: "memory"); + + /* order context switch after arbitrary prior code */ + isync(); + + orig_mm = start_using_temp_mm(patching_mm); + + err = __patch_instructions(patch_addr, code, len, repeat_instr); + + /* context synchronisation performed by __patch_instructions */ + stop_using_temp_mm(patching_mm, orig_mm); + + pte_clear(patching_mm, text_poke_addr, pte); + /* + * ptesync to order PTE update before TLB invalidation done + * by radix__local_flush_tlb_page_psize (in _tlbiel_va) + */ + local_flush_tlb_page_psize(patching_mm, text_poke_addr, mmu_virtual_psize); + + pte_unmap_unlock(pte, ptl); + + return err; +} + +/* + * A page is mapped and instructions that fit the page are patched. + * Assumes 'len' to be (PAGE_SIZE - offset_in_page(addr)) or below. + */ +static int __do_patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr) +{ + unsigned long pfn = get_patch_pfn(addr); + unsigned long text_poke_addr; + u32 *patch_addr; + pte_t *pte; + int err; + + text_poke_addr = (unsigned long)__this_cpu_read(cpu_patching_context.addr) & PAGE_MASK; + patch_addr = (u32 *)(text_poke_addr + offset_in_page(addr)); + + pte = __this_cpu_read(cpu_patching_context.pte); + __set_pte_at(&init_mm, text_poke_addr, pte, pfn_pte(pfn, PAGE_KERNEL), 0); + /* See ptesync comment in radix__set_pte_at() */ + if (radix_enabled()) + asm volatile("ptesync" ::: "memory"); + + err = __patch_instructions(patch_addr, code, len, repeat_instr); + + pte_clear(&init_mm, text_poke_addr, pte); + flush_tlb_kernel_range(text_poke_addr, text_poke_addr + PAGE_SIZE); + + return err; +} + +/* + * Patch 'addr' with 'len' bytes of instructions from 'code'. + * + * If repeat_instr is true, the same instruction is filled for + * 'len' bytes. + */ +int patch_instructions(u32 *addr, u32 *code, size_t len, bool repeat_instr) +{ + while (len > 0) { + unsigned long flags; + size_t plen; + int err; + + plen = min_t(size_t, PAGE_SIZE - offset_in_page(addr), len); + + local_irq_save(flags); + if (mm_patch_enabled()) + err = __do_patch_instructions_mm(addr, code, plen, repeat_instr); + else + err = __do_patch_instructions(addr, code, plen, repeat_instr); + local_irq_restore(flags); + if (err) + return err; + + len -= plen; + addr = (u32 *)((unsigned long)addr + plen); + if (!repeat_instr) + code = (u32 *)((unsigned long)code + plen); + } + + return 0; +} +NOKPROBE_SYMBOL(patch_instructions); + int patch_branch(u32 *addr, unsigned long target, int flags) { ppc_inst_t instr; From c4e3ca071be04cb56ee012d19a817082cff71fcb Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 20 Oct 2023 19:43:55 +0530 Subject: [PATCH 70/76] powerpc/bpf: implement bpf_arch_text_copy bpf_arch_text_copy is used to dump JITed binary to RX page, allowing multiple BPF programs to share the same page. Use the newly introduced patch_instructions() to implement it. Signed-off-by: Hari Bathini Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231020141358.643575-3-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 471e37cf8e2a4..c21cb88b09e43 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -13,9 +13,13 @@ #include #include #include -#include +#include +#include #include +#include +#include + #include "bpf_jit.h" static void bpf_jit_fill_ill_insns(void *area, unsigned int size) @@ -274,3 +278,17 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct code ctx->exentry_idx++; return 0; } + +void *bpf_arch_text_copy(void *dst, void *src, size_t len) +{ + int err; + + if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst))) + return ERR_PTR(-EINVAL); + + mutex_lock(&text_mutex); + err = patch_instructions(dst, src, len, false); + mutex_unlock(&text_mutex); + + return err ? ERR_PTR(err) : dst; +} From 46aae75aea6a948b7f2fccad6a0dc540bfede042 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 20 Oct 2023 19:43:56 +0530 Subject: [PATCH 71/76] powerpc/bpf: implement bpf_arch_text_invalidate for bpf_prog_pack Implement bpf_arch_text_invalidate and use it to fill unused part of the bpf_prog_pack with trap instructions when a BPF program is freed. Signed-off-by: Hari Bathini Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231020141358.643575-4-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index c21cb88b09e43..0a5a9758903a2 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -292,3 +292,18 @@ void *bpf_arch_text_copy(void *dst, void *src, size_t len) return err ? ERR_PTR(err) : dst; } + +int bpf_arch_text_invalidate(void *dst, size_t len) +{ + u32 insn = BREAKPOINT_INSTRUCTION; + int ret; + + if (WARN_ON_ONCE(core_kernel_text((unsigned long)dst))) + return -EINVAL; + + mutex_lock(&text_mutex); + ret = patch_instructions(dst, &insn, len, true); + mutex_unlock(&text_mutex); + + return ret; +} From e068db361ed2752ac5550b65e43250ce8ea3e714 Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 20 Oct 2023 19:43:57 +0530 Subject: [PATCH 72/76] powerpc/bpf: rename powerpc64_jit_data to powerpc_jit_data powerpc64_jit_data is a misnomer as it is meant for both ppc32 and ppc64. Rename it to powerpc_jit_data. Signed-off-by: Hari Bathini Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231020141358.643575-5-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 0a5a9758903a2..7a01bbc2513a5 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -43,7 +43,7 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, return 0; } -struct powerpc64_jit_data { +struct powerpc_jit_data { struct bpf_binary_header *header; u32 *addrs; u8 *image; @@ -63,7 +63,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) u8 *image = NULL; u32 *code_base; u32 *addrs; - struct powerpc64_jit_data *jit_data; + struct powerpc_jit_data *jit_data; struct codegen_context cgctx; int pass; int flen; From 43064d839ff51c6060c9d80171df2ce3e0b3594c Mon Sep 17 00:00:00 2001 From: Hari Bathini Date: Fri, 20 Oct 2023 19:43:58 +0530 Subject: [PATCH 73/76] powerpc/bpf: use bpf_jit_binary_pack_[alloc|finalize|free] Use bpf_jit_binary_pack_alloc in powerpc jit. The jit engine first writes the program to the rw buffer. When the jit is done, the program is copied to the final location with bpf_jit_binary_pack_finalize. With multiple jit_subprogs, bpf_jit_free is called on some subprograms that haven't got bpf_jit_binary_pack_finalize() yet. Implement custom bpf_jit_free() like in commit 1d5f82d9dd47 ("bpf, x86: fix freeing of not-finalized bpf_prog_pack") to call bpf_jit_binary_pack_finalize(), if necessary. As bpf_flush_icache() is not needed anymore, remove it. Signed-off-by: Hari Bathini Acked-by: Song Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20231020141358.643575-6-hbathini@linux.ibm.com --- arch/powerpc/net/bpf_jit.h | 18 ++--- arch/powerpc/net/bpf_jit_comp.c | 106 ++++++++++++++++++++++-------- arch/powerpc/net/bpf_jit_comp32.c | 13 ++-- arch/powerpc/net/bpf_jit_comp64.c | 10 +-- 4 files changed, 96 insertions(+), 51 deletions(-) diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h index 72b7bb34fadea..cdea5dccaefe7 100644 --- a/arch/powerpc/net/bpf_jit.h +++ b/arch/powerpc/net/bpf_jit.h @@ -36,9 +36,6 @@ EMIT(PPC_RAW_BRANCH(offset)); \ } while (0) -/* bl (unconditional 'branch' with link) */ -#define PPC_BL(dest) EMIT(PPC_RAW_BL((dest) - (unsigned long)(image + ctx->idx))) - /* "cond" here covers BO:BI fields. */ #define PPC_BCC_SHORT(cond, dest) \ do { \ @@ -147,12 +144,6 @@ struct codegen_context { #define BPF_FIXUP_LEN 2 /* Two instructions => 8 bytes */ #endif -static inline void bpf_flush_icache(void *start, void *end) -{ - smp_wmb(); /* smp write barrier */ - flush_icache_range((unsigned long)start, (unsigned long)end); -} - static inline bool bpf_is_seen_register(struct codegen_context *ctx, int i) { return ctx->seen & (1 << (31 - i)); @@ -169,16 +160,17 @@ static inline void bpf_clear_seen_register(struct codegen_context *ctx, int i) } void bpf_jit_init_reg_mapping(struct codegen_context *ctx); -int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func); -int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func); +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx, u32 *addrs, int pass, bool extra_pass); void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx); void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx); void bpf_jit_realloc_regs(struct codegen_context *ctx); int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, long exit_addr); -int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct codegen_context *ctx, - int insn_idx, int jmp_off, int dst_reg); +int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, + struct codegen_context *ctx, int insn_idx, + int jmp_off, int dst_reg); #endif diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 7a01bbc2513a5..0f9a217833296 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -44,9 +44,12 @@ int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx, int tmp_reg, } struct powerpc_jit_data { - struct bpf_binary_header *header; + /* address of rw header */ + struct bpf_binary_header *hdr; + /* address of ro final header */ + struct bpf_binary_header *fhdr; u32 *addrs; - u8 *image; + u8 *fimage; u32 proglen; struct codegen_context ctx; }; @@ -67,11 +70,14 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct codegen_context cgctx; int pass; int flen; - struct bpf_binary_header *bpf_hdr; + struct bpf_binary_header *fhdr = NULL; + struct bpf_binary_header *hdr = NULL; struct bpf_prog *org_fp = fp; struct bpf_prog *tmp_fp; bool bpf_blinded = false; bool extra_pass = false; + u8 *fimage = NULL; + u32 *fcode_base; u32 extable_len; u32 fixup_len; @@ -101,9 +107,16 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) addrs = jit_data->addrs; if (addrs) { cgctx = jit_data->ctx; - image = jit_data->image; - bpf_hdr = jit_data->header; + /* + * JIT compiled to a writable location (image/code_base) first. + * It is then moved to the readonly final location (fimage/fcode_base) + * using instruction patching. + */ + fimage = jit_data->fimage; + fhdr = jit_data->fhdr; proglen = jit_data->proglen; + hdr = jit_data->hdr; + image = (void *)hdr + ((void *)fimage - (void *)fhdr); extra_pass = true; /* During extra pass, ensure index is reset before repopulating extable entries */ cgctx.exentry_idx = 0; @@ -123,7 +136,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.stack_size = round_up(fp->aux->stack_depth, 16); /* Scouting faux-generate pass 0 */ - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) { + if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { /* We hit something illegal or unsupported. */ fp = org_fp; goto out_addrs; @@ -138,7 +151,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) */ if (cgctx.seen & SEEN_TAILCALL || !is_offset_in_branch_range((long)cgctx.idx * 4)) { cgctx.idx = 0; - if (bpf_jit_build_body(fp, 0, &cgctx, addrs, 0, false)) { + if (bpf_jit_build_body(fp, NULL, NULL, &cgctx, addrs, 0, false)) { fp = org_fp; goto out_addrs; } @@ -160,17 +173,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) proglen = cgctx.idx * 4; alloclen = proglen + FUNCTION_DESCR_SIZE + fixup_len + extable_len; - bpf_hdr = bpf_jit_binary_alloc(alloclen, &image, 4, bpf_jit_fill_ill_insns); - if (!bpf_hdr) { + fhdr = bpf_jit_binary_pack_alloc(alloclen, &fimage, 4, &hdr, &image, + bpf_jit_fill_ill_insns); + if (!fhdr) { fp = org_fp; goto out_addrs; } if (extable_len) - fp->aux->extable = (void *)image + FUNCTION_DESCR_SIZE + proglen + fixup_len; + fp->aux->extable = (void *)fimage + FUNCTION_DESCR_SIZE + proglen + fixup_len; skip_init_ctx: code_base = (u32 *)(image + FUNCTION_DESCR_SIZE); + fcode_base = (u32 *)(fimage + FUNCTION_DESCR_SIZE); /* Code generation passes 1-2 */ for (pass = 1; pass < 3; pass++) { @@ -178,8 +193,10 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) cgctx.idx = 0; cgctx.alt_exit_addr = 0; bpf_jit_build_prologue(code_base, &cgctx); - if (bpf_jit_build_body(fp, code_base, &cgctx, addrs, pass, extra_pass)) { - bpf_jit_binary_free(bpf_hdr); + if (bpf_jit_build_body(fp, code_base, fcode_base, &cgctx, addrs, pass, + extra_pass)) { + bpf_arch_text_copy(&fhdr->size, &hdr->size, sizeof(hdr->size)); + bpf_jit_binary_pack_free(fhdr, hdr); fp = org_fp; goto out_addrs; } @@ -199,17 +216,19 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) #ifdef CONFIG_PPC64_ELF_ABI_V1 /* Function descriptor nastiness: Address + TOC */ - ((u64 *)image)[0] = (u64)code_base; + ((u64 *)image)[0] = (u64)fcode_base; ((u64 *)image)[1] = local_paca->kernel_toc; #endif - fp->bpf_func = (void *)image; + fp->bpf_func = (void *)fimage; fp->jited = 1; fp->jited_len = proglen + FUNCTION_DESCR_SIZE; - bpf_flush_icache(bpf_hdr, (u8 *)bpf_hdr + bpf_hdr->size); if (!fp->is_func || extra_pass) { - bpf_jit_binary_lock_ro(bpf_hdr); + if (bpf_jit_binary_pack_finalize(fp, fhdr, hdr)) { + fp = org_fp; + goto out_addrs; + } bpf_prog_fill_jited_linfo(fp, addrs); out_addrs: kfree(addrs); @@ -219,8 +238,9 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) jit_data->addrs = addrs; jit_data->ctx = cgctx; jit_data->proglen = proglen; - jit_data->image = image; - jit_data->header = bpf_hdr; + jit_data->fimage = fimage; + jit_data->fhdr = fhdr; + jit_data->hdr = hdr; } out: @@ -234,12 +254,13 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) * The caller should check for (BPF_MODE(code) == BPF_PROBE_MEM) before calling * this function, as this only applies to BPF_PROBE_MEM, for now. */ -int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct codegen_context *ctx, - int insn_idx, int jmp_off, int dst_reg) +int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, u32 *fimage, int pass, + struct codegen_context *ctx, int insn_idx, int jmp_off, + int dst_reg) { off_t offset; unsigned long pc; - struct exception_table_entry *ex; + struct exception_table_entry *ex, *ex_entry; u32 *fixup; /* Populate extable entries only in the last pass */ @@ -250,9 +271,16 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct code WARN_ON_ONCE(ctx->exentry_idx >= fp->aux->num_exentries)) return -EINVAL; + /* + * Program is first written to image before copying to the + * final location (fimage). Accordingly, update in the image first. + * As all offsets used are relative, copying as is to the + * final location should be alright. + */ pc = (unsigned long)&image[insn_idx]; + ex = (void *)fp->aux->extable - (void *)fimage + (void *)image; - fixup = (void *)fp->aux->extable - + fixup = (void *)ex - (fp->aux->num_exentries * BPF_FIXUP_LEN * 4) + (ctx->exentry_idx * BPF_FIXUP_LEN * 4); @@ -263,17 +291,17 @@ int bpf_add_extable_entry(struct bpf_prog *fp, u32 *image, int pass, struct code fixup[BPF_FIXUP_LEN - 1] = PPC_RAW_BRANCH((long)(pc + jmp_off) - (long)&fixup[BPF_FIXUP_LEN - 1]); - ex = &fp->aux->extable[ctx->exentry_idx]; + ex_entry = &ex[ctx->exentry_idx]; - offset = pc - (long)&ex->insn; + offset = pc - (long)&ex_entry->insn; if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) return -ERANGE; - ex->insn = offset; + ex_entry->insn = offset; - offset = (long)fixup - (long)&ex->fixup; + offset = (long)fixup - (long)&ex_entry->fixup; if (WARN_ON_ONCE(offset >= 0 || offset < INT_MIN)) return -ERANGE; - ex->fixup = offset; + ex_entry->fixup = offset; ctx->exentry_idx++; return 0; @@ -307,3 +335,27 @@ int bpf_arch_text_invalidate(void *dst, size_t len) return ret; } + +void bpf_jit_free(struct bpf_prog *fp) +{ + if (fp->jited) { + struct powerpc_jit_data *jit_data = fp->aux->jit_data; + struct bpf_binary_header *hdr; + + /* + * If we fail the final pass of JIT (from jit_subprogs), + * the program may not be finalized yet. Call finalize here + * before freeing it. + */ + if (jit_data) { + bpf_jit_binary_pack_finalize(fp, jit_data->fhdr, jit_data->hdr); + kvfree(jit_data->addrs); + kfree(jit_data); + } + hdr = bpf_jit_binary_pack_hdr(fp); + bpf_jit_binary_pack_free(hdr, NULL); + WARN_ON_ONCE(!bpf_prog_kallsyms_verify_off(fp)); + } + + bpf_prog_unlock_free(fp); +} diff --git a/arch/powerpc/net/bpf_jit_comp32.c b/arch/powerpc/net/bpf_jit_comp32.c index bc7f92ec7f2d7..2f39c50ca729e 100644 --- a/arch/powerpc/net/bpf_jit_comp32.c +++ b/arch/powerpc/net/bpf_jit_comp32.c @@ -200,12 +200,13 @@ void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx) EMIT(PPC_RAW_BLR()); } -int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func) +/* Relative offset needs to be calculated based on final image location */ +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) { - s32 rel = (s32)func - (s32)(image + ctx->idx); + s32 rel = (s32)func - (s32)(fimage + ctx->idx); if (image && rel < 0x2000000 && rel >= -0x2000000) { - PPC_BL(func); + EMIT(PPC_RAW_BL(rel)); } else { /* Load function address into r0 */ EMIT(PPC_RAW_LIS(_R0, IMM_H(func))); @@ -278,7 +279,7 @@ static int bpf_jit_emit_tail_call(u32 *image, struct codegen_context *ctx, u32 o } /* Assemble the body code between the prologue & epilogue */ -int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx, u32 *addrs, int pass, bool extra_pass) { const struct bpf_insn *insn = fp->insnsi; @@ -997,7 +998,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * jmp_off += 4; } - ret = bpf_add_extable_entry(fp, image, pass, ctx, insn_idx, + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, insn_idx, jmp_off, dst_reg); if (ret) return ret; @@ -1053,7 +1054,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * EMIT(PPC_RAW_STW(bpf_to_ppc(BPF_REG_5), _R1, 12)); } - ret = bpf_jit_emit_func_call_rel(image, ctx, func_addr); + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); if (ret) return ret; diff --git a/arch/powerpc/net/bpf_jit_comp64.c b/arch/powerpc/net/bpf_jit_comp64.c index 0f8048f6dad63..79f23974a3204 100644 --- a/arch/powerpc/net/bpf_jit_comp64.c +++ b/arch/powerpc/net/bpf_jit_comp64.c @@ -240,7 +240,7 @@ static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, u return 0; } -int bpf_jit_emit_func_call_rel(u32 *image, struct codegen_context *ctx, u64 func) +int bpf_jit_emit_func_call_rel(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) { unsigned int i, ctx_idx = ctx->idx; @@ -361,7 +361,7 @@ asm ( ); /* Assemble the body code between the prologue & epilogue */ -int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context *ctx, +int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, u32 *fimage, struct codegen_context *ctx, u32 *addrs, int pass, bool extra_pass) { enum stf_barrier_type stf_barrier = stf_barrier_type_get(); @@ -940,8 +940,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * addrs[++i] = ctx->idx * 4; if (BPF_MODE(code) == BPF_PROBE_MEM) { - ret = bpf_add_extable_entry(fp, image, pass, ctx, ctx->idx - 1, - 4, dst_reg); + ret = bpf_add_extable_entry(fp, image, fimage, pass, ctx, + ctx->idx - 1, 4, dst_reg); if (ret) return ret; } @@ -995,7 +995,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, struct codegen_context * if (func_addr_fixed) ret = bpf_jit_emit_func_call_hlp(image, ctx, func_addr); else - ret = bpf_jit_emit_func_call_rel(image, ctx, func_addr); + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); if (ret) return ret; From ec91227e0244199939a6b6e7bd7d67a6170ee771 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sat, 21 Oct 2023 23:47:31 +0530 Subject: [PATCH 74/76] powerpc: make fadump resilient with memory add/remove events Due to changes in memory resources caused by either memory hotplug or online/offline events, the elfcorehdr, which describes the CPUs and memory of the crashed kernel to the kernel that collects the dump (known as second/fadump kernel), becomes outdated. Consequently, attempting dump collection with an outdated elfcorehdr can lead to failed or inaccurate dump collection. Memory hotplug or online/offline events is referred as memory add/remove events in reset of the commit message. The current solution to address the aforementioned issue is as follows: Monitor memory add/remove events in userspace using udev rules, and re-register fadump whenever there are changes in memory resources. This leads to the creation of a new elfcorehdr with updated system memory information. There are several notable issues associated with re-registering fadump for every memory add/remove events. 1. Bulk memory add/remove events with udev-based fadump re-registration can lead to race conditions and, more importantly, it creates a wide window during which fadump is inactive until all memory add/remove events are settled. 2. Re-registering fadump for every memory add/remove event is inefficient. 3. The memory for elfcorehdr is allocated based on the memblock regions available during early boot and remains fixed thereafter. However, if elfcorehdr is later recreated with to additional memblock regions, its size will increase, potentially leading to memory corruption. Address the aforementioned challenges by shifting the creation of elfcorehdr from the first kernel (also referred as the crashed kernel), where it was created and frequently recreated for every memory add/remove event, to the fadump kernel. As a result, the elfcorehdr only needs to be created once, thus eliminating the necessity to re-register fadump during memory add/remove events. At present, the first kernel prepares the elfcorehdr and stores it in the fadump reserved area. Subsequently, it includes the start address of the elfcorehdr in the elfcorehdr_addr attribute of the fadump header. The fadump header holds information like elfcorehdr address, crashing CPU details, etc. The first kernel prepares the fadump header and stores it in the fadump reserved area. In the event of first kernel crash the second boots and access the fadump header prepared by first kernel and do the following in a platform-specific function [rtas|opal]_fadump_process: 1. Sanity check for fadump header 2. Update CPU notes in elfcorehdr 3. Set the global variable elfcorehdr_addr to the address of the fadump header's elfcorehdr. For vmcore module to process it later on. Along with the above, update the setup_fadump()/fadump.c to create elfcorehdr in second/fadump kernel. Section below outlines the information required to create the elfcorehdr and the changes made to make it available to the fadump kernel if it's not already. To create elfcorehdr, the following crashed kernel information is required: CPU notes, vmcoreinfo, and memory ranges. At present, the CPU notes are already prepared in the fadump kernel, so no changes are needed in that regard. The fadump kernel has access to all crashed kernel memory regions, including boot memory regions that are relocated by firmware to fadump reserved areas, so no changes for that either. However, it is necessary to add new members to the fadump header, i.e., the 'fadump_crash_info_header' structure, in order to pass the crashed kernel's vmcoreinfo address and size to the fadump kernel. Table 1 below illustrates kernel's ability to collect dump if either the first/crashed kernel or the second/fadump kernel does not have the changes introduced here. Consider the 'old kernel' as the kernel without this patch, and the 'new kernel' as the kernel with this patch included. +----------+------------------------+------------------------+-------+ | scenario | first/crashed kernel | second/fadump kernel | Dump | +----------+------------------------+------------------------+-------+ | 1 | old kernel | new kernel | Yes | +----------+------------------------+------------------------+-------+ | 2 | new kernel | old kernel | No | +----------+------------------------+------------------------+-------+ Table 1 Scenario 1: ----------- Since the magic number of fadump header is updated, the second kernel can differentiate the crashed kernel is of type 'new kernel' or 'old kernel' and act accordingly. In this scenario, since the crashed kernel is of type 'old kernel,' the fadump kernel skips elfcorehdr creation and uses the one prepared in the first kernel itself to collect the dump. Scenario 2: ----------- Since 'old kernel' as the fadump kernel is NOT capable of processing fadump header with updated magic number from 'new kernel' hence it gracefully fails with the below error and dump collection fails in this scenario. [ 0.007365] rtas fadump: Crash info header is not valid. Add a version field to the fadump_crash_info_header structure to avoid the need to change its magic number in the future. Adding a version field to the fadump header was one of the TODO items listed in Documentation/powerpc/firmware-assisted-dump.rst. To: linuxppc-dev@lists.ozlabs.org Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20231021181733.204311-2-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/fadump-internal.h | 24 +- arch/powerpc/kernel/fadump.c | 361 +++++++++++-------- arch/powerpc/platforms/powernv/opal-fadump.c | 18 +- arch/powerpc/platforms/pseries/rtas-fadump.c | 23 +- 4 files changed, 242 insertions(+), 184 deletions(-) diff --git a/arch/powerpc/include/asm/fadump-internal.h b/arch/powerpc/include/asm/fadump-internal.h index 27f9e11eda28c..7be3d8894520d 100644 --- a/arch/powerpc/include/asm/fadump-internal.h +++ b/arch/powerpc/include/asm/fadump-internal.h @@ -42,7 +42,25 @@ static inline u64 fadump_str_to_u64(const char *str) #define FADUMP_CPU_UNKNOWN (~((u32)0)) -#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPINF") +/* + * The introduction of new fields in the fadump crash info header has + * led to a change in the magic key, from `FADMPINF` to `FADMPSIG`. + * This alteration ensures backward compatibility, enabling the kernel + * with the updated fadump crash info to handle kernel dumps from older + * kernels. + * + * To prevent the need for further changes to the magic number in the + * event of future modifications to the fadump header, a version field + * has been introduced to track the fadump crash info header version. + * + * Historically, there was no connection between the magic number and + * the fadump crash info header version. However, moving forward, the + * `FADMPINF` magic number in header will be treated as version 0, while + * the `FADMPSIG` magic number in header will include a version field to + * determine its version. + */ +#define FADUMP_CRASH_INFO_MAGIC fadump_str_to_u64("FADMPSIG") +#define FADUMP_VERSION 1 /* fadump crash info structure */ struct fadump_crash_info_header { @@ -51,6 +69,10 @@ struct fadump_crash_info_header { u32 crashing_cpu; struct pt_regs regs; struct cpumask cpu_mask; + u32 version; + u64 elfcorehdr_size; + u64 vmcoreinfo_raddr; + u64 vmcoreinfo_size; }; struct fadump_memory_range { diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 3ff2da7b120b5..32d5321361138 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -53,8 +53,6 @@ static struct kobject *fadump_kobj; static atomic_t cpus_in_fadump; static DEFINE_MUTEX(fadump_mutex); -static struct fadump_mrange_info crash_mrange_info = { "crash", NULL, 0, 0, 0, false }; - #define RESERVED_RNGS_SZ 16384 /* 16K - 128 entries */ #define RESERVED_RNGS_CNT (RESERVED_RNGS_SZ / \ sizeof(struct fadump_memory_range)) @@ -373,12 +371,6 @@ static unsigned long __init get_fadump_area_size(void) size = PAGE_ALIGN(size); size += fw_dump.boot_memory_size; size += sizeof(struct fadump_crash_info_header); - size += sizeof(struct elfhdr); /* ELF core header.*/ - size += sizeof(struct elf_phdr); /* place holder for cpu notes */ - /* Program headers for crash memory regions. */ - size += sizeof(struct elf_phdr) * (memblock_num_regions(memory) + 2); - - size = PAGE_ALIGN(size); /* This is to hold kernel metadata on platforms that support it */ size += (fw_dump.ops->fadump_get_metadata_size ? @@ -931,36 +923,6 @@ static inline int fadump_add_mem_range(struct fadump_mrange_info *mrange_info, return 0; } -static int fadump_exclude_reserved_area(u64 start, u64 end) -{ - u64 ra_start, ra_end; - int ret = 0; - - ra_start = fw_dump.reserve_dump_area_start; - ra_end = ra_start + fw_dump.reserve_dump_area_size; - - if ((ra_start < end) && (ra_end > start)) { - if ((start < ra_start) && (end > ra_end)) { - ret = fadump_add_mem_range(&crash_mrange_info, - start, ra_start); - if (ret) - return ret; - - ret = fadump_add_mem_range(&crash_mrange_info, - ra_end, end); - } else if (start < ra_start) { - ret = fadump_add_mem_range(&crash_mrange_info, - start, ra_start); - } else if (ra_end < end) { - ret = fadump_add_mem_range(&crash_mrange_info, - ra_end, end); - } - } else - ret = fadump_add_mem_range(&crash_mrange_info, start, end); - - return ret; -} - static int fadump_init_elfcore_header(char *bufp) { struct elfhdr *elf; @@ -997,52 +959,6 @@ static int fadump_init_elfcore_header(char *bufp) return 0; } -/* - * Traverse through memblock structure and setup crash memory ranges. These - * ranges will be used create PT_LOAD program headers in elfcore header. - */ -static int fadump_setup_crash_memory_ranges(void) -{ - u64 i, start, end; - int ret; - - pr_debug("Setup crash memory ranges.\n"); - crash_mrange_info.mem_range_cnt = 0; - - /* - * Boot memory region(s) registered with firmware are moved to - * different location at the time of crash. Create separate program - * header(s) for this memory chunk(s) with the correct offset. - */ - for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { - start = fw_dump.boot_mem_addr[i]; - end = start + fw_dump.boot_mem_sz[i]; - ret = fadump_add_mem_range(&crash_mrange_info, start, end); - if (ret) - return ret; - } - - for_each_mem_range(i, &start, &end) { - /* - * skip the memory chunk that is already added - * (0 through boot_memory_top). - */ - if (start < fw_dump.boot_mem_top) { - if (end > fw_dump.boot_mem_top) - start = fw_dump.boot_mem_top; - else - continue; - } - - /* add this range excluding the reserved dump area. */ - ret = fadump_exclude_reserved_area(start, end); - if (ret) - return ret; - } - - return 0; -} - /* * If the given physical address falls within the boot memory region then * return the relocated address that points to the dump region reserved @@ -1073,13 +989,28 @@ static inline unsigned long fadump_relocate(unsigned long paddr) return raddr; } -static int fadump_create_elfcore_headers(char *bufp) +static void populate_elf_pt_load(struct elf_phdr *phdr, u64 start, + u64 size, unsigned long long offset) { - unsigned long long raddr, offset; - struct elf_phdr *phdr; + phdr->p_align = 0; + phdr->p_memsz = size; + phdr->p_filesz = size; + phdr->p_paddr = start; + phdr->p_offset = offset; + phdr->p_type = PT_LOAD; + phdr->p_flags = PF_R|PF_W|PF_X; + phdr->p_vaddr = (unsigned long)__va(start); +} + +static void __init fadump_populate_elfcorehdr(struct fadump_crash_info_header *fdh) +{ + char *bufp; struct elfhdr *elf; - int i, j; + struct elf_phdr *phdr; + u64 boot_mem_dest_offset; + unsigned long long i, ra_start, ra_end, ra_size, mstart, mend; + bufp = (char *) __va(fdh->elfcorehdr_addr); fadump_init_elfcore_header(bufp); elf = (struct elfhdr *)bufp; bufp += sizeof(struct elfhdr); @@ -1113,54 +1044,68 @@ static int fadump_create_elfcore_headers(char *bufp) phdr->p_vaddr = 0; phdr->p_align = 0; - phdr->p_paddr = fadump_relocate(paddr_vmcoreinfo_note()); - phdr->p_offset = phdr->p_paddr; - phdr->p_memsz = phdr->p_filesz = VMCOREINFO_NOTE_SIZE; + phdr->p_paddr = phdr->p_offset = fdh->vmcoreinfo_raddr; + phdr->p_memsz = phdr->p_filesz = fdh->vmcoreinfo_size; /* Increment number of program headers. */ (elf->e_phnum)++; - /* setup PT_LOAD sections. */ - j = 0; - offset = 0; - raddr = fw_dump.boot_mem_addr[0]; - for (i = 0; i < crash_mrange_info.mem_range_cnt; i++) { - u64 mbase, msize; - - mbase = crash_mrange_info.mem_ranges[i].base; - msize = crash_mrange_info.mem_ranges[i].size; - if (!msize) - continue; - + /* + * setup PT_LOAD sections. + * first include boot memory regions and then add rest of the memory + * regions + */ + boot_mem_dest_offset = fw_dump.boot_mem_dest_addr; + for (i = 0; i < fw_dump.boot_mem_regs_cnt; i++) { phdr = (struct elf_phdr *)bufp; bufp += sizeof(struct elf_phdr); - phdr->p_type = PT_LOAD; - phdr->p_flags = PF_R|PF_W|PF_X; - phdr->p_offset = mbase; - - if (mbase == raddr) { - /* - * The entire real memory region will be moved by - * firmware to the specified destination_address. - * Hence set the correct offset. - */ - phdr->p_offset = fw_dump.boot_mem_dest_addr + offset; - if (j < (fw_dump.boot_mem_regs_cnt - 1)) { - offset += fw_dump.boot_mem_sz[j]; - raddr = fw_dump.boot_mem_addr[++j]; - } + populate_elf_pt_load(phdr, fw_dump.boot_mem_addr[i], + fw_dump.boot_mem_sz[i], + boot_mem_dest_offset); + /* Increment number of program headers. */ + (elf->e_phnum)++; + boot_mem_dest_offset += fw_dump.boot_mem_sz[i]; + } + + /* Memory reserved for fadump in first kernel */ + ra_start = fw_dump.reserve_dump_area_start; + ra_size = get_fadump_area_size(); + ra_end = ra_start + ra_size; + + phdr = (struct elf_phdr *)bufp; + for_each_mem_range(i, &mstart, &mend) { + /* + * Skip the memory regions that already added + * to elfcorehdr. + */ + if (mstart < fw_dump.boot_mem_top) { + if (mend > fw_dump.boot_mem_top) + mstart = fw_dump.boot_mem_top; + else + continue; } - phdr->p_paddr = mbase; - phdr->p_vaddr = (unsigned long)__va(mbase); - phdr->p_filesz = msize; - phdr->p_memsz = msize; - phdr->p_align = 0; + /* Handle memblock regions overlaps with reserved area */ + if ((ra_start < mend) && (ra_end > mstart)) { + if ((mstart < ra_start) && (mend > ra_end)) { + populate_elf_pt_load(phdr, mstart, ra_start - mstart, mstart); + elf->e_phnum++; + bufp += sizeof(struct elf_phdr); + phdr = (struct elf_phdr *)bufp; + populate_elf_pt_load(phdr, ra_end, mend - ra_end, ra_end); + } else if (mstart < ra_start) { + populate_elf_pt_load(phdr, mstart, ra_start - mstart, mstart); + } else if (ra_end < mend) { + populate_elf_pt_load(phdr, ra_end, mend - ra_end, ra_end); + } + } else { + populate_elf_pt_load(phdr, mstart, mend - mstart, mstart); + } - /* Increment number of program headers. */ - (elf->e_phnum)++; + elf->e_phnum++; + bufp += sizeof(struct elf_phdr); + phdr = (struct elf_phdr *) bufp; } - return 0; } static unsigned long init_fadump_header(unsigned long addr) @@ -1175,9 +1120,16 @@ static unsigned long init_fadump_header(unsigned long addr) memset(fdh, 0, sizeof(struct fadump_crash_info_header)); fdh->magic_number = FADUMP_CRASH_INFO_MAGIC; - fdh->elfcorehdr_addr = addr; + /* Elfcorehdr will now be prepared in the second kernel. */ + fdh->elfcorehdr_addr = 0; /* We will set the crashing cpu id in crash_fadump() during crash. */ fdh->crashing_cpu = FADUMP_CPU_UNKNOWN; + fdh->version = FADUMP_VERSION; + + /* Need below vmcoreinfo in second kernel to prepare elfcorehdr. */ + fdh->vmcoreinfo_raddr = fadump_relocate(paddr_vmcoreinfo_note()); + fdh->vmcoreinfo_size = VMCOREINFO_NOTE_SIZE; + /* * When LPAR is terminated by PYHP, ensure all possible CPUs' * register data is processed while exporting the vmcore. @@ -1190,8 +1142,6 @@ static unsigned long init_fadump_header(unsigned long addr) static int register_fadump(void) { unsigned long addr; - void *vaddr; - int ret; /* * If no memory is reserved then we can not register for firmware- @@ -1200,18 +1150,10 @@ static int register_fadump(void) if (!fw_dump.reserve_dump_area_size) return -ENODEV; - ret = fadump_setup_crash_memory_ranges(); - if (ret) - return ret; - addr = fw_dump.fadumphdr_addr; /* Initialize fadump crash info header. */ addr = init_fadump_header(addr); - vaddr = __va(addr); - - pr_debug("Creating ELF core headers at %#016lx\n", addr); - fadump_create_elfcore_headers(vaddr); /* register the future kernel dump with firmware. */ pr_debug("Registering for firmware-assisted kernel dump...\n"); @@ -1230,7 +1172,6 @@ void fadump_cleanup(void) } else if (fw_dump.dump_registered) { /* Un-register Firmware-assisted dump if it was registered. */ fw_dump.ops->fadump_unregister(&fw_dump); - fadump_free_mem_ranges(&crash_mrange_info); } if (fw_dump.ops->fadump_cleanup) @@ -1416,6 +1357,54 @@ static void fadump_release_memory(u64 begin, u64 end) fadump_release_reserved_area(tstart, end); } +/* + * To determine fadump header version, the following conditions apply + * - If magic_number is FADMPINF, return 0 + * - If magic_number is FADMPSIG, return the version value. + * - Otherwise, return -E (header is not valid). + */ +static u32 get_fadump_crash_header_version(struct fadump_crash_info_header *fdh) +{ + if (fdh->magic_number == fadump_str_to_u64("FADMPINF")) + return 0; + else if (fdh->magic_number == FADUMP_CRASH_INFO_MAGIC) + return fdh->version; + + return -EINVAL; +} + +static void fadump_free_elfcorehdr_buf(void) +{ + struct fadump_crash_info_header *fdh; + unsigned long elfcorehdr_vaddr; + + if (!fw_dump.fadumphdr_addr) + return; + + fdh = (struct fadump_crash_info_header *) __va(fw_dump.fadumphdr_addr); + + /* + * For fadump version 1 the memory for elfcorehdr is dynamically + * allocated in second kernel so free it. + */ + if (get_fadump_crash_header_version(fdh) != 1) + return; + + if (fdh->elfcorehdr_addr == 0 || fdh->elfcorehdr_size == 0) + return; + + /* + * Reset the global `elfcorehdr_addr` before freeing `elfcorehdr` + * memory so that other kernel modules, like `vmcore`, do not + * access the invalid memory. + */ + elfcorehdr_addr = ELFCORE_ADDR_MAX; + elfcorehdr_vaddr = (unsigned long) __va(fdh->elfcorehdr_addr); + fadump_free_buffer(elfcorehdr_vaddr, fdh->elfcorehdr_size); + fdh->elfcorehdr_addr = 0; + fdh->elfcorehdr_size = 0; +} + static void fadump_invalidate_release_mem(void) { mutex_lock(&fadump_mutex); @@ -1424,6 +1413,8 @@ static void fadump_invalidate_release_mem(void) return; } + /* Free elfcorehdr buf before cleaning up the fadump reserved area. */ + fadump_free_elfcorehdr_buf(); fadump_cleanup(); mutex_unlock(&fadump_mutex); @@ -1632,6 +1623,93 @@ static void __init fadump_init_files(void) return; } +static int __init fadump_setup_elfcorehdr_buf(struct fadump_crash_info_header *fdh) +{ + char *bufp; + int elf_phdr_cnt; + unsigned long elfcorehdr_size; + + /* + * Program header for CPU notes comes first, followed by one for + * vmcoreinfo, and the remaining program headers correspond to + * memory regions. + */ + elf_phdr_cnt = 2 + fw_dump.boot_mem_regs_cnt + memblock_num_regions(memory); + elfcorehdr_size = sizeof(struct elfhdr) + (elf_phdr_cnt * sizeof(struct elf_phdr)); + elfcorehdr_size = PAGE_ALIGN(elfcorehdr_size); + bufp = (char *)fadump_alloc_buffer(elfcorehdr_size); + if (!bufp) { + pr_err("Failed to allocate %lu bytes for elfcorehdr\n", + elfcorehdr_size); + return -ENOMEM; + } + fdh->elfcorehdr_addr = virt_to_phys(bufp); + fdh->elfcorehdr_size = elfcorehdr_size; + return 0; +} + +/* + * Process an active dump in four steps. First, verify the crash info header + * signature/magic number for integrity and accuracy. Second, if the fadump + * version is greater than 0, prepare the elfcorehdr; for fadump version 0, + * it's already created in the first kernel as part of the fadump reserved + * area. Third, let the platform update CPU notes in elfcorehdr. Finally, + * set elfcorehdr_addr so that the vmcore module can export the elfcore + * header through '/proc/vmcore'. + */ +static void process_fadump(void) +{ + int fadump_version; + struct fadump_crash_info_header *fdh; + + fdh = (struct fadump_crash_info_header *) __va(fw_dump.fadumphdr_addr); + if (!fdh) { + pr_err("Crash info header is empty.\n"); + goto err_out; + } + + fadump_version = get_fadump_crash_header_version(fdh); + + /* + * fadump version zero indicates that fadump crash info header + * is corrupted. + */ + if (fadump_version < 0) { + pr_err("Crash info header is not valid.\n"); + goto err_out; + } + + /* + * If crashed kernel's fadump crash info header version is 1, + * then create elfcorehdr here. + */ + if (fadump_version >= 1) { + if (fadump_setup_elfcorehdr_buf(fdh)) + goto err_out; + + fadump_populate_elfcorehdr(fdh); + } + + /* + * If dump process fails then invalidate the registration + * and release memory before proceeding for re-registration. + */ + if (fw_dump.ops->fadump_process(&fw_dump) < 0) + goto err_out; + + /* + * elfcore header is now ready to be exported. + * + * set elfcorehdr_addr so that vmcore module will export the + * elfcore header through '/proc/vmcore'. + */ + elfcorehdr_addr = fdh->elfcorehdr_addr; + return; + +err_out: + fadump_invalidate_release_mem(); +} + /* * Prepare for firmware-assisted dump. */ @@ -1651,12 +1729,7 @@ int __init setup_fadump(void) * saving it to the disk. */ if (fw_dump.dump_active) { - /* - * if dump process fails then invalidate the registration - * and release memory before proceeding for re-registration. - */ - if (fw_dump.ops->fadump_process(&fw_dump) < 0) - fadump_invalidate_release_mem(); + process_fadump(); } /* Initialize the kernel dump memory structure and register with f/w */ else if (fw_dump.reserve_dump_area_size) { diff --git a/arch/powerpc/platforms/powernv/opal-fadump.c b/arch/powerpc/platforms/powernv/opal-fadump.c index 964f464b1b0e3..a44d75cefc9ed 100644 --- a/arch/powerpc/platforms/powernv/opal-fadump.c +++ b/arch/powerpc/platforms/powernv/opal-fadump.c @@ -526,12 +526,7 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) if (!opal_fdm_active || !fadump_conf->fadumphdr_addr) return rc; - /* Validate the fadump crash info header */ fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - pr_err("Crash info header is not valid.\n"); - return rc; - } #ifdef CONFIG_OPAL_CORE /* @@ -545,18 +540,7 @@ static int __init opal_fadump_process(struct fw_dump *fadump_conf) kernel_initiated = true; #endif - rc = opal_fadump_build_cpu_notes(fadump_conf, fdh); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return rc; + return opal_fadump_build_cpu_notes(fadump_conf, fdh); } static void opal_fadump_region_show(struct fw_dump *fadump_conf, diff --git a/arch/powerpc/platforms/pseries/rtas-fadump.c b/arch/powerpc/platforms/pseries/rtas-fadump.c index b5853e9fcc3c1..3366082705d5f 100644 --- a/arch/powerpc/platforms/pseries/rtas-fadump.c +++ b/arch/powerpc/platforms/pseries/rtas-fadump.c @@ -394,9 +394,6 @@ static int __init rtas_fadump_build_cpu_notes(struct fw_dump *fadump_conf) */ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) { - struct fadump_crash_info_header *fdh; - int rc = 0; - if (!fdm_active || !fadump_conf->fadumphdr_addr) return -EINVAL; @@ -415,25 +412,7 @@ static int __init rtas_fadump_process(struct fw_dump *fadump_conf) return -EINVAL; } - /* Validate the fadump crash info header */ - fdh = __va(fadump_conf->fadumphdr_addr); - if (fdh->magic_number != FADUMP_CRASH_INFO_MAGIC) { - pr_err("Crash info header is not valid.\n"); - return -EINVAL; - } - - rc = rtas_fadump_build_cpu_notes(fadump_conf); - if (rc) - return rc; - - /* - * We are done validating dump info and elfcore header is now ready - * to be exported. set elfcorehdr_addr so that vmcore module will - * export the elfcore header through '/proc/vmcore'. - */ - elfcorehdr_addr = fdh->elfcorehdr_addr; - - return 0; + return rtas_fadump_build_cpu_notes(fadump_conf); } static void rtas_fadump_region_show(struct fw_dump *fadump_conf, From f134c3dac49a918253104676f07ffb979e8b96e5 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sat, 21 Oct 2023 23:47:32 +0530 Subject: [PATCH 75/76] powerpc/fadump: add hotplug_ready sysfs interface The elfcorehdr describes the CPUs and memory of the crashed kernel to the kernel that captures the dump, known as the second or fadump kernel. The elfcorehdr needs to be updated if the system's memory changes due to memory hotplug or online/offline events. Currently, memory hotplug events are monitored in userspace by udev rules, and fadump is re-registered, which recreates the elfcorehdr with the latest available memory in the system. However, the previous patch ("powerpc: make fadump resilient with memory add/remove events") moved the creation of elfcorehdr to the second or fadump kernel. This eliminates the need to regenerate the elfcorehdr during memory hotplug or online/offline events. Create a sysfs entry at /sys/kernel/fadump/hotplug_ready to let userspace know that fadump re-registration is not required for memory add/remove events. To: linuxppc-dev@lists.ozlabs.org Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20231021181733.204311-3-sourabhjain@linux.ibm.com --- Documentation/ABI/testing/sysfs-kernel-fadump | 12 ++++++++++++ arch/powerpc/kernel/fadump.c | 14 ++++++++++++++ 2 files changed, 26 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-kernel-fadump b/Documentation/ABI/testing/sysfs-kernel-fadump index 8f7a64a81783a..f8ba21da0f712 100644 --- a/Documentation/ABI/testing/sysfs-kernel-fadump +++ b/Documentation/ABI/testing/sysfs-kernel-fadump @@ -38,3 +38,15 @@ Contact: linuxppc-dev@lists.ozlabs.org Description: read only Provide information about the amount of memory reserved by FADump to save the crash dump in bytes. +What: /sys/kernel/fadump/hotplug_ready +Date: Sep 2023 +Contact: linuxppc-dev@lists.ozlabs.org +Description: read only + The Kdump scripts utilize udev rules to monitor memory add/remove + events, ensuring that FADUMP is automatically re-registered when + system memory changes occur. This re-registration was necessary + to update the elfcorehdr, which describes the system memory to the + second kernel. Now If this sysfs node holds a value of 1, it + indicates to userspace that FADUMP does not require re-registration + since the elfcorehdr is now generated in the second kernel. +User: kexec-tools diff --git a/arch/powerpc/kernel/fadump.c b/arch/powerpc/kernel/fadump.c index 32d5321361138..78b0118939cb6 100644 --- a/arch/powerpc/kernel/fadump.c +++ b/arch/powerpc/kernel/fadump.c @@ -1475,6 +1475,18 @@ static ssize_t enabled_show(struct kobject *kobj, return sprintf(buf, "%d\n", fw_dump.fadump_enabled); } +/* + * /sys/kernel/fadump/hotplug_ready sysfs node only returns 1, + * which inidcates to usersapce that fadump re-registration is not + * required on memory hotplug events. + */ +static ssize_t hotplug_ready_show(struct kobject *kobj, + struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", 1); +} + static ssize_t mem_reserved_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -1547,11 +1559,13 @@ static struct kobj_attribute release_attr = __ATTR_WO(release_mem); static struct kobj_attribute enable_attr = __ATTR_RO(enabled); static struct kobj_attribute register_attr = __ATTR_RW(registered); static struct kobj_attribute mem_reserved_attr = __ATTR_RO(mem_reserved); +static struct kobj_attribute hotplug_ready_attr = __ATTR_RO(hotplug_ready); static struct attribute *fadump_attrs[] = { &enable_attr.attr, ®ister_attr.attr, &mem_reserved_attr.attr, + &hotplug_ready_attr.attr, NULL, }; From 5e69d465457215225178b3003d70e556c6c42ea0 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Sat, 21 Oct 2023 23:47:33 +0530 Subject: [PATCH 76/76] Documentation/powerpc: update fadump implementation details The patch titled ("powerpc: make fadump resilient with memory add/remove events") has made significant changes to the implementation of fadump, particularly on elfcorehdr creation and fadump crash info header structure. Therefore, updating the fadump implementation documentation to reflect those changes. Following updates are done to firmware assisted dump documentation: 1. The elfcorehdr is no longer stored after fadump HDR in the reserved dump area. Instead, the second kernel dynamically allocates memory for the elfcorehdr within the address range from 0 to the boot memory size. Therefore, update figures 1 and 2 of Memory Reservation during the first and second kernels to reflect this change. 2. A version field has been added to the fadump header to maintain backward compatibility without changing the fadump header magic number in the future. Therefore, remove the corresponding TODO from the document. To: linuxppc-dev@lists.ozlabs.org Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20231021181733.204311-4-sourabhjain@linux.ibm.com --- .../powerpc/firmware-assisted-dump.rst | 91 +++++++++---------- 1 file changed, 42 insertions(+), 49 deletions(-) diff --git a/Documentation/powerpc/firmware-assisted-dump.rst b/Documentation/powerpc/firmware-assisted-dump.rst index e363fc48529a0..7e37aadd1f772 100644 --- a/Documentation/powerpc/firmware-assisted-dump.rst +++ b/Documentation/powerpc/firmware-assisted-dump.rst @@ -134,12 +134,12 @@ that are run. If there is dump data, then the memory is held. If there is no waiting dump data, then only the memory required to -hold CPU state, HPTE region, boot memory dump, FADump header and -elfcore header, is usually reserved at an offset greater than boot -memory size (see Fig. 1). This area is *not* released: this region -will be kept permanently reserved, so that it can act as a receptacle -for a copy of the boot memory content in addition to CPU state and -HPTE region, in the case a crash does occur. +hold CPU state, HPTE region, boot memory dump, and FADump header is +usually reserved at an offset greater than boot memory size (see Fig. 1). +This area is *not* released: this region will be kept permanently +reserved, so that it can act as a receptacle for a copy of the boot +memory content in addition to CPU state and HPTE region, in the case +a crash does occur. Since this reserved memory area is used only after the system crash, there is no point in blocking this significant chunk of memory from @@ -153,22 +153,22 @@ that were present in CMA region:: o Memory Reservation during first kernel - Low memory Top of memory - 0 boot memory size |<--- Reserved dump area --->| | - | | | Permanent Reservation | | - V V | | V - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | |///|////| DUMP | HDR | ELF |////| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | ^ ^ ^ ^ ^ - | | | | | | - \ CPU HPTE / | | - ------------------------------ | | - Boot memory content gets transferred | | - to reserved area by firmware at the | | - time of crash. | | - FADump Header | - (meta area) | + Low memory Top of memory + 0 boot memory size |<------ Reserved dump area ----->| | + | | | Permanent Reservation | | + V V | | V + +-----------+-----/ /---+---+----+-----------+-------+----+-----+ + | | |///|////| DUMP | HDR |////| | + +-----------+-----/ /---+---+----+-----------+-------+----+-----+ + | ^ ^ ^ ^ ^ + | | | | | | + \ CPU HPTE / | | + -------------------------------- | | + Boot memory content gets transferred | | + to reserved area by firmware at the | | + time of crash. | | + FADump Header | + (meta area) | | | Metadata: This area holds a metadata structure whose @@ -186,13 +186,20 @@ that were present in CMA region:: 0 boot memory size | | |<------------ Crash preserved area ------------>| V V |<--- Reserved dump area --->| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | |///|////| DUMP | HDR | ELF |////| | - +-----------+-----/ /---+---+----+-------+-----+-----+----+--+ - | | - V V - Used by second /proc/vmcore - kernel to boot + +----+---+--+-----/ /---+---+----+-------+-----+-----+-------+ + | |ELF| | |///|////| DUMP | HDR |/////| | + +----+---+--+-----/ /---+---+----+-------+-----+-----+-------+ + | | | | | | + ----- ------------------------------ --------------- + \ | | + \ | | + \ | | + \ | ---------------------------- + \ | / + \ | / + \ | / + /proc/vmcore + +---+ |///| -> Regions (CPU, HPTE & Metadata) marked like this in the above @@ -200,6 +207,12 @@ that were present in CMA region:: does not have CPU & HPTE regions while Metadata region is not supported on pSeries currently. + +---+ + |ELF| -> elfcorehdr, it is created in second kernel after crash. + +---+ + + Note: Memory from 0 to the boot memory size is used by second kernel + Fig. 2 @@ -353,26 +366,6 @@ TODO: - Need to come up with the better approach to find out more accurate boot memory size that is required for a kernel to boot successfully when booted with restricted memory. - - The FADump implementation introduces a FADump crash info structure - in the scratch area before the ELF core header. The idea of introducing - this structure is to pass some important crash info data to the second - kernel which will help second kernel to populate ELF core header with - correct data before it gets exported through /proc/vmcore. The current - design implementation does not address a possibility of introducing - additional fields (in future) to this structure without affecting - compatibility. Need to come up with the better approach to address this. - - The possible approaches are: - - 1. Introduce version field for version tracking, bump up the version - whenever a new field is added to the structure in future. The version - field can be used to find out what fields are valid for the current - version of the structure. - 2. Reserve the area of predefined size (say PAGE_SIZE) for this - structure and have unused area as reserved (initialized to zero) - for future field additions. - - The advantage of approach 1 over 2 is we don't need to reserve extra space. Author: Mahesh Salgaonkar