From 2bac6caee94e25f59ee47e2d365d7e07465089ba Mon Sep 17 00:00:00 2001 From: Celeste Liu Date: Thu, 30 May 2024 19:19:52 +0800 Subject: [PATCH 1/4] powerpc/configs: drop RT_GROUP_SCHED=y from ppc6xx_defconfig Commit 6e5f1537833a ("powerpc: Add a 6xx defconfig") said it was copied from fedore's ppc32 defconfig, but at least since 2015-06-10, Fedora has dropped this option.[1] For cgroup v1, if turned on, and there's any cgroup in the "cpu" hierarchy it needs an RT budget assigned, otherwise the processes in it will not be able to get RT at all. The problem with RT group scheduling is that it requires the budget assigned but there's no way we could assign a default budget, since the values to assign are both upper and lower time limits, are absolute, and need to be sum up to < 1 for each individal cgroup. That means we cannot really come up with values that would work by default in the general case.[1] For cgroup v2, it's almost unusable as well. If it turned on, the cpu controller can only be enabled when all RT processes are in the root cgroup. But it will lose the benefits of cgroup v2 if all RT process were placed in the same cgroup. systemd also doesn't support it.[2] [1]: https://bugzilla.redhat.com/show_bug.cgi?id=1229700 [2]: https://github.com/systemd/systemd/issues/13781#issuecomment-549164383 Signed-off-by: Celeste Liu Signed-off-by: Michael Ellerman Link: https://msgid.link/20240530111947.549474-12-CoelacanthusHex@gmail.com --- arch/powerpc/configs/ppc6xx_defconfig | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig index 66c7b28d74504..c06344db0eb37 100644 --- a/arch/powerpc/configs/ppc6xx_defconfig +++ b/arch/powerpc/configs/ppc6xx_defconfig @@ -12,7 +12,6 @@ CONFIG_TASK_XACCT=y CONFIG_TASK_IO_ACCOUNTING=y CONFIG_CGROUPS=y CONFIG_CGROUP_SCHED=y -CONFIG_RT_GROUP_SCHED=y CONFIG_CGROUP_DEVICE=y CONFIG_CGROUP_CPUACCT=y CONFIG_USER_NS=y From 0300a92e96cb393a1891d3b4a0f00b28dde8643b Mon Sep 17 00:00:00 2001 From: Anjali K Date: Tue, 28 May 2024 09:33:56 +0530 Subject: [PATCH 2/4] powerpc/perf: Set cpumode flags using sample address Currently in some cases, when the sampled instruction address register latches to a specific address during sampling, the privilege bits captured in the sampled event register are incorrect. For example, a snippet from the perf report on a power10 system is: Overhead Address Command Shared Object Symbol ........ .................. ............ ................. ....................... 2.41% 0x7fff9f94a02c null_syscall [unknown] [k] 0x00007fff9f94a02c 2.20% 0x7fff9f94a02c null_syscall libc.so.6 [.] syscall perf_get_misc_flags() function looks at the privilege bits to return the corresponding flags to be used for the address symbol and these privilege bit details are read from the sampled event register. In the above snippet, address "0x00007fff9f94a02c" is shown as "k" (kernel) due to the incorrect privilege bits captured in the sampled event register. To address this case check whether the sampled address is in the kernel area. Since this is specific to the latest platform, a new pmu flag is added called "PPMU_P10" and is used to contain the proposed fix. PPMU_P10_DD1 marked events are also included under PPMU_P10, hence remove the code specific to PPMU_P10_DD1 marked events. Signed-off-by: Anjali K Reviewed-by: Athira Rajeev > Signed-off-by: Michael Ellerman Link: https://msgid.link/20240528040356.2722275-1-anjalik@linux.ibm.com --- arch/powerpc/include/asm/perf_event_server.h | 3 +- arch/powerpc/perf/core-book3s.c | 45 +++++++++----------- arch/powerpc/perf/power10-pmu.c | 3 +- 3 files changed, 23 insertions(+), 28 deletions(-) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index e2221d29fdf9e..5995614e90629 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -89,7 +89,8 @@ struct power_pmu { #define PPMU_NO_SIAR 0x00000100 /* Do not use SIAR */ #define PPMU_ARCH_31 0x00000200 /* Has MMCR3, SIER2 and SIER3 */ #define PPMU_P10_DD1 0x00000400 /* Is power10 DD1 processor version */ -#define PPMU_HAS_ATTR_CONFIG1 0x00000800 /* Using config1 attribute */ +#define PPMU_P10 0x00000800 /* For power10 pmu */ +#define PPMU_HAS_ATTR_CONFIG1 0x00001000 /* Using config1 attribute */ /* * Values for flags to get_alternatives() diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6b5f8a94e7d89..42867469752d7 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -266,31 +266,12 @@ static inline u32 perf_flags_from_msr(struct pt_regs *regs) static inline u32 perf_get_misc_flags(struct pt_regs *regs) { bool use_siar = regs_use_siar(regs); - unsigned long mmcra = regs->dsisr; - int marked = mmcra & MMCRA_SAMPLE_ENABLE; + unsigned long siar; + unsigned long addr; if (!use_siar) return perf_flags_from_msr(regs); - /* - * Check the address in SIAR to identify the - * privilege levels since the SIER[MSR_HV, MSR_PR] - * bits are not set for marked events in power10 - * DD1. - */ - if (marked && (ppmu->flags & PPMU_P10_DD1)) { - unsigned long siar = mfspr(SPRN_SIAR); - if (siar) { - if (is_kernel_addr(siar)) - return PERF_RECORD_MISC_KERNEL; - return PERF_RECORD_MISC_USER; - } else { - if (is_kernel_addr(regs->nip)) - return PERF_RECORD_MISC_KERNEL; - return PERF_RECORD_MISC_USER; - } - } - /* * If we don't have flags in MMCRA, rather than using * the MSR, we intuit the flags from the address in @@ -298,19 +279,31 @@ static inline u32 perf_get_misc_flags(struct pt_regs *regs) * results */ if (ppmu->flags & PPMU_NO_SIPR) { - unsigned long siar = mfspr(SPRN_SIAR); + siar = mfspr(SPRN_SIAR); if (is_kernel_addr(siar)) return PERF_RECORD_MISC_KERNEL; return PERF_RECORD_MISC_USER; } /* PR has priority over HV, so order below is important */ - if (regs_sipr(regs)) - return PERF_RECORD_MISC_USER; - - if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV)) + if (regs_sipr(regs)) { + if (!(ppmu->flags & PPMU_P10)) + return PERF_RECORD_MISC_USER; + } else if (regs_sihv(regs) && (freeze_events_kernel != MMCR0_FCHV)) return PERF_RECORD_MISC_HYPERVISOR; + /* + * Check the address in SIAR to identify the + * privilege levels since the SIER[MSR_HV, MSR_PR] + * bits are not set correctly in power10 sometimes + */ + if (ppmu->flags & PPMU_P10) { + siar = mfspr(SPRN_SIAR); + addr = siar ? siar : regs->nip; + if (!is_kernel_addr(addr)) + return PERF_RECORD_MISC_USER; + } + return PERF_RECORD_MISC_KERNEL; } diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index 62a68b6b2d4b1..bb57b7cfe6406 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -593,7 +593,8 @@ static struct power_pmu power10_pmu = { .get_mem_weight = isa207_get_mem_weight, .disable_pmc = isa207_disable_pmc, .flags = PPMU_HAS_SIER | PPMU_ARCH_207S | - PPMU_ARCH_31 | PPMU_HAS_ATTR_CONFIG1, + PPMU_ARCH_31 | PPMU_HAS_ATTR_CONFIG1 | + PPMU_P10, .n_generic = ARRAY_SIZE(power10_generic_events), .generic_events = power10_generic_events, .cache_events = &power10_cache_events, From 0d3ff067331ef84e7e7f49537d768881042ed5ba Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 10 May 2024 15:52:34 +0530 Subject: [PATCH 3/4] powerpc/kexec_file: fix extra size calculation for kexec FDT While setting up the FDT for kexec, CPU nodes that are added after the system boots and reserved memory ranges are incorporated into the initial_boot_params (base FDT). However, they are not taken into account when determining the additional size needed for the kexec FDT. As a result, kexec fails to load, generating the following error: [1116.774451] Error updating memory reserve map: FDT_ERR_NOSPACE kexec_file_load failed: No such process Therefore, consider the extra size for CPU nodes added post-system boot and reserved memory ranges while preparing the kexec FDT. While adding a new parameter to the setup_new_fdt_ppc64 function, it was noticed that there were a couple of unused parameters, so they were removed. Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240510102235.2269496-2-sourabhjain@linux.ibm.com --- arch/powerpc/include/asm/kexec.h | 6 ++-- arch/powerpc/kexec/elf_64.c | 12 +++++-- arch/powerpc/kexec/file_load_64.c | 53 +++++++++++++------------------ 3 files changed, 33 insertions(+), 38 deletions(-) diff --git a/arch/powerpc/include/asm/kexec.h b/arch/powerpc/include/asm/kexec.h index 95a98b390d62d..270ee93a0f7d8 100644 --- a/arch/powerpc/include/asm/kexec.h +++ b/arch/powerpc/include/asm/kexec.h @@ -103,10 +103,8 @@ int load_crashdump_segments_ppc64(struct kimage *image, int setup_purgatory_ppc64(struct kimage *image, const void *slave_code, const void *fdt, unsigned long kernel_load_addr, unsigned long fdt_load_addr); -unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image); -int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, - unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline); +unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image, struct crash_mem *rmem); +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, struct crash_mem *rmem); #endif /* CONFIG_PPC64 */ #endif /* CONFIG_KEXEC_FILE */ diff --git a/arch/powerpc/kexec/elf_64.c b/arch/powerpc/kexec/elf_64.c index 214c071c58edf..5d6d616404cf9 100644 --- a/arch/powerpc/kexec/elf_64.c +++ b/arch/powerpc/kexec/elf_64.c @@ -23,6 +23,7 @@ #include #include #include +#include static void *elf64_load(struct kimage *image, char *kernel_buf, unsigned long kernel_len, char *initrd, @@ -36,6 +37,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, const void *slave_code; struct elfhdr ehdr; char *modified_cmdline = NULL; + struct crash_mem *rmem = NULL; struct kexec_elf_info elf_info; struct kexec_buf kbuf = { .image = image, .buf_min = 0, .buf_max = ppc64_rma_size }; @@ -102,17 +104,20 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, kexec_dprintk("Loaded initrd at 0x%lx\n", initrd_load_addr); } + ret = get_reserved_memory_ranges(&rmem); + if (ret) + goto out; + fdt = of_kexec_alloc_and_setup_fdt(image, initrd_load_addr, initrd_len, cmdline, - kexec_extra_fdt_size_ppc64(image)); + kexec_extra_fdt_size_ppc64(image, rmem)); if (!fdt) { pr_err("Error setting up the new device tree.\n"); ret = -EINVAL; goto out; } - ret = setup_new_fdt_ppc64(image, fdt, initrd_load_addr, - initrd_len, cmdline); + ret = setup_new_fdt_ppc64(image, fdt, rmem); if (ret) goto out_free_fdt; @@ -146,6 +151,7 @@ static void *elf64_load(struct kimage *image, char *kernel_buf, out_free_fdt: kvfree(fdt); out: + kfree(rmem); kfree(modified_cmdline); kexec_free_elf_info(&elf_info); diff --git a/arch/powerpc/kexec/file_load_64.c b/arch/powerpc/kexec/file_load_64.c index 925a69ad24689..413c76de283dc 100644 --- a/arch/powerpc/kexec/file_load_64.c +++ b/arch/powerpc/kexec/file_load_64.c @@ -803,10 +803,9 @@ static unsigned int cpu_node_size(void) return size; } -static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) +static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image, unsigned int cpu_nodes) { - unsigned int cpu_nodes, extra_size = 0; - struct device_node *dn; + unsigned int extra_size = 0; u64 usm_entries; #ifdef CONFIG_CRASH_HOTPLUG unsigned int possible_cpu_nodes; @@ -826,18 +825,6 @@ static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) extra_size += (unsigned int)(usm_entries * sizeof(u64)); } - /* - * Get the number of CPU nodes in the current DT. This allows to - * reserve places for CPU nodes added since the boot time. - */ - cpu_nodes = 0; - for_each_node_by_type(dn, "cpu") { - cpu_nodes++; - } - - if (cpu_nodes > boot_cpu_node_count) - extra_size += (cpu_nodes - boot_cpu_node_count) * cpu_node_size(); - #ifdef CONFIG_CRASH_HOTPLUG /* * Make sure enough space is reserved to accommodate possible CPU nodes @@ -861,16 +848,30 @@ static unsigned int kdump_extra_fdt_size_ppc64(struct kimage *image) * * Returns the estimated extra size needed for kexec/kdump kernel FDT. */ -unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image) +unsigned int kexec_extra_fdt_size_ppc64(struct kimage *image, struct crash_mem *rmem) { - unsigned int extra_size = 0; + struct device_node *dn; + unsigned int cpu_nodes = 0, extra_size = 0; // Budget some space for the password blob. There's already extra space // for the key name if (plpks_is_available()) extra_size += (unsigned int)plpks_get_passwordlen(); - return extra_size + kdump_extra_fdt_size_ppc64(image); + /* Get the number of CPU nodes in the current device tree */ + for_each_node_by_type(dn, "cpu") { + cpu_nodes++; + } + + /* Consider extra space for CPU nodes added since the boot time */ + if (cpu_nodes > boot_cpu_node_count) + extra_size += (cpu_nodes - boot_cpu_node_count) * cpu_node_size(); + + /* Consider extra space for reserved memory ranges if any */ + if (rmem->nr_ranges > 0) + extra_size += sizeof(struct fdt_reserve_entry) * rmem->nr_ranges; + + return extra_size + kdump_extra_fdt_size_ppc64(image, cpu_nodes); } static int copy_property(void *fdt, int node_offset, const struct device_node *dn, @@ -924,18 +925,13 @@ static int update_pci_dma_nodes(void *fdt, const char *dmapropname) * being loaded. * @image: kexec image being loaded. * @fdt: Flattened device tree for the next kernel. - * @initrd_load_addr: Address where the next initrd will be loaded. - * @initrd_len: Size of the next initrd, or 0 if there will be none. - * @cmdline: Command line for the next kernel, or NULL if there will - * be none. + * @rmem: Reserved memory ranges. * * Returns 0 on success, negative errno on error. */ -int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, - unsigned long initrd_load_addr, - unsigned long initrd_len, const char *cmdline) +int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, struct crash_mem *rmem) { - struct crash_mem *umem = NULL, *rmem = NULL; + struct crash_mem *umem = NULL; int i, nr_ranges, ret; #ifdef CONFIG_CRASH_DUMP @@ -991,10 +987,6 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, goto out; /* Update memory reserve map */ - ret = get_reserved_memory_ranges(&rmem); - if (ret) - goto out; - nr_ranges = rmem ? rmem->nr_ranges : 0; for (i = 0; i < nr_ranges; i++) { u64 base, size; @@ -1014,7 +1006,6 @@ int setup_new_fdt_ppc64(const struct kimage *image, void *fdt, ret = plpks_populate_fdt(fdt); out: - kfree(rmem); kfree(umem); return ret; } From 932bed41217059638c78a75411b7893b121d2162 Mon Sep 17 00:00:00 2001 From: Sourabh Jain Date: Fri, 10 May 2024 15:52:35 +0530 Subject: [PATCH 4/4] powerpc/kexec_file: fix cpus node update to FDT While updating the cpus node, commit 40c753993e3a ("powerpc/kexec_file: Use current CPU info while setting up FDT") first deletes all subnodes under the /cpus node. However, while adding sub-nodes back, it missed adding cpus subnodes whose device_type != "cpu", such as l2-cache*, l3-cache*, ibm,powerpc-cpu-features. Fix this by only deleting cpus sub-nodes of device_type == "cpus" and then adding all available nodes with device_type == "cpu". Fixes: 40c753993e3a ("powerpc/kexec_file: Use current CPU info while setting up FDT") Signed-off-by: Sourabh Jain Signed-off-by: Michael Ellerman Link: https://msgid.link/20240510102235.2269496-3-sourabhjain@linux.ibm.com --- arch/powerpc/kexec/core_64.c | 53 +++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/arch/powerpc/kexec/core_64.c b/arch/powerpc/kexec/core_64.c index 85050be08a23d..2e625c2cb6b9e 100644 --- a/arch/powerpc/kexec/core_64.c +++ b/arch/powerpc/kexec/core_64.c @@ -456,9 +456,15 @@ static int add_node_props(void *fdt, int node_offset, const struct device_node * * @fdt: Flattened device tree of the kernel. * * Returns 0 on success, negative errno on error. + * + * Note: expecting no subnodes under /cpus/ with device_type == "cpu". + * If this changes, update this function to include them. */ int update_cpus_node(void *fdt) { + int prev_node_offset; + const char *device_type; + const struct fdt_property *prop; struct device_node *cpus_node, *dn; int cpus_offset, cpus_subnode_offset, ret = 0; @@ -469,30 +475,44 @@ int update_cpus_node(void *fdt) return cpus_offset; } - if (cpus_offset > 0) { - ret = fdt_del_node(fdt, cpus_offset); + prev_node_offset = cpus_offset; + /* Delete sub-nodes of /cpus node with device_type == "cpu" */ + for (cpus_subnode_offset = fdt_first_subnode(fdt, cpus_offset); cpus_subnode_offset >= 0;) { + /* Ignore nodes that do not have a device_type property or device_type != "cpu" */ + prop = fdt_get_property(fdt, cpus_subnode_offset, "device_type", NULL); + if (!prop || strcmp(prop->data, "cpu")) { + prev_node_offset = cpus_subnode_offset; + goto next_node; + } + + ret = fdt_del_node(fdt, cpus_subnode_offset); if (ret < 0) { - pr_err("Error deleting /cpus node: %s\n", fdt_strerror(ret)); - return -EINVAL; + pr_err("Failed to delete a cpus sub-node: %s\n", fdt_strerror(ret)); + return ret; } +next_node: + if (prev_node_offset == cpus_offset) + cpus_subnode_offset = fdt_first_subnode(fdt, cpus_offset); + else + cpus_subnode_offset = fdt_next_subnode(fdt, prev_node_offset); } - /* Add cpus node to fdt */ - cpus_offset = fdt_add_subnode(fdt, fdt_path_offset(fdt, "/"), "cpus"); - if (cpus_offset < 0) { - pr_err("Error creating /cpus node: %s\n", fdt_strerror(cpus_offset)); + cpus_node = of_find_node_by_path("/cpus"); + /* Fail here to avoid kexec/kdump kernel boot hung */ + if (!cpus_node) { + pr_err("No /cpus node found\n"); return -EINVAL; } - /* Add cpus node properties */ - cpus_node = of_find_node_by_path("/cpus"); - ret = add_node_props(fdt, cpus_offset, cpus_node); - of_node_put(cpus_node); - if (ret < 0) - return ret; + /* Add all /cpus sub-nodes of device_type == "cpu" to FDT */ + for_each_child_of_node(cpus_node, dn) { + /* Ignore device nodes that do not have a device_type property + * or device_type != "cpu". + */ + device_type = of_get_property(dn, "device_type", NULL); + if (!device_type || strcmp(device_type, "cpu")) + continue; - /* Loop through all subnodes of cpus and add them to fdt */ - for_each_node_by_type(dn, "cpu") { cpus_subnode_offset = fdt_add_subnode(fdt, cpus_offset, dn->full_name); if (cpus_subnode_offset < 0) { pr_err("Unable to add %s subnode: %s\n", dn->full_name, @@ -506,6 +526,7 @@ int update_cpus_node(void *fdt) goto out; } out: + of_node_put(cpus_node); of_node_put(dn); return ret; }