ARM-software / tf-issues

Issue tracking for the ARM Trusted Firmware project
37 stars 16 forks source link

Allocate stacks for a single CPU in BL1 and BL2 #76

Closed danh-arm closed 10 years ago

danh-arm commented 10 years ago

On the FVPs, all components currently use the FVP plat_common.c and common plat_helpers.S source to allocate the normal and coherent stacks. These create one stack per CPU, which results in significant additional RAM usage in BL1 and BL2, which only run on the primary CPU.

BL1 and BL2 should only allocate stacks for the primary CPU.

The below patch is a proof-of-concept that showed a 36KB reduction in SRAM requirements (the stacks are now larger and this could be an under-estimate).

Note - the patch is now out of date and will not apply cleanly.

From eb4172858ccbd37daf8996b4e0a8066e91633318 Mon Sep 17 00:00:00 2001
From: Andrew Thoelke <andrew.thoelke@arm.com>
Date: Fri, 24 Jan 2014 14:35:13 +0000
Subject: [PATCH 2/3] Provide just one CPU stack for BL1 and BL2

BL1 and BL2 were allocating memory for per-cpu coherent and normal
stacks, reserving a large amount of memory that is not used. This
patch provides an alternative implementation of the stack API
which provides just a single cpu stack and is used for BL1 and BL2.

This reduces RAM usage in BL1 and BL2 by 16KB and 20KB respectively

---
 bl1/bl1.mk                                  |    3 +-
 bl2/bl2.mk                                  |    3 +-
 bl31/bl31.mk                                |    3 +-
 plat/common/aarch64/platform_helpers.S      |   64 ---------------
 plat/common/aarch64/platform_multi_stack.S  |  116 +++++++++++++++++++++++++++
 plat/common/aarch64/platform_single_stack.S |  100 +++++++++++++++++++++++
 plat/fvp/aarch64/plat_common.c              |    4 -
 7 files changed, 222 insertions(+), 71 deletions(-)
 create mode 100644 plat/common/aarch64/platform_multi_stack.S
 create mode 100644 plat/common/aarch64/platform_single_stack.S

diff --git a/bl1/bl1.mk b/bl1/bl1.mk
index 50a245b..8c41912 100644
--- a/bl1/bl1.mk
+++ b/bl1/bl1.mk
@@ -46,7 +46,8 @@ BL1_OBJS      +=  bl1_arch_setup.o    \
                bl1_entrypoint.o    \
                early_exceptions.o  \
                bl1_main.o      \
-               cpu_helpers.o
+               cpu_helpers.o   \
+               platform_single_stack.o

 BL1_ENTRY_POINT        :=  reset_handler
 BL1_MAPFILE        :=  bl1.map
diff --git a/bl2/bl2.mk b/bl2/bl2.mk
index fb47669..cb7a603 100644
--- a/bl2/bl2.mk
+++ b/bl2/bl2.mk
@@ -44,7 +44,8 @@ BL2_OBJS      +=  bl2_entrypoint.o        \
                bl2_arch_setup.o        \
                bl2_main.o          \
                spinlock.o          \
-               early_exceptions.o
+               early_exceptions.o  \
+               platform_single_stack.o

 BL2_ENTRY_POINT        :=  bl2_entrypoint
 BL2_MAPFILE        :=  bl2.map
diff --git a/bl31/bl31.mk b/bl31/bl31.mk
index 5c374aa..f1a5d7e 100644
--- a/bl31/bl31.mk
+++ b/bl31/bl31.mk
@@ -62,7 +62,8 @@ BL31_OBJS     +=  bl31_arch_setup.o           \
                spinlock.o              \
                gic_v3_sysregs.o            \
                bakery_lock.o               \
-               runtime_svc.o
+               runtime_svc.o   \
+               platform_multi_stack.o

 BL31_ENTRY_POINT   :=  bl31_entrypoint
 BL31_MAPFILE       :=  bl31.map
diff --git a/plat/common/aarch64/platform_helpers.S b/plat/common/aarch64/platform_helpers.S
index 3cea9f6..3d0e42c 100644
--- a/plat/common/aarch64/platform_helpers.S
+++ b/plat/common/aarch64/platform_helpers.S
@@ -32,43 +32,14 @@
 #include <platform.h>

-   .globl  pcpu_dv_mem_stack
    .weak   platform_get_core_pos
-   .weak   platform_set_stack
-   .weak   platform_get_stack
    .weak   platform_is_primary_cpu
-   .weak   platform_set_coherent_stack
    .weak   platform_check_mpidr
    .weak   plat_report_exception

-   /* -----------------------------------------------------
-    * 512 bytes of coherent stack for each cpu
-    * -----------------------------------------------------
-    */
-#define PCPU_DV_MEM_STACK_SIZE 0x200
-

    .section    .text, "ax"; .align 3

-   /* -----------------------------------------------------
-    * unsigned long long platform_set_coherent_stack
-    *                                    (unsigned mpidr);
-    * For a given mpidr, this function returns the stack
-    * pointer allocated in device memory. This stack can
-    * be used by C code which enables/disables the SCTLR.M
-    * SCTLR.C bit e.g. while powering down a cpu
-    * -----------------------------------------------------
-    */
-platform_set_coherent_stack: ; .type platform_set_coherent_stack, %function
-   mov x5, x30 // lr
-   bl  platform_get_core_pos
-   add x0, x0, #1
-   mov x1, #PCPU_DV_MEM_STACK_SIZE
-   mul x0, x0, x1
-   ldr x1, =pcpu_dv_mem_stack
-   add sp, x1, x0
-   ret x5
-

    /* -----------------------------------------------------
     *  int platform_get_core_pos(int mpidr);
@@ -96,29 +67,6 @@ platform_is_primary_cpu: ; .type platform_is_primary_cpu, %function
    cset    x0, eq
    ret

-   /* -----------------------------------------------------
-    * void platform_get_stack (unsigned long mpidr)
-    * -----------------------------------------------------
-    */
-platform_get_stack: ; .type platform_get_stack, %function
-   mov x10, x30 // lr
-   bl  platform_get_core_pos
-   add x0, x0, #1
-   mov x1, #PLATFORM_STACK_SIZE
-   mul x0, x0, x1
-   ldr x1, =platform_normal_stacks
-   add x0, x1, x0
-   ret x10
-
-   /* -----------------------------------------------------
-    * void platform_set_stack (unsigned long mpidr)
-    * -----------------------------------------------------
-    */
-platform_set_stack: ; .type platform_set_stack, %function
-   mov x9, x30 // lr
-   bl  platform_get_stack
-   mov sp, x0
-   ret x9

    /* -----------------------------------------------------
     * Placeholder function which should be redefined by
@@ -137,15 +85,3 @@ platform_check_mpidr: ; .type platform_check_mpidr, %function
 plat_report_exception:
    ret

-   /* -----------------------------------------------------
-    * Per-cpu stacks in device memory.
-    * Used for C code just before power down or right after
-    * power up when the MMU or caches need to be turned on
-    * or off. Each cpu gets a stack of 512 bytes.
-    * -----------------------------------------------------
-    */
-   .section    tzfw_coherent_mem, "aw", %nobits; .align 6
-
-pcpu_dv_mem_stack:
-   /* Zero fill */
-   .space (PLATFORM_CORE_COUNT * PCPU_DV_MEM_STACK_SIZE), 0
diff --git a/plat/common/aarch64/platform_multi_stack.S b/plat/common/aarch64/platform_multi_stack.S
new file mode 100644
index 0000000..ac7ecaa
--- /dev/null
+++ b/plat/common/aarch64/platform_multi_stack.S
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch.h>
+#include <platform.h>
+
+
+   .weak   pcpu_dv_mem_stack
+   .weak platform_normal_stacks
+   .weak   platform_set_stack
+   .weak   platform_get_stack
+   .weak   platform_set_coherent_stack
+
+   /* -----------------------------------------------------
+    * 512 bytes of coherent stack for each cpu
+    * -----------------------------------------------------
+    */
+#define PCPU_DV_MEM_STACK_SIZE 0x200
+
+
+   .section    .text, "ax"; .align 3
+
+   /* -----------------------------------------------------
+    * unsigned long long platform_set_coherent_stack
+    *                                    (unsigned mpidr);
+    * For a given mpidr, this function returns the stack
+    * pointer allocated in device memory. This stack can
+    * be used by C code which enables/disables the SCTLR.M
+    * SCTLR.C bit e.g. while powering down a cpu
+    * -----------------------------------------------------
+    */
+platform_set_coherent_stack: ; .type platform_set_coherent_stack, %function
+   mov x5, x30 // lr
+   bl  platform_get_core_pos
+   add x0, x0, #1
+   mov x1, #PCPU_DV_MEM_STACK_SIZE
+   mul x0, x0, x1
+   ldr x1, =pcpu_dv_mem_stack
+   add sp, x1, x0
+   ret x5
+
+   /* -----------------------------------------------------
+    * void platform_get_stack (unsigned long mpidr)
+    * -----------------------------------------------------
+    */
+platform_get_stack: ; .type platform_get_stack, %function
+   mov x10, x30 // lr
+   bl  platform_get_core_pos
+   add x0, x0, #1
+   mov x1, #PLATFORM_STACK_SIZE
+   mul x0, x0, x1
+   ldr x1, =platform_normal_stacks
+   add x0, x1, x0
+   ret x10
+
+   /* -----------------------------------------------------
+    * void platform_set_stack (unsigned long mpidr)
+    * -----------------------------------------------------
+    */
+platform_set_stack: ; .type platform_set_stack, %function
+   mov x9, x30 // lr
+   bl  platform_get_stack
+   mov sp, x0
+   ret x9
+
+   /* -----------------------------------------------------
+    * Per-cpu stacks in normal memory.
+    * Used for C code during runtime execution (when coherent
+    * stacks are not required. Each cpu gets a stack of
+    * PLATFORM_STACK_SIZE bytes.
+    * -----------------------------------------------------
+    */
+   .section    tzfw_normal_stacks, "aw", %nobits; .align 6
+
+platform_normal_stacks:
+   .space (PLATFORM_CORE_COUNT * PLATFORM_STACK_SIZE), 0
+
+   /* -----------------------------------------------------
+    * Per-cpu stacks in device memory.
+    * Used for C code just before power down or right after
+    * power up when the MMU or caches need to be turned on
+    * or off. Each cpu gets a stack of 512 bytes.
+    * -----------------------------------------------------
+    */
+   .section    tzfw_coherent_mem, "aw", %nobits; .align 6
+
+pcpu_dv_mem_stack:
+   /* Zero fill */
+   .space (PLATFORM_CORE_COUNT * PCPU_DV_MEM_STACK_SIZE), 0
diff --git a/plat/common/aarch64/platform_single_stack.S b/plat/common/aarch64/platform_single_stack.S
new file mode 100644
index 0000000..e69103d
--- /dev/null
+++ b/plat/common/aarch64/platform_single_stack.S
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2013-2014, ARM Limited and Contributors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * Neither the name of ARM nor the names of its contributors may be used
+ * to endorse or promote products derived from this software without specific
+ * prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <arch.h>
+#include <platform.h>
+
+
+   .globl  pcpu_dv_mem_stack
+   .globl  platform_normal_stacks
+   .globl  platform_set_stack
+   .globl  platform_get_stack
+   .globl  platform_set_coherent_stack
+
+   /* -----------------------------------------------------
+    * 512 bytes of coherent stack for each cpu
+    * -----------------------------------------------------
+    */
+#define PCPU_DV_MEM_STACK_SIZE 0x200
+
+
+   .section    .text, "ax"; .align 3
+
+   /* -----------------------------------------------------
+    * unsigned long long platform_set_coherent_stack
+    *                                    (unsigned mpidr);
+    * This version of the function is only used during the
+    * UP boot phase, so onyl a single stack is required
+    * -----------------------------------------------------
+    */
+platform_set_coherent_stack: ; .type platform_set_coherent_stack, %function
+   ldr x0, =pcpu_dv_mem_stack
+   mov sp, x0
+   ret
+
+
+   /* -----------------------------------------------------
+    * void platform_get_stack (unsigned long mpidr)
+    * -----------------------------------------------------
+    */
+platform_get_stack: ; .type platform_get_stack, %function
+   ldr x0, =platform_normal_stacks
+   ret
+
+   /* -----------------------------------------------------
+    * void platform_set_stack (unsigned long mpidr)
+    * -----------------------------------------------------
+    */
+platform_set_stack: ; .type platform_set_stack, %function
+   ldr x0, =platform_normal_stacks
+   mov sp, x0
+   ret
+
+   /* -----------------------------------------------------
+    * Single cpu stack in normal memory.
+    * Used for C code during boot, PLATFORM_STACK_SIZE bytes
+    * are allocated
+    * -----------------------------------------------------
+    */
+   .section    tzfw_normal_stacks, "aw", %nobits; .align 6
+
+platform_normal_stacks:
+   .space (PLATFORM_STACK_SIZE), 0
+
+   /* -----------------------------------------------------
+    * Single cpu stack in device/coherent memory.
+    * Used for C code during boot, 512 bytes are allocated
+    * -----------------------------------------------------
+    */
+   .section    tzfw_coherent_mem, "aw", %nobits; .align 6
+
+pcpu_dv_mem_stack:
+   /* Zero fill */
+   .space (PCPU_DV_MEM_STACK_SIZE), 0
diff --git a/plat/fvp/aarch64/plat_common.c b/plat/fvp/aarch64/plat_common.c
index 3b698be..17f81c8 100644
--- a/plat/fvp/aarch64/plat_common.c
+++ b/plat/fvp/aarch64/plat_common.c
@@ -36,10 +36,6 @@
 /* Included only for error codes */
 #include <psci.h>

-unsigned char platform_normal_stacks[PLATFORM_STACK_SIZE][PLATFORM_CORE_COUNT]
-__attribute__ ((aligned(PLATFORM_CACHE_LINE_SIZE),
-       section("tzfw_normal_stacks")));
-
 /*******************************************************************************
  * This array holds the characteristics of the differences between the three
  * FVP platforms (Base, A53_A57 & Foundation). It will be populated during cold
-- 
1.7.9.5
athoelke commented 10 years ago

I've got an updated patch for this - it cuts out 40KB of RAM usage, half from BL1 and half from BL2.

tixy commented 10 years ago

Just to make a note of things we discussed face to face...

Some platforms may still need stacks for each CPU if they have the situation where secondary CPU's are released from reset at the same time as the primary CPUs and they need to boot further than a simple holding pen, and all the way into BL31; e.g. because they need to make use of run-time services like PSCI to power themselves off again.

As Andrew pointed out, even in this case, there is no reason for BL2 to have more than one CPU stack.

And thinking about it more, and I can't remember whether this was discussed, I guess it is possible for secondary CPU to stay in a holding pen in BL1 until BL31 release's then, and they can jump straight into BL3 anyway.

So perhaps there is no legitimate reason for secondary CPUs to have there own stacks in running BL1 or BL2 code after all, and this enhancement issue faces no obstacle to implementation.

athoelke commented 10 years ago

I'm trying to get some internal feedback on this patch before posting it on GitHub for review.

I agree that the holding pen scenario should not need extra stacks in BL1 or BL2 - you probably wouldn't want to release these cores into BL3-1 to issue a PSCI shutdown until PSCI is initialised anyway.

If necessary in future, it wouldn't be difficult to modify for individual BLs or to make the selection more flexible.