tukl-msd / gem5.bare-metal

This shows a simple ARM bare-metal software implementation for gem5
16 stars 9 forks source link

L1, L2 cache enable in the Interrupt example #4

Open hy-easyone opened 7 years ago

hy-easyone commented 7 years ago

Hi, I find your code really helpful for the development base in firmware simulation, but it turns out that L1, L2 cache simulation (such as hit/miss rate) results do not appear in the stats.txt. Is it because the cache was not enabled yet in the kernel code? (I'm a newbie in kernel or system area, so i don't really understand what's going on) If that's the case, is it complicated to write a code for initializing the cache in the boot.S file?

Thank you!

myzinsky commented 7 years ago

how you started gem5 on the command line?

hy-easyone commented 7 years ago

As in your documentation, I run gem5 as below:

./build/ARM/gem5.opt ./configs/example/fs.py --bare-metal --disk-image=/dist/m5/system/gem5.bare-metal/common/fake.iso --kernel=/dist/m5/system/gem5.bare-metal/Interrupt/main.elf --machine-type=RealView_PBX --dtb-filename=none --mem-size=256MB

and also tried with adding --caches

the Interrupt example runs fine, but when I stop it and look at gem5/m5out/stats.txt, values for caches, such as system.cpu.icache.tags.tagsinuse, system.cpu.icache.tags.replacements, etc. are all zero. Plus the stats.txt doesn't have any cache hit/miss rates (or related values), while normal simulation of aarch64-vmlinux (gem5 fs mode example) shows them. I was wondering if it is just supposed to be like that?

asvos commented 5 years ago

Hi,

As I fell to the same issue, it turns out that in order to be able to have cacheable accesses you need to do a proper set up of the system. Use the attached boot.s and you will be ready. (According to http://infocenter.arm.com/help/topic/com.arm.doc.dai0527a/DAI0527A_baremetal_boot_code_for_ARMv8_A_processors.pdf ) boot.s.zip

Boot.s :

// Put a 64-bit value with little endianness.
.macro PUT_64B high, low
.word \low
.word \high
.endm
// Create an entry pointing to a next-level table.
.macro TABLE_ENTRY PA, ATTR
    PUT_64B \ATTR, (\PA) + 0x3
.endm
// Create an entry for a 1GB block.
.macro BLOCK_1GB PA, ATTR_HI, ATTR_LO
    PUT_64B \ATTR_HI, ((\PA) & 0xC0000000) | \ATTR_LO | 0x1
.endm
// Create an entry for a 2MB block.
.macro BLOCK_2MB PA, ATTR_HI, ATTR_LO
    PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1
.endm
.align 12
ttb0_base:
    TABLE_ENTRY level2_pagetable, 0
    BLOCK_1GB 0x40000000, 0, 0x740
    BLOCK_1GB 0x80000000, 0, 0x740
    BLOCK_1GB 0xC0000000, 0, 0x740
.align 12
level2_pagetable:
.set ADDR, 0x000 // The current page address.
.rept 0x200
    BLOCK_2MB (ADDR << 20), 0, 0x74C
.set ADDR, ADDR+2
.endr
.global _Reset

_Reset:
    // Initialize the stack pointer.
    LDR R13, =stack_top
    ADD R13, R13, #4
    MRC P15, 0, R0, C0, C0, 5 // Read MPIDR.
    AND R0, R0, #0xFF // R0 == core number.
    MOV R2, #0x1000
    MUL R1, R0, R2 // Create separate stack spaces
    SUB R13, R13, R1 // for each processor.
    // Initialize SPSR in all modes.
    MOV R0, #0
    MSR SPSR, R0
    MSR SPSR_svc, R0
    // Disable L1 Caches.
    MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
    BIC R1, R1, #(0x1 << 2) // Disable D Cache.
    MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
    // Invalidate Data cache to create general-purpose code. Calculate the
    // cache size first and loop through each set + way.
    MOV R0, #0x0 // R0 = 0x0 for L1 dcache 0x2 for L2 dcache.
    MCR P15, 2, R0, C0, C0, 0 // CSSELR Cache Size Selection Register.
    MRC P15, 1, R4, C0, C0, 0 // CCSIDR read Cache Size.
    AND R1, R4, #0x7
    ADD R1, R1, #0x4 // R1 = Cache Line Size.
    LDR R3, =0x7FFF
    AND R2, R3, R4, LSR #13 // R2 = Cache Set Number – 1.
    LDR R3, =0x3FF
    AND R3, R3, R4, LSR #3 // R3 = Cache Associativity Number – 1.
    CLZ R4, R3 // R4 = way position in CISW instruction.
    MOV R5, #0 // R5 = way loop counter.
way_loop:
    MOV R6, #0 // R6 = set loop counter.
set_loop:
    ORR R7, R0, R5, LSL R4 // Set way.
    ORR R7, R7, R6, LSL R1 // Set set.
    MCR P15, 0, R7, C7, C6, 2 // DCCISW R7.
    ADD R6, R6, #1 // Increment set counter.
    CMP R6, R2 // Last set reached yet?
    BLE set_loop // If not, iterate set_loop,
    ADD R5, R5, #1 // else, next way.
    CMP R5, R3 // Last way reached yet?
    BLE way_loop // if not, iterate way_loop.
    // Initialize TTBCR.
    MOV R0, #0 // Use short descriptor.
    MCR P15, 0, R0, C2, C0, 2 // Base address is 16KB aligned.
    // Perform translation table walk for TTBR0.
    // Initialize DACR.
    LDR R1, =0x55555555 // Set all domains as clients.
    MCR P15, 0, R1, C3, C0, 0 // Accesses are checked against the
    // permission bits in the translation tables.
    // Initialize SCTLR.AFE.
    MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
    BIC R1, R1, #(0x1 <<29) // Set AFE to 0 and disable Access Flag.
    MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
    // Initialize TTBR0.
    LDR R0, =ttb0_base // ttb0_base must be a 16KB-aligned address.
    MOV R1, #0x2B // The translation table walk is normal, inner
    ORR R1, R0, R1 // and outer cacheable, WB WA, and inner
    MCR P15, 0, R1, C2, C0, 0 // shareable.
    // Set up translation table entries in memory
    LDR R4, =0x00100000 // Increase 1MB address each time.
    LDR R2, =0x00015C06 // Set up translation table descriptor with
    // Secure, global, full accessibility,
    // executable.
    // Domain 0, Shareable, Normal cacheable memory
    LDR R3, =1024 // executes the loop 1024 times to set up
    // 1024 descriptors to cover 0-1GB memory.
loop:
    STR R2, [R0], #4 // Build a page table section entry.
    ADD R2, R2, R4 // Update address part for next descriptor.
    SUBS R3, #1
    BNE loop
    LDR R2, =0x40010C02 // Set up translation table descriptors with
    // secure, global, full accessibility,
    // Domain=0 Shareable Device-nGnRnE Memory.
    LDR R3, =3072 // Executes loop 3072 times to set up 3072
loop2:
    STR R2, [R0], #4 // Build a translation table section entry.
    ADD R2, R2, R4 // Update address part for next descriptor.
    SUBS R3, #1
    BNE loop2
    // SMP is implemented in the CPUECTLR register.
    MRRC P15, 1, R0, R1, C15 // Read CPUECTLR.
    ORR R0, R0, #(0x1 << 6) // Set SMPEN.
    MCRR P15, 1, R0, R1, C15 // Write CPUECTLR.
    // Enable caches and the MMU.
    MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
    ORR R1, R1, #(0x1 << 2) // The C bit (data cache).
    ORR R1, R1, #(0x1 << 12) // The I bit (instruction cache).
    ORR R1, R1, #0x1 // The M bit (MMU).
    MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
    DSB
    ISB
    BL main
    B .
myzinsky commented 5 years ago

Can you create a proper example and open a Pull request for it? I would highly appreciate!

HMeuleman commented 2 years ago

Hi, I know it is very late response, but just wanted to add the following: I think the given example has an error (I think the error is in the ARM documentation as well). It says: MOV R1, #0x2B // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable. But bit 6 and bit 0 should have been swapped (according to this note which exists in both armv7 and armv8 documentation): Note The encoding of the IRGN bits is counter-intuitive, with register bit[6] being IRGN[0] and register bit[0] being IRGN[1]. This encoding is chosen to give a consistent encoding of memory region types and to ensure that software written for ARMv7 without the Multiprocessing Extensions can run unmodified on an implementation that includes the functionality introduced by the ARMv7 Multiprocessing Extensions

So the correct code (given the comment on the code) should be: MOV R1, #0x6A // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable.

Perhaps I misread the documentation, but both armv7 and armv8 documentation is quite clear about this and the most used configuration would be "Normal memory, Inner Write-Back Write-Allocate Cacheable." . I'm not entirely sure how this setting is used though. Since I would expect that the MMU only reads L1/L2 tables I guess it doesn't matter if you use "Write-Through, no Write-Allocate", since there is simply no write coming from MMU? But then why would there be no bit cacheable/non-cacheable? Or does this give info on how reads are to be performed? If somebody could explain how this configuration (TTBR0) is used then this would be greatly appreciated.

myzinsky commented 2 years ago

It could be that your right. Can you test it and open a PR?

kashish-107 commented 7 months ago

The code provided for enabling the cache results in the following error. Any suggestions on how to troubleshoot and resolve this issue?

Command: build/ARM/gem5.opt configs/example/fs.py --bare-metal --kernel=/home/kashishm/gem5_backup/gem5.bare-metal/Simple/main.elf --machine-type=VExpress_GEM5_V1 --cpu-type=MinorCPU --caches --l1d_size=32kB --l1i_size=32kB

Error:

gem5.opt: build/ARM/dev/arm/pl011.cc:70: virtual gem5::Tick gem5::Pl011::read(gem5::PacketPtr): Assertion `pkt->getSize() <= 4' failed. Program aborted at tick 579247000 --- BEGIN LIBC BACKTRACE --- build/ARM/gem5.opt(+0xff4750)[0x61caef010750] build/ARM/gem5.opt(+0x101918c)[0x61caef03518c] /lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7b2a8ac42520] /lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x12c)[0x7b2a8ac96

derchr commented 7 months ago

@kashish-107

You probably need mark the IO region of gem5 as non-cacheable memory. Marking the lower 2GiB of the address space as non-cacheable device memory region should fix the issue. For a better explanation of the code refer to: https://developer.arm.com/documentation/dai0527/latest/

kashish-107 commented 7 months ago

@derchr I'm new to this. Could you assist me with an updated boot.s file or provide a simple example of how to designate the IO region as uncacheable?

derchr commented 7 months ago

You could use this as a baseline:

.extern LD_STACK_PTR

// Put a 64-bit value with little endianness.
.macro PUT_64B high, low
    .word \low
    .word \high
.endm

// Create an entry pointing to a next-level table.
.macro TABLE_ENTRY PA, ATTR
    PUT_64B \ATTR, (\PA) + 0x3
.endm

// Create an entry for a 1GB block.
.macro BLOCK_1GB PA, ATTR_HI, ATTR_LO
    PUT_64B \ATTR_HI | ((\PA) >> 32), ((\PA) & 0xC0000000) | \ATTR_LO | 0x1
.endm

// Create an entry for a 2MB block.
.macro BLOCK_2MB PA, ATTR_HI, ATTR_LO
    PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1
.endm

.section .init

.align 12
ttb0_base:
.set ADDR, 0x000
.rept 0x02
BLOCK_1GB (ADDR << 29), 0, 0x740
.set ADDR, ADDR+2
.endr

// Cached normal DRAM region
.rept 0x3E
BLOCK_1GB (ADDR << 29), 0, 0x74C
.set ADDR, ADDR+2
.endr

.globl _start
_start:
    ldr     x30, =LD_STACK_PTR
    mov     sp, x30

    // Initialize translation table control registers
    ldr x1, =0x13520 // 64GB space 4KB granularity Inner-shareable. Normal Inner and Outer Cacheable.
    msr tcr_el3, x1

    ldr x1, =0xFF440400
    msr mair_el3, x1 // ATTR0 Device-nGnRnE ATTR1 Device. ATTR2 Normal Non-Cacheable. ATTR3 Normal Cacheable.

    adr x0, ttb0_base
    msr ttbr0_el3, x0

    // Enable MMU and caches
    mrs x0, sctlr_el3
    orr x0, x0, #(0x1 << 2) // The C bit (data cache).
    orr x0, x0, #(0x1 << 12) // The I bit (instruction cache).
    orr x0, x0, #0x1 // The M bit (MMU).
    msr sctlr_el3, x0
    dsb sy
    isb

    bl entry

Make sure to properly designate the DRAM area in the linker script:

MEMORY
{
    bootmem : ORIGIN = 0x0, LENGTH = 0x100000
    dram : ORIGIN = 0x80000000, LENGTH = 0x40000000
    // ...

And place the desired sections either into the bootmem or the DRAM: .text : { KEEP(*(.text)) } > dram.

This would only be an example. The concrete address values probably depend on your system configuration. For that, I would highly recommend the previously cited resource and also this section of the ARM Cortex-A Series Programmer's Guide: https://developer.arm.com/documentation/den0024/a/The-Memory-Management-Unit

kashish-107 commented 6 months ago

@derchr

Here are the assembly, linker, and application code I'm currently utilizing. Could you assist me in ensuring that the configuration is correct?

boot.s

`// Put a 64-bit value with little endianness. .macro PUT_64B high, low .word \low .word \high .endm // Create an entry pointing to a next-level table. .macro TABLE_ENTRY PA, ATTR PUT_64B \ATTR, (\PA) + 0x3 .endm // Create an entry for a 1GB block. .macro BLOCK_1GB PA, ATTR_HI, ATTR_LO PUT_64B \ATTR_HI, ((\PA) & 0xC0000000) | \ATTR_LO | 0x1 .endm // Create an entry for a 2MB block. .macro BLOCK_2MB PA, ATTR_HI, ATTR_LO PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1 .endm .align 12 ttb0_base: TABLE_ENTRY level2_pagetable, 0 BLOCK_1GB 0x40000000, 0, 0x740 BLOCK_1GB 0x80000000, 0, 0x740 BLOCK_1GB 0xC0000000, 0, 0x740 .align 12 level2_pagetable: .set ADDR, 0x000 // The current page address. .rept 0x200 BLOCK_2MB (ADDR << 20), 0, 0x74C .set ADDR, ADDR+2 .endr .global _Reset

_Reset: // Initialize the stack pointer. LDR R13, =stack_top ADD R13, R13, #4 MRC P15, 0, R0, C0, C0, 5 // Read MPIDR. AND R0, R0, #0xFF // R0 == core number. MOV R2, #0x1000 MUL R1, R0, R2 // Create separate stack spaces SUB R13, R13, R1 // for each processor. // Initialize SPSR in all modes. MOV R0, #0 MSR SPSR, R0 MSR SPSR_svc, R0 // Disable L1 Caches. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. BIC R1, R1, #(0x1 << 2) // Disable D Cache. MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. // Invalidate Data cache to create general-purpose code. Calculate the // cache size first and loop through each set + way. MOV R0, #0x0 // R0 = 0x0 for L1 dcache 0x2 for L2 dcache. MCR P15, 2, R0, C0, C0, 0 // CSSELR Cache Size Selection Register. MRC P15, 1, R4, C0, C0, 0 // CCSIDR read Cache Size. AND R1, R4, #0x7 ADD R1, R1, #0x4 // R1 = Cache Line Size. LDR R3, =0x7FFF AND R2, R3, R4, LSR #13 // R2 = Cache Set Number – 1. LDR R3, =0x3FF AND R3, R3, R4, LSR #3 // R3 = Cache Associativity Number – 1. CLZ R4, R3 // R4 = way position in CISW instruction. MOV R5, #0 // R5 = way loop counter. way_loop: MOV R6, #0 // R6 = set loop counter. set_loop: ORR R7, R0, R5, LSL R4 // Set way. ORR R7, R7, R6, LSL R1 // Set set. MCR P15, 0, R7, C7, C6, 2 // DCCISW R7. ADD R6, R6, #1 // Increment set counter. CMP R6, R2 // Last set reached yet? BLE set_loop // If not, iterate set_loop, ADD R5, R5, #1 // else, next way. CMP R5, R3 // Last way reached yet? BLE way_loop // if not, iterate way_loop. // Initialize TTBCR. MOV R0, #0 // Use short descriptor. MCR P15, 0, R0, C2, C0, 2 // Base address is 16KB aligned. // Perform translation table walk for TTBR0. // Initialize DACR. LDR R1, =0x55555555 // Set all domains as clients. MCR P15, 0, R1, C3, C0, 0 // Accesses are checked against the // permission bits in the translation tables. // Initialize SCTLR.AFE. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. BIC R1, R1, #(0x1 <<29) // Set AFE to 0 and disable Access Flag. MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. // Initialize TTBR0. LDR R0, =ttb0_base // ttb0_base must be a 16KB-aligned address. MOV R1, #0x6A // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable. // Set up translation table entries in memory LDR R4, =0x00100000 // Increase 1MB address each time. LDR R2, =0x00015C06 // Set up translation table descriptor with // Secure, global, full accessibility, // executable. // Domain 0, Shareable, Normal cacheable memory LDR R3, =1024 // executes the loop 1024 times to set up // 1024 descriptors to cover 0-1GB memory. loop: STR R2, [R0], #4 // Build a page table section entry. ADD R2, R2, R4 // Update address part for next descriptor. SUBS R3, #1 BNE loop LDR R2, =0x40010C02 // Set up translation table descriptors with // secure, global, full accessibility, // Domain=0 Shareable Device-nGnRnE Memory. LDR R3, =3072 // Executes loop 3072 times to set up 3072 loop2: STR R2, [R0], #4 // Build a translation table section entry. ADD R2, R2, R4 // Update address part for next descriptor. SUBS R3, #1 BNE loop2 // SMP is implemented in the CPUECTLR register. MRRC P15, 1, R0, R1, C15 // Read CPUECTLR. ORR R0, R0, #(0x1 << 6) // Set SMPEN. MCRR P15, 1, R0, R1, C15 // Write CPUECTLR. // Enable caches and the MMU. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. ORR R1, R1, #(0x1 << 2) // The C bit (data cache). ORR R1, R1, #(0x1 << 12) // The I bit (instruction cache). ORR R1, R1, #0x1 // The M bit (MMU). MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. DSB ISB BL main B .`

boot.ld ENTRY(_Reset) SECTIONS { . = 0x10000; .boot . : { boot.o(.text) } .text : { *(.text) } .data : { *(.data) } .bss : { *(.bss COMMON) } . = ALIGN(8); . = . + 0x1000; /* 4kB of stack memory */ stack_top = .; PROVIDE (end = .) ; }

main.cpp `#include

// // Define macro for UART end-of-transmission address

define UART_EOT_ADDR 0x1C090000

// //Define macro for the value representing session termination

define SESSION_TERMINATION_VALUE 4

int main(int argc, char argv[]) { // // Pointer to the UART end-of-transmission address int p_uart_eot_addr = (int *) UART_EOT_ADDR;

// // Write session terminate value to UART eot
*p_uart_eot_addr = SESSION_TERMINATION_VALUE;

return 0;

}`

derchr commented 6 months ago

You could check with the debug flags of gem5 whether the caches are properly enabled. Otherwise I would suggest to use my provided boot.S as a base to simplify the booting script as some of the real-world initialization steps might not be necessary in a gem5 simulation.