Open hy-easyone opened 7 years ago
how you started gem5 on the command line?
As in your documentation, I run gem5 as below:
./build/ARM/gem5.opt ./configs/example/fs.py --bare-metal --disk-image=/dist/m5/system/gem5.bare-metal/common/fake.iso --kernel=/dist/m5/system/gem5.bare-metal/Interrupt/main.elf --machine-type=RealView_PBX --dtb-filename=none --mem-size=256MB
and also tried with adding --caches
the Interrupt example runs fine, but when I stop it and look at gem5/m5out/stats.txt, values for caches, such as system.cpu.icache.tags.tagsinuse, system.cpu.icache.tags.replacements, etc. are all zero. Plus the stats.txt doesn't have any cache hit/miss rates (or related values), while normal simulation of aarch64-vmlinux (gem5 fs mode example) shows them. I was wondering if it is just supposed to be like that?
Hi,
As I fell to the same issue, it turns out that in order to be able to have cacheable accesses you need to do a proper set up of the system. Use the attached boot.s and you will be ready. (According to http://infocenter.arm.com/help/topic/com.arm.doc.dai0527a/DAI0527A_baremetal_boot_code_for_ARMv8_A_processors.pdf ) boot.s.zip
// Put a 64-bit value with little endianness.
.macro PUT_64B high, low
.word \low
.word \high
.endm
// Create an entry pointing to a next-level table.
.macro TABLE_ENTRY PA, ATTR
PUT_64B \ATTR, (\PA) + 0x3
.endm
// Create an entry for a 1GB block.
.macro BLOCK_1GB PA, ATTR_HI, ATTR_LO
PUT_64B \ATTR_HI, ((\PA) & 0xC0000000) | \ATTR_LO | 0x1
.endm
// Create an entry for a 2MB block.
.macro BLOCK_2MB PA, ATTR_HI, ATTR_LO
PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1
.endm
.align 12
ttb0_base:
TABLE_ENTRY level2_pagetable, 0
BLOCK_1GB 0x40000000, 0, 0x740
BLOCK_1GB 0x80000000, 0, 0x740
BLOCK_1GB 0xC0000000, 0, 0x740
.align 12
level2_pagetable:
.set ADDR, 0x000 // The current page address.
.rept 0x200
BLOCK_2MB (ADDR << 20), 0, 0x74C
.set ADDR, ADDR+2
.endr
.global _Reset
_Reset:
// Initialize the stack pointer.
LDR R13, =stack_top
ADD R13, R13, #4
MRC P15, 0, R0, C0, C0, 5 // Read MPIDR.
AND R0, R0, #0xFF // R0 == core number.
MOV R2, #0x1000
MUL R1, R0, R2 // Create separate stack spaces
SUB R13, R13, R1 // for each processor.
// Initialize SPSR in all modes.
MOV R0, #0
MSR SPSR, R0
MSR SPSR_svc, R0
// Disable L1 Caches.
MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
BIC R1, R1, #(0x1 << 2) // Disable D Cache.
MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
// Invalidate Data cache to create general-purpose code. Calculate the
// cache size first and loop through each set + way.
MOV R0, #0x0 // R0 = 0x0 for L1 dcache 0x2 for L2 dcache.
MCR P15, 2, R0, C0, C0, 0 // CSSELR Cache Size Selection Register.
MRC P15, 1, R4, C0, C0, 0 // CCSIDR read Cache Size.
AND R1, R4, #0x7
ADD R1, R1, #0x4 // R1 = Cache Line Size.
LDR R3, =0x7FFF
AND R2, R3, R4, LSR #13 // R2 = Cache Set Number – 1.
LDR R3, =0x3FF
AND R3, R3, R4, LSR #3 // R3 = Cache Associativity Number – 1.
CLZ R4, R3 // R4 = way position in CISW instruction.
MOV R5, #0 // R5 = way loop counter.
way_loop:
MOV R6, #0 // R6 = set loop counter.
set_loop:
ORR R7, R0, R5, LSL R4 // Set way.
ORR R7, R7, R6, LSL R1 // Set set.
MCR P15, 0, R7, C7, C6, 2 // DCCISW R7.
ADD R6, R6, #1 // Increment set counter.
CMP R6, R2 // Last set reached yet?
BLE set_loop // If not, iterate set_loop,
ADD R5, R5, #1 // else, next way.
CMP R5, R3 // Last way reached yet?
BLE way_loop // if not, iterate way_loop.
// Initialize TTBCR.
MOV R0, #0 // Use short descriptor.
MCR P15, 0, R0, C2, C0, 2 // Base address is 16KB aligned.
// Perform translation table walk for TTBR0.
// Initialize DACR.
LDR R1, =0x55555555 // Set all domains as clients.
MCR P15, 0, R1, C3, C0, 0 // Accesses are checked against the
// permission bits in the translation tables.
// Initialize SCTLR.AFE.
MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
BIC R1, R1, #(0x1 <<29) // Set AFE to 0 and disable Access Flag.
MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
// Initialize TTBR0.
LDR R0, =ttb0_base // ttb0_base must be a 16KB-aligned address.
MOV R1, #0x2B // The translation table walk is normal, inner
ORR R1, R0, R1 // and outer cacheable, WB WA, and inner
MCR P15, 0, R1, C2, C0, 0 // shareable.
// Set up translation table entries in memory
LDR R4, =0x00100000 // Increase 1MB address each time.
LDR R2, =0x00015C06 // Set up translation table descriptor with
// Secure, global, full accessibility,
// executable.
// Domain 0, Shareable, Normal cacheable memory
LDR R3, =1024 // executes the loop 1024 times to set up
// 1024 descriptors to cover 0-1GB memory.
loop:
STR R2, [R0], #4 // Build a page table section entry.
ADD R2, R2, R4 // Update address part for next descriptor.
SUBS R3, #1
BNE loop
LDR R2, =0x40010C02 // Set up translation table descriptors with
// secure, global, full accessibility,
// Domain=0 Shareable Device-nGnRnE Memory.
LDR R3, =3072 // Executes loop 3072 times to set up 3072
loop2:
STR R2, [R0], #4 // Build a translation table section entry.
ADD R2, R2, R4 // Update address part for next descriptor.
SUBS R3, #1
BNE loop2
// SMP is implemented in the CPUECTLR register.
MRRC P15, 1, R0, R1, C15 // Read CPUECTLR.
ORR R0, R0, #(0x1 << 6) // Set SMPEN.
MCRR P15, 1, R0, R1, C15 // Write CPUECTLR.
// Enable caches and the MMU.
MRC P15, 0, R1, C1, C0, 0 // Read SCTLR.
ORR R1, R1, #(0x1 << 2) // The C bit (data cache).
ORR R1, R1, #(0x1 << 12) // The I bit (instruction cache).
ORR R1, R1, #0x1 // The M bit (MMU).
MCR P15, 0, R1, C1, C0, 0 // Write SCTLR.
DSB
ISB
BL main
B .
Can you create a proper example and open a Pull request for it? I would highly appreciate!
Hi, I know it is very late response, but just wanted to add the following: I think the given example has an error (I think the error is in the ARM documentation as well). It says: MOV R1, #0x2B // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable. But bit 6 and bit 0 should have been swapped (according to this note which exists in both armv7 and armv8 documentation): Note The encoding of the IRGN bits is counter-intuitive, with register bit[6] being IRGN[0] and register bit[0] being IRGN[1]. This encoding is chosen to give a consistent encoding of memory region types and to ensure that software written for ARMv7 without the Multiprocessing Extensions can run unmodified on an implementation that includes the functionality introduced by the ARMv7 Multiprocessing Extensions
So the correct code (given the comment on the code) should be: MOV R1, #0x6A // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable.
Perhaps I misread the documentation, but both armv7 and armv8 documentation is quite clear about this and the most used configuration would be "Normal memory, Inner Write-Back Write-Allocate Cacheable." . I'm not entirely sure how this setting is used though. Since I would expect that the MMU only reads L1/L2 tables I guess it doesn't matter if you use "Write-Through, no Write-Allocate", since there is simply no write coming from MMU? But then why would there be no bit cacheable/non-cacheable? Or does this give info on how reads are to be performed? If somebody could explain how this configuration (TTBR0) is used then this would be greatly appreciated.
It could be that your right. Can you test it and open a PR?
The code provided for enabling the cache results in the following error. Any suggestions on how to troubleshoot and resolve this issue?
Command: build/ARM/gem5.opt configs/example/fs.py --bare-metal --kernel=/home/kashishm/gem5_backup/gem5.bare-metal/Simple/main.elf --machine-type=VExpress_GEM5_V1 --cpu-type=MinorCPU --caches --l1d_size=32kB --l1i_size=32kB
Error:
gem5.opt: build/ARM/dev/arm/pl011.cc:70: virtual gem5::Tick gem5::Pl011::read(gem5::PacketPtr): Assertion `pkt->getSize() <= 4' failed. Program aborted at tick 579247000 --- BEGIN LIBC BACKTRACE --- build/ARM/gem5.opt(+0xff4750)[0x61caef010750] build/ARM/gem5.opt(+0x101918c)[0x61caef03518c] /lib/x86_64-linux-gnu/libc.so.6(+0x42520)[0x7b2a8ac42520] /lib/x86_64-linux-gnu/libc.so.6(pthread_kill+0x12c)[0x7b2a8ac96
@kashish-107
You probably need mark the IO region of gem5 as non-cacheable memory. Marking the lower 2GiB of the address space as non-cacheable device memory region should fix the issue. For a better explanation of the code refer to: https://developer.arm.com/documentation/dai0527/latest/
@derchr I'm new to this. Could you assist me with an updated boot.s file or provide a simple example of how to designate the IO region as uncacheable?
You could use this as a baseline:
.extern LD_STACK_PTR
// Put a 64-bit value with little endianness.
.macro PUT_64B high, low
.word \low
.word \high
.endm
// Create an entry pointing to a next-level table.
.macro TABLE_ENTRY PA, ATTR
PUT_64B \ATTR, (\PA) + 0x3
.endm
// Create an entry for a 1GB block.
.macro BLOCK_1GB PA, ATTR_HI, ATTR_LO
PUT_64B \ATTR_HI | ((\PA) >> 32), ((\PA) & 0xC0000000) | \ATTR_LO | 0x1
.endm
// Create an entry for a 2MB block.
.macro BLOCK_2MB PA, ATTR_HI, ATTR_LO
PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1
.endm
.section .init
.align 12
ttb0_base:
.set ADDR, 0x000
.rept 0x02
BLOCK_1GB (ADDR << 29), 0, 0x740
.set ADDR, ADDR+2
.endr
// Cached normal DRAM region
.rept 0x3E
BLOCK_1GB (ADDR << 29), 0, 0x74C
.set ADDR, ADDR+2
.endr
.globl _start
_start:
ldr x30, =LD_STACK_PTR
mov sp, x30
// Initialize translation table control registers
ldr x1, =0x13520 // 64GB space 4KB granularity Inner-shareable. Normal Inner and Outer Cacheable.
msr tcr_el3, x1
ldr x1, =0xFF440400
msr mair_el3, x1 // ATTR0 Device-nGnRnE ATTR1 Device. ATTR2 Normal Non-Cacheable. ATTR3 Normal Cacheable.
adr x0, ttb0_base
msr ttbr0_el3, x0
// Enable MMU and caches
mrs x0, sctlr_el3
orr x0, x0, #(0x1 << 2) // The C bit (data cache).
orr x0, x0, #(0x1 << 12) // The I bit (instruction cache).
orr x0, x0, #0x1 // The M bit (MMU).
msr sctlr_el3, x0
dsb sy
isb
bl entry
Make sure to properly designate the DRAM area in the linker script:
MEMORY
{
bootmem : ORIGIN = 0x0, LENGTH = 0x100000
dram : ORIGIN = 0x80000000, LENGTH = 0x40000000
// ...
And place the desired sections either into the bootmem or the DRAM: .text : { KEEP(*(.text)) } > dram
.
This would only be an example. The concrete address values probably depend on your system configuration. For that, I would highly recommend the previously cited resource and also this section of the ARM Cortex-A Series Programmer's Guide: https://developer.arm.com/documentation/den0024/a/The-Memory-Management-Unit
@derchr
Here are the assembly, linker, and application code I'm currently utilizing. Could you assist me in ensuring that the configuration is correct?
boot.s
`// Put a 64-bit value with little endianness. .macro PUT_64B high, low .word \low .word \high .endm // Create an entry pointing to a next-level table. .macro TABLE_ENTRY PA, ATTR PUT_64B \ATTR, (\PA) + 0x3 .endm // Create an entry for a 1GB block. .macro BLOCK_1GB PA, ATTR_HI, ATTR_LO PUT_64B \ATTR_HI, ((\PA) & 0xC0000000) | \ATTR_LO | 0x1 .endm // Create an entry for a 2MB block. .macro BLOCK_2MB PA, ATTR_HI, ATTR_LO PUT_64B \ATTR_HI, ((\PA) & 0xFFE00000) | \ATTR_LO | 0x1 .endm .align 12 ttb0_base: TABLE_ENTRY level2_pagetable, 0 BLOCK_1GB 0x40000000, 0, 0x740 BLOCK_1GB 0x80000000, 0, 0x740 BLOCK_1GB 0xC0000000, 0, 0x740 .align 12 level2_pagetable: .set ADDR, 0x000 // The current page address. .rept 0x200 BLOCK_2MB (ADDR << 20), 0, 0x74C .set ADDR, ADDR+2 .endr .global _Reset
_Reset: // Initialize the stack pointer. LDR R13, =stack_top ADD R13, R13, #4 MRC P15, 0, R0, C0, C0, 5 // Read MPIDR. AND R0, R0, #0xFF // R0 == core number. MOV R2, #0x1000 MUL R1, R0, R2 // Create separate stack spaces SUB R13, R13, R1 // for each processor. // Initialize SPSR in all modes. MOV R0, #0 MSR SPSR, R0 MSR SPSR_svc, R0 // Disable L1 Caches. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. BIC R1, R1, #(0x1 << 2) // Disable D Cache. MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. // Invalidate Data cache to create general-purpose code. Calculate the // cache size first and loop through each set + way. MOV R0, #0x0 // R0 = 0x0 for L1 dcache 0x2 for L2 dcache. MCR P15, 2, R0, C0, C0, 0 // CSSELR Cache Size Selection Register. MRC P15, 1, R4, C0, C0, 0 // CCSIDR read Cache Size. AND R1, R4, #0x7 ADD R1, R1, #0x4 // R1 = Cache Line Size. LDR R3, =0x7FFF AND R2, R3, R4, LSR #13 // R2 = Cache Set Number – 1. LDR R3, =0x3FF AND R3, R3, R4, LSR #3 // R3 = Cache Associativity Number – 1. CLZ R4, R3 // R4 = way position in CISW instruction. MOV R5, #0 // R5 = way loop counter. way_loop: MOV R6, #0 // R6 = set loop counter. set_loop: ORR R7, R0, R5, LSL R4 // Set way. ORR R7, R7, R6, LSL R1 // Set set. MCR P15, 0, R7, C7, C6, 2 // DCCISW R7. ADD R6, R6, #1 // Increment set counter. CMP R6, R2 // Last set reached yet? BLE set_loop // If not, iterate set_loop, ADD R5, R5, #1 // else, next way. CMP R5, R3 // Last way reached yet? BLE way_loop // if not, iterate way_loop. // Initialize TTBCR. MOV R0, #0 // Use short descriptor. MCR P15, 0, R0, C2, C0, 2 // Base address is 16KB aligned. // Perform translation table walk for TTBR0. // Initialize DACR. LDR R1, =0x55555555 // Set all domains as clients. MCR P15, 0, R1, C3, C0, 0 // Accesses are checked against the // permission bits in the translation tables. // Initialize SCTLR.AFE. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. BIC R1, R1, #(0x1 <<29) // Set AFE to 0 and disable Access Flag. MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. // Initialize TTBR0. LDR R0, =ttb0_base // ttb0_base must be a 16KB-aligned address. MOV R1, #0x6A // The translation table walk is normal, inner ORR R1, R0, R1 // and outer cacheable, WB WA, and inner MCR P15, 0, R1, C2, C0, 0 // shareable. // Set up translation table entries in memory LDR R4, =0x00100000 // Increase 1MB address each time. LDR R2, =0x00015C06 // Set up translation table descriptor with // Secure, global, full accessibility, // executable. // Domain 0, Shareable, Normal cacheable memory LDR R3, =1024 // executes the loop 1024 times to set up // 1024 descriptors to cover 0-1GB memory. loop: STR R2, [R0], #4 // Build a page table section entry. ADD R2, R2, R4 // Update address part for next descriptor. SUBS R3, #1 BNE loop LDR R2, =0x40010C02 // Set up translation table descriptors with // secure, global, full accessibility, // Domain=0 Shareable Device-nGnRnE Memory. LDR R3, =3072 // Executes loop 3072 times to set up 3072 loop2: STR R2, [R0], #4 // Build a translation table section entry. ADD R2, R2, R4 // Update address part for next descriptor. SUBS R3, #1 BNE loop2 // SMP is implemented in the CPUECTLR register. MRRC P15, 1, R0, R1, C15 // Read CPUECTLR. ORR R0, R0, #(0x1 << 6) // Set SMPEN. MCRR P15, 1, R0, R1, C15 // Write CPUECTLR. // Enable caches and the MMU. MRC P15, 0, R1, C1, C0, 0 // Read SCTLR. ORR R1, R1, #(0x1 << 2) // The C bit (data cache). ORR R1, R1, #(0x1 << 12) // The I bit (instruction cache). ORR R1, R1, #0x1 // The M bit (MMU). MCR P15, 0, R1, C1, C0, 0 // Write SCTLR. DSB ISB BL main B .`
boot.ld
ENTRY(_Reset) SECTIONS { . = 0x10000; .boot . : { boot.o(.text) } .text : { *(.text) } .data : { *(.data) } .bss : { *(.bss COMMON) } . = ALIGN(8); . = . + 0x1000; /* 4kB of stack memory */ stack_top = .; PROVIDE (end = .) ; }
main.cpp
`#include
// // Define macro for UART end-of-transmission address
// //Define macro for the value representing session termination
int main(int argc, char argv[]) { // // Pointer to the UART end-of-transmission address int p_uart_eot_addr = (int *) UART_EOT_ADDR;
// // Write session terminate value to UART eot
*p_uart_eot_addr = SESSION_TERMINATION_VALUE;
return 0;
}`
You could check with the debug flags of gem5 whether the caches are properly enabled. Otherwise I would suggest to use my provided boot.S as a base to simplify the booting script as some of the real-world initialization steps might not be necessary in a gem5 simulation.
Hi, I find your code really helpful for the development base in firmware simulation, but it turns out that L1, L2 cache simulation (such as hit/miss rate) results do not appear in the stats.txt. Is it because the cache was not enabled yet in the kernel code? (I'm a newbie in kernel or system area, so i don't really understand what's going on) If that's the case, is it complicated to write a code for initializing the cache in the boot.S file?
Thank you!