Hello,
I ran tuned ARM cpu full system simulation for a large workload (that takes 8 hours on a supercomputer) using gem5-dev branch code.
However, with this commit, sometimes my job is killed which I think is due to excessive memory requirement. If I run a small workload then it works fine. Could you identify or estimate, where can we improve memory management in terms of the implementation of fdp and/or the associative BTB?
I changed associative BTB to simpleBTB_v2 like below:
namespace branch_prediction
{
SimpleBTB::SimpleBTB(const SimpleBTBParams &p)
: BranchTargetBuffer(p),
numEntries(p.numEntries),
tagBits(p.tagBits),
instShiftAmt(p.instShiftAmt),
log2NumThreads(floorLog2(p.numThreads))
{
DPRINTF(BTB, "BTB: Creating BTB object.\n");
if (!isPowerOf2(numEntries)) {
fatal("BTB entries is not a power of 2!");
}
btb.resize(numEntries);
for (unsigned i = 0; i < numEntries; ++i) {
btb[i].valid = false;
}
idxMask = numEntries - 1;
tagMask = (1 << tagBits) - 1;
tagShiftAmt = instShiftAmt + floorLog2(numEntries);
}
void
SimpleBTB::memInvalidate()
{
for (unsigned i = 0; i < numEntries; ++i) {
btb[i].valid = false;
}
}
inline
unsigned
SimpleBTB::getIndex(Addr instPC, ThreadID tid)
{
// Need to shift PC over by the word offset.
return ((instPC >> instShiftAmt)
^ (tid << (tagShiftAmt - instShiftAmt - log2NumThreads)))
& idxMask;
}
inline
Addr
SimpleBTB::getTag(Addr instPC)
{
return (instPC >> tagShiftAmt) & tagMask;
}
SimpleBTB::BTBEntry *
SimpleBTB::findEntry(Addr instPC, ThreadID tid)
{
unsigned btb_idx = getIndex(instPC, tid);
Addr inst_tag = getTag(instPC);
assert(btb_idx < numEntries);
if (btb[btb_idx].valid
&& inst_tag == btb[btb_idx].tag
&& btb[btb_idx].tid == tid) {
return &btb[btb_idx];
}
return nullptr;
}
bool
SimpleBTB::valid(ThreadID tid, Addr instPC)
{
BTBEntry *entry = findEntry(instPC, tid);
return entry != nullptr;
}
// @todo Create some sort of return struct that has both whether or not the
// address is valid, and also the address. For now will just use addr = 0 to
// represent invalid entry.
const PCStateBase *
SimpleBTB::lookup(ThreadID tid, Addr instPC, BranchType type)
{
stats.lookups[type]++;
BTBEntry *entry = findEntry(instPC, tid);
if (entry) {
return entry->target.get();
}
stats.misses[type]++;
return nullptr;
}
const StaticInstPtr
SimpleBTB::lookupInst(ThreadID tid, Addr instPC)
{
BTBEntry *entry = findEntry(instPC, tid);
if (entry) {
return entry->inst;
}
return nullptr;
}
void
SimpleBTB::update(ThreadID tid, Addr instPC,
const PCStateBase &target,
BranchType type, StaticInstPtr inst)
{
unsigned btb_idx = getIndex(instPC, tid);
assert(btb_idx < numEntries);
stats.updates[type]++;
btb[btb_idx].tid = tid;
btb[btb_idx].valid = true;
set(btb[btb_idx].target, target);
btb[btb_idx].tag = getTag(instPC);
btb[btb_idx].inst = inst;
}
} // namespace branch_prediction
} // namespace gem5
Hello, I ran tuned ARM cpu full system simulation for a large workload (that takes 8 hours on a supercomputer) using gem5-dev branch code. However, with this commit, sometimes my job is killed which I think is due to excessive memory requirement. If I run a small workload then it works fine. Could you identify or estimate, where can we improve memory management in terms of the implementation of fdp and/or the associative BTB?
I changed associative BTB to simpleBTB_v2 like below: