public CachePacket(int reqNode, int from, int to, int flits, int _class, int _vcclass, Simulator.Ready _cb, CmpCache_Txn txn, bool critical) : base(null, 0, flits, new Coord(from), new Coord(to), txn, critical) { cb = _cb; m_class = _class; m_VCclass = mapClass(_vcclass); // assign network VC class here. requesterID = reqNode; }
public CachePacket(int reqNode, int from, int to, int flits, int _class, int _vcclass, Simulator.Ready _cb) : base(null, 0, flits, new Coord(from), new Coord(to)) { cb = _cb; m_class = _class; m_VCclass = mapClass(_vcclass); requesterID = reqNode; }
// TODO: Rachata, here is the problem, all the cachepacket lost anything related to GPU if we need to use the shared CPU-GPU cache public CachePacket(int reqNode, int from, int to, int flits, int _class, int _vcclass, Simulator.Ready _cb, bool is_GPU) : base(null, 0, flits, new Coord(from), new Coord(to)) { cb = _cb; m_class = _class; m_VCclass = mapClass(_vcclass); requesterID = reqNode; //cache requests -- TODO: This is a hack, but once we start using a shared CPU-GPU cache then this need to be fixed this.from_GPU = is_GPU; }
void access_mem(int requestor, ulong addr, bool write, Simulator.Ready cb) { Request req = new Request(requestor, addr, write); int node = map_addr_mem(requestor, addr); Simulator.network.nodes[node].mem.access(req, cb); }
void send_noc(int reqNode, int from, int to, int flits, Simulator.Ready cb, bool off_crit, int vc) { int cl = off_crit ? 2 : // packet class (used for split queues): 0 = ctl, 1 = data, 2 = off-crit (writebacks) (flits > 1 ? 1 : 0); CachePacket p = new CachePacket(reqNode, from, to, flits, cl, vc, cb); Simulator.network.nodes[from].queuePacket(p); }
public void access(int node, ulong addr, bool write, Simulator.Ready cb, out bool L1hit, out bool L1upgr, out bool L1ev, out bool L1wb, out bool L2access, out bool L2hit, out bool L2ev, out bool L2wb, out bool c2c) { CmpCache_Txn txn = null; int sh_slice = map_addr(node, addr); // ------------- first, we probe the cache (private, and shared if necessary) to // determine current state. // probe private cache CmpCache_State state; bool prv_state; bool prv_hit = m_prv[node].probe(addr, out prv_state); bool sh_hit = false; if (m_sh_perfect) { ulong blk = addr >> m_blkshift; sh_hit = true; if (m_perf_sh.ContainsKey(blk)) state = m_perf_sh[blk]; else { state = new CmpCache_State(); m_perf_sh[blk] = state; } } else sh_hit = m_sh.probe(addr, out state); bool prv_excl = sh_hit ? (state.excl == node) : false; if (prv_hit) // we always update the timestamp on the private cache m_prv[node].update(addr, Simulator.CurrentRound); // out-params L1hit = prv_hit; L1upgr = L1hit && !prv_excl; L2hit = sh_hit; c2c = false; // will be set below for appropriate cases L1ev = false; // will be set below L1wb = false; // will be set below L2ev = false; // will be set below L2wb = false; // will be set below L2access = false; // will be set below // ----------------- now, we execute one of four cases: // 1a. present in private cache, with appropriate ownership. // 1b. present in private cache, but not excl (for a write) // 2. not present in private cache, but in shared cache. // 3. not present in private or shared cache. // // in each case, we update functional state and generate the packet DAG as we go. if (prv_hit && (!write || prv_excl)) // CASE 1a: present in prv cache, have excl if write { // just set modified-bit in state, then we're done (no protocol interaction) if (write) state.modified = true; } else if (prv_hit && write && !prv_excl) // CASE 1b: present in prv cache, need upgr { txn = new CmpCache_Txn(); txn.node = node; // request packet CmpCache_Pkt req_pkt = add_ctl_pkt(txn, node, sh_slice, false); CmpCache_Pkt done_pkt = null; // present in others? if (state.owners.others_set(node)) { done_pkt = do_inval(txn, state, req_pkt, node, addr); } else { // not present in others, but we didn't have excl -- send empty grant // (could happen if others have evicted and we are the only one left) done_pkt = add_ctl_pkt(txn, sh_slice, node, true); done_pkt.delay = m_shdelay; add_dep(req_pkt, done_pkt); } state.owners.reset(); state.owners.set(node); state.excl = node; state.modified = true; } else if (!prv_hit && sh_hit) // CASE 2: not in prv cache, but in sh cache { txn = new CmpCache_Txn(); txn.node = node; // update functional shared state if (!m_sh_perfect) m_sh.update(addr, Simulator.CurrentRound); // request packet CmpCache_Pkt req_pkt = add_ctl_pkt(txn, node, sh_slice, false); CmpCache_Pkt done_pkt = null; if (state.owners.any_set()) // in other caches? { if (write) // need to invalidate? { if (state.excl != -1) // someone else has exclusive -- c-to-c xfer { c2c = true; // out-param CmpCache_Pkt xfer_req = add_ctl_pkt(txn, sh_slice, state.excl, false); CmpCache_Pkt xfer_dat = add_data_pkt(txn, state.excl, node, true); done_pkt = xfer_dat; xfer_req.delay = m_shdelay; xfer_dat.delay = m_prvdelay; add_dep(req_pkt, xfer_req); add_dep(xfer_req, xfer_dat); bool evicted_state; m_prv[state.excl].inval(addr, out evicted_state); } else // others have it -- inval to all, c-to-c from closest { int close = closest(node, state.owners); if (close != -1) c2c = true; // out-param done_pkt = do_inval(txn, state, req_pkt, node, addr, close); } // for a write, we need exclusive -- update state state.owners.reset(); state.owners.set(node); state.excl = node; state.modified = true; } else // just a read -- joining sharer set, c-to-c from closest { if (state.excl != -1) { CmpCache_Pkt xfer_req = add_ctl_pkt(txn, sh_slice, state.excl, false); CmpCache_Pkt xfer_dat = add_data_pkt(txn, state.excl, node, true); done_pkt = xfer_dat; c2c = true; // out-param xfer_req.delay = m_shdelay; xfer_dat.delay = m_prvdelay; add_dep(req_pkt, xfer_req); add_dep(xfer_req, xfer_dat); // downgrade must also trigger writeback if (state.modified) { CmpCache_Pkt wb_dat = add_data_pkt(txn, state.excl, sh_slice, false); add_dep(xfer_req, wb_dat); state.modified = false; state.sh_dirty = true; } } else { int close = closest(node, state.owners); if (close != -1) c2c = true; // out-param CmpCache_Pkt xfer_req = add_ctl_pkt(txn, sh_slice, close, false); CmpCache_Pkt xfer_dat = add_data_pkt(txn, close, node, true); done_pkt = xfer_dat; xfer_req.delay = m_shdelay; xfer_dat.delay = m_prvdelay; add_dep(req_pkt, xfer_req); add_dep(xfer_req, xfer_dat); } state.owners.set(node); state.excl = -1; } } else { // not in other prv caches, need to get from shared slice L2access = true; CmpCache_Pkt dat_resp = add_data_pkt(txn, sh_slice, node, true); done_pkt = dat_resp; add_dep(req_pkt, done_pkt); dat_resp.delay = m_shdelay; state.owners.reset(); state.owners.set(node); state.excl = node; state.modified = write; } // insert into private cache, get evicted block (if any) ulong evict_addr; bool evict_data; bool evicted = m_prv[node].insert(addr, true, out evict_addr, out evict_data, Simulator.CurrentRound); // add either a writeback or a release packet if (evicted) { L1ev = true; do_evict(txn, done_pkt, node, evict_addr, out L1wb); } } else if (!prv_hit && !sh_hit) // CASE 3: not in prv or shared cache { // here, we need to go to memory Debug.Assert(!m_sh_perfect); txn = new CmpCache_Txn(); txn.node = node; L2access = true; // request packet CmpCache_Pkt req_pkt = add_ctl_pkt(txn, node, sh_slice, false); // cache response packet CmpCache_Pkt resp_pkt = add_data_pkt(txn, sh_slice, node, true); resp_pkt.delay = m_opdelay; // req already active -- just a pass-through op delay here // memory request packet int mem_slice = map_addr_mem(node, addr); CmpCache_Pkt memreq_pkt = add_ctl_pkt(txn, sh_slice, mem_slice, false); memreq_pkt.delay = m_shdelay; // memory-access virtual node CmpCache_Pkt mem_access = add_ctl_pkt(txn, 0, 0, false); mem_access.send = false; mem_access.mem = true; mem_access.mem_addr = addr; mem_access.mem_write = false; // cache-line fill mem_access.mem_requestor = node; // memory response packet CmpCache_Pkt memresp_pkt = add_data_pkt(txn, mem_slice, sh_slice, false); // connect up the critical path first add_dep(req_pkt, memreq_pkt); add_dep(memreq_pkt, mem_access); add_dep(mem_access, memresp_pkt); add_dep(memresp_pkt, resp_pkt); // now, handle replacement in the shared cache... CmpCache_State new_state = new CmpCache_State(); new_state.owners.reset(); new_state.owners.set(node); new_state.excl = node; new_state.modified = write; new_state.sh_dirty = false; ulong sh_evicted_addr; CmpCache_State sh_evicted_state; bool evicted = m_sh.insert(addr, new_state, out sh_evicted_addr, out sh_evicted_state, Simulator.CurrentRound); if (evicted) { // shared-cache eviction (different from the private-cache evictions elsewhere): // we must evict any private-cache copies, because we model an inclusive hierarchy. L2ev = true; CmpCache_Pkt prv_evict_join = add_joinpt(txn, false); if (sh_evicted_state.excl != -1) // evicted block lives only in one prv cache { // invalidate request to prv cache before sh cache does eviction CmpCache_Pkt prv_invl = add_ctl_pkt(txn, sh_slice, sh_evicted_state.excl, false); add_dep(memresp_pkt, prv_invl); CmpCache_Pkt prv_wb; prv_invl.delay = m_opdelay; if (sh_evicted_state.modified) { // writeback prv_wb = add_data_pkt(txn, sh_evicted_state.excl, sh_slice, false); prv_wb.delay = m_prvdelay; sh_evicted_state.sh_dirty = true; } else { // simple ACK prv_wb = add_ctl_pkt(txn, sh_evicted_state.excl, sh_slice, false); prv_wb.delay = m_prvdelay; } add_dep(prv_invl, prv_wb); add_dep(prv_wb, prv_evict_join); bool prv_evicted_dat; m_prv[sh_evicted_state.excl].inval(sh_evicted_addr, out prv_evicted_dat); } else if (sh_evicted_state.owners.any_set()) // evicted block has greater-than-one sharer set { for (int i = 0; i < m_N; i++) if (sh_evicted_state.owners.is_set(i)) { CmpCache_Pkt prv_invl = add_ctl_pkt(txn, sh_slice, i, false); CmpCache_Pkt prv_ack = add_ctl_pkt(txn, i, sh_slice, false); prv_invl.delay = m_opdelay; prv_ack.delay = m_prvdelay; add_dep(memresp_pkt, prv_invl); add_dep(prv_invl, prv_ack); add_dep(prv_ack, prv_evict_join); bool prv_evicted_dat; m_prv[i].inval(sh_evicted_addr, out prv_evicted_dat); } } else // evicted block has no owners (was only in shared cache) { add_dep(memresp_pkt, prv_evict_join); } // now writeback to memory, if we were dirty if (sh_evicted_state.sh_dirty) { CmpCache_Pkt mem_wb = add_data_pkt(txn, sh_slice, mem_slice, false); mem_wb.delay = m_opdelay; add_dep(prv_evict_join, mem_wb); CmpCache_Pkt mem_wb_op = add_ctl_pkt(txn, 0, 0, false); mem_wb_op.send = false; mem_wb_op.mem = true; mem_wb_op.mem_addr = sh_evicted_addr; mem_wb_op.mem_write = true; mem_wb_op.mem_requestor = node; add_dep(mem_wb, mem_wb_op); L2wb = true; } } // ...and insert and handle replacement in the private cache ulong evict_addr; bool evict_data; bool prv_evicted = m_prv[node].insert(addr, true, out evict_addr, out evict_data, Simulator.CurrentRound); // add either a writeback or a release packet if (prv_evicted) { L1ev = true; do_evict(txn, resp_pkt, node, evict_addr, out L1wb); } } else // shouldn't happen. Debug.Assert(false); // now start the transaction, if one was needed if (txn != null) { txn.cb = cb; assignVCclasses(txn.pkts); // start running the protocol DAG. It may be an empty graph (for a silent upgr), in // which case the deferred start (after cache delay) Simulator.Defer(delegate() { start_pkts(txn); }, Simulator.CurrentRound + m_prvdelay); } // no transaction -- just the cache access delay. schedule deferred callback. else { Simulator.Defer(cb, Simulator.CurrentRound + m_prvdelay); } }
public static void Defer(Simulator.Ready cb, ulong cyc) { m_deferQueue.Enqueue(cb, cyc); }
public MemoryRequest(Request req, Simulator.Ready cb) { this.cb = cb; request = req; isWrite = req.write; req.beenToMemory = true; // mapAddr(req.blockAddress, out shift_row, out mem_index, out channel_index, // out rank_index, out bank_index, out row_index); mapAddr(req.requesterID, req.blockAddress, out shift_row, out mem_index, out channel_index, out rank_index, out bank_index, out row_index); // Console.WriteLine("Address:{0:x}, shift_row:{1:x}", req.address, shift_row ); //scheduling related isMarked = false; /* HWA CODE */ // Bug Fix?? this.from_GPU = req.from_GPU; }
public MemoryRequest(Request req, Simulator.Ready cb) { this.cb = cb; request = req; req.beenToMemory = true; mapAddr(req.blockAddress, out m_index, out b_index, out r_index, out glob_b_index); //scheduling related //sched = Config.memory.mem[m_index].sched; sched = null; isMarked = false; }
/* public void receivePacket(MemoryPacket p) { Simulator.Ready cb; //receive WB or request from memory if(p.type == MemoryRequestType.RD) { cb = delegate() { MemoryPacket mp = new MemoryPacket( p.request, p.block, MemoryRequestType.DAT, p.dest, p.src); node.queuePacket(mp); }; } else { // WB don't need a callback cb = delegate(){}; } access(p.request, cb); } */ public void access(Request req, Simulator.Ready cb) { MemoryRequest mreq = new MemoryRequest(req, cb); sched.issue_req(mreq); bank[mreq.b_index].outstandingReqs_perapp[req.requesterID]++; bank[mreq.b_index].outstandingReqs++; }
// Called by CPU.cs to issue a request to the MemoryCoalescing // This just places the request in the appropriate client queue // This cannot be used when we model the network public void issueReq(int targetID, Request req, Simulator.Ready cb) { // Console.WriteLine("In MemoryCoalescing, issueReq is called requester {0}, addr = {1} at cycle {2}", req.requesterID, req.address, Simulator.CurrentRound); MemoryRequest mreq = new MemoryRequest(req, cb); mreq.from_GPU = true; //Console.WriteLine("Get a GPU Request {0}", req.from_GPU); int c = (int)req.client; int w = req.write?1:0; if(Config.useMemoryCoalescing) { // Console.WriteLine("In MemoryCoalescing, enqueue to the client queue requester {0}, addr = {1} at cycle {2}", req.requesterID, req.address, Simulator.CurrentRound); clientQueue[c,w].Enqueue(new Tuple3(targetID,Simulator.CurrentRound,mreq)); } else { bool l1hit = false, l1upgr = false, l1ev = false, l1wb = false; bool l2access = false, l2hit = false, l2ev = false, l2wb = false, c2c = false; Simulator.network.cache.access(req.requesterID, req.address, req.write, cb, out l1hit, out l1upgr, out l1ev, out l1wb, out l2access, out l2hit, out l2ev, out l2wb, out c2c); } }
protected override void _doStep() { stepNacks(); /* bool ejectedThisCycle = */ ejectLocal(); for (int i = 0; i < 4; i++) { input[i] = null; } // first, propagate the non-head flits along their worm paths // (no truncation, so this is very simple) for (int dir = 0; dir < 4; dir++) { if (linkIn[dir] != null && linkIn[dir].Out != null && !linkIn[dir].Out.isHeadFlit) { #if DEBUG Console.WriteLine("non-head flit: {0}", linkIn[dir].Out); #endif Flit f = linkIn[dir].Out; // grab the input flit from the link linkIn[dir].Out = null; if (wormRouting[dir] == -1) { // AGH: worm not routed throw new Exception("SHOULDN'T HAPPEN!"); } if (wormRouting[dir] != -2) // if not dropping, propagate the flit { linkOut[wormRouting[dir]].In = f; } if (f.isTailFlit) // if last flit, close the wormhole { wormRouting[dir] = -1; } } } if (m_injectSlot != null && !m_injectSlot.isHeadFlit) { linkOut[wormRouting[4]].In = m_injectSlot; if (m_injectSlot.isTailFlit) { wormRouting[4] = -1; } m_injectSlot = null; } // grab inputs into a local array int c = 0; for (int dir = 0; dir < 4; dir++) { if (linkIn[dir] != null && linkIn[dir].Out != null) { linkIn[dir].Out.inDir = dir; // record this for below input[c++] = linkIn[dir].Out; linkIn[dir].Out = null; } } // step 1: get possible-output vectors for each input bool[,] possible = new bool[4, 4]; // (input,direction) int[] possible_count = new int[4]; for (int i = 0; i < 4 && input[i] != null; i++) { PreferredDirection pd = determineDirection(input[i].dest); if (pd.xDir != Simulator.DIR_NONE && linkOut[pd.xDir].In == null) { if (nackAvailable(pd.xDir)) { possible[i, pd.xDir] = true; } else { Simulator.stats.nack_unavail.Add(); Simulator.stats.nack_unavail_by_src[ID].Add(); } } if (pd.yDir != Simulator.DIR_NONE && linkOut[pd.yDir].In == null) { if (nackAvailable(pd.yDir)) { possible[i, pd.yDir] = true; } else { Simulator.stats.nack_unavail.Add(); Simulator.stats.nack_unavail_by_src[ID].Add(); } } } // step 2: count possible requests per output for (int i = 0; i < 4; i++) { for (int dir = 0; dir < 4; dir++) { if (possible[i, dir]) { possible_count[dir]++; } } } // step 3: if more than one possible for a given request, pick one with least // requests; if tie, break randomly for (int i = 0; i < 4; i++) { int min_req = 10, min_req_j = -1; for (int j = 0; j < 4; j++) { if (possible[i, j]) { if (possible_count[j] < min_req) { min_req_j = j; min_req = possible_count[j]; } } } for (int j = 0; j < 4; j++) { possible[i, j] = false; } if (min_req_j != -1) { possible[i, min_req_j] = true; } } // step 4,5: compute maximum priority requesting each output; set everyone // below this prio to false for (int dir = 0; dir < 4; dir++) { int max_prio = -1; for (int i = 0; i < 4; i++) { if (possible[i, dir]) { if (input[i].packet.scarab_retransmit_count > max_prio) { max_prio = input[i].packet.scarab_retransmit_count; } } } for (int i = 0; i < 4; i++) { if (possible[i, dir] && input[i].packet.scarab_retransmit_count < max_prio) { possible[i, dir] = false; } } } // step 6: select a winner in round-robin fashion int offset = getRR(); int[] assignments = new int[4]; for (int i = 0; i < 4; i++) { assignments[i] = -1; } for (int i_ = 0; i_ < 4; i_++) { int i = (i_ + offset) % 4; for (int dir = 0; dir < 4; dir++) { if (possible[i, dir]) { assignments[i] = dir; for (int j = 0; j < 4; j++) { possible[j, dir] = false; } } } } //Flit oppBufferable = null; // assign outputs, choose a flit to opp. buffer if appropriate for (int i = 0; i < 4 && input[i] != null; i++) { int dir = assignments[i]; if (dir == -1) { // drop! sendNack(input[i]); wormRouting[input[i].inDir] = -2; Simulator.stats.drop.Add(); Simulator.stats.drop_by_src[ID].Add(); } else { double decay = 0.875; //TODO parameterize avgOutPriority[dir] = avgOutPriority[dir] * (1 - decay) + input[i].packet.scarab_retransmit_count * decay; /* * if (Config.opp_buffering * && !ejectedThisCycle * && input[i].packet.nrOfFlits == 1 * && myProcessor.msh.hasOppBufferSpace() * && input[i].packet.scarab_retransmit_count < avgOutPriority[dir] * ) * { * // buffer opportunistically! (choose highest priority packet) * if (oppBufferable == null || input[i].packet.scarab_retransmit_count > oppBufferable.packet.scarab_retransmit_count) * oppBufferable = input[i]; * } */ } } for (int i = 0; i < 4 && input[i] != null; i++) { int dir = assignments[i]; if (dir == -1) { continue; } int nackWire; ulong due = Simulator.CurrentRound + 4 * (1 + Simulator.distance(coord, input[i].dest)); //nack_due[input[i].packet] = due; /* * if (input[i] == oppBufferable) * { * Console.WriteLine("Opp Buffering flit!"); * sendTeardown(oppBufferable); * myProcessor.ejectFlit(oppBufferable); * * nackWire = allocateNack(dir, -2, due); * } * else */ nackWire = allocateNack(dir, nackNr(input[i].inDir, input[i].nackWire), due); if (nackWire == -1) { throw new Exception("shouldn't happen"); } input[i].nackWire = nackWire; linkOut[dir].In = input[i]; wormRouting[input[i].inDir] = dir; } // now try to inject if (m_injectSlot != null) { PreferredDirection pd = determineDirection(m_injectSlot.dest); ulong due = Simulator.CurrentRound + 4 * (1 + Simulator.distance(coord, m_injectSlot.dest)); //nack_due[m_injectSlot.packet] = due; if (pd.xDir != Simulator.DIR_NONE && linkOut[pd.xDir].In == null) { int nackWire = allocateNack(pd.xDir, -2, due); if (nackWire != -1) { linkOut[pd.xDir].In = m_injectSlot; m_injectSlot.nackWire = nackWire; m_injectSlot = null; wormRouting[4] = pd.xDir; } } if (m_injectSlot != null && // check this again: only try y if x didn't work pd.yDir != Simulator.DIR_NONE && linkOut[pd.yDir].In == null) { int nackWire = allocateNack(pd.yDir, -2, due); if (nackWire != -1) { linkOut[pd.yDir].In = m_injectSlot; m_injectSlot.nackWire = nackWire; m_injectSlot = null; wormRouting[4] = pd.yDir; } } } }
public void access(Request req, Simulator.Ready cb) { MemoryRequest mreq = new MemoryRequest(req, cb); ReceivePacket(mreq); }