Only in ptlsim-stable: .depend
Only in ptlsim-asf/: .hgtags
Only in ptlsim-asf/: LICENSE
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/Makefile ptlsim-asf/Makefile
--- ptlsim-stable/Makefile	2010-03-02 13:20:35.936788000 +0100
+++ ptlsim-asf/Makefile	2009-12-22 14:57:33.985339000 +0100
@@ -32,7 +32,7 @@
 #
 PTLSIM_HYPERVISOR=1
 
-CC = g++
+CC = g++-4.2
 
 GCCVER_SPECIFIC =
 
@@ -68,7 +68,7 @@
 CFLAGS += -fpic -mno-red-zone
 endif
 
-CFLAGS += -fno-trapping-math -fno-stack-protector -fno-exceptions -fno-rtti -funroll-loops -mpreferred-stack-boundary=4 -fno-strict-aliasing -fno-stack-protector -Wreturn-type $(GCCVER_SPECIFIC)
+CFLAGS += -fno-trapping-math -fno-stack-protector -fno-exceptions -fno-rtti -funroll-loops -mpreferred-stack-boundary=4 -fno-strict-aliasing -Wreturn-type $(GCCVER_SPECIFIC)
 
 
 BASEOBJS = superstl.o config.o mathlib.o syscalls.o
@@ -76,28 +76,28 @@
 
 ifdef __x86_64__
 ifdef PTLSIM_HYPERVISOR
-COMMONOBJS = linkstart.o lowlevel-64bit-xen.o ptlsim.o ptlxen.o ptlxen-memory.o ptlxen-events.o ptlxen-common.o perfctrs.o mm.o superstl.o config.o mathlib.o klibc.o ptlhwdef.o datastore.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-64bit-xen.o ptlsim.o ptlxen.o ptlxen-memory.o ptlxen-events.o ptlxen-common.o perfctrs.o mm.o superstl.o config.o mathlib.o klibc.o ptlhwdef.o datastore.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o ptlsim.dst.o linkend.o decode-asf.o
 else
-COMMONOBJS = linkstart.o lowlevel-64bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o datastore.o injectcode-64bit.o seqcore.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-64bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o datastore.o injectcode-64bit.o seqcore.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o decode-asf.o
 endif
 else
 # 32-bit PTLsim32 only:
-COMMONOBJS = linkstart.o lowlevel-32bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o datastore.o injectcode-32bit.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-32bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o datastore.o injectcode-32bit.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o decode-asf.o
 endif
 
-OOOOBJS = branchpred.o dcache.o ooocore.o ooopipe.o oooexec.o 
+OOOOBJS = branchpred.o dcache.o ooocore.o ooopipe.o oooexec.o asf.o
 OBJFILES = $(COMMONOBJS) $(OOOOBJS)
 
-COMMONINCLUDES = logic.h ptlhwdef.h decode.h seqexec.h dcache.h dcache-amd-k8.h config.h ptlsim.h datastore.h superstl.h globals.h kernel.h mm.h ptlcalls.h loader.h mathlib.h klibc.h syscalls.h ptlxen.h stats.h xen-types.h
+COMMONINCLUDES = logic.h ptlhwdef.h decode.h seqexec.h dcache.h dcache-amd-k8.h config.h ptlsim.h datastore.h superstl.h globals.h kernel.h mm.h ptlcalls.h loader.h mathlib.h klibc.h syscalls.h ptlxen.h stats.h xen-types.h trace_event.h
 OOOINCLUDES = branchpred.h ooocore.h ooocore-amd-k8.h
 INCLUDEFILES = $(COMMONINCLUDES) $(OOOINCLUDES)
 
-COMMONCPPFILES = ptlsim.cpp kernel.cpp mm.cpp superstl.cpp ptlhwdef.cpp decode-core.cpp decode-fast.cpp decode-complex.cpp decode-x87.cpp decode-sse.cpp lowlevel-64bit.S lowlevel-32bit.S linkstart.S linkend.S uopimpl.cpp dcache.cpp config.cpp datastore.cpp injectcode.cpp ptlcalls.c cpuid.cpp ptlstats.cpp klibc.cpp glibc.cpp mathlib.cpp syscalls.cpp makeusage.cpp
+COMMONCPPFILES = ptlsim.cpp kernel.cpp mm.cpp superstl.cpp ptlhwdef.cpp decode-core.cpp decode-fast.cpp decode-complex.cpp decode-x87.cpp decode-sse.cpp lowlevel-64bit.S lowlevel-32bit.S linkstart.S linkend.S uopimpl.cpp dcache.cpp config.cpp datastore.cpp injectcode.cpp ptlcalls.c cpuid.cpp ptlstats.cpp klibc.cpp glibc.cpp mathlib.cpp syscalls.cpp makeusage.cpp decode-asf.cpp
 
 ifdef PTLSIM_HYPERVISOR
 COMMONCPPFILES += lowlevel-64bit-xen.S ptlxen.cpp ptlxen-memory.cpp ptlxen-events.cpp ptlxen-common.cpp perfctrs.cpp ptlmon.cpp ptlctl.cpp
 endif
-OOOCPPFILES = ooocore.cpp ooopipe.cpp oooexec.cpp seqcore.cpp branchpred.cpp
+OOOCPPFILES = ooocore.cpp ooopipe.cpp oooexec.cpp seqcore.cpp branchpred.cpp asf.cpp
 
 CPPFILES = $(COMMONCPPFILES) $(OOOCPPFILES)
 
@@ -179,14 +179,17 @@
 ifdef __x86_64__
 ifdef PTLSIM_HYPERVISOR
 ptlsim: ptlmon.o ptlxen.bin.o usage.o $(BASEOBJS) $(STDOBJS) ptlxen-common.o ptlhwdef.o ptlmon.lds Makefile
-	$(CC) $(CFLAGS) ptlmon.o ptlxen.bin.o usage.o $(BASEOBJS) $(STDOBJS) ptlxen-common.o ptlhwdef.o -lxenctrl -lxenguest -lxenstore -lstdc++ -lpthread -Wl,-T,ptlmon.lds -static -o ptlsim
+	$(CC) $(CFLAGS) ptlmon.o ptlxen.bin.o usage.o $(BASEOBJS) $(STDOBJS) ptlxen-common.o ptlhwdef.o -lxenctrl -lxenguest -lxenstore -lstdc++ -lpthread -Wl,-T,ptlmon.lds -static -o ptlsim.xen
+	ln -sf ptlsim.xen ptlsim 
 else
 ptlsim: $(OBJFILES) Makefile ptlsim.lds
-	ld -g -O2 $(OBJFILES) -o ptlsim $(LIBPERFCTR) -static --allow-multiple-definition -T ptlsim.lds -e ptlsim_preinit_entry `gcc -print-libgcc-file-name`
+	ld -g -O2 $(OBJFILES) -o ptlsim.usr $(LIBPERFCTR) -static --allow-multiple-definition -T ptlsim.lds -e ptlsim_preinit_entry `gcc -print-libgcc-file-name`
+	ln -sf ptlsim.usr ptlsim
 endif # PTLSIM_HYPERVISOR
 else
 ptlsim: $(OBJFILES) Makefile ptlsim32.lds
-	ld -g -O2 $(OBJFILES) -o ptlsim $(LIBPERFCTR) -static --allow-multiple-definition -T ptlsim32.lds -e ptlsim_preinit_entry `gcc -print-libgcc-file-name`
+	ld -g -O2 $(OBJFILES) -o ptlsim.usr $(LIBPERFCTR) -static --allow-multiple-definition -T ptlsim32.lds -e ptlsim_preinit_entry `gcc -print-libgcc-file-name`
+	ln -sf ptlsim.usr ptlsim
 endif
 
 ptlctl: ptlctl.o $(BASEOBJS) $(STDOBJS)
Only in ptlsim-asf/: RELEASE_NOTES
Only in ptlsim-asf/: asf-highlevel.h
Only in ptlsim-asf/: asf-opcodes.h
Only in ptlsim-asf/: asf.cpp
Only in ptlsim-asf/: asf.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/config.h ptlsim-asf/config.h
--- ptlsim-stable/config.h	2009-10-30 19:40:24.955182000 +0100
+++ ptlsim-asf/config.h	2010-03-02 12:00:55.041050000 +0100
@@ -12,6 +12,12 @@
 #include <superstl.h>
 #include <stdarg.h>
 
+//
+// Enable ASF support, for cores where supported
+//
+#define ENABLE_ASF
+//#define ENABLE_ASF_CACHE_BASED
+
 static const W64 infinity = limits<W64s>::max;
 
 struct ConfigurationOption {
Only in ptlsim-stable: cpuid
Only in ptlsim-asf/: dcache-amd-barcelona-asf.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/dcache-amd-k8.h ptlsim-asf/dcache-amd-k8.h
--- ptlsim-stable/dcache-amd-k8.h	2010-03-02 13:20:36.009716000 +0100
+++ ptlsim-asf/dcache-amd-k8.h	2010-03-02 12:00:55.104987000 +0100
@@ -6,9 +6,6 @@
 // Copyright 2007 Matt T. Yourst <yourst@yourst.com>
 //
 
-#ifndef _DCACHE_H_
-#define _DCACHE_H_
-
 #include <ptlsim.h>
 //#include <datastore.h>
 
@@ -473,7 +470,7 @@
 
     inline LoadFillReq() { }
   
-    LoadFillReq(W64 addr, W64 data, byte mask, LoadStoreInfo lsi);
+    LoadFillReq(W64 addr, W64 virtaddr, W64 data, byte mask, LoadStoreInfo lsi);
     ostream& print(ostream& os) const;
   };
 
@@ -623,12 +620,18 @@
     bool covered_by_sfr(W64 addr, SFR* sfr, int sizeshift);
     void annul_lfrq_slot(int lfrqslot);
     int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi, bool& L2hit);
+    int issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi, bool& L2hit);
 
     int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi) {
       bool L2hit = 0;
       return issueload_slowpath(physaddr, sfra, lsi, L2hit);
     }
 
+    int issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi) {
+      bool L2hit = 0;
+      return issueload_slowpath(physaddr, virtaddr, sfra, lsi, L2hit);
+    }
+
     int get_lfrq_mb(int lfrqslot) const;
     int get_lfrq_mb_state(int lfrqslot) const;
     bool lfrq_or_missbuf_full() const { return lfrq.full() | missbuf.full(); }
@@ -766,5 +769,3 @@
   PerContextDataCacheStats vcpu30;
   PerContextDataCacheStats vcpu31;
 };
-
-#endif // _DCACHE_H_
Only in ptlsim-stable: dcache-amd-k8.h.orig
Only in ptlsim-asf/: dcache-generic.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/dcache.cpp ptlsim-asf/dcache.cpp
--- ptlsim-stable/dcache.cpp	2009-10-30 19:40:24.964179000 +0100
+++ ptlsim-asf/dcache.cpp	2010-03-02 12:00:55.127964000 +0100
@@ -3,6 +3,8 @@
 // L1 and L2 Data Caches
 //
 // Copyright 2005-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <dcache.h>
@@ -62,6 +64,7 @@
       if (logable(6)) logfile << "[vcpu ", threadid, "] reset lfrq slot ", i, ": ", req, endl;
       waiting[i] = 0;
       ready[i] = 0;
+      retry[i] = 0;
       freemap[i] = 1;
       count--;
       assert(count >= 0);
@@ -76,10 +79,14 @@
   LoadFillReq& req = reqs[lfrqslot];
   if (logable(6)) logfile << "  Annul LFRQ slot ", lfrqslot, endl;
   stats.dcache.lfrq.annuls++;
+#ifdef ENABLE_ASF_CACHE_BASED
+  annul_asf_spec_lfr(req.mbidx, lfrqslot);
+#endif
   hierarchy.missbuf.annul_lfrq(lfrqslot);
   reqs[lfrqslot].mbidx = -1;
   assert(!freemap[lfrqslot]);
   changestate(lfrqslot, ready, freemap);
+  retry[lfrqslot] = 0;
   count--;
   assert(count >= 0);
 }
@@ -120,9 +127,10 @@
 // miss buffer can be freed.
 // 
 template <int size>
-void LoadFillReqQueue<size>::wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask) {
+void LoadFillReqQueue<size>::wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask, bool need_retry) {
   if (logable(6)) logfile << "LFRQ.wakeup(", (void*)(Waddr)address, ", ", lfrqmask, ")", endl;
   //assert(L2.probe(address));
+  if unlikely(need_retry) retry |= lfrqmask;
   waiting &= ~lfrqmask;
   ready |= lfrqmask;
 }
@@ -150,10 +158,12 @@
   foreach (i, MAX_WAKEUPS_PER_CYCLE) {
     if unlikely (!ready) break;
 
-    int idx = ready.lsb();
+    int idx = ready.lsb(); // SD NOTE: This can cause deadlocks, thanks to loads with high entries not being woken up!
     LoadFillReq& req = reqs[idx];
     
-    if (logable(6)) logfile << "[vcpu ", req.lsi.threadid, "] at cycle ", sim_cycle, ": wakeup LFRQ slot ", idx, ": ", req, endl;
+    if (logable(6)) logfile << "[vcpu ", req.lsi.threadid, "] at cycle ",
+      sim_cycle, ": ", retry[idx] ? "retry" : "wakeup", " LFRQ slot ", idx, ": ",
+       req, endl;
 
     W64 delta = LO32(sim_cycle) - LO32(req.initcycle);
     if unlikely (delta >= 65536) {
@@ -165,10 +175,11 @@
         
     stats.dcache.lfrq.wakeups++;
     wakeupcount++;
-    if likely (hierarchy.callback) hierarchy.callback->dcache_wakeup(req.lsi, req.addr);
+    if likely (hierarchy.callback) hierarchy.callback->dcache_wakeup(req.lsi, req.addr, retry[idx]);
 
     assert(!freemap[idx]);
     changestate(idx, ready, freemap);
+    retry[idx] = 0;
     count--;
     assert(count >= 0);
   }
@@ -176,8 +187,9 @@
   stats.dcache.lfrq.width[wakeupcount]++;
 }
 
-LoadFillReq::LoadFillReq(W64 addr, W64 data, byte mask, LoadStoreInfo lsi) {
+LoadFillReq::LoadFillReq(W64 addr, W64 virtaddr, W64 data, byte mask, LoadStoreInfo lsi) {
   this->addr = addr;
+  this->virtaddr = virtaddr;
   this->data = data;
   this->mask = mask;
   this->lsi = lsi;
@@ -208,6 +220,49 @@
   return os;
 }
 
+#ifdef ENABLE_ASF_CACHE_BASED
+/**
+ * Patches up the condensed asf_spec-ness of this MissBufferEntry, whenever
+ * a load that was receiving data through this MBE is annulled.
+ * This is complicated, because normal and asf-spec loads may use the same
+ * MBE to handle the cache miss. Annulling one of them then necessitates the
+ * complicated reconstruction logic in here to ensure that the asf_spec flag
+ * is disabled, once all asf_spec loads referencing this MBE are annulled.
+ *
+ * @param mb MissBufferEntry that is currently looked at.
+ * @param lfrqslot LoadFillRequest of the annulled load.
+ */
+template <int size>
+void LoadFillReqQueue<size>::annul_asf_spec_lfr(int mbidx, int lfrqslot) {
+  assert(reqs[lfrqslot].mbidx == mbidx);
+  //assert(!hierarchy.missbuf.freemap[mbidx]);
+  /* NOTE: This can happen as the wake-up of the LFR and the MBE occur asynchro-
+     nously with each other. The entry is then already present in the cache. */
+  if unlikely (hierarchy.missbuf.freemap[mbidx]) return;
+
+  MissBuffer<MISSBUF_COUNT>::Entry &mb = hierarchy.missbuf.missbufs[mbidx];
+  if likely (!mb.asf_spec) return;
+
+  if (logable(5))
+    logfile << __FILE__, __LINE__, "Annul spec lfr @ MB: ", mbidx, " (addr: ",
+      (void*)mb.virtaddr, "/",(void*)mb.addr, ") LFR: ", lfrqslot, endl;
+
+  // Traverse all LFRs in the map and reconstruct asf_spec bit.
+  bool rebuild_asf_spec = false;
+  int  other_slot       = -1;
+  while (true) {
+    other_slot = mb.lfrqmap.nextlsb(other_slot);
+    if (other_slot == -1) break;
+    if (other_slot == lfrqslot) continue;
+    rebuild_asf_spec |= reqs[other_slot].lsi.asf_spec;
+
+    if (logable(6))
+      logfile << __FILE__, __LINE__, "LFR: ", other_slot, "ASF-spec: ",
+        reqs[other_slot].lsi.asf_spec, " rebuild_asf_spec: ", rebuild_asf_spec, endl;
+  }
+  mb.asf_spec = rebuild_asf_spec;
+}
+#endif
 //
 // Miss Buffer
 //
@@ -274,7 +329,7 @@
 // caches and needs service from below.
 //
 template <int SIZE>
-int MissBuffer<SIZE>::initiate_miss(W64 addr, bool hit_in_L2, bool icache, int rob, int threadid) {
+int MissBuffer<SIZE>::initiate_miss(W64 addr, W64 virtaddr, bool hit_in_L2, bool icache, int rob, int threadid) {
   bool DEBUG = logable(6);
 
   addr = floor(addr, L1_LINE_SIZE);
@@ -307,6 +362,7 @@
   stats.dcache.missbuf.inserts++;
   Entry& mb = missbufs[idx];
   mb.addr = addr;
+  mb.virtaddr = virtaddr;
   mb.lfrqmap = 0;
   mb.icache = icache;
   mb.dcache = (!icache);
@@ -335,11 +392,18 @@
 
   if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", idx, ": enter state deliver to L3 on ", (void*)(Waddr)addr, " (iter ", iterations, ")", endl;
   mb.state = STATE_DELIVER_TO_L3;
-  mb.cycles = MAIN_MEM_LATENCY;
 #else
   // L3 cache disabled
   if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", idx, ": enter state deliver to L2 on ", (void*)(Waddr)addr, " (iter ", iterations, ")", endl;
   mb.state = STATE_DELIVER_TO_L2;
+#endif
+
+#ifdef POOR_MANS_MESI
+  if (hierarchy.probe_other_caches(addr, virtaddr, false))
+    mb.cycles = CROSS_CACHE_LATENCY;
+  else
+    mb.cycles = MAIN_MEM_LATENCY;
+#else
   mb.cycles = MAIN_MEM_LATENCY;
 #endif
   if unlikely (icache) per_context_dcache_stats_update(mb.threadid, fetch.hit.mem++); else per_context_dcache_stats_update(mb.threadid, load.hit.mem++);
@@ -355,7 +419,7 @@
 
   if unlikely (lfrqslot < 0) return -1;
   
-  int mbidx = initiate_miss(req.addr, hit_in_L2, 0, rob, req.lsi.threadid);
+  int mbidx = initiate_miss(req.addr, req.virtaddr, hit_in_L2, 0, rob, req.lsi.threadid);
   if unlikely (mbidx < 0) {
     hierarchy.lfrq.free(lfrqslot);
     return -1;
@@ -365,7 +429,11 @@
   missbuf.lfrqmap[lfrqslot] = 1;
   hierarchy.lfrq[lfrqslot].mbidx = mbidx;
   // missbuf.threadid = req.lsi.threadid;
-
+#ifdef ENABLE_ASF_CACHE_BASED
+  // For multiple loads the ASF speculative bit is dominant, as the speculative
+  // region has to be aborted always if there is a probe hit
+  missbuf.asf_spec |= req.lsi.asf_spec;
+#endif
   return lfrqslot;
 }
 
@@ -415,7 +483,11 @@
           if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", i, ": delivered ", (void*)(Waddr)mb.addr, " to L1 dcache (map ", mb.lfrqmap, ")", endl;
           // If the L2 line size is bigger than the L1 line size, this will validate multiple lines in the L1 when an L2 line arrives:
           // foreach (i, L2_LINE_SIZE / L1_LINE_SIZE) L1.validate(mb.addr + i*L1_LINE_SIZE, bitvec<L1_LINE_SIZE>().setall());
-          hierarchy.L1.validate(mb.addr, bitvec<L1_LINE_SIZE>().setall());
+#ifdef ENABLE_ASF_CACHE_BASED
+          hierarchy.L1.validate(mb.addr, mb.virtaddr, bitvec<L1_LINE_SIZE>().setall(), mb.asf_spec);
+#else
+          hierarchy.L1.validate(mb.addr, mb.virtaddr, bitvec<L1_LINE_SIZE>().setall());
+#endif
           stats.dcache.missbuf.deliver.L2_to_L1D++;
           hierarchy.lfrq.wakeup(mb.addr, mb.lfrqmap);
         }
@@ -431,7 +503,9 @@
           lsi.threadid = mb.threadid;
           if likely (hierarchy.callback) hierarchy.callback->icache_wakeup(lsi, mb.addr);
         }
-
+        /* NOTE: The LFRs may still hold a reference to this MB-entry at this
+           point as they wake up asynchronously *and* clear their mbidx
+           lazyly! :-/ */
         assert(!freemap[i]);
         freemap[i] = 1;
         mb.reset();
@@ -440,6 +514,22 @@
       }
       break;
     }
+    case STATE_INVALIDATED: {
+      // This entry has been hit by an external invalidating probe. Notify the
+      // consumers to retry and free the entry
+      // Other options:
+      // TODO: Option 1: Reprobe directly in here -> notify ASF from here, too
+      // TODO: Option 2: Send tainted data to the core -> let it retry the load
+      hierarchy.lfrq.wakeup(mb.addr, mb.lfrqmap, true);
+
+      // TODO: Stats
+      assert(!freemap[i]);
+      freemap[i] = 1;
+      mb.reset();
+      count--;
+      assert(count >= 0);
+      break;
+    }
     }
   }
 }
@@ -452,6 +542,31 @@
   }
 }
 
+/**
+ * Notify the miss-buffer of external probes to cache-line sized objects.
+ * @param physaddr Physical address of the probed cache-line, properly aligned!
+ * @param inv Invalidating probe?
+ */
+template <int SIZE>
+void MissBuffer<SIZE>::external_probe(W64 physaddr, bool inv) {
+  // NonInv probes do not affect the MissBuffer that just contains loads
+  if (!inv) return;
+
+  physaddr = floor(physaddr, L1_LINE_SIZE);
+
+  int idx = find(physaddr);
+
+  // No matching in flight request.
+  if (idx == -1) return;
+
+  Entry& mbe = missbufs[idx];
+  mbe.state = STATE_INVALIDATED;
+
+  if (logable(5)) logfile << "mb", idx, ": hit by inv(", inv,
+    ") probe. ", endl;
+  // The mb.clock() function will pick up the state change and act accordingly
+}
+
 template <int SIZE>
 ostream& MissBuffer<SIZE>::print(ostream& os) const {
  
@@ -489,13 +604,37 @@
 #endif
   return os;
 }
-
+#ifdef ENABLE_ASF_CACHE_BASED
+template <int linesize>
+ostream& CacheLineWithValidMaskSpecRead<linesize>::print(ostream& os, W64 tag) const {
+#if 0
+  const byte* data = (const byte*)(W64)tag;
+  foreach (i, linesize/8) {
+    os << "    ", bytemaskstring(data + i*8, base_t::valid(i*8, 8).integer(), 8, 8), " ";
+    os << endl;
+  }
+  os << sr;
+#endif
+  return os;
+}
+template class CacheSubsystem::CacheLineWithValidMaskSpecRead<L1_LINE_SIZE>;
+#endif
+/**
+ * For virtually indexed caches, set the bits in the index differing
+ * between physical frame and virtual page number to zero, as we can't map
+ * some physical address back to any virtual one necessarily.
+ * HACKALERT: In a real CPU, we quite likely would get the data directly from
+ * L2, but this fairly difficult to model in PTLsim.
+ */
 int CacheHierarchy::issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi, bool& L2hit) {
+  return issueload_slowpath(physaddr, physaddr & (~PAGE_MASK), sfra, lsi, L2hit);
+}
+int CacheHierarchy::issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi, bool& L2hit) {
   static const bool DEBUG = 0;
 
   starttimer(load_slowpath_timer);
 
-  L1CacheLine* L1line = L1.probe(physaddr);
+  L1CacheLine* L1line = L1.probe(physaddr, virtaddr);
 
   //
   // Loads and stores that also miss the L2 Stores that
@@ -560,12 +699,15 @@
   // Slap a lock on the L2 line it so it can't get evicted.
   // Once it's locked up, we can move it into the L1 later.
   //
+  // SD: I doubt that this is true. L1 and L2 lines are allocated when the data
+  // arrives. See mb.clock() and the call to validate!
+  //
   // If we did have a hit, but either the L1 or L2 lines
   // were still missing bytes, initiate prefetches to fill
   // them in.
   //
 
-  LoadFillReq req(physaddr, lsi.sfrused ? sfra.data : 0, lsi.sfrused ? sfra.bytemask : 0, lsi);
+  LoadFillReq req(physaddr, virtaddr, lsi.sfrused ? sfra.data : 0, lsi.sfrused ? sfra.bytemask : 0, lsi);
 
   int lfrqslot = missbuf.initiate_miss(req, L2hit, lsi.rob);
 
@@ -603,16 +745,26 @@
   return ((sframask & reqmask) == reqmask);
 }
 
-bool CacheHierarchy::probe_cache_and_sfr(W64 addr, const SFR* sfr, int sizeshift) {
+/**
+ * For virtually indexed caches, set the bits in the index differing
+ * between physical frame and virtual page number to zero, as we can't map
+ * some physical address back to any virtual one necessarily.
+ * HACKALERT: In a real CPU, we quite likely would get the data directly from
+ * L2, but this fairly difficult to model in PTLsim.
+ */
+bool CacheHierarchy::probe_cache_and_sfr(W64 physaddr, const SFR* sfr, int sizeshift) {
+  return probe_cache_and_sfr(physaddr, physaddr & (~PAGE_MASK), sfr, sizeshift);
+}
+bool CacheHierarchy::probe_cache_and_sfr(W64 physaddr, W64 virtaddr, const SFR* sfr, int sizeshift) {
   bitvec<L1_LINE_SIZE> sframask, reqmask;
-  prep_sframask_and_reqmask(sfr, addr, sizeshift, sframask, reqmask);
+  prep_sframask_and_reqmask(sfr, physaddr, sizeshift, sframask, reqmask);
 
   //
   // Short circuit if the SFR covers the entire load: no need for cache probe
   //
   if unlikely ((sframask & reqmask) == reqmask) return true;
 
-  L1CacheLine* L1line = L1.probe(addr);
+  L1CacheLine* L1line = L1.probe(physaddr, virtaddr);
 
   if unlikely (!L1line) return false;
 
@@ -638,28 +790,41 @@
 //
 static const int PREFETCH_STOPS_AT_L2 = 0;
   
-void CacheHierarchy::initiate_prefetch(W64 addr, int cachelevel) {
+void CacheHierarchy::initiate_prefetch(W64 physaddr, W64 virtaddr, int cachelevel, bool invalidating) {
   static const bool DEBUG = 0;
 
-  addr = floor(addr, L1_LINE_SIZE);
-    
-  L1CacheLine* L1line = L1.probe(addr);
+  physaddr = floor(physaddr, L1_LINE_SIZE);
+  virtaddr = floor(virtaddr, L1_LINE_SIZE);
+  L1CacheLine* L1line = L1.probe(physaddr, virtaddr);
     
   if unlikely (L1line) {
     stats.dcache.prefetch.in_L1++;
+#ifdef POOR_MANS_MESI
+    if (invalidating) probe_other_caches(physaddr, virtaddr, true);
+#endif
     return;
   }
     
-  L2CacheLine* L2line = L2.probe(addr);
+  L2CacheLine* L2line = L2.probe(physaddr);
     
   if unlikely (L2line) {
     stats.dcache.prefetch.in_L2++;
-    if (PREFETCH_STOPS_AT_L2) return; // only move up to L2 level, and it's already there
+    if (PREFETCH_STOPS_AT_L2) {
+#ifdef POOR_MANS_MESI
+      if (invalidating) probe_other_caches(physaddr, virtaddr, true);
+#endif
+      return; // only move up to L2 level, and it's already there
+    }
   }
     
-  if (DEBUG) logfile << "Prefetch requested for ", (void*)(Waddr)addr, " to cache level ", cachelevel, endl;
+  if (DEBUG) logfile << "Prefetch requested for ", (void*)(Waddr)physaddr, " to cache level ", cachelevel, endl;
     
-  missbuf.initiate_miss(addr, L2line);
+  // NB: This might actually get the line from another cache, ie with less cycles than full memory latency.
+  missbuf.initiate_miss(physaddr, virtaddr, L2line);
+  // NB(cont'd): hence we will just invalidate after initiating the miss!
+#ifdef POOR_MANS_MESI
+  if (invalidating) probe_other_caches(physaddr, (W64)virtaddr, true);
+#endif
   stats.dcache.prefetch.required++;
 }
 
@@ -677,7 +842,9 @@
 int CacheHierarchy::initiate_icache_miss(W64 addr, int rob, int threadid) {
   addr = floor(addr, L1I_LINE_SIZE);
   bool line_in_L2 = (L2.probe(addr) != null);
-  int mb = missbuf.initiate_miss(addr, L2.probe(addr), true, rob, threadid);
+  /* SD: ignore virtual address, as L1I is not virtually indexed
+     if it was, getting the virtual address in here would be a TODO!*/
+  int mb = missbuf.initiate_miss(addr, 0, L2.probe(addr), true, rob, threadid);
     
   if (logable(6))
     logfile << "[vcpu ", threadid, "] Initiate icache miss on ", (void*)(Waddr)addr, " to missbuf ", mb, " (", (line_in_L2 ? "in L2" : "not in L2"), ")", endl;
@@ -690,7 +857,7 @@
 // any cache lines. The store must have already been checked
 // to have no exceptions.
 //
-W64 CacheHierarchy::commitstore(const SFR& sfr, int threadid, bool perform_actual_write) {
+W64 CacheHierarchy::commitstore(const SFR& sfr, W64 virtaddr, bool internal, int threadid, bool perform_actual_write) {
   if unlikely (sfr.invalid | (sfr.bytemask == 0)) return 0;
 
   static const bool DEBUG = 0;
@@ -699,18 +866,27 @@
 
   W64 addr = sfr.physaddr << 3;
 
+  // internal stores do not hit the caches
+  if unlikely (internal && perform_actual_write) {
+    storemask(addr, sfr.data, sfr.bytemask);
+    return 0;
+  }
+
   L2CacheLine* L2line = L2.select(addr);
 
   if likely (perform_actual_write) storemask(addr, sfr.data, sfr.bytemask);
+#ifdef POOR_MANS_MESI
+  probe_other_caches(addr, virtaddr, true);
+#endif
 
-  L1CacheLine* L1line = L1.select(addr);
+  L1CacheLine* L1line = L1.select(addr, virtaddr);
 
   L1line->valid |= ((W64)sfr.bytemask << lowbits(addr, 6));
   L2line->valid |= ((W64)sfr.bytemask << lowbits(addr, 6));
 
   if unlikely (!L1line->valid.allset()) {
     per_context_dcache_stats_update(threadid, store.prefetches++);
-    missbuf.initiate_miss(addr, L2line->valid.allset(), false, 0xffff, threadid);
+    missbuf.initiate_miss(addr, virtaddr, L2line->valid.allset(), false, 0xffff, threadid);
   }
 
   stoptimer(store_flush_timer);
@@ -723,8 +899,8 @@
 // so they can be immediately forwarded to loads, but do not actually
 // write to the cache itself.
 //
-W64 CacheHierarchy::speculative_store(const SFR& sfr, int threadid) {
-  return commitstore(sfr, threadid, false);
+W64 CacheHierarchy::speculative_store(const SFR& sfr, W64 virtaddr, int threadid) {
+  return commitstore(sfr, virtaddr, false, threadid, false);
 }
 
 void CacheHierarchy::clock() {
@@ -743,6 +919,58 @@
   missbuf.clock();
 }
 
+#ifdef POOR_MANS_MESI
+
+bool CacheHierarchy::external_probe(W64 addr, W64 virtaddr, bool inv) {
+
+  missbuf.external_probe(addr, inv);
+
+  if unlikely (inv) {
+    // Invalidations remove the entry from all cache levels
+    L1.invalidate(addr, virtaddr);
+    L1I.invalidate(addr);
+    L2.invalidate(addr);
+#ifdef ENABLE_L3_CACHE
+    L3.invalidate(addr);
+#endif
+    return true;
+  } else {
+    // NonInv probes just probe the outermost cache as those are inclusive
+    // in PTLsim!
+#ifdef ENABLE_L3_CACHE
+    return (L3.probe(addr) != null);
+#else
+    return (L2.probe(addr) != null);
+#endif
+  }
+}
+//
+// Probes all other cache hierarchies in the system and checks whether any cache
+// in them contains the line with the specified address. Invalidates the line,
+// if necessary.
+//
+bool CacheHierarchy::probe_other_caches(W64 addr, W64 virtaddr, bool inv) {
+  CacheHierarchy *other_hier;
+  int            other_id;
+  int  this_id   = (int) coreid;
+  bool crosshit  = false;
+
+  foreach (other_id, MAX_HIERARCHIES) {
+    //TODO: Add statistics!
+    if (other_id == this_id) continue;
+
+    other_hier = hierarchies[other_id];
+    if (!other_hier) continue;
+
+    if (logable(6)) logfile << "[vcpu ", this_id, "] Sending ", inv ? "" : "Non",
+      "Inv probe @ ", (void*) virtaddr, " / ", (void*) addr, " to core ", other_id, endl;
+    crosshit |= other_hier->external_probe(addr, virtaddr, inv);
+    if (!inv && crosshit) break;  //Short-cut read probes
+  }
+  return crosshit;
+}
+#endif
+
 void CacheHierarchy::complete() {
   lfrq.restart();
   missbuf.restart();
@@ -764,6 +992,10 @@
   L1I.reset();
   itlb.reset();
   dtlb.reset();
+#ifdef USE_L2_TLB
+  l2itlb.reset();
+  l2dtlb.reset();
+#endif
 }
 
 ostream& CacheHierarchy::print(ostream& os) {
@@ -778,7 +1011,7 @@
 //
 // Make sure the templates and vtables get instantiated:
 //
-void PerCoreCacheCallbacks::dcache_wakeup(LoadStoreInfo lsi, W64 physaddr) { }
+void PerCoreCacheCallbacks::dcache_wakeup(LoadStoreInfo lsi, W64 physaddr, bool retry) { }
 void PerCoreCacheCallbacks::icache_wakeup(LoadStoreInfo lsi, W64 physaddr) { }
 
 template struct LoadFillReqQueue<LFRQ_SIZE>;
@@ -802,3 +1035,4 @@
 }
 */
 
+CacheHierarchy* CacheHierarchy::hierarchies[MAX_HIERARCHIES] = {null};
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/dcache.h ptlsim-asf/dcache.h
--- ptlsim-stable/dcache.h	2010-03-02 13:20:36.023701000 +0100
+++ ptlsim-asf/dcache.h	2010-03-02 12:00:55.134957000 +0100
@@ -3,770 +3,43 @@
 // PTLsim: Cycle Accurate x86-64 Simulator
 // Data Cache
 //
-// Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2000-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2008-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _DCACHE_H_
 #define _DCACHE_H_
 
 #include <ptlsim.h>
-//#include <datastore.h>
 
-struct LoadStoreInfo {
-  W16 rob;
-  W8  threadid;
-  W8  sizeshift:2, aligntype:2, sfrused:1, internal:1, signext:1, pad1:1;
-  W32 pad32;
-  RawDataAccessors(LoadStoreInfo, W64);
-};
-
-#define per_context_dcache_stats_ref(vcpuid) (*(((PerContextDataCacheStats*)&stats.dcache.vcpu0) + (vcpuid)))
-#define per_context_dcache_stats_update(vcpuid, expr) stats.dcache.total.expr, per_context_dcache_stats_ref(vcpuid).expr
-
-namespace CacheSubsystem {
-  // How many load wakeups can be driven into the core each cycle:
-  const int MAX_WAKEUPS_PER_CYCLE = 2;
-
-#ifndef STATS_ONLY
-
-// non-debugging only:
-//#define __RELEASE__
-#ifdef __RELEASE__
-#undef assert
-#define assert(x) (x)
-#endif
-
-  //#define CACHE_ALWAYS_HITS
-  //#define L2_ALWAYS_HITS
-  
-  // 16 KB L1 at 2 cycles       // increase to 32 KB to match Core 2
-  const int L1_LINE_SIZE = 64;
-  const int L1_SET_COUNT = 64;
-  const int L1_WAY_COUNT = 4;
-  // #define ENFORCE_L1_DCACHE_BANK_CONFLICTS
-  const int L1_DCACHE_BANKS = 8; // 8 banks x 8 bytes/bank = 64 bytes/line
-
-  // 32 KB L1I
-  const int L1I_LINE_SIZE = 64;
-  const int L1I_SET_COUNT = 128;
-  const int L1I_WAY_COUNT = 4;
-
-  // 256 KB L2 at 6 cycles
-  const int L2_LINE_SIZE = 64;
-  const int L2_SET_COUNT = 256; // 256 KB
-  const int L2_WAY_COUNT = 16;
-  const int L2_LATENCY   = 5; // don't include the extra wakeup cycle (waiting->ready state transition) in the LFRQ
-
-#define ENABLE_L3_CACHE
-#ifdef ENABLE_L3_CACHE
-  // 4 MB L3 cache (2048 sets, 32 ways) with 64-byte lines, latency 16 cycles
-  const int L3_SET_COUNT = 2048;
-  const int L3_WAY_COUNT = 32;
-  const int L3_LINE_SIZE = 64;
-  const int L3_LATENCY   = 8; // Core 2 Duo 2.0 GHz has 14 cycle total L2 latency
-#endif
-  // Load Fill Request Queue (maximum number of missed loads)
-  // const int LFRQ_SIZE = 63;
-  const int LFRQ_SIZE = 64;
-  
-  // Allow up to 32 outstanding lines in the L2 awaiting service:
-  const int MISSBUF_COUNT = 64;
-  // const int MISSBUF_COUNT = 4;
-
-  // Main memory latency
-  const int MAIN_MEM_LATENCY = 140; // Core 2 Duo 2.4 GHz has 160 cycle total L2 latency
-
-  // TLBs
-#ifdef PTLSIM_HYPERVISOR
-#define USE_TLB
-#endif
-  const int ITLB_SIZE = 32;
-  const int DTLB_SIZE = 32;
-
-//#define ISSUE_LOAD_STORE_DEBUG
-//#define CHECK_LOADS_AND_STORES
-
-// Line Usage Statistics
-
-//#define TRACK_LINE_USAGE
-
-#ifdef TRACK_LINE_USAGE
-#define DCACHE_L1_LINE_LIFETIME_INTERVAL   1
-#define DCACHE_L1_LINE_DEADTIME_INTERVAL   1
-#define DCACHE_L1_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L1_LINE_LIFETIME_SLOTS      8192
-#define DCACHE_L1_LINE_DEADTIME_SLOTS      8192
-#define DCACHE_L1_LINE_HITCOUNT_SLOTS      64
-
-#define DCACHE_L1I_LINE_LIFETIME_INTERVAL  16
-#define DCACHE_L1I_LINE_DEADTIME_INTERVAL  16
-#define DCACHE_L1I_LINE_HITCOUNT_INTERVAL  1
-#define DCACHE_L1I_LINE_LIFETIME_SLOTS     8192
-#define DCACHE_L1I_LINE_DEADTIME_SLOTS     8192
-#define DCACHE_L1I_LINE_HITCOUNT_SLOTS     1024
-
-#define DCACHE_L2_LINE_LIFETIME_INTERVAL   4
-#define DCACHE_L2_LINE_DEADTIME_INTERVAL   4
-#define DCACHE_L2_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L2_LINE_LIFETIME_SLOTS      65536
-#define DCACHE_L2_LINE_DEADTIME_SLOTS      65536
-#define DCACHE_L2_LINE_HITCOUNT_SLOTS      256
-
-#define DCACHE_L3_LINE_LIFETIME_INTERVAL   64
-#define DCACHE_L3_LINE_DEADTIME_INTERVAL   64
-#define DCACHE_L3_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L3_LINE_LIFETIME_SLOTS      16384
-#define DCACHE_L3_LINE_DEADTIME_SLOTS      16384
-#define DCACHE_L3_LINE_HITCOUNT_SLOTS      256
-#endif
-
-  //
-  // Cache Line Types
-  //
-  template <int linesize>
-  struct CacheLine {
-#ifdef TRACK_LINE_USAGE
-    W32 filltime;
-    W32 lasttime;
-    W32 hitcount;
+#ifdef CORE_GENERIC
+#include <dache-generic.h>
 #else
-    byte dummy;
-#endif
-    void reset() { clearstats(); }
-    void invalidate() { reset(); }
-    void fill(W64 tag, const bitvec<linesize>& valid) { }
-
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      filltime = sim_cycle;
-      lasttime = sim_cycle;
-      hitcount = 0;
-#endif
-    }
-
-    ostream& print(ostream& os, W64 tag) const;
-  };
-
-  template <int linesize>
-  static inline ostream& operator <<(ostream& os, const CacheLine<linesize>& line) {
-    return line.print(os, 0);
-  }
-
-  template <int linesize>
-  struct CacheLineWithValidMask {
-    bitvec<linesize> valid;
-#ifdef TRACK_LINE_USAGE
-    W32 filltime;
-    W32 lasttime;
-    W32 hitcount;
-#endif
-
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      filltime = sim_cycle;
-      lasttime = sim_cycle;
-      hitcount = 0;
-#endif
-    }
-
-    void reset() { valid = 0; clearstats(); }
-    void invalidate() { reset(); }
-    void fill(W64 tag, const bitvec<linesize>& valid) { this->valid |= valid; }
-    ostream& print(ostream& os, W64 tag) const;
-  };
-
-  template <int linesize>
-  static inline ostream& operator <<(ostream& os, const CacheLineWithValidMask<linesize>& line) {
-    return line.print(os, 0);
-  }
-
-  typedef CacheLineWithValidMask<L1_LINE_SIZE> L1CacheLine;
-  typedef CacheLine<L1I_LINE_SIZE> L1ICacheLine;
-  typedef CacheLineWithValidMask<L2_LINE_SIZE> L2CacheLine;
-#ifdef ENABLE_L3_CACHE
-  typedef CacheLine<L3_LINE_SIZE> L3CacheLine;
-#endif
-
-  //
-  // L1 data cache
-  //
-#ifdef TRACK_LINE_USAGE
-  static const char* cache_names[4] = {"L1", "I1", "L2", "L3"};
-
-  template <int uniq, typename V, int LIFETIME_INTERVAL, int LIFETIME_SLOTS, int DEADTIME_INTERVAL, int DEADTIME_SLOTS, int HITCOUNT_INTERVAL, int HITCOUNT_SLOTS>
-  struct HistogramAssociativeArrayStatisticsCollector {
-    static W64 line_lifetime_histogram[LIFETIME_SLOTS];
-    static W64 line_deadtime_histogram[DEADTIME_SLOTS];
-    static W64 line_hitcount_histogram[HITCOUNT_SLOTS];
-
-    static const bool FORCE_DEBUG = 0;
-
-    HistogramAssociativeArrayStatisticsCollector() {
-      reset();
-    }
-
-    static void reset() {
-      setzero(line_lifetime_histogram);
-      setzero(line_deadtime_histogram);
-      setzero(line_hitcount_histogram);
-    }
-
-    static void evicted(const V& line, W64 tag) {
-      // Line has been evicted: update statistics
-      W64s lifetime = line.lasttime - line.filltime;
-      assert(lifetime >= 0);
-      int lifetimeslot = clipto(lifetime / LIFETIME_INTERVAL, 0, LIFETIME_SLOTS-1);
-      line_lifetime_histogram[lifetimeslot]++;
-
-      W64s deadtime = sim_cycle - line.lasttime;
-      int deadtimeslot = clipto(deadtime / DEADTIME_INTERVAL, 0, DEADTIME_SLOTS-1);
-      line_deadtime_histogram[deadtimeslot]++;
-
-      W64 hitcount = line.hitcount;
-      int hitcountslot = clipto(hitcount / HITCOUNT_INTERVAL, 0, HITCOUNT_SLOTS-1);
-      line_hitcount_histogram[hitcountslot]++;
-
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": evicted(", (void*)tag, "): lifetime ", lifetime, ", deadtime ", deadtime, ", hitcount ", hitcount, " (line addr ", &line, ")", endl;
-    }
-
-    static void filled(V& line, W64 tag) {
-      line.filltime = sim_cycle;
-      line.lasttime = sim_cycle;
-      line.hitcount = 1;
-
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": filled(", (void*)tag, ")", " (line addr ", &line, ")", endl;
-    }
-
-    static void inserted(V& line, W64 newtag, int way) {
-      filled(line, newtag);
-    }
-
-    static void replaced(V& line, W64 oldtag, W64 newtag, int way) {
-      evicted(line, oldtag);
-      filled(line, newtag);
-    }
-
-    static void probed(V& line, W64 tag, int way, bool hit) { 
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": probe(", (void*)tag, "): ", (hit ? "HIT" : "miss"), " way ", way, ": hitcount ", line.hitcount, ", filltime ", line.filltime, ", lasttime ", line.lasttime, " (line addr ", &line, ")", endl;
-      if (hit) {
-        line.hitcount++;
-        line.lasttime = sim_cycle;
-      }
-    }
-
-    static void overflow(W64 tag) { }
-
-    static void locked(V& slot, W64 tag, int way) { }
-    static void unlocked(V& slot, W64 tag, int way) { }
-
-    static void invalidated(V& line, W64 oldtag, int way) { evicted(line, oldtag); }
-
-    static void savestats(DataStoreNode& ds) {
-      ds.add("lifetime", (W64s*)line_lifetime_histogram, LIFETIME_SLOTS, 0, ((LIFETIME_SLOTS-1) * LIFETIME_INTERVAL), LIFETIME_INTERVAL);
-      ds.add("deadtime", (W64s*)line_deadtime_histogram, DEADTIME_SLOTS, 0, ((DEADTIME_SLOTS-1) * DEADTIME_INTERVAL), DEADTIME_INTERVAL);
-      ds.add("hitcount", (W64s*)line_hitcount_histogram, HITCOUNT_SLOTS, 0, ((HITCOUNT_SLOTS-1) * HITCOUNT_INTERVAL), HITCOUNT_INTERVAL);
-    }
-  };
-
-  typedef HistogramAssociativeArrayStatisticsCollector<0, L1CacheLine,
-    DCACHE_L1_LINE_LIFETIME_INTERVAL, DCACHE_L1_LINE_LIFETIME_SLOTS, 
-    DCACHE_L1_LINE_DEADTIME_INTERVAL, DCACHE_L1_LINE_DEADTIME_SLOTS, 
-    DCACHE_L1_LINE_HITCOUNT_INTERVAL, DCACHE_L1_LINE_HITCOUNT_SLOTS> L1StatsCollectorBase;
-
-  typedef HistogramAssociativeArrayStatisticsCollector<1, L1ICacheLine,
-    DCACHE_L1I_LINE_LIFETIME_INTERVAL, DCACHE_L1I_LINE_LIFETIME_SLOTS, 
-    DCACHE_L1I_LINE_DEADTIME_INTERVAL, DCACHE_L1I_LINE_DEADTIME_SLOTS, 
-    DCACHE_L1I_LINE_HITCOUNT_INTERVAL, DCACHE_L1I_LINE_HITCOUNT_SLOTS> L1IStatsCollectorBase;
-
-  typedef HistogramAssociativeArrayStatisticsCollector<2, L2CacheLine,
-    DCACHE_L2_LINE_LIFETIME_INTERVAL, DCACHE_L2_LINE_LIFETIME_SLOTS, 
-    DCACHE_L2_LINE_DEADTIME_INTERVAL, DCACHE_L2_LINE_DEADTIME_SLOTS, 
-    DCACHE_L2_LINE_HITCOUNT_INTERVAL, DCACHE_L2_LINE_HITCOUNT_SLOTS> L2StatsCollectorBase;
-
-#ifdef ENABLE_L3_CACHE
-  typedef HistogramAssociativeArrayStatisticsCollector<3, L3CacheLine,
-    DCACHE_L3_LINE_LIFETIME_INTERVAL, DCACHE_L3_LINE_LIFETIME_SLOTS, 
-    DCACHE_L3_LINE_DEADTIME_INTERVAL, DCACHE_L3_LINE_DEADTIME_SLOTS, 
-    DCACHE_L3_LINE_HITCOUNT_INTERVAL, DCACHE_L3_LINE_HITCOUNT_SLOTS> L3StatsCollectorBase;
-#endif
-
-  struct L1StatsCollector: public L1StatsCollectorBase { };
-  struct L1IStatsCollector: public L1IStatsCollectorBase { };
-  struct L2StatsCollector: public L2StatsCollectorBase { };
-#ifdef ENABLE_L3_CACHE
-  struct L3StatsCollector: public L3StatsCollectorBase { };
-#endif
-
+#ifdef CORE_AMD_K8
+#include <dcache-amd-k8.h>
+#else
+#ifdef CORE_AMD_BARCELONA_ASF
+#include <dcache-amd-barcelona-asf.h>
 #else
-  typedef NullAssociativeArrayStatisticsCollector<W64, L1CacheLine> L1StatsCollector;
-  typedef NullAssociativeArrayStatisticsCollector<W64, L1ICacheLine> L1IStatsCollector;
-  typedef NullAssociativeArrayStatisticsCollector<W64, L2CacheLine> L2StatsCollector;
-#ifdef ENABLE_L3_CACHE
-  typedef NullAssociativeArrayStatisticsCollector<W64, L3CacheLine> L3StatsCollector;
+#error Please specify a core flavour by defining CORE_XXX in ptlsim.h!
 #endif
 #endif
-
-  template <typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<W64, V> > 
-  struct DataCache: public AssociativeArray<W64, V, setcount, waycount, linesize, stats> {
-    typedef AssociativeArray<W64, V, setcount, waycount, linesize, stats> base_t;
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      foreach (set, L1_SET_COUNT) {
-        foreach (way, waycount) {
-          base_t::sets[set][way].clearstats();
-        }
-      }
 #endif
-    }
-  };
-
-  struct L1Cache: public DataCache<L1CacheLine, L1_SET_COUNT, L1_WAY_COUNT, L1_LINE_SIZE, L1StatsCollector> {
-    L1CacheLine* validate(W64 addr, const bitvec<L1_LINE_SIZE>& valid) {
-      addr = tagof(addr);
-      L1CacheLine* line = select(addr);
-      line->fill(addr, valid);
-      return line;
-    }
-  };
-
-  static inline ostream& operator <<(ostream& os, const L1Cache& cache) {
-    return os;
-  }
-
-  //
-  // L1 instruction cache
-  //
-
-  struct L1ICache: public DataCache<L1ICacheLine, L1I_SET_COUNT, L1I_WAY_COUNT, L1I_LINE_SIZE, L1IStatsCollector> {
-    L1ICacheLine* validate(W64 addr, const bitvec<L1I_LINE_SIZE>& valid) {
-      addr = tagof(addr);
-      L1ICacheLine* line = select(addr);
-      line->fill(addr, valid);
-      return line;
-    }
-  };
-
-  static inline ostream& operator <<(ostream& os, const L1ICache& cache) {
-    return os;
-  }
-
-  //
-  // L2 cache
-  //
-
-  typedef DataCache<L2CacheLine, L2_SET_COUNT, L2_WAY_COUNT, L2_LINE_SIZE, L2StatsCollector> L2CacheBase;
-
-  struct L2Cache: public L2CacheBase {
-    void validate(W64 addr) {
-      L2CacheLine* line = select(addr);
-      if (!line) return;
-      line->valid.setall();
-    }
-
-    void deliver(W64 address);
-  };
-
-  //
-  // L3 cache
-  //
-#ifdef ENABLE_L3_CACHE
-  static inline ostream& operator <<(ostream& os, const L3CacheLine& line) {
-    return line.print(os, 0);
-  }
-
-  struct L3Cache: public DataCache<L3CacheLine, L3_SET_COUNT, L3_WAY_COUNT, L3_LINE_SIZE, L3StatsCollector> {
-    L3CacheLine* validate(W64 addr) {
-      W64 oldaddr;
-      L3CacheLine* line = select(addr, oldaddr);
-      return line;
-    }
-  };
-#endif
-
-  static inline void prep_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L1_LINE_SIZE>& sframask, bitvec<L1_LINE_SIZE>& reqmask) {
-    sframask = (sfr) ? (bitvec<L1_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L1_LINE_SIZE)-3)) : 0;
-    reqmask = bitvec<L1_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L1_LINE_SIZE));
-  }
-
-  static inline void prep_L2_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L2_LINE_SIZE>& sframask, bitvec<L2_LINE_SIZE>& reqmask) {
-    sframask = (sfr) ? (bitvec<L2_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L2_LINE_SIZE)-3)) : 0;
-    reqmask = bitvec<L2_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L2_LINE_SIZE));
-  }
-
-  //
-  // TLB class with one-hot semantics. 36 bit tags are required since
-  // virtual addresses are 48 bits, so 48 - 12 (2^12 bytes per page)
-  // is 36 bits.
-  //
-  template <int tlbid, int size>
-  struct TranslationLookasideBuffer: public FullyAssociativeTagsNbitOneHot<size, 40> {
-    typedef FullyAssociativeTagsNbitOneHot<size, 40> base_t;
-    TranslationLookasideBuffer(): base_t() { }
-
-    void reset() {
-      base_t::reset();
-    }
-
-    // Get the 40-bit TLB tag (36 bit virtual page ID plus 4 bit threadid)
-    static W64 tagof(W64 addr, W64 threadid) {
-      return bits(addr, 12, 36) | (threadid << 36);
-    }
-
-    bool probe(W64 addr, int threadid = 0) {
-      W64 tag = tagof(addr, threadid);
-      return (base_t::probe(tag) >= 0);
-    }
-
-    bool insert(W64 addr, int threadid = 0) {
-      addr = floor(addr, PAGE_SIZE);
-      W64 tag = tagof(addr, threadid);
-      W64 oldtag;
-      int way = base_t::select(tag, oldtag);
-      W64 oldaddr = lowbits(oldtag, 36) << 12;
-      if (logable(6)) {
-        logfile << "TLB insertion of virt page ", (void*)(Waddr)addr, " (virt addr ", 
-          (void*)(Waddr)(addr), ") into way ", way, ": ",
-          ((oldtag != tag) ? "evicted old entry" : "already present"), endl;
-      }
-      return (oldtag != tag);
-    }
-
-    int flush_all() {
-      reset();
-      return size;
-    }
-
-    int flush_thread(W64 threadid) {
-      W64 tag = threadid << 36;
-      W64 tagmask = 0xfULL << 36;
-      bitvec<size> slotmask = base_t::masked_match(tag, tagmask);
-      int n = slotmask.popcount();
-      base_t::masked_invalidate(slotmask);
-      return n;
-    }
-
-    int flush_virt(Waddr virtaddr, W64 threadid) {
-      return invalidate(tagof(virtaddr, threadid));
-    }
-  };
-
-  template <int tlbid, int size>
-  static inline ostream& operator <<(ostream& os, const TranslationLookasideBuffer<tlbid, size>& tlb) {
-    return tlb.print(os);
-  }
-
-  typedef TranslationLookasideBuffer<0, DTLB_SIZE> DTLB;
-  typedef TranslationLookasideBuffer<1, ITLB_SIZE> ITLB;
-
-  struct CacheHierarchy;
-
-  //
-  // Load fill request queue (LFRQ) contains any requests for outstanding
-  // loads from both the L2 or L1. 
-  //
-  struct LoadFillReq {
-    W64 addr;       // physical address
-    W64 data;       // data already known so far (e.g. from SFR)
-    LoadStoreInfo lsi;
-    W32  initcycle;
-    byte mask;
-    byte fillL1:1, fillL2:1;
-    W8s  mbidx;
-
-    inline LoadFillReq() { }
-  
-    LoadFillReq(W64 addr, W64 data, byte mask, LoadStoreInfo lsi);
-    ostream& print(ostream& os) const;
-  };
-
-  static inline ostream& operator <<(ostream& os, const LoadFillReq& req) {
-    return req.print(os);
-  }
-
-  template <int size>
-  struct LoadFillReqQueue {
-    CacheHierarchy& hierarchy;
-    bitvec<size> freemap;                    // Slot is free
-    bitvec<size> waiting;                    // Waiting for the line to arrive in the L1
-    bitvec<size> ready;                      // Wait to extract/signext and write into register
-    LoadFillReq reqs[size];
-    int count;
-
-    static const int SIZE = size;
-
-    LoadFillReqQueue(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
-    LoadFillReqQueue(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
-
-    // Clear entries belonging to one thread
-    void reset(int threadid);
-
-    // Reset all threads
-    void reset() {
-      freemap.setall();
-      ready = 0;
-      waiting = 0;
-      count = 0;
-    }
-
-    void changestate(int idx, bitvec<size>& oldstate, bitvec<size>& newstate) {
-      oldstate[idx] = 0;
-      newstate[idx] = 1;
-    }
-
-    void free(int lfrqslot) {
-      changestate(lfrqslot, waiting, freemap);
-    }
-
-    bool full() const {
-      return (!freemap);
-    }
-
-    int remaining() const {
-      return (size - count);
-    }
-
-    void annul(int lfrqslot);
-
-    void restart();
-
-    int add(const LoadFillReq& req);
-
-    void wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask);
-
-    void clock();
-
-    LoadFillReq& operator [](int idx) { return reqs[idx]; }
-    const LoadFillReq& operator [](int idx) const { return reqs[idx]; }
-
-    ostream& print(ostream& os) const;
-  };
-
-  template <int size>
-  static inline ostream& operator <<(ostream& os, const LoadFillReqQueue<size>& lfrq) {
-    return lfrq.print(os);
-  }
-
-  enum { STATE_IDLE, STATE_DELIVER_TO_L3, STATE_DELIVER_TO_L2, STATE_DELIVER_TO_L1 };
-  static const char* missbuf_state_names[] = {"idle", "mem->L3", "L3->L2", "L2->L1"};
-
-  template <int SIZE>
-  struct MissBuffer {
-    struct Entry {
-      W64 addr;     // physical line address we are waiting for
-      W16 state;
-      W16 dcache:1, icache:1;    // L1I vs L1D
-      W32 cycles;
-      W16 rob;
-      W8 threadid;
-
-      bitvec<LFRQ_SIZE> lfrqmap;  // which LFRQ entries should this load wake up?
-      void reset() {
-        lfrqmap = 0;
-        addr = 0xffffffffffffffffULL;
-        state = STATE_IDLE;
-        cycles = 0;
-        icache = 0;
-        dcache = 0;
-        rob = 0xffff;
-        threadid = 0xff;
-      }
-    };
-
-    MissBuffer(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
-    MissBuffer(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
-
-    CacheHierarchy& hierarchy;
-    Entry missbufs[SIZE];
-    bitvec<SIZE> freemap;
-    int count;
-
-    void reset();
-    void reset(int threadid);
-    void restart();
-    bool full() const { return (!freemap); }
-    int remaining() const { return (SIZE - count); }
-    int find(W64 addr);
-    int initiate_miss(W64 addr, bool hit_in_L2, bool icache = 0, int rob = 0xffff, int threadid = 0xfe);
-    int initiate_miss(LoadFillReq& req, bool hit_in_L2, int rob = 0xffff);
-    void annul_lfrq(int slot);
-    void annul_lfrq(int slot, int threadid);
-    void clock();
-
-    ostream& print(ostream& os) const;
-  };
-
-  template <int size>
-  static inline ostream& operator <<(ostream& os, const MissBuffer<size>& missbuf) {
-    return missbuf.print(os);
-  }
-
-  struct PerCoreCacheCallbacks {
-    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-  };
-
-  struct CacheHierarchy {
-    LoadFillReqQueue<LFRQ_SIZE> lfrq;
-    MissBuffer<MISSBUF_COUNT> missbuf;
-    L1Cache L1;
-    L1ICache L1I;
-    L2Cache L2;
-#ifdef ENABLE_L3_CACHE
-    L3Cache L3;
-#endif
-    DTLB dtlb;
-    ITLB itlb;
-
-    PerCoreCacheCallbacks* callback;
-
-    CacheHierarchy(): lfrq(*this), missbuf(*this) { callback = null; }
-
-    bool probe_cache_and_sfr(W64 addr, const SFR* sfra, int sizeshift);
-    bool covered_by_sfr(W64 addr, SFR* sfr, int sizeshift);
-    void annul_lfrq_slot(int lfrqslot);
-    int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi, bool& L2hit);
-
-    int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi) {
-      bool L2hit = 0;
-      return issueload_slowpath(physaddr, sfra, lsi, L2hit);
-    }
-
-    int get_lfrq_mb(int lfrqslot) const;
-    int get_lfrq_mb_state(int lfrqslot) const;
-    bool lfrq_or_missbuf_full() const { return lfrq.full() | missbuf.full(); }
-
-    W64 commitstore(const SFR& sfr, int threadid = 0xff, bool perform_actual_write = true);
-    W64 speculative_store(const SFR& sfr, int threadid = 0xff);
-
-    void initiate_prefetch(W64 addr, int cachelevel);
-
-    bool probe_icache(Waddr virtaddr, Waddr physaddr);
-    int initiate_icache_miss(W64 addr, int rob = 0xffff, int threadid = 0xff);
-
-    void reset();
-    void clock();
-    void complete();
-    void complete(int threadid);
-    ostream& print(ostream& os);
-  };
-#endif // STATS_ONLY
-};
-
-struct PerContextDataCacheStats { // rootnode:
-  struct load {
-    struct hit { // node: summable
-      W64 L1;
-      W64 L2;
-      W64 L3;
-      W64 mem;
-    } hit;
-        
-    struct dtlb { // node: summable
-      W64 hits;
-      W64 misses;
-    } dtlb;
-
-    struct tlbwalk { // node: summable
-      W64 L1_dcache_hit;
-      W64 L1_dcache_miss;
-      W64 no_lfrq_mb;
-    } tlbwalk;
-  } load;
- 
-  struct fetch {
-    struct hit { // node: summable
-      W64 L1;
-      W64 L2;
-      W64 L3;
-      W64 mem;
-    } hit;
-    
-    struct itlb { // node: summable
-      W64 hits;
-      W64 misses;
-    } itlb;
-
-    struct tlbwalk { // node: summable
-      W64 L1_dcache_hit;
-      W64 L1_dcache_miss;
-      W64 no_lfrq_mb;      
-    } tlbwalk;
-  } fetch;
-  
-  struct store {
-    W64 prefetches;
-  } store;
-};
-
-struct DataCacheStats { // rootnode:
-  struct load {
-    struct transfer { // node: summable
-      W64 L2_to_L1_full;
-      W64 L2_to_L1_partial;
-      W64 L2_L1I_full;
-    } transfer;
-  } load;
-
-  struct missbuf {
-    W64 inserts;
-    struct deliver { // node: summable
-      W64 mem_to_L3;
-      W64 L3_to_L2;
-      W64 L2_to_L1D;
-      W64 L2_to_L1I;
-    } deliver;
-  } missbuf;
-
-  struct prefetch { // node: summable
-    W64 in_L1;
-    W64 in_L2;
-    W64 required;
-  } prefetch;
-
-  struct lfrq {
-    W64 inserts;
-    W64 wakeups;
-    W64 annuls;
-    W64 resets;
-    W64 total_latency;
-    double average_latency;
-    W64 width[CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1]; // histo: 0, CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1, 1
-  } lfrq;
-
-  PerContextDataCacheStats total;
-  // IMPORTANT: This list MUST be equal in length to the number of active VCPUs (at most MAX_CONTEXTS):
-  PerContextDataCacheStats vcpu0;
-  PerContextDataCacheStats vcpu1;
-  PerContextDataCacheStats vcpu2;
-  PerContextDataCacheStats vcpu3;
-  PerContextDataCacheStats vcpu4;
-  PerContextDataCacheStats vcpu5;
-  PerContextDataCacheStats vcpu6;
-  PerContextDataCacheStats vcpu7;
-  PerContextDataCacheStats vcpu8;
-  PerContextDataCacheStats vcpu9;
-  PerContextDataCacheStats vcpu10;
-  PerContextDataCacheStats vcpu11;
-  PerContextDataCacheStats vcpu12;
-  PerContextDataCacheStats vcpu13;
-  PerContextDataCacheStats vcpu14;
-  PerContextDataCacheStats vcpu15;
-  PerContextDataCacheStats vcpu16;
-  PerContextDataCacheStats vcpu17;
-  PerContextDataCacheStats vcpu18;
-  PerContextDataCacheStats vcpu19;
-  PerContextDataCacheStats vcpu20;
-  PerContextDataCacheStats vcpu21;
-  PerContextDataCacheStats vcpu22;
-  PerContextDataCacheStats vcpu23;
-  PerContextDataCacheStats vcpu24;
-  PerContextDataCacheStats vcpu25;
-  PerContextDataCacheStats vcpu26;
-  PerContextDataCacheStats vcpu27;
-  PerContextDataCacheStats vcpu28;
-  PerContextDataCacheStats vcpu29;
-  PerContextDataCacheStats vcpu30;
-  PerContextDataCacheStats vcpu31;
-};
 
 #endif // _DCACHE_H_
Only in ptlsim-stable: dcache.h.orig
Only in ptlsim-asf/: decode-asf.cpp
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/decode-complex.cpp ptlsim-asf/decode-complex.cpp
--- ptlsim-stable/decode-complex.cpp	2010-03-02 13:20:36.029694000 +0100
+++ ptlsim-asf/decode-complex.cpp	2010-03-02 12:00:55.160931000 +0100
@@ -113,11 +113,11 @@
   // REG_rip is filled out for us
 }
 
-static const char cpuid_vendor[12+1] = "GenuineIntel";
-static const char cpuid_description[48+1] = "Intel(R) Xeon(TM) CPU 2.00 GHz                  ";
+//static const char cpuid_vendor[12+1] = "GenuineIntel";
+//static const char cpuid_description[48+1] = "Intel(R) Xeon(TM) CPU 2.00 GHz                  ";
 
-//static const char cpuid_vendor[12+1] = "PTLsimCPUx64";
-//static const char cpuid_description[48+1] = "PTLsim Cycle Accurate x86-64 Simulator Model    ";
+static const char cpuid_vendor[12+1] = "PTLsimCPUx64";
+static const char cpuid_description[48+1] = "PTLsim Cycle Accurate x86-64 Simulator Model    ";
 
 
 //
@@ -796,35 +796,10 @@
   }
 
   case 0x61: {
-    // popa
-    if (use64) {
-      // popa is invalid in 64-bit mode
+    // popa [not used by gcc]
       MakeInvalid();
       break;
     }
-    EndOfDecode();
-
-    int sizeshift = (opsize_prefix) ? 1 : 2;
-    int size = (1 << sizeshift);
-    int offset = 0;
-
-#define POP(reg) \
-    this << TransOp(OP_ld, reg, REG_rsp, REG_imm, REG_zero, sizeshift, offset);
-
-    POP(REG_rdi);   offset += size;
-    POP(REG_rsi);   offset += size;
-    POP(REG_rbp);   offset += size;
-    /* skip rsp */  offset += size;
-    POP(REG_rbx);   offset += size;
-    POP(REG_rdx);   offset += size;
-    POP(REG_rcx);   offset += size;
-    POP(REG_rax);   offset += size;
-#undef POP
-
-    this << TransOp(OP_add, REG_rsp, REG_rsp, REG_imm, REG_zero, sizeshift, offset);
-
-    break;
-  }
 
   case 0x62: {
     // bound [not used by gcc]
@@ -1768,13 +1743,16 @@
   }
 
   case 0x10d: {
-    // prefetchw [eform] (NOTE: this is an AMD-only insn from K6 onwards)
+    // prefetch(w) [eform] (NOTE: this is an AMD-only insn from K6 onwards)
+    /* Let decode_asf handle the locked versions of prefetchw */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(eform, ra, b_mode);
     EndOfDecode();
 
     int level = 2;
-    prefixes &= ~PFX_LOCK;
-    operand_load(REG_temp0, ra, OP_ld_pre, DATATYPE_INT, level);
+    assert(!(prefixes & PFX_LOCK));
+    operand_load(REG_temp0, ra, OP_ld_pre, DATATYPE_INT, level, (modrm.reg == 1));
     break;
   }
 
@@ -2367,7 +2345,8 @@
   }
 
   default: {
-    MakeInvalid();
+    //S.D. Give the ASF-Decoder a chance to run first! MakeInvalid();
+    return false;
     break;
   }
   }
Only in ptlsim-stable: decode-complex.cpp~
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/decode-core.cpp ptlsim-asf/decode-core.cpp
--- ptlsim-stable/decode-core.cpp	2010-03-02 13:20:36.056668000 +0100
+++ ptlsim-asf/decode-core.cpp	2010-03-02 12:00:55.177916000 +0100
@@ -3,6 +3,8 @@
 // Decoder for x86 and x86-64 to PTL transops
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -109,6 +111,9 @@
   // I/O and legacy
   assist_ioport_in,
   assist_ioport_out,
+#ifdef ENABLE_ASF
+  assist_asf_abort,
+#endif
 };
 
 int assist_index(assist_func_t assist) {
@@ -157,6 +162,7 @@
   ag.cond = 0;
   ag.eom = 0;
   ag.internal = 0;
+  ag.is_asf = 0; // SD: The address generation is not ASF related.
   ag.unaligned = 0;
   ag.rd = REG_temp9;
   ag.rc = REG_zero;
@@ -211,7 +217,11 @@
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, PFX_FS, PFX_GS, PFX_DATA, PFX_ADDR, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#ifdef DECODE_XOP
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PFX_XOP1,
+#else
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#endif
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PFX_FWAIT, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -230,7 +240,11 @@
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, PFX_FS, PFX_GS, PFX_DATA, PFX_ADDR, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#ifdef DECODE_XOP
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PFX_XOP1,
+#else
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+#endif
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PFX_FWAIT, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
   0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -240,7 +254,11 @@
   PFX_LOCK, 0, PFX_REPNZ, PFX_REPZ, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 };
 
-const char* prefix_names[PFX_count] = {"repz", "repnz", "lock", "cs", "ss", "ds", "es", "fs", "gs", "datasz", "addrsz", "rex", "fwait"};
+const char* prefix_names[PFX_count] = {"repz", "repnz", "lock", "cs", "ss", "ds", "es", "fs", "gs", "datasz", "addrsz", "rex", "fwait",
+#ifdef DECODE_XOP
+    "xop1",
+#endif
+    };
 
 const char* uniform_arch_reg_names[APR_COUNT] = {
   // 64-bit
@@ -333,6 +351,31 @@
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
 };
 
+#ifdef DECODE_XOP
+// TODO: Complete this table for proper XOP decoding
+static const byte xop_has_modrm[256] = {
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+  /*       -------------------------------        */
+  /* 00 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,1, /* 00 */
+  /* 10 */ _,_,1,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 10 */
+  /* 20 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 20 */
+  /* 30 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 30 */
+  /* 40 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 40 */
+  /* 50 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 50 */
+  /* 60 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 60 */
+  /* 70 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 70 */
+  /* 80 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 80 */
+  /* 90 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* 90 */
+  /* a0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* a0 */
+  /* b0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* b0 */
+  /* c0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* c0 */
+  /* d0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* d0 */
+  /* e0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_, /* e0 */
+  /* f0 */ _,_,_,_,_,_,_,_,_,_,_,_,_,_,_,_  /* f0 */
+  /*       -------------------------------        */
+  /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
+};
+#endif
 static const byte twobyte_has_modrm[256] = {
   /*       0 1 2 3 4 5 6 7 8 9 a b c d e f        */
   /*       -------------------------------        */
@@ -913,7 +956,7 @@
   return basereg;
 }
 
-void TraceDecoder::address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool force_seg_bias, bool rmw) {
+void TraceDecoder::address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool force_seg_bias, bool invalidating, bool rmw) {
   //
   // In the address generation form used by internally generated
   // uops, we need the full virtual address, including the segment base
@@ -977,6 +1020,7 @@
       ldst.datatype = datatype;
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
+      ldst.invalidating = invalidating;
       ldst.extshift = 0;
       this << ldst;
     } else {
@@ -997,6 +1041,7 @@
       ldst.datatype = datatype;
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
+      ldst.invalidating = invalidating;
       ldst.extshift = 0; // rmw;
     }
     this << ldst;
@@ -1021,6 +1066,7 @@
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
       ldst.extshift = 0; // rmw;
+      ldst.invalidating = invalidating;
     }
     this << ldst;
   } else {
@@ -1041,18 +1087,19 @@
       ldst.datatype = datatype;
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
+      ldst.invalidating = invalidating;
       ldst.extshift = 0; // rmw;
     }
     this << ldst;
   }
 }
 
-void TraceDecoder::operand_load(int destreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool rmw) {
-  address_generate_and_load_or_store(destreg, REG_zero, memref, opcode, datatype, cachelevel, false, rmw);
+void TraceDecoder::operand_load(int destreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool invalidating, bool rmw) {
+  address_generate_and_load_or_store(destreg, REG_zero, memref, opcode, datatype, cachelevel, false, invalidating, rmw);
 }
 
 void TraceDecoder::result_store(int srcreg, int tempreg, const DecodedOperand& memref, int datatype, bool rmw) {
-  address_generate_and_load_or_store(REG_mem, srcreg, memref, OP_st, datatype, 0, 0, rmw);
+  address_generate_and_load_or_store(REG_mem, srcreg, memref, OP_st, datatype, 0, false, true, rmw);
 }
 
 void TraceDecoder::alu_reg_or_mem(int opcode, const DecodedOperand& rd, const DecodedOperand& ra, W32 setflags, int rcreg, 
@@ -1141,7 +1188,8 @@
 
     bool isimm = (ra.type == OPTYPE_IMM);
     int srcreg = (isimm) ? REG_imm : arch_pseudo_reg_to_arch_reg[ra.reg.reg];
-    operand_load(REG_temp0, rd, OP_ld, 0, 0, 1);
+    // Loads of RMW instructions invalidate in other caches (read for ownership), if they produce a result!
+    operand_load(REG_temp0, rd, OP_ld, 0, 0, !flagsonly, 1);
 
     int sizeshift = rd.mem.size;
     bool rahigh = (isimm) ? 0 : reginfo[ra.reg.reg].hibyte;
@@ -1321,6 +1369,13 @@
       rex = 0;
       prefixes &= ~PFX_REX;
     }
+#ifdef DECODE_XOP
+    if (prefix == PFX_XOP1) {
+      // Check that this is not a 'pop' instruction -- by peeking ahead
+      if (XopByte2(insnbytes[byteoffset+1]).xopmapselect < 8)
+        break;
+    }
+#endif
     prefixes |= prefix;
     if (prefix == PFX_REX) { rex = b; }
     byteoffset++; rip++;
@@ -1858,9 +1913,25 @@
 
   if (prefixes & PFX_ADDR) addrsize_prefix = 1;
 
+#ifdef DECODE_XOP
+  bool xop = false;
+  if (prefixes & PFX_XOP1) {
+    xop2 = XopByte2(fetch1());
+    xop3 = XopByte3(fetch1());
+    xop  = true;
+  }
+#endif
+
   bool uses_sse = 0;
   op = fetch1();
   bool need_modrm = onebyte_has_modrm[op];
+#ifdef DECODE_XOP
+  if (xop) {
+    need_modrm = xop_has_modrm[op];
+    op |= 0x700;
+  }
+#endif
+
   if (op == 0x0f) {
     op = fetch1();
     need_modrm = twobyte_has_modrm[op];
@@ -1933,11 +2004,20 @@
   case 6:
     stats.decoder.x86_decode_type[DECODE_TYPE_X87]++;
     rc = decode_x87(); break;
+#ifdef DECODE_XOP
+  case 7:
+    stats.decoder.x86_decode_type[DECODE_TYPE_SSE]++;
+    rc = decode_xop(); break;
+#endif
   default: {
     assert(false);
   }
   } // switch
 
+  /* check for possible ASF-Instructions */
+  bool isasf = ((rc == 0) & (!invalid));
+  if (isasf) rc = decode_asf();
+
   if (!rc) return rc;
 
   user_insn_count++;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/decode-fast.cpp ptlsim-asf/decode-fast.cpp
--- ptlsim-stable/decode-fast.cpp	2010-03-02 13:20:36.090634000 +0100
+++ ptlsim-asf/decode-fast.cpp	2010-03-02 12:00:55.188903000 +0100
@@ -3,6 +3,8 @@
 // Decoder for simple x86 instructions
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <decode.h>
@@ -216,6 +218,9 @@
 
   case 0x88 ... 0x8b: {
     // moves
+    /* Handle LOCKed loads with ASF! */
+    if (prefixes & PFX_LOCK) return false;
+
     int bytemode = bit(op, 0) ? v_mode : b_mode;
     switch (bit(op, 1)) {
     case 0: DECODE(eform, rd, bytemode); DECODE(gform, ra, bytemode); break;
@@ -526,12 +531,11 @@
   case 0xeb: {
     bool iscall = (op == 0xe8);
     // CALL or JMP rel16/rel32/rel64
-    // near unconditional branches with 8-bit displacement:
+    // near conditional branches with 8-bit displacement:
     bool longform = (op != 0xeb);
     DECODE(iform, ra, (longform ? v_mode : b_mode));
-    W64 target = (Waddr)(rip + ra.imm.imm);
-    bb.rip_taken = target;
-    bb.rip_not_taken = target;
+    bb.rip_taken = (Waddr)rip + (W64s)ra.imm.imm;
+    bb.rip_not_taken = bb.rip_taken;
     bb.brtype = (longform) ? BRTYPE_BRU_IMM32 : BRTYPE_BRU_IMM8;
     end_of_block = true;
     EndOfDecode();
@@ -548,8 +552,8 @@
       this << TransOp(OP_collcc, REG_temp0, REG_zf, REG_cf, REG_of, 3, 0, 0, FLAGS_DEFAULT_ALU);
     TransOp transop(OP_bru, REG_rip, REG_zero, REG_zero, REG_zero, 3);
     transop.extshift = (iscall) ? BRANCH_HINT_PUSH_RAS : 0;
-    transop.riptaken = target;
-    transop.ripseq = target;
+    transop.riptaken = (Waddr)rip + (W64s)ra.imm.imm;
+    transop.ripseq = (Waddr)rip + (W64s)ra.imm.imm;
     this << transop;
     break;
   }
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/decode-sse.cpp ptlsim-asf/decode-sse.cpp
--- ptlsim-stable/decode-sse.cpp	2010-03-02 13:20:36.105619000 +0100
+++ ptlsim-asf/decode-sse.cpp	2010-03-02 12:00:55.211880000 +0100
@@ -3,6 +3,8 @@
 // Decoder for SSE/SSE2/SSE3/MMX and misc instructions
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <decode.h>
@@ -14,7 +16,6 @@
   DecodedOperand ra;
 
   is_sse = 1;
-  prefixes &= ~PFX_LOCK;
 
   switch (op) {
     /*
@@ -516,7 +517,7 @@
     lowop.datatype = DATATYPE_VEC_DOUBLE;
     this << lowop;
 
-    TransOp highop(uop, rdreg+1, rareg+0, rareg+1, REG_zero, 3);
+    TransOp highop(uop, rdreg+1, rareg+1, rareg+1, REG_zero, 3);
     highop.datatype = DATATYPE_VEC_DOUBLE;
     this << highop;
 
@@ -818,12 +819,15 @@
     break;
   }
 
+  case 0x56f: // movdqa load
+  case 0x26f: { // movdqu load
+    /* Let decode_asf handle LOCKed movdqa and movdqu */
+    if (prefixes & PFX_LOCK) return false;
   case 0x328: // movaps load 
   case 0x528: // movapd load
   case 0x310: // movups load
   case 0x510: // movupd load
-  case 0x56f: // movdqa load
-  case 0x26f: { // movdqu load
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, x_mode);
     EndOfDecode();
@@ -844,15 +848,17 @@
     break;
   }
 
+  case 0x57f: // movdqa store
+  case 0x27f: { // movdqu store
+    /* Let decode_asf handle LOCKed movdqa and movdqu */
+    if (prefixes & PFX_LOCK) return false;
   case 0x329: // movaps store
   case 0x529: // movapd store
   case 0x311: // movups store
   case 0x511: // movupd store
-  case 0x57f: // movdqa store
-  case 0x27f: // movdqu store
   case 0x5e7: // movntdq store
   case 0x52b: // movntpd store
-  case 0x32b: { // movntps store
+  case 0x32b: // movntps store
     DECODE(eform, rd, x_mode);
     DECODE(gform, ra, x_mode);
     EndOfDecode();
@@ -1214,6 +1220,9 @@
   }
 
   case 0x56e: { // movd xmm,rm32/rm64
+    /* Let decode_asf handle the locked verisons of movd xmm,rm32/rm64 */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, v_mode);
     EndOfDecode();
@@ -1249,6 +1258,9 @@
   }
 
   case 0x27e: { // movq xmm,xmmlo|mem64 with zero extension
+    /* Let decode_asf handle the locked verisons of movq xmm,xmmlo|mem64 */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, x_mode);
     EndOfDecode();
Only in ptlsim-stable: decode-x87.cpp.orig
Only in ptlsim-stable: decode-x87.cpp.rej
Only in ptlsim-stable: decode-x87.cpp~
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/decode.h ptlsim-asf/decode.h
--- ptlsim-stable/decode.h	2009-10-30 19:40:25.020180000 +0100
+++ ptlsim-asf/decode.h	2010-03-02 12:00:55.232859000 +0100
@@ -4,6 +4,8 @@
 // Decoder for x86 and x86-64 to PTL uops
 //
 // Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _DECODE_H_
@@ -13,6 +15,10 @@
 #include <ptlsim.h>
 #include <datastore.h>
 
+#ifdef ENABLE_ASF
+#define DECODE_XOP
+#endif
+
 struct RexByte { 
   // a.k.a., b, x, r, w
   byte extbase:1, extindex:1, extreg:1, mode64:1, insnbits:4; 
@@ -36,6 +42,23 @@
   operator byte() const { return (*((byte*)this)); }
 };
 
+#ifdef DECODE_XOP
+struct XopByte2 {
+  // !R.!X.!B.mmmmm
+  byte xopmapselect:5, extbaseinv:1, extindexinv:1, extreginv:1; // (lowest bit first in bit field)
+  XopByte2() { }
+  XopByte2(const byte& b) { *((byte*)this) = b; }
+  operator byte() const { return (*((byte*)this)); }
+};
+struct XopByte3 {
+  // W.vvvv.L.pp
+  byte opex:2, veclen:1, vreg:4, mode64:1; // (lowest bit first in bit field)
+  XopByte3() { }
+  XopByte3(const byte& b) { *((byte*)this) = b; }
+  operator byte() const { return (*((byte*)this)); }
+};
+#endif
+
 static const int PFX_REPZ      = (1 << 0);
 static const int PFX_REPNZ     = (1 << 1);
 static const int PFX_LOCK      = (1 << 2);
@@ -49,7 +72,13 @@
 static const int PFX_ADDR      = (1 << 10);
 static const int PFX_REX       = (1 << 11);
 static const int PFX_FWAIT     = (1 << 12);
+#ifdef DECODE_XOP
+static const int PFX_XOP1      = (1 << 13);
+static const int PFX_count     = 14;
+#else
 static const int PFX_count     = 13;
+#endif
+
 
 extern const char* prefix_names[PFX_count];
 
@@ -166,6 +195,10 @@
   W32 prefixes;
   ModRMByte modrm;
   RexByte rex;
+#ifdef DECODE_XOP
+  XopByte2 xop2;
+  XopByte3 xop3;
+#endif
   W64 user_insn_count;
   bool last_flags_update_was_atomic;
   bool invalid;
@@ -201,8 +234,8 @@
   void immediate(int rdreg, int sizeshift, W64s imm, bool issigned = true);
   void abs_code_addr_immediate(int rdreg, int sizeshift, W64 imm);
   int bias_by_segreg(int basereg);
-  void address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype = DATATYPE_INT, int cachelevel = 0, bool force_seg_bias = false, bool rmw = false);
-  void operand_load(int destreg, const DecodedOperand& memref, int loadop = OP_ld, int datatype = 0, int cachelevel = 0, bool rmw = false);
+  void address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype = DATATYPE_INT, int cachelevel = 0, bool force_seg_bias = false, bool invalidating = false, bool rmw = false);
+  void operand_load(int destreg, const DecodedOperand& memref, int loadop = OP_ld, int datatype = 0, int cachelevel = 0, bool invalidating = false, bool rmw = false);
   void result_store(int srcreg, int tempreg, const DecodedOperand& memref, int datatype = 0, bool rmw = false);
   void alu_reg_or_mem(int opcode, const DecodedOperand& rd, const DecodedOperand& ra, W32 setflags, int rcreg, 
                       bool flagsonly = false, bool isnegop = false, bool ra_rb_imm_form = false, W64s ra_rb_imm_form_rbimm = 0);
@@ -227,6 +260,13 @@
   bool decode_complex();
   bool decode_sse();
   bool decode_x87();
+#ifdef ENABLE_ASF
+  bool decode_asf();
+#endif
+#ifdef DECODE_XOP
+  bool decode_xop();
+#endif
+  void scan_transb_and_flag_asf(byte opcode);
 
   typedef int rep_and_size_to_assist_t[3][4];
 
@@ -265,6 +305,7 @@
 
 #define MakeInvalid() { invalid |= true; EndOfDecode(); }
 
+// NOTE: Keep these ordered, as asf.cpp relies on ranges staying intact.
 enum {
   // Forced assists based on decode context
   ASSIST_INVALID_OPCODE,
@@ -331,7 +372,10 @@
   // Interrupts and I/O
   ASSIST_IOPORT_IN,
   ASSIST_IOPORT_OUT,
-  ASSIST_COUNT,
+#ifdef ENABLE_ASF
+  ASSIST_ASF_ABORT,
+#endif
+  ASSIST_COUNT
 };
 
 
@@ -409,6 +453,9 @@
   // I/O and legacy
   "ioport_in",
   "ioport_out",
+#ifdef ENABLE_ASF
+  "asf_abort",
+#endif
 };
 
 int propagate_exception_during_assist(Context& ctx, byte exception, W32 errorcode, Waddr virtaddr = 0, bool intN = 0);
@@ -479,7 +526,9 @@
 // I/O and legacy
 void assist_ioport_in(Context& ctx);
 void assist_ioport_out(Context& ctx);
-
+#ifdef ENABLE_ASF
+void assist_asf_abort(Context& ctx);
+#endif
 //
 // Global functions
 //
Only in ptlsim-stable: dstbuild.temp.cpp
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/globals.h ptlsim-asf/globals.h
--- ptlsim-stable/globals.h	2009-10-30 19:40:25.024181000 +0100
+++ ptlsim-asf/globals.h	2010-03-02 12:00:55.250841000 +0100
@@ -532,11 +532,15 @@
 #include <mathlib.h>
 #include <klibc.h>
 
+#ifdef PAGE_SHIFT
+#undef PAGE_SHIFT
+#endif
 #ifdef PAGE_SIZE
 #undef PAGE_SIZE
-// We're on x86 or x86-64, so pages are always 4096 bytes:
-#define PAGE_SIZE 4096
 #endif
+// We're on x86 or x86-64, so pages are always 4096 bytes:
+#define PAGE_SHIFT (12)
+#define PAGE_SIZE (1 << (PAGE_SHIFT))
 
 // e.g., head (a, b, c) => a
 // e.g., if list = (a, b, c), head list => a
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/kernel.cpp ptlsim-asf/kernel.cpp
--- ptlsim-stable/kernel.cpp	2010-03-02 13:20:36.217509000 +0100
+++ ptlsim-asf/kernel.cpp	2010-03-02 12:00:55.275816000 +0100
@@ -3,6 +3,8 @@
 // Linux Kernel Interface
 //
 // Copyright 2000-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -1113,6 +1115,25 @@
 #endif
     break;
   }
+
+// Verbosity for debugging futex-related issues
+#if (0)
+  case __NR_futex: {
+    // S.D.: Mask the sys_futex / futex_wait to immediatelly return zero. This turns any futex based
+    // lock into a spinlock, which should not make any difference in terms of correctness. It seems
+    // like the split core model has some problems regarding futex wakeup. THis is (hopefully) a quick
+    // hack.
+    if ((arg2 & 127) == 0 /* FUTEX_WAIT*/) {
+      logfile << "handle_syscall: sys_futex/FUTEX_WAIT on futex ", (void*)arg1, " masked!",endl;
+      ctx.commitarf[REG_rax] = 0;
+    } else 
+      ctx.commitarf[REG_rax] = do_syscall_64bit(syscallid, arg1, arg2, arg3, arg4, arg5, arg6);
+    break;
+  }
+#endif
+  case __NR_clone: {
+    //TODO: Handle the clone syscall by spawning of another VCPU.
+  }
   default:
     ctx.commitarf[REG_rax] = do_syscall_64bit(syscallid, arg1, arg2, arg3, arg4, arg5, arg6);
     break;
@@ -1523,8 +1544,7 @@
 int ptlsim_inject(int argc, char** argv) {
   static const bool DEBUG = 0;
 
-  int filename_arg = configparser.parse(config, argc - 1, argv + 1);
-  const char* filename = argv[filename_arg];
+  const char* filename = argv[1];
 
   int x86_64_mode = is_elf_64bit(filename);
 
@@ -1541,7 +1561,7 @@
     if (DEBUG) cerr << "ptlsim[", sys_gettid(), "]: Executing ", filename, endl, flush;
     sys_ptrace(PTRACE_TRACEME, 0, 0, 0);
     // Child process stops after execve() below:
-    int rc = sys_execve(filename, (const char**)(argv + filename_arg), (const char**)environ);
+    int rc = sys_execve(filename, (const char**)argv+1, (const char**)environ);
 
     if (rc < 0) {
       cerr << "ptlsim: rc ", rc, ": unable to exec ", filename, endl, flush;
@@ -1728,11 +1748,7 @@
   int status;
   int rc;
 
-  //
-  // Find the argv index of the filename to execute and its arguments:
-  //
-  int filename_arg = configparser.parse(config, argc - 1, argv + 1);
-  const char* filename = argv[filename_arg];
+  const char* filename = argv[1];
 
   if (!is_elf_valid(filename)) {
     cerr << "ptlsim: cannot open ", filename, endl, flush;
@@ -1747,7 +1763,7 @@
     if (DEBUG) cerr << "ptlsim[", sys_gettid(), "]: Executing ", filename, endl, flush;
     sys_ptrace(PTRACE_TRACEME, 0, 0, 0);
     // Child process stops after execve() below:
-    int rc = sys_execve(filename, (const char**)(argv + filename_arg), (const char**)environ);
+    int rc = sys_execve(filename, (const char**)argv+1, (const char**)environ);
 
     if (rc < 0) {
       cerr << "ptlsim: rc ", rc, ": unable to exec ", filename, ": rc = ", rc, endl, flush;
@@ -1880,6 +1896,7 @@
     break;
   }
   default:
+    /*S.D.*/ cerr << "Received signal ", si->si_signo," ignoring it!", endl, flush;
     if (logfile) logfile << "Warning: unknown signal ", si->si_signo, "; ignoring", endl, flush; break;
   }
 }
@@ -1895,6 +1912,11 @@
   sa.k_sa_handler = external_signal_callback;
   sa.sa_flags = SA_SIGINFO;
   assert(sys_rt_sigaction(SIGXCPU, &sa, NULL, sizeof(W64)) == 0);
+
+  /*S.D.: try to fetch all signals within PTLsim!*/
+  sa.sa_flags = SA_SIGINFO;
+  assert(sys_rt_sigaction(SIGUSR1, &sa, NULL, sizeof(W64)) == 0);
+
 }
 
 bool check_for_async_sim_break() {
@@ -1940,51 +1962,52 @@
 int init_config(int argc, char** argv) {
   collect_sysinfo(stats, argc, argv);
 
-  //
-  // argv[] is a suffix of the parent argv[] of length argc.
-  // If the parent has some configuration between the initial ptlsim 
-  // executable in argv[0] and the argv[X] that starts the suffix (noting 
-  // that argv[X-1] will be "--"), then send that to configparser.parse().
-  //
+  char confroot[1024] = "";
+  stringbuf sb;
 
-  pid_t parent = sys_getppid();
-  stringbuf cmdline;
-  cmdline << "/proc/", parent, "/cmdline";
+  char* homedir = getenv("HOME");
 
-  //
-  // Load p_argc and p_argv for the parent, analogous to argc/argv
-  // /proc/<pid>/cmdline terminates each argument with a null character.
-  //
-  istream is(cmdline);
-  if (unlikely (!is)) {
-    cerr << "PTLsim error: cannot open /proc/<parent>/cmdline", endl, flush;
-    abort();
+  const char* execname = get_full_exec_filename();
+
+  sb << (homedir ? homedir : "/etc"), "/.ptlsim", execname, ".conf";
+
+  char args[4096];
+  istream is(sb);
+  if (!is) {
+    cerr << "ptlsim: Warning: could not find '", sb, "', using defaults", endl;
   }
 
-  dynarray<char*> parent_args;
-  stringbuf line;
+  const char* simname = "ptlsim";
   
   for (;;) {
-    line.reset();
-    is >> line;
+    is >> readline(args, sizeof(args));
     if (!is) break;
-    parent_args.push(strdup(line));
+    char* p = args;
+    while (*p && (*p != '#')) p++;
+    if (*p == '#') *p = 0;
+    if (args[0]) break;
   }
-  is.close();
   
-  unsigned p_argc = parent_args.length;
-
-  //
-  // ConfigurationParser.parse() will automatically stop parsing at
-  // the first non-option (i.e. not starting with "-xxx") argument
-  // it finds (conveniently, this is always the target program name).
-  //  
-  int ptlsim_arg_count = configparser.parse(config, parent_args.length-1, parent_args+1);
-
-  foreach (i, parent_args.length) delete parent_args[i];
+  is.close();
 
-  handle_config_change(config, ptlsim_arg_count, parent_args+1);
+  char* ptlargs[1024];
 
+  ptlargs[0] = strdup(simname);
+  int ptlargc = 0;
+  char* p = args;
+  while (*p && (ptlargc < (lengthof(ptlargs)-1))) {
+    char* pbase = p;
+    while ((*p != 0) && (*p != ' ')) p++;
+    ptlargc++;
+    ptlargs[ptlargc] = strndup(pbase, p - pbase);
+    if (*p == 0) break;
+    *p++;
+    while ((*p != 0) && (*p == ' ')) p++;
+  }
+
+  // skip the leading argv[0]; just parse the options:
+  configparser.parse(config, ptlargc, ptlargs+1);
+  handle_config_change(config, argc, argv);
   logfile << config;
 
   return 0;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/kernel.h ptlsim-asf/kernel.h
--- ptlsim-stable/kernel.h	2009-10-30 19:40:25.030182000 +0100
+++ ptlsim-asf/kernel.h	2010-03-02 12:00:55.284807000 +0100
@@ -380,6 +380,10 @@
   return data;
 }
 
+static inline W64 storephys(Waddr physaddr, W64 data) {
+  return storemask(physaddr, data, 0xff);
+}
+
 // In userspace PTLsim, virtual == physical:
 inline RIPVirtPhys& RIPVirtPhys::update(Context& ctx, int bytes) {
   use64 = ctx.use64;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/klibc.cpp ptlsim-asf/klibc.cpp
--- ptlsim-stable/klibc.cpp	2009-10-30 19:40:25.033180000 +0100
+++ ptlsim-asf/klibc.cpp	2010-03-02 12:00:55.299792000 +0100
@@ -431,11 +431,7 @@
 }
 
 #define __HAVE_ARCH_STRCHR
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char * strchr(const char * s, int c)
-#else
 char * strchr(const char * s, int c)
-#endif
 {
 int d0;
 register char * __res;
@@ -456,11 +452,7 @@
 }
 
 #define __HAVE_ARCH_STRRCHR
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char * strrchr(const char * s, int c)
-#else
 char * strrchr(const char * s, int c)
-#endif
 {
 int d0, d1;
 register char * __res;
@@ -517,11 +509,7 @@
 }
 
 #define __HAVE_ARCH_MEMCHR
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const void* memchr(const void * cs,int c,size_t count)
-#else
 void* memchr(const void * cs,int c,size_t count)
-#endif
 {
 int d0;
 register void * __res;
@@ -851,11 +839,7 @@
  * @s: The string to be searched
  * @c: The character to search for
  */
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char *strchr(const char *s, int c)
-#else
 char *strchr(const char *s, int c)
-#endif
 {
 	for (; *s != (char)c; ++s)
 		if (*s == '\0')
@@ -871,11 +855,7 @@
  * @s: The string to be searched
  * @c: The character to search for
  */
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char *strrchr(const char *s, int c)
-#else
 char *strrchr(const char *s, int c)
-#endif
 {
        const char *p = s + strlen(s);
        do {
@@ -993,11 +973,7 @@
  * @cs: The string to be searched
  * @ct: The characters to search for
  */
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char *strpbrk(const char *cs, const char *ct)
-#else
 char *strpbrk(const char *cs, const char *ct)
-#endif
 {
 	const char *sc1, *sc2;
 
@@ -1167,11 +1143,7 @@
  * @s1: The string to be searched
  * @s2: The string to search for
  */
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const char *strstr(const char *s1, const char *s2)
-#else
 char *strstr(const char *s1, const char *s2)
-#endif
 {
 	int l1, l2;
 
@@ -1200,11 +1172,7 @@
  * returns the address of the first occurrence of @c, or %NULL
  * if @c is not found
  */
-#ifdef __CORRECT_ISO_CPP_STRING_H_PROTO
-const void *memchr(const void *s, int c, size_t n)
-#else
 void *memchr(const void *s, int c, size_t n)
-#endif
 {
 	const unsigned char *p = (unsigned char*)s;
 	while (n-- != 0) {
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/klibc.h ptlsim-asf/klibc.h
--- ptlsim-stable/klibc.h	2009-10-30 19:40:25.034182000 +0100
+++ ptlsim-asf/klibc.h	2010-03-02 12:00:55.307784000 +0100
@@ -54,5 +54,6 @@
 //
 
 void call_global_constuctors();
-
+#undef RAND_MAX
+#define RAND_MAX (32767)
 #endif // _BASELIBC_H
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/logic.h ptlsim-asf/logic.h
--- ptlsim-stable/logic.h	2009-10-30 19:40:25.039182000 +0100
+++ ptlsim-asf/logic.h	2010-03-02 12:00:55.330761000 +0100
@@ -871,6 +872,40 @@
     }
     return os;
   }
+
+  class iterator {
+  private:
+    int s, w;
+    Set *sets;
+    bool at_end() {return ((s == setcount) && (w == 0 )); }
+
+  public:
+    V& operator*() { return sets[s][w]; }
+    iterator& operator++() {
+      if (at_end()) return *this;
+      ++w;
+      if (w == waycount) { ++s; w = 0; }
+      return *this;
+    }
+    bool operator==(const iterator& other) const {
+      return ((sets == other.sets) && (s == other.s) && (w == other.w));
+    }
+    bool operator!=(const iterator& other) const {
+      return !(*this == other);
+    }
+    iterator& to_end() {
+      s = setcount; w = 0;
+      return *this;
+    }
+    iterator(Set* sets_) : s(0), w(0), sets(sets_) {}
+    ostream& print(ostream& os) const {
+      os << "Set: ", s, "/", setcount, " Way: ", w, "/", waycount;
+      return os;
+    }
+  };
+  iterator begin() { return iterator(sets); }
+  iterator end() { return iterator(sets).to_end(); }
+
 };
 
 template <typename T, typename V, int size, int ways, int linesize>
@@ -1147,7 +1182,7 @@
 
   int unlock(T tag) {
     int way = tags.probe(tag);
-    if (way < 0) return;
+    if (way < 0) return -1;
     unlock_way(way);
     if (tags.islocked(way)) stats::unlocked(data[way], tags[way], way);
     return way;
@@ -1226,6 +1261,10 @@
 
   V* select_and_lock(T addr) { bool dummy; return select_and_lock(addr, dummy); }
 
+  V* unlock(T addr) {
+    return sets[setof(addr)].unlock(tagof(addr));
+  }
+
   ostream& print(ostream& os) const {
     os << "LockableAssociativeArray<", setcount, " sets, ", waycount, " ways, ", linesize, "-byte lines>:", endl;
     foreach (set, setcount) {
@@ -1241,6 +1280,136 @@
   return aa.print(os);
 }
 
+template <class B, class T, class V>
+class NotifyAssociativeWrapper : private B {
+public:
+  typedef B base_t;
+  typedef typename base_t::Set Set;
+  typedef void (*LineNotify)(void*, V*, T);
+  typedef void (*Notify)(void*);
+
+  class iterator {
+  private:
+    typename B::iterator base_it;
+  public:
+    V& operator*() { return *base_it; }
+    iterator& operator++() { ++base_it; return *this;}
+    bool operator==(const iterator& other) const { return (base_it == other.base_it); }
+    bool operator!=(const iterator& other) const { return (base_it != other.base_it); }
+    iterator(typename B::iterator b) : base_it(b) {}
+    ostream& print(ostream& os) const { return base_it.print(os); }
+  };
+  iterator begin() { return iterator(base_t::begin()); }
+  iterator end() { return iterator(base_t::end()); }
+
+  class NotifyingSet {
+  private:
+    Set &set;
+    void *notify_data;
+    LineNotify notify_evict, notify_inv, notify_probe;
+  public:
+    NotifyingSet(Set &s, void* d, LineNotify e, LineNotify i, LineNotify p) :
+      set(s), notify_data(d), notify_evict(e), notify_inv(i), notify_probe(p) {}
+
+    /* These functions cause notifications */
+    int invalidate(T tag) {
+      V *l = set.probe(tag);
+      if (notify_inv && l) notify_inv(notify_data, l, tag);
+      return set.invalidate(tag);
+    }
+    V* probe(T tag)  {
+      V *l = set.probe(tag);
+      if (notify_probe && l) notify_probe(notify_data, l, tag);
+      return l;
+    }
+    V* select(T tag) {
+      T dummy;
+      return select(tag, dummy);
+    }
+    V* select(T tag, T &oldtag) {
+      V *l = set.select(tag, oldtag);
+      if (tag == oldtag) { if (notify_probe) notify_probe(notify_data, l, oldtag); }
+      else               { if (notify_evict) notify_evict(notify_data, l, oldtag); }
+      return l;
+    }
+    // TODO: Add proxies for other functions used
+  };
+  /* This wrapper class allow direct access to the sets of the original
+   * container structure. By overloading the array subscript, it essentially
+   * adds the NotifyingSet class as an interposer. */
+  class NotifyingSets {
+  private:
+    Set *sets;
+    void* notify_data;
+    LineNotify notify_evict, notify_inv, notify_probe;
+  public:
+    NotifyingSets(Set *s, void* d, LineNotify e, LineNotify i, LineNotify p) :
+      sets(s), notify_data(d), notify_evict(e), notify_inv(i), notify_probe(p) {}
+
+    NotifyingSet operator[](int i) {
+      return NotifyingSet(sets[i], notify_data, notify_evict, notify_inv, notify_probe);
+    }
+    void set_notifications(void* data, LineNotify evict_line, LineNotify invalidate_line,
+        LineNotify probe_line) {
+      notify_data  = data;
+      notify_evict = evict_line;
+      notify_inv   = invalidate_line;
+      notify_probe = probe_line;
+    }
+  };
+
+  NotifyingSets sets;
+
+  NotifyAssociativeWrapper() :
+    notify_data(NULL), notify_evict(NULL), notify_inv(NULL), notify_probe(NULL),
+    notify_reset(NULL), sets(base_t::sets, notify_data, notify_evict, notify_inv, notify_probe) {}
+  NotifyAssociativeWrapper(void* data, LineNotify evict_line, LineNotify invalidate_line,
+      LineNotify probe_line, Notify reset_cache) :
+    notify_data(data), notify_evict(evict_line), notify_inv(invalidate_line),
+    notify_probe(probe_line), notify_reset(reset_cache),
+    sets(base_t::sets, notify_data, notify_evict, notify_inv, notify_probe) {}
+
+  void set_notifications(void* data, LineNotify evict_line, LineNotify invalidate_line,
+      LineNotify probe_line, Notify reset_cache) {
+    sets.set_notifications(data, evict_line, invalidate_line, probe_line);
+    notify_data  = data;
+    notify_evict = evict_line;
+    notify_inv   = invalidate_line;
+    notify_probe = probe_line;
+    notify_reset = reset_cache;
+  }
+  int setof(T addr) const { return base_t::setof(addr); }
+  T   tagof(T addr) const { return base_t::tagof(addr); }
+
+  /* These functions cause notifications. */
+  V* select(T addr) {
+    T oldaddr;
+    V *l = base_t::select(addr, oldaddr);
+    if (addr == oldaddr) { if (notify_probe) notify_probe(notify_data, l, oldaddr); }
+    else                 { if (notify_evict) notify_evict(notify_data, l, oldaddr); }
+    return l;
+  }
+  void invalidate(T addr) {
+    V *l = base_t::probe(addr);
+    if (notify_inv && l) notify_inv(notify_data, l, addr);
+    base_t::invalidate(addr);
+  }
+  V* probe(T addr)  {
+    V *l = base_t::probe(addr);
+    if (notify_probe && l) notify_probe(notify_data, l, addr);
+    return l;
+  }
+  void reset() {
+    if (notify_reset) notify_reset(notify_data);
+    base_t::reset();
+  }
+  // TODO: Add proxies for other functions used
+private:
+  void*      notify_data;
+  LineNotify notify_evict, notify_inv, notify_probe;
+  Notify     notify_reset;
+};
+
 template <typename T, int setcount, int linesize>
 struct DefaultCacheIndexingFunction {
   static inline Waddr setof(T address) { return bits(address, log2(linesize), log2(setcount)); }
Only in ptlsim-asf/: ooocore-amd-barcelona-asf.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ooocore-amd-k8.h ptlsim-asf/ooocore-amd-k8.h
--- ptlsim-stable/ooocore-amd-k8.h	2009-10-30 19:40:25.121179000 +0100
+++ ptlsim-asf/ooocore-amd-k8.h	2010-03-02 12:00:55.442649000 +0100
@@ -8,13 +8,6 @@
 // Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
 //
 
-#ifndef _OOOCORE_H_
-#define _OOOCORE_H_
-
-// With these disabled, simulation is faster
-// #define ENABLE_CHECKS
-// #define ENABLE_LOGGING
-
 //
 // Enable SMT operation:
 //
@@ -205,10 +198,10 @@
     {OP_divs,          32, ALUC},
     {OP_rems,          32, ALUC},
     // Minimum and maximum
-    {OP_min,            1, ANYALU},
-    {OP_max,            1, ANYALU},
-    {OP_min_s,          1, ANYALU},
-    {OP_max_s,          1, ANYALU},
+    {OP_min,            1, ALU1|ALU2|ALU3},
+    {OP_max,            1, ALU1|ALU2|ALU3},
+    {OP_min_s,          1, ALU1|ALU2|ALU3},
+    {OP_max_s,          1, ALU1|ALU2|ALU3},
     // Floating point
     // uop.size bits have following meaning:
     // 00 = single precision, scalar (preserve high 32 bits of ra)
@@ -274,10 +267,10 @@
     {OP_vmull,          4, FMUL},
     {OP_vmulh,          4, FMUL},
     {OP_vmulhu,         4, FMUL},
-    {OP_vmaddp,         4, ANYFPU},
-    {OP_vsad,           4, ANYFPU},
-    {OP_vpack_us,       2, ANYFPU},
-    {OP_vpack_ss,       2, ANYFPU},
+    {OP_vmaddp,         4, FADD|FMUL},
+    {OP_vsad,           4, FADD|FMUL},
+    {OP_vpack_us,       2, FADD|FMUL},
+    {OP_vpack_ss,       2, FADD|FMUL},
   };
 
 #undef A
@@ -1629,7 +1622,7 @@
     void check_rob();
   };
 
-#define MAX_SMT_CORES 1
+#define MAX_SMT_CORES 32
 
   struct OutOfOrderMachine: public PTLsimMachine {
     OutOfOrderCore* cores[MAX_SMT_CORES];
@@ -2001,5 +1994,3 @@
     } cputime;
   } simulator;
 };
-
-#endif // _OOOCORE_H_
Only in ptlsim-asf/: ooocore-generic.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ooocore.cpp ptlsim-asf/ooocore.cpp
--- ptlsim-stable/ooocore.cpp	2009-10-30 19:40:25.124180000 +0100
+++ ptlsim-asf/ooocore.cpp	2010-03-02 12:00:55.490603000 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -97,6 +99,7 @@
   current_basic_block = null;
   current_basic_block_transop_index = -1;
   stall_frontend = false;
+  stall_on_eom    = false;
   waiting_for_icache_fill = false;
   waiting_for_icache_fill_physaddr = 0;
   fetch_uuid = 0;
@@ -275,1388 +278,1508 @@
     
     return os;
   }
-};
-
-ostream& RegisterRenameTable::print(ostream& os) const {
-  foreach (i, TRANSREG_COUNT) {
-    if ((i % 8) == 0) os << " ";
-    os << " ", padstring(arch_reg_names[i], -6), " r", intstring((*this)[i]->index(), -3), " | ";
-    if (((i % 8) == 7) || (i == TRANSREG_COUNT-1)) os << endl;
-  }
-  return os;
-}
-
-//
-// Get the thread priority, with lower numbers receiving higher priority.
-// This is used to regulate the order in which fetch, rename, frontend
-// and dispatch slots are filled in each cycle.
-//
-// The well known ICOUNT algorithm adds up the number of uops in
-// the frontend pipeline stages and gives highest priority to
-// the thread with the lowest number, since this thread is moving
-// uops through very quickly and can make more progress.
-//
-int ThreadContext::get_priority() const {
-  int priority =
-    fetchq.count +
-    rob_frontend_list.count +
-    rob_ready_to_dispatch_list.count;
-
-  for_each_cluster (cluster) {
-    priority +=
-      rob_dispatched_list[cluster].count +
-      rob_ready_to_issue_list[cluster].count +
-      rob_ready_to_store_list[cluster].count +
-      rob_ready_to_load_list[cluster].count;
-  }
-
-  return priority;
-}
 
-//
-// Execute one cycle of the entire core state machine
-//
-bool OutOfOrderCore::runcycle() {
-  bool exiting = 0;
-  //
-  // Detect edge triggered transition from 0->1 for
-  // pending interrupt events, then wait for current
-  // x86 insn EOM uop to commit before redirecting
-  // to the interrupt handler.
-  //
+  template <class T>
+  bool GenericEventLog<T>::init(size_t bufsize) {
+    reset();
+    size_t bytes = bufsize * sizeof(T);
+    start = (T*)ptl_mm_alloc_private_pages(bytes);
+    if unlikely (!start) return false;
+    end = start + bufsize;
+    tail = start;
 
-#ifdef PTLSIM_HYPERVISOR
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    bool current_interrupts_pending = thread->ctx.check_events();
-    bool edge_triggered = ((!thread->prev_interrupts_pending) & current_interrupts_pending);
-    thread->handle_interrupt_at_next_eom |= edge_triggered;
-    thread->prev_interrupts_pending = current_interrupts_pending;
+    foreach (i, bufsize) start[i].type = EVENT_INVALID;
+    return true;
   }
-#endif
-
-  //
-  // Compute reserved issue queue entries to avoid starvation:
-  //
-#ifdef ENABLE_CHECKS
-  int total_issueq_count = 0;
-  int total_issueq_reserved_free = 0;
 
-  foreach (i, MAX_THREADS_PER_CORE) {
-    ThreadContext* thread = threads[i];
+  template <class T>
+  void GenericEventLog<T>::reset() {
+    if (!start) return;
 
-    if unlikely (!thread) {
-      total_issueq_reserved_free += reserved_iq_entries;
-    } else {
-      total_issueq_count += thread->issueq_count;
-      if(thread->issueq_count < reserved_iq_entries){
-        total_issueq_reserved_free += reserved_iq_entries - thread->issueq_count;
-      }
-    }
+    size_t bytes = (end - start) * sizeof(T);
+    ptl_mm_free_private_pages(start, bytes);
+    start = null;
+    end = null;
+    tail = null;
   }
 
-  // assert (total_issueq_count == issueq_all.count);
-  // assert((ISSUE_QUEUE_SIZE - issueq_all.count) == (issueq_all.shared_entries + total_issueq_reserved_free));
-#endif
-
-  foreach (i, threadcount) threads[i]->loads_in_this_cycle = 0;
-
-  fu_avail = bitmask(FU_COUNT);
-  caches.clock();
+  template <>
+  ostream& EventLog::print(ostream& os, bool only_to_tail) {
+    if (tail >= end) tail = start;
+    if (tail < start) tail = end;
 
-  //
-  // Backend and issue pipe stages run with round robin priority
-  //
-  int commitrc[MAX_THREADS_PER_CORE];
-  commitcount = 0;
-  writecount = 0;
+    OutOfOrderCoreEvent* p = (only_to_tail) ? start : tail;
 
-  foreach (permute, threadcount) {
-    int tid = add_index_modulo(round_robin_tid, +permute, threadcount);
-    ThreadContext* thread = threads[tid];
-    if unlikely (!thread->ctx.running) continue;
+    W64 cycle = limits<W64>::max;
+    size_t bufsize = end - start;
 
-    commitrc[tid] = thread->commit();
-    for_each_cluster(j) thread->writeback(j);
-    for_each_cluster(j) thread->transfer(j);
-  }
+    if (!config.flush_event_log_every_cycle) os << "#-------- Start of event log --------", endl;
 
-  //
-  // Clock the TLB miss page table walk state machine
-  // This may use up load ports, so do it before other
-  // loads can issue 
-  //
-#ifdef PTLSIM_HYPERVISOR
-  foreach (i, threadcount) {
-    threads[i]->tlbwalk();
+    foreach (i, (only_to_tail ? (tail - start) : bufsize)) {
+      if unlikely (p >= end) p = start;
+      if unlikely (p < start) p = end-1;
+      if unlikely (p->type == EVENT_INVALID) {
+        p++;
+        continue;
   }
-#endif
-
-  //
-  // Issue whatever is ready
-  //
-  for_each_cluster(i) { issue(i); }
-
-  //
-  // Most of the frontend (except fetch!) also works with round robin priority
-  //
-  int dispatchrc[MAX_THREADS_PER_CORE];
-  dispatchcount = 0;
-  foreach (permute, threadcount) {
-    int tid = add_index_modulo(round_robin_tid, +permute, threadcount);
-    ThreadContext* thread = threads[tid];
-    if unlikely (!thread->ctx.running) continue;
-
-    for_each_cluster(j) { thread->complete(j); }
-
-    dispatchrc[tid] = thread->dispatch();
 
-    if likely (dispatchrc[tid] >= 0) {
-      thread->frontend();
-      thread->rename();
-    }
+      if unlikely (p->cycle != cycle) {
+        cycle = p->cycle;
+        os << "[core ", coreid, "] Cycle ", cycle, ":", endl;
   }
 
-  //
-  // Compute fetch priorities (default is ICOUNT algorithm)
-  //
-  // This means we sort in ascending order, with any unused threads
-  // (if any) given the lowest priority.
-  //
-
-  int priority_value[MAX_THREADS_PER_CORE];
-  int priority_index[MAX_THREADS_PER_CORE];
-
-  if likely (threadcount == 1) {
-    priority_value[0] = 0;
-    priority_index[0] = 0;
-  } else {
-    foreach (i, threadcount) {
-      priority_index[i] = i;
-      ThreadContext* thread = threads[i];
-      priority_value[i] = thread->get_priority();
-      if unlikely (!thread->ctx.running) priority_value[i] = limits<int>::max;
+      p->print(os);
+      p++;
     }
     
-    sort(priority_index, threadcount, SortPrecomputedIndexListComparator<int, false>(priority_value));
-  }
+    if (!config.flush_event_log_every_cycle) os << "#-------- End of event log --------", endl;
 
-  //
-  // Fetch in thread priority order
-  //
-  // NOTE: True ICOUNT only fetches the highest priority
-  // thread per cycle, since there is usually only one
-  // instruction cache port. In a banked i-cache, we can
-  // fetch from multiple threads every cycle.
-  //
-  foreach (j, threadcount) {
-    int i = priority_index[j];
-    ThreadContext* thread = threads[i];
-    assert(thread);
-    if unlikely (!thread->ctx.running) {
-      continue;
+    return os;
     }
 
-    if likely (dispatchrc[i] >= 0) {
-      thread->fetch();
-    }
+  template <>
+  void EventLog::flush(bool only_to_tail) {
+    if likely (!logable(6)) return;
+    if unlikely (!logfile) return;
+    if unlikely (!logfile->ok()) return;
+    print(*logfile, only_to_tail);
+    tail = start;
   }
 
-  //
-  // Always clock the issue queues: they're independent of all threads
-  //
-  foreach_issueq(clock());
+  ostream& OutOfOrderCoreEvent::print(ostream& os) const {
+    bool ld = isload(uop.opcode);
+    bool st = isstore(uop.opcode);
+    bool br = isbranch(uop.opcode);
+    W32 exception = LO32(commit.state.reg.rddata);
+    W32 error_code = HI32(commit.state.reg.rddata);
 
-  //
-  // Advance the round robin priority index
-  //
-  round_robin_tid = add_index_modulo(round_robin_tid, +1, threadcount);
+    stringbuf uopname;
+    nameof(uopname, uop);
 
+    os << intstring(uuid, 20), " t", threadid, " ";
+    switch (type) {
   //
-  // Flush event log ring buffer
+      // Fetch Events
   //
-  if unlikely (config.event_log_enabled) {
-    // logfile << "[cycle ", sim_cycle, "] Miss buffer contents:", endl;
-    // logfile << caches.missbuf;
-    if unlikely (config.flush_event_log_every_cycle) {
-      eventlog.flush(true);
+    case EVENT_FETCH_STALLED:
+      os <<  "fetch  frontend stalled"; break;
+    case EVENT_FETCH_ICACHE_WAIT:
+      os <<  "fetch  rip ", rip, ": wait for icache fill"; break;
+    case EVENT_FETCH_FETCHQ_FULL:
+      os <<  "fetch  rip ", rip, ": fetchq full"; break;
+    case EVENT_FETCH_IQ_QUOTA_FULL:
+      os <<  "fetch  rip ", rip, ": issue queue quota full = ", issueq_count, " "; break;
+    case EVENT_FETCH_BOGUS_RIP:
+      os <<  "fetch  rip ", rip, ": bogus RIP or decode failed"; break;
+    case EVENT_FETCH_ICACHE_MISS:
+      os <<  "fetch  rip ", rip, ": wait for icache fill of phys ", (void*)(Waddr)((rip.mfnlo << 12) + lowbits(rip.rip, 12)), " on missbuf ", fetch.missbuf; break;
+    case EVENT_FETCH_SPLIT:
+      os <<  "fetch  rip ", rip, ": split unaligned load or store ", uop; break;
+    case EVENT_FETCH_ASSIST:
+      os <<  "fetch  rip ", rip, ": branch into assist microcode: ", uop; break;
+    case EVENT_FETCH_TRANSLATE:
+      os <<  "xlate  rip ", rip, ": ", fetch.bb_uop_count, " uops"; break;
+    case EVENT_FETCH_OK: {
+      os <<  "fetch  rip ", rip, ": ", uop,
+        " (uopid ", uop.bbindex;
+      if (uop.som) os << "; SOM";
+      if (uop.eom) os << "; EOM ", uop.bytes, " bytes";
+      os << ")";
+      if (uop.eom && fetch.predrip) os << " -> pred ", (void*)fetch.predrip;
+      if (isload(uop.opcode) | isstore(uop.opcode)) {
+        os << "; unaligned pred slot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), " -> ", uop.unaligned;
     }
+      break;
   }
-
-#ifdef ENABLE_CHECKS
-  // This significantly slows down simulation; only enable it if absolutely needed:
-  // check_refcounts();
-#endif
-
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    if unlikely (!thread->ctx.running) continue;
-    int rc = commitrc[i];
-    if likely ((rc == COMMIT_RESULT_OK) | (rc == COMMIT_RESULT_NONE)) continue;
-
-    switch (rc) {
-    case COMMIT_RESULT_SMC: {
-      if (logable(3)) logfile << "Potentially cross-modifying SMC detected: global flush required (cycle ", sim_cycle, ", ", total_user_insns_committed, " commits)", endl, flush;
-      //
-      // DO NOT GLOBALLY FLUSH! It will cut off the other thread(s) in the
-      // middle of their currently committing x86 instruction, causing massive
-      // internal corruption on any VCPUs that happen to be straddling the
-      // instruction boundary.
       //
-      // BAD: machine.flush_all_pipelines();
-      //
-      // This is a temporary fix: in the *extremely* rare case where both
-      // threads have the same basic block in their pipelines and that
-      // BB is being invalidated, the BB cache will forbid us from
-      // freeing it (and will print a warning to that effect).
-      //
-      // I'm working on a solution to this, to put some BBs on an
-      // "invisible" list, where they cannot be looked up anymore,
-      // but their memory is not freed until the lock is released.
+      // Rename Events
       //
-      foreach (i, threadcount) {
-        ThreadContext* t = threads[i];
-        if unlikely (!t) continue;
-        if (logable(3)) {
-          logfile << "  [vcpu ", i, "] current_basic_block = ", t->current_basic_block;  ": ";
-          if (t->current_basic_block) logfile << t->current_basic_block->rip;
-          logfile << endl;
-        }
+    case EVENT_RENAME_FETCHQ_EMPTY:
+      os << "rename fetchq empty"; break;
+    case EVENT_RENAME_ROB_FULL:
+      os <<  "rename ROB full"; break;
+    case EVENT_RENAME_PHYSREGS_FULL:
+      os <<  "rename physical register file full"; break;
+    case EVENT_RENAME_LDQ_FULL:
+      os <<  "rename load queue full"; break;
+    case EVENT_RENAME_STQ_FULL:
+      os <<  "rename store queue full"; break;
+    case EVENT_RENAME_MEMQ_FULL:
+      os <<  "rename memory queue full"; break;
+    case EVENT_RENAME_OK: {
+      os <<  "rename rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
+      if (ld|st) os << " lsq", lsq;
+      os << " = ";
+      foreach (i, MAX_OPERANDS) os << rename.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
+      os << "; renamed";
+      os << " ", arch_reg_names[uop.rd], " (old r", rename.oldphys, ")";
+      if unlikely (!uop.nouserflags) {
+        if likely (uop.setflags & SETFLAG_ZF) os << " zf (old r", rename.oldzf, ")";
+        if likely (uop.setflags & SETFLAG_CF) os << " cf (old r", rename.oldcf, ")";
+        if likely (uop.setflags & SETFLAG_OF) os << " of (old r", rename.oldof, ")";
       }
-
-      thread->flush_pipeline();
-      thread->invalidate_smc();
       break;
     }
-    case COMMIT_RESULT_EXCEPTION: {
-      exiting = !thread->handle_exception();
+    case EVENT_FRONTEND:
+      os <<  "front  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " frontend stage ", (FRONTEND_STAGES - frontend.cycles_left), " of ", FRONTEND_STAGES;
       break;
-    }
-    case COMMIT_RESULT_BARRIER: {
-      exiting = !thread->handle_barrier();
+    case EVENT_CLUSTER_NO_CLUSTER:
+    case EVENT_CLUSTER_OK: {
+      os << ((type == EVENT_CLUSTER_OK) ? "clustr" : "noclus"), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " allowed FUs = ",
+        bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), " -> clusters ",
+        bitstring(select_cluster.allowed_clusters, MAX_CLUSTERS, true), " avail";
+      foreach (i, MAX_CLUSTERS) os << " ", select_cluster.iq_avail[i];
+      os << "-> ";
+      if (type == EVENT_CLUSTER_OK) os << "cluster ", clusters[cluster].name; else os << "-> none"; break;
       break;
     }
-    case COMMIT_RESULT_INTERRUPT: {
-      thread->handle_interrupt();
+    case EVENT_DISPATCH_NO_CLUSTER:
+    case EVENT_DISPATCH_OK: {
+      os << ((type == EVENT_DISPATCH_OK) ? "disptc" : "nodisp"),  " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " operands ";
+      foreach (i, MAX_OPERANDS) os << dispatch.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
+      if (type == EVENT_DISPATCH_OK) os << " -> cluster ", clusters[cluster].name; else os << " -> none";
       break;
     }
-    case COMMIT_RESULT_STOP: {
-      thread->flush_pipeline();
-      thread->stall_frontend = 1;
-      machine.stopped[thread->ctx.vcpuid] = 1;
-      // Wait for other cores to sync up, so don't exit right away
+    case EVENT_ISSUE_NO_FU: {
+      os << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+      os << "no FUs available in cluster ", clusters[cluster].name, ": ",
+        "fu_avail = ", bitstring(issue.fu_avail, FU_COUNT, true), ", ",
+        "op_fu = ", bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), ", "
+        "fu_cl_mask = ", bitstring(clusters[cluster].fu_mask, FU_COUNT, true);
       break;
     }
+    case EVENT_ISSUE_OK: {
+      stringbuf sb;
+      sb << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+      sb << " on ", padstring(fu_names[fu], -4), " in ", padstring(cluster_names[cluster], -4), ": r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
+      sb << " "; print_value_and_flags(sb, issue.state.reg.rddata, issue.state.reg.rdflags); sb << " =";
+      sb << " "; print_value_and_flags(sb, issue.operand_data[RA], issue.operand_flags[RA]); sb << ", ";
+      sb << " "; print_value_and_flags(sb, issue.operand_data[RB], issue.operand_flags[RB]); sb << ", ";
+      sb << " "; print_value_and_flags(sb, issue.operand_data[RC], issue.operand_flags[RC]);
+      sb << " (", issue.cycles_left, " cycles left)";
+      if (issue.mispredicted) sb << "; mispredicted (real ", (void*)(Waddr)issue.state.reg.rddata, " vs expected ", (void*)(Waddr)issue.predrip, ")";
+      os << sb;
+      break;
     }
+    case EVENT_REPLAY: {
+      os << "replay rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid],
+        " on cluster ", clusters[cluster].name, ": waiting on";
+      foreach (i, MAX_OPERANDS) {
+        if (!bit(replay.ready, i)) os << " ", replay.opinfo[i];
   }
-
-#ifdef PTLSIM_HYPERVISOR
-  if unlikely (vcpu_online_map_changed) {
-    vcpu_online_map_changed = 0;
-    foreach (i, contextcount) {
-      Context& vctx = contextof(i);
-      if likely (!vctx.dirty) continue;
-      //
-      // The VCPU is coming up for the first time after booting or being
-      // taken offline by the user.
-      //
-      // Force the active core model to flush any cached (uninitialized)
-      // internal state (like register file copies) it might have, since
-      // it did not know anything about this VCPU prior to now: if it
-      // suddenly gets marked as running without this, the core model
-      // will try to execute from bogus state data.
-      //
-      logfile << "VCPU ", vctx.vcpuid, " context was dirty: update core model internal state", endl;
-
-      ThreadContext* tc = threads[vctx.vcpuid];
-      assert(tc);
-      assert(&tc->ctx == &vctx);
-      tc->flush_pipeline();
-      vctx.dirty = 0;
+      break;
     }
+    case EVENT_STORE_WAIT: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      os << "wait on ";
+      if (!loadstore.rcready) os << " rc";
+      if (loadstore.inherit_sfr_used) {
+        os << ((loadstore.rcready) ? "" : " and "), loadstore.inherit_sfr,
+          " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
+          ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
   }
-#endif
-
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    if unlikely (!thread->ctx.running) break;
-
-    if unlikely ((sim_cycle - thread->last_commit_at_cycle) > 4096) {
-      stringbuf sb;
-      sb << "[vcpu ", thread->ctx.vcpuid, "] thread ", thread->threadid, ": WARNING: At cycle ",
-        sim_cycle, ", ", total_user_insns_committed,  " user commits: no instructions have committed for ",
-        (sim_cycle - thread->last_commit_at_cycle), " cycles; the pipeline could be deadlocked", endl;
-      logfile << sb, flush;
-      cerr << sb, flush;
-      exiting = 1;
+      break;
     }
+    case EVENT_STORE_PARALLEL_FORWARDING_MATCH: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      os << "ignored parallel forwarding match with ldq ", loadstore.inherit_sfr_lsq,
+        " (uuid ", loadstore.inherit_sfr_uuid, " rob", loadstore.inherit_sfr_rob,
+        " r", loadstore.inherit_sfr_physreg, ")";
+      break;
   }
-
-  return exiting;
+    case EVENT_STORE_ALIASED_LOAD: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      os << "aliased with ldbuf ", loadstore.inherit_sfr_lsq, " (uuid ", loadstore.inherit_sfr_uuid,
+        " rob", loadstore.inherit_sfr_rob, " r", loadstore.inherit_sfr_physreg, ");",
+        " (add colliding load rip ", (void*)(Waddr)loadstore.inherit_sfr_rip, "; replay from rip ", rip, ")";
+      break;
 }
-
-//
-// ReorderBufferEntry
-//
-void ReorderBufferEntry::init(int idx) {
-  this->idx = idx;
-  entry_valid = 0;
-  selfqueuelink::reset();
-  current_state_list = null;
-  reset();
+    case EVENT_STORE_ISSUED: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      if (loadstore.inherit_sfr_used) {
+        os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
+          ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
+          ", r", loadstore.inherit_sfr_physreg, ");";
 }
-
-//
-// Clean out various fields from the ROB entry that are 
-// expected to be zero when allocating a new ROB entry.
-//
-void ReorderBufferEntry::reset() {
-  int latency, operand;
-  // Deallocate ROB entry
-  entry_valid = false;
-  cycles_left = 0;
-  physreg = (PhysicalRegister*)null;
-  lfrqslot = -1;
-  lsq = 0;
-  load_store_second_phase = 0;
-  lock_acquired = 0;
-  consumer_count = 0;
-  executable_on_cluster_mask = 0;
-  pteupdate = 0;
-  cluster = -1;
-#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
-  dest_renamed_before_writeback = 0;
-  no_branches_between_renamings = 0;
-#endif
-  issued = 0;
+      os << " <= ", hexstring(loadstore.data_to_store, 8*(1<<uop.size)), " = ", loadstore.sfr;
+      break;
 }
-
-bool ReorderBufferEntry::ready_to_issue() const {
-  bool raready = operands[0]->ready();
-  bool rbready = operands[1]->ready();
-  bool rcready = operands[2]->ready();
-  bool rsready = operands[3]->ready();
-  
-  if (isstore(uop.opcode)) {
-    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready);
-  } else if (isload(uop.opcode)) {
-    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready & rcready);
-  } else {
-    return (raready & rbready & rcready & rsready);
+    case EVENT_STORE_LOCK_RELEASED: {
+      os << "lk-rel", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "lock released (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
+      break;
   }
+    case EVENT_STORE_LOCK_ANNULLED: {
+      os << "lk-anl", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "lock annulled (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
+      break;
 }
-
-bool ReorderBufferEntry::ready_to_commit() const {
-  return (current_state_list == &getthread().rob_ready_to_commit_queue);
+    case EVENT_STORE_LOCK_REPLAY: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
+      break;
 }
 
-StateList& ReorderBufferEntry::get_ready_to_issue_list() const {
-  OutOfOrderCore& core = getcore();
-  ThreadContext& thread = getthread();
-  return 
-    isload(uop.opcode) ? thread.rob_ready_to_load_list[cluster] :
-    isstore(uop.opcode) ? thread.rob_ready_to_store_list[cluster] :
-    thread.rob_ready_to_issue_list[cluster];
+    case EVENT_LOAD_WAIT: {
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      os << "wait on sfr ", loadstore.inherit_sfr,
+        " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
+        ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
+      if (loadstore.predicted_alias) os << "; stalled by predicted aliasing";
+      break;
 }
+    case EVENT_LOAD_HIT:
+    case EVENT_LOAD_MISS: {
+      if (type == EVENT_LOAD_HIT)
+        os << (loadstore.load_store_second_phase ? "load2 " : "load  ");
+      else os << (loadstore.load_store_second_phase ? "ldmis2" : "ldmiss");
 
-//
-// Reorder Buffer
-//
-stringbuf& ReorderBufferEntry::get_operand_info(stringbuf& sb, int operand) const {
-  PhysicalRegister& physreg = *operands[operand];
-  ReorderBufferEntry& sourcerob = *physreg.rob;
-
-  sb << "r", physreg.index();
-  if (PHYS_REG_FILE_COUNT > 1) sb << "@", getcore().physregfiles[physreg.rfid].name;
-
-  switch (physreg.state) {
-  case PHYSREG_WRITTEN:
-    sb << " (written)"; break;
-  case PHYSREG_BYPASS:
-    sb << " (ready)"; break;
-  case PHYSREG_WAITING:
-    sb << " (wait rob ", sourcerob.index(), " uuid ", sourcerob.uop.uuid, ")"; break;
-  case PHYSREG_ARCH: break;
-    if (physreg.index() == PHYS_REG_NULL)  sb << " (zero)"; else sb << " (arch ", arch_reg_names[physreg.archreg], ")"; break;
-  case PHYSREG_PENDINGFREE:
-    sb << " (pending free for ", arch_reg_names[physreg.archreg], ")"; break;
-  default:
-    // Cannot be in free state!
-    sb << " (FREE)"; break;
+      os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      if (loadstore.inherit_sfr_used) {
+        os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
+          ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
+          ", r", loadstore.inherit_sfr_physreg, "); ";
   }
-
-  return sb;
+      if (type == EVENT_LOAD_HIT)
+        os << "hit L1: value 0x", hexstring(loadstore.sfr.data, 64);
+      else os << "missed L1 (lfrqslot ", lfrqslot, ") [value would be 0x", hexstring(loadstore.sfr.data, 64), "]";
+      break;
 }
-
-ThreadContext& ReorderBufferEntry::getthread() const { return *getcore().threads[threadid]; }
-
-issueq_tag_t ReorderBufferEntry::get_tag() {
-  int mask = ((1 << MAX_THREADS_BIT) - 1) << MAX_ROB_IDX_BIT;
-  if (logable(100)) logfile << " get_tag() thread ", (void*) threadid, " rob idx ", (void*)idx, " mask ", (void*)mask, endl;
-
-  assert(!(idx & mask)); 
-  assert(!(threadid >> MAX_THREADS_BIT));
-  //  int threadid = 1;  
-  issueq_tag_t rc = (idx | (threadid << MAX_ROB_IDX_BIT));
-  if (logable(100)) logfile <<  " tag ", (void*) rc, endl;
-  return rc;
+    case EVENT_LOAD_BANK_CONFLICT: {
+      os << "ldbank", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "L1 bank conflict over bank ", lowbits(loadstore.sfr.physaddr, log2(CacheSubsystem::L1_DCACHE_BANKS));
+      break;
 }
-
-ostream& ReorderBufferEntry::print_operand_info(ostream& os, int operand) const {
-  stringbuf sb;
-  get_operand_info(sb, operand);
-  os << sb;
-  return os;
+    case EVENT_LOAD_TLB_MISS: {
+      os << (loadstore.load_store_second_phase ? "ldtlb2" : "ldtlb ");
+      os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      if (loadstore.inherit_sfr_used) {
+        os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
+          ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
+          ", r", loadstore.inherit_sfr_physreg, "); ";
 }
-
-ostream& ReorderBufferEntry::print(ostream& os) const {
-  stringbuf name, rainfo, rbinfo, rcinfo;
-  nameof(name, uop);
-  get_operand_info(rainfo, 0);
-  get_operand_info(rbinfo, 1);
-  get_operand_info(rcinfo, 2);
-
-  os << "rob ", intstring(index(), -3), " uuid ", intstring(uop.uuid, 16), " rip 0x", hexstring(uop.rip, 48), " ",
-    padstring(current_state_list->name, -24), " ", (uop.som ? "SOM" : "   "), " ", (uop.eom ? "EOM" : "   "), 
-    " @ ", padstring((cluster >= 0) ? clusters[cluster].name : "???", -4), " ",
-    padstring(name, -12), " r", intstring(physreg->index(), -3), " ", padstring(arch_reg_names[uop.rd], -6);
-  if (isload(uop.opcode)) 
-    os << " ld", intstring(lsq->index(), -3);
-  else if (isstore(uop.opcode))
-    os << " st", intstring(lsq->index(), -3);
-  else os << "      ";
-
-  os << " = ";
-  os << padstring(rainfo, -30);
-  os << padstring(rbinfo, -30);
-  os << padstring(rcinfo, -30);
-
-  return os;
+      else os << "DTLB miss", " [value would be 0x", hexstring(loadstore.sfr.data, 64), "]";
+      break;
 }
-
-void ThreadContext::print_rob(ostream& os) {
-  os << "ROB head ", ROB.head, " to tail ", ROB.tail, " (", ROB.count, " entries):", endl;
-  foreach_forward(ROB, i) {
-    ReorderBufferEntry& rob = ROB[i];
-    os << "  ", rob, endl;
+    case EVENT_LOAD_LOCK_REPLAY: {
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
+      break;
   }
+    case EVENT_LOAD_LOCK_OVERFLOW: {
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "replay because locking required but no free interlock buffers", endl;
+      break;
 }
-
-void ThreadContext::print_lsq(ostream& os) {
-  os << "LSQ head ", LSQ.head, " to tail ", LSQ.tail, " (", LSQ.count, " entries):", endl;
-  foreach_forward(LSQ, i) {
-    LoadStoreQueueEntry& lsq = LSQ[i];
-    os << "  ", lsq, endl;
+    case EVENT_LOAD_LOCK_ACQUIRED: {
+      os << "lk-acq", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+        "lock acquired";
+      break;
   }
+    case EVENT_LOAD_LFRQ_FULL:
+      os << "load   rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), ": LFRQ or miss buffer full; replaying"; break;
+    case EVENT_LOAD_HIGH_ANNULLED: {
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+      os << "load was annulled (high unaligned load)";
+      break;
 }
-
-void ThreadContext::print_rename_tables(ostream& os) {
-  os << "SpecRRT:", endl;
-  os << specrrt;
-  os << "CommitRRT:", endl;
-  os << commitrrt;
+    case EVENT_LOAD_WAKEUP:
+      os << "ldwake rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " wakeup load via lfrq slot ", lfrqslot; break;
+    case EVENT_TLBWALK_HIT: {
+      os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+        loadstore.tlb_walk_level, "): hit for PTE at phys ", (void*)loadstore.virtaddr; break;
+      break;
 }
-
-void OutOfOrderCore::print_smt_state(ostream& os) {
-  os << "Print SMT statistics:", endl;
-
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    os << "Thread ", i, ":", endl,
-      "  total_uops_committed ", thread->total_uops_committed, endl,
-      "  uipc ", double(thread->total_uops_committed) / double(iterations), endl,
-      "  total_insns_committed ",  thread->total_insns_committed,
-      "  ipc ", double(thread->total_insns_committed) / double(iterations), endl;
+    case EVENT_TLBWALK_MISS: {
+      os << "wlkmis rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+        loadstore.tlb_walk_level, "): miss for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
+      break;
   }
+    case EVENT_TLBWALK_WAKEUP: {
+      os << "wlkwak rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+        loadstore.tlb_walk_level, "): wakeup from cache miss for phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
+      break;
 }
-
-void ThreadContext::dump_smt_state(ostream& os) {
-  os << "SMT per-thread state for t", threadid, ":", endl;
-
-  print_rename_tables(os);
-  print_rob(os);
-  print_lsq(os);
-  os << flush;
+    case EVENT_TLBWALK_NO_LFRQ_MB: {
+      os << "wlknml rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+        loadstore.tlb_walk_level, "): no LFRQ or MB for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
+      break;
+    }
+    case EVENT_TLBWALK_COMPLETE: {
+      os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+        loadstore.tlb_walk_level, "): complete!"; break;
+      break;
+    }
+    case EVENT_LOAD_EXCEPTION: {
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
+      break;
+    }
+    case EVENT_STORE_EXCEPTION: {
+      os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+        " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+        (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
+      break;
+    }
+    case EVENT_ALIGNMENT_FIXUP:
+      os << "algnfx", " rip ", rip, ": set unaligned bit for uop ", uop.bbindex, " (unaligned predictor slot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), ") and refetch"; break;
+    case EVENT_FENCE_ISSUED:
+      os << "mfence rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " lsq ", lsq, " r", intstring(physreg, -3), ": memory fence (", uop, ")"; break;
+    case EVENT_ANNUL_NO_FUTURE_UOPS:
+      os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, ", EOM rob ", annul.eomidx, ": no future uops to annul"; break;
+    case EVENT_ANNUL_MISSPECULATION: {
+      os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx,
+        ", EOM rob ", annul.eomidx, ": annul from rob ", annul.startidx, " to rob ", annul.endidx;
+      break;
+    }
+    case EVENT_ANNUL_EACH_ROB: {
+      os << "annul  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": annul rip ", rip;
+      os << (uop.som ? " SOM" : "    "); os << (uop.eom ? " EOM" : "    ");
+      os << ": free";
+      os << " r", physreg;
+      if (ld|st) os << " lsq", lsq;
+      if (lfrqslot >= 0) os << " lfrq", lfrqslot;
+      if (annul.annulras) os << " ras";
+      break;
+    }
+    case EVENT_ANNUL_PSEUDOCOMMIT: {
+      os << "pseucm rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": r", physreg, " rebuild rrt:";
+      os << " arch ", arch_reg_names[uop.rd];
+      if likely (!uop.nouserflags) {
+        if (uop.setflags & SETFLAG_ZF) os << " zf";
+        if (uop.setflags & SETFLAG_CF) os << " cf";
+        if (uop.setflags & SETFLAG_OF) os << " of";
+      }
+      os << " = r", physreg;
+      break;
+    }
+    case EVENT_ANNUL_FETCHQ_RAS:
+      os << "anlras rip ", rip, ": annul RAS update still in fetchq"; break;
+    case EVENT_ANNUL_FLUSH:
+      os << "flush  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " rip ", rip; break;
+    case EVENT_REDISPATCH_DEPENDENTS:
+      os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " find all dependents"; break;
+    case EVENT_REDISPATCH_DEPENDENTS_DONE:
+      os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " redispatched ", (redispatch.count - 1), " dependent uops"; break;
+    case EVENT_REDISPATCH_EACH_ROB: {
+      os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " from state ", redispatch.current_state_list->name, ": dep on ";
+      if (!redispatch.dependent_operands) {
+        os << " [self]";
+      } else {
+        foreach (i, MAX_OPERANDS) {
+          if (bit(redispatch.dependent_operands, i)) os << " ", redispatch.opinfo[i];
 }
-
-void OutOfOrderCore::dump_smt_state(ostream& os) {
-  os << "SMT common structures:", endl;
-
-  print_list_of_state_lists<PhysicalRegister>(os, physreg_states, "Physical register states");
-  foreach (i, PHYS_REG_FILE_COUNT) {
-    os << physregfiles[i];
   }
 
-  print_list_of_state_lists<ReorderBufferEntry>(os, rob_states, "ROB entry states");
-  os << "Issue Queues:", endl;
-  foreach_issueq(print(os));
-  caches.print(os);
-
-  os << "Unaligned predictor:", endl;
-  os << "  ", unaligned_predictor.popcount(), " unaligned bits out of ", UNALIGNED_PREDICTOR_SIZE, " bits", endl;
-  os << "  Raw data: ", unaligned_predictor, endl;
+      os << "; redispatch ";
+      os << " [rob ", rob, "]";
+      os << " [physreg ", physreg, "]";
+      if (ld|st) os << " [lsq ", lsq, "]";
+      if (redispatch.iqslot) os << " [iqslot]";
+      if (lfrqslot >= 0) os << " [lfrqslot ", lfrqslot, "]";
+      if (redispatch.opinfo[RS].physreg != PHYS_REG_NULL) os << " [inheritsfr ", redispatch.opinfo[RS], "]";
 
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    thread->dump_smt_state(os);
+      break;
   }
+    case EVENT_COMPLETE:
+      os << "complt rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " on ", padstring(fu_names[fu], -4), ": r", intstring(physreg, -3); break;
+    case EVENT_FORWARD: {
+      os << "forwd", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")",
+        " (", clusters[cluster].name, ") r", intstring(physreg, -3),
+        " => ", "uuid ", forwarding.target_uuid, " rob ", forwarding.target_rob,
+        " (", clusters[forwarding.target_cluster].name, ") r", forwarding.target_physreg,
+        " operand ", forwarding.operand;
+      if (forwarding.target_st) os << " => st", forwarding.target_lsq;
+      os << " [still waiting?";
+      foreach (i, MAX_OPERANDS) { if (!bit(forwarding.target_operands_ready, i)) os << " r", (char)('a' + i); }
+      if (forwarding.target_all_operands_ready) os << " READY";
+      os << "]";
+      break;
+    }
+    case EVENT_BROADCAST: {
+      os << "brcst", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")",
+        " from cluster ", clusters[cluster].name, " to cluster ", clusters[forwarding.target_cluster].name,
+        " on forwarding cycle ", forwarding.forward_cycle;
+      break;
+    }
+    case EVENT_WRITEBACK: {
+      os << "write  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " (cluster ", clusters[cluster].name, ") r", intstring(physreg, -3), "@", phys_reg_file_names[rfid], " = 0x", hexstring(writeback.data, 64), " ", flagstring(writeback.flags);
+      if (writeback.transient) os << " (transient)";
+      os << " (", writeback.consumer_count, " consumers";
+      if (writeback.all_consumers_sourced_from_bypass) os << ", all from bypass";
+      if (writeback.no_branches_between_renamings) os << ", no intervening branches";
+      if (writeback.dest_renamed_before_writeback) os << ", dest renamed before writeback";
+      os << ")";
+      break;
 }
+    case EVENT_COMMIT_FENCE_COMPLETED:
+      os << "mfcmit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " fence committed: wake up waiting memory uops"; break;
+    case EVENT_COMMIT_EXCEPTION_DETECTED:
+      os << "detect rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " (", exception, "), error code ", hexstring(error_code, 16), ", origvirt ", (void*)(Waddr)commit.origvirt; break;
+    case EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED:
+      os << "except rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " [EOM #", commit.total_user_insns_committed, "]"; break;
+    case EVENT_COMMIT_SKIPBLOCK:
+      os << "skipbk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " skip block: advance rip by ", uop.bytes, " to ", (void*)(Waddr)(rip.rip + uop.bytes), " [EOM #", commit.total_user_insns_committed, "]"; break;
+    case EVENT_COMMIT_SMC_DETECTED:
+      os << "smcdet rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " self-modifying code at rip ", rip, " detected (mfn was dirty); invalidate and retry [EOM #", commit.total_user_insns_committed, "]"; break;
+    case EVENT_COMMIT_MEM_LOCKED:
+      os << "waitlk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " wait for lock on physaddr ", (void*)(commit.state.st.physaddr << 3), " to be released"; break;
+    case EVENT_COMMIT_OK: {
+      os << "commit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+      if likely (archdest_can_commit[uop.rd])
+                  os << " [rrt ", arch_reg_names[uop.rd], " = r", physreg, " 0x", hexstring(commit.state.reg.rddata, 64), "]";
 
-//
-// Validate the physical register reference counters against what
-// is really accessible from the various tables and operand fields.
-//
-// This is for debugging only.
-//
-void OutOfOrderCore::check_refcounts() {
-  // this should be for each thread instead of whole core:
-  // for now, we just work on thread[0];
-  ThreadContext& thread = *threads[0];
-  Queue<ReorderBufferEntry, ROB_SIZE>& ROB = thread.ROB;
-  RegisterRenameTable& specrrt = thread.specrrt;
-  RegisterRenameTable& commitrrt = thread.commitrrt;
+      if ((!uop.nouserflags) && uop.setflags) {
+        os << " [flags ", ((uop.setflags & SETFLAG_ZF) ? "z" : ""),
+          ((uop.setflags & SETFLAG_CF) ? "c" : ""), ((uop.setflags & SETFLAG_OF) ? "o" : ""),
+          " -> ", flagstring(commit.state.reg.rdflags), "]";
+      }
 
-  int refcounts[PHYS_REG_FILE_COUNT][MAX_PHYS_REG_FILE_SIZE];
-  memset(refcounts, 0, sizeof(refcounts));
+      if (uop.eom) os << " [rip = ", (void*)(Waddr)commit.target_rip, commit.krn ? " krn" : "", "]";
 
-  foreach (rfid, PHYS_REG_FILE_COUNT) {
-    // Null physreg in each register file is special and can never be freed:
-    refcounts[rfid][PHYS_REG_NULL]++;
-  }
+      if unlikely (st && (commit.state.st.bytemask != 0))
+                    os << " [mem ", (void*)(Waddr)(commit.state.st.physaddr << 3), " = ", bytemaskstring((const byte*)&commit.state.st.data, commit.state.st.bytemask, 8), " mask ", bitstring(commit.state.st.bytemask, 8, true), "]";
 
-  foreach_forward(ROB, i) {
-    ReorderBufferEntry& rob = ROB[i];
-    foreach (j, MAX_OPERANDS) {
-      refcounts[rob.operands[j]->rfid][rob.operands[j]->index()]++;
-    }
+      if unlikely (commit.pteupdate.a | commit.pteupdate.d | commit.pteupdate.ptwrite) {
+        os << " [pte:";
+        if (commit.pteupdate.a) os << " a";
+        if (commit.pteupdate.d) os << " d";
+        if (commit.pteupdate.ptwrite) os << " w";
+        os << "]";
   }
 
-  foreach (i, TRANSREG_COUNT) {
-    refcounts[commitrrt[i]->rfid][commitrrt[i]->index()]++;
-    refcounts[specrrt[i]->rfid][specrrt[i]->index()]++;
+      if unlikely (ld|st) {
+        os << " [lsq ", lsq, "]";
+        os << " [upslot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), " = ", commit.ld_st_truly_unaligned, "]";
   }
 
-  bool errors = 0;
+      if likely (commit.oldphysreg > 0) {
+        if unlikely (commit.oldphysreg_refcount) {
+          os << " [pending free old r", commit.oldphysreg, " ref by";
+          os << " refcount ", commit.oldphysreg_refcount;
+          os << "]";
+        } else {
+          os << " [free old r", commit.oldphysreg, "]";
+        }
+      }
 
-  foreach (rfid, PHYS_REG_FILE_COUNT) {
-    PhysicalRegisterFile& physregs = physregfiles[rfid];
-    foreach (i, physregs.size) {
-      if unlikely (physregs[i].refcount != refcounts[rfid][i]) {
-        logfile << "ERROR: r", i, " refcount is ", physregs[i].refcount, " but should be ", refcounts[rfid][i], endl;
+      os << " [commit r", physreg, "]";
         
-        foreach_forward(ROB, r) {
-          ReorderBufferEntry& rob = ROB[r];
-          foreach (j, MAX_OPERANDS) {
-            if ((rob.operands[j]->index() == i) & (rob.operands[j]->rfid == rfid)) logfile << "  ROB ", r, " operand ", j, endl;
-          }
+      foreach (i, MAX_OPERANDS) {
+        if unlikely (commit.operand_physregs[i] != PHYS_REG_NULL) os << " [unref r", commit.operand_physregs[i], "]";
         }
         
-        foreach (j, TRANSREG_COUNT) {
-          if ((commitrrt[j]->index() == i) & (commitrrt[j]->rfid == rfid)) logfile << "  CommitRRT ", arch_reg_names[j], endl;
-          if ((specrrt[j]->index() == i) & (specrrt[j]->rfid == rfid)) logfile << "  SpecRRT ", arch_reg_names[j], endl;
+      if unlikely (br) {
+        os << " [brupdate", (commit.taken ? " tk" : " nt"), (commit.predtaken ? " pt" : " np"), ((commit.taken == commit.predtaken) ? " ok" : " MP"), "]";
         }
         
-        errors = 1;
+      if (uop.eom) os << " [EOM #", commit.total_user_insns_committed, "]";
+      break;
       }
+    case EVENT_COMMIT_ASSIST: {
+      assist_func_t assist_func = assistid_to_func[rip.rip];
+      os << "assist rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " calling assist ", (void*)assist_func, " (#",
+        assist_index(assist_func), ": ", assist_name(assist_func), ")";
+      break;
     }
+    case EVENT_RECLAIM_PHYSREG:
+      os << "free   r", physreg, " no longer referenced; moving to free state"; break;
+    case EVENT_RELEASE_MEM_LOCK: {
+      os << "unlkcm", " phys ", (void*)(loadstore.sfr.physaddr << 3), ": lock release committed";
+      break;
+    }
+    default:
+      os << "?????? unknown event type ", type;
+      break;
   }
 
-  if (errors) assert(false);
+    os << endl;
+    return os;
 }
 
-void OutOfOrderCore::check_rob() {
-  // this should be for each thread instead of whole core:
-  // for now, we just work on thread[0];
-  ThreadContext& thread = *threads[0];
-  Queue<ReorderBufferEntry, ROB_SIZE>& ROB = thread.ROB;
+  #include <trace_event.h>
+  ostream& OutOfOrderCoreBinaryEvent::print_binary(ostream& os, W8 coreid) const {
+    bool ld = isload(uop.opcode);
+    bool st = isstore(uop.opcode);
+    bool br = isbranch(uop.opcode);
+    W32 exception = LO32(commit.state.reg.rddata);
+    W32 error_code = HI32(commit.state.reg.rddata);
 
-  foreach (i, ROB_SIZE) {
-    ReorderBufferEntry& rob = ROB[i];
-    if (!rob.entry_valid) continue;
-    assert(inrange((int)rob.forward_cycle, 0, (MAX_FORWARDING_LATENCY+1)-1));
+    switch (type) {
+    // TODO: Maybe we need the EVENT_LOAD_HIT/MISS event, too?
+    case EVENT_COMMIT_OK: {
+      // Values
+      //   uop.rd                  - Destination arch-register
+      //   commit.state.reg.rddata - Data in destination arch-register
+      //   - Do we need flags?
+      //   Core + cycle generated from outer code!
+      //   rip                      - Own RIP
+      //   uop.eom                  - Destination register is RIP
+      //   commit.target_rip        - Target RIP (for branches interesting!)
+      //   commit.state.st.physaddr - Physical address for stores
+      //   commit.state.st.data     - Data for stores
+      //   commit.state.st.bytemask - Bytemask for stores
+      //   commit.state.reg.addr    - Physical address for loads
+      //   commit.origvirt          - virtual address (incl. byte offset!)
+      //   uop.size                 - Size for loads and stores (2^n bytes)
+      //   uop.is_asf               - ASF flag for these instructions
+      //   uop.opcode               - differenciate between ASF start / end
+      //   threadid                 - thread ID
+      //   cycle                    - Simulation cycle
+
+      // Very simple version now: Just threadid, cycle, rip for EOMs
+      if (!uop.eom) break;
+
+      TraceEvent rep;
+      rep.coreid   = coreid;
+      rep.threadid = threadid;
+      rep.cycle    = cycle;
+      rep.rip      = rip.rip;
+      os.write(&rep, sizeof(rep));
+      break;
   }
-
-  foreach (i, threadcount) {
-    ThreadContext* thread = threads[i];
-    foreach (i, rob_states.count) {
-      StateList& list = *(thread->rob_states[i]);
-      ReorderBufferEntry* rob;
-      foreach_list_mutable(list, rob, entry, nextentry) {
-        assert(inrange(rob->index(), 0, ROB_SIZE-1));
-        assert(rob->current_state_list == &list);
-        if (!((rob->current_state_list != &thread->rob_free_list) ? rob->entry_valid : (!rob->entry_valid))) {
-          logfile << "ROB ", rob->index(), " list = ", rob->current_state_list->name, " entry_valid ", rob->entry_valid, endl, flush;
-          dump_smt_state(logfile);
-        assert(false);
+    default:
+      break;
         }
+
+    return os;
       }
+  ostream& operator<<(ostream& os, const OutOfOrderCoreBinaryEvent& e ) {
+    return e.print_binary(os, e.coreid);
     }
+};
+
+ostream& RegisterRenameTable::print(ostream& os) const {
+  foreach (i, TRANSREG_COUNT) {
+    if ((i % 8) == 0) os << " ";
+    os << " ", padstring(arch_reg_names[i], -6), " r", intstring((*this)[i]->index(), -3), " | ";
+    if (((i % 8) == 7) || (i == TRANSREG_COUNT-1)) os << endl;
   }
+  return os;
 }
 
-ostream& LoadStoreQueueEntry::print(ostream& os) const {
-  os << (store ? "st" : "ld"), intstring(index(), -3), " ";
-  os << "uuid ", intstring(rob->uop.uuid, 10), " ";
-  os << "rob ", intstring(rob->index(), -3), " ";
-  os << "r", intstring(rob->physreg->index(), -3);
-  if (PHYS_REG_FILE_COUNT > 1) os << "@", getcore().physregfiles[rob->physreg->rfid].name;
-  os << " ";
-  if (invalid) {
-    os << "< Invalid: fault 0x", hexstring(data, 8), " > ";
-  } else {
-    if (datavalid)
-      os << bytemaskstring((const byte*)&data, bytemask, 8);
-    else os << "<    Data Invalid     >";
-    os << " @ ";
-    if (addrvalid)
-      os << "0x", hexstring(physaddr << 3, 48);
-    else os << "< Addr Inval >";
+//
+// Get the thread priority, with lower numbers receiving higher priority.
+// This is used to regulate the order in which fetch, rename, frontend
+// and dispatch slots are filled in each cycle.
+//
+// The well known ICOUNT algorithm adds up the number of uops in
+// the frontend pipeline stages and gives highest priority to
+// the thread with the lowest number, since this thread is moving
+// uops through very quickly and can make more progress.
+//
+int ThreadContext::get_priority() const {
+  int priority =
+    fetchq.count +
+    rob_frontend_list.count +
+    rob_ready_to_dispatch_list.count;
+
+  for_each_cluster (cluster) {
+    priority +=
+      rob_dispatched_list[cluster].count +
+      rob_ready_to_issue_list[cluster].count +
+      rob_ready_to_store_list[cluster].count +
+      rob_ready_to_load_list[cluster].count;
   }    
-  return os;
+
+  return priority;
 }
 
 //
-// Barriers must flush the fetchq and stall the frontend until
-// after the barrier is consumed. Execution resumes at the address
-// in internal register nextrip (rip after the instruction) after
-// handling the barrier in microcode.
+// Execute one cycle of the entire core state machine
+//
+bool OutOfOrderCore::runcycle() {
+  bool exiting = 0;
+  //
+  // Detect edge triggered transition from 0->1 for
+  // pending interrupt events, then wait for current
+  // x86 insn EOM uop to commit before redirecting
+  // to the interrupt handler.
 //
-bool ThreadContext::handle_barrier() {
-  // Release resources of everything in the pipeline:
 
-  core_to_external_state();
-  flush_pipeline();
+#ifdef PTLSIM_HYPERVISOR
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    bool current_interrupts_pending = thread->ctx.check_events();
+    bool edge_triggered = ((!thread->prev_interrupts_pending) & current_interrupts_pending);
+    thread->handle_interrupt_at_next_eom |= edge_triggered;
+    thread->prev_interrupts_pending = current_interrupts_pending;
+  }
+#endif
 
-  int assistid = ctx.commitarf[REG_rip];
-  assist_func_t assist = (assist_func_t)(Waddr)assistid_to_func[assistid];
+  //
+  // Compute reserved issue queue entries to avoid starvation:
+  //
+#ifndef MULTI_IQ
+#ifdef ENABLE_CHECKS
+  int total_issueq_count = 0;
+  int total_issueq_reserved_free = 0;
   
-  if (logable(4)) {
-    logfile << "[vcpu ", ctx.vcpuid, "] Barrier (#", assistid, " -> ", (void*)assist, " ", assist_name(assist), " called from ",
-      (RIPVirtPhys(ctx.commitarf[REG_selfrip]).update(ctx)), "; return to ", (void*)(Waddr)ctx.commitarf[REG_nextrip],
-      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+  foreach (i, MAX_THREADS_PER_CORE) {
+    ThreadContext* thread = threads[i];
+
+    if unlikely (!thread) {
+      total_issueq_reserved_free += reserved_iq_entries;
+    } else {
+      total_issueq_count += thread->issueq_count;
+      if(thread->issueq_count < reserved_iq_entries){
+        total_issueq_reserved_free += reserved_iq_entries - thread->issueq_count;
+      }
+    }
   }
   
-  if (logable(6)) logfile << "Calling assist function at ", (void*)assist, "...", endl, flush; 
+  assert (total_issueq_count == issueq_all.count);
+  assert((ISSUE_QUEUE_SIZE - issueq_all.count) == (issueq_all.shared_entries + total_issueq_reserved_free));
+#endif /* ENABLE_CHECKS */
+#endif /* MULTI_IQ */
   
-  update_assist_stats(assist);
-  if (logable(6)) {
-    logfile << "Before assist:", endl, ctx, endl;
-#ifdef PTLSIM_HYPERVISOR
-    logfile << sshinfo, endl;
-#endif
-  }
+  foreach (i, threadcount) threads[i]->loads_in_this_cycle = 0;
   
-  assist(ctx);
+  fu_avail = bitmask(FU_COUNT);
+  caches.clock();
   
-  if (logable(6)) {
-    logfile << "Done with assist", endl;
-    logfile << "New state:", endl;
-    logfile << ctx;
-#ifdef PTLSIM_HYPERVISOR
-    logfile << sshinfo;
-#endif
-  }
+  //
+  // Backend and issue pipe stages run with round robin priority
+  //
+  int commitrc[MAX_THREADS_PER_CORE];
+  commitcount = 0;
+  writecount = 0;
 
-  // Flush again, but restart at possibly modified rip
-  flush_pipeline();
+  foreach (permute, threadcount) {
+    int tid = add_index_modulo(round_robin_tid, +permute, threadcount);
+    ThreadContext* thread = threads[tid];
+    if unlikely (!thread->ctx.running) continue;
 
-#ifndef PTLSIM_HYPERVISOR
-  if (requested_switch_to_native) {
-    logfile << "PTL call requested switch to native mode at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl;
-    return false;
-  }
+    commitrc[tid] = thread->commit();
+    for_each_cluster(j) thread->writeback(j);
+    for_each_cluster(j) thread->transfer(j);
+#ifdef ENABLE_ASF
+    commitrc[tid] = thread->asf_pipeline_intercept.post_commit(thread->ctx, commitrc[tid]);
 #endif
-  return true;
 }
 
-bool ThreadContext::handle_exception() {
-  // Release resources of everything in the pipeline:
-  core_to_external_state();
-  flush_pipeline();
-
-  if (logable(4)) {
-    logfile << "[vcpu ", ctx.vcpuid, "] Exception ", exception_name(ctx.exception), " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip], 
-      " at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+  //
+  // Clock the TLB miss page table walk state machine
+  // This may use up load ports, so do it before other
+  // loads can issue
+  //
+#ifdef PTLSIM_HYPERVISOR
+  foreach (i, threadcount) {
+    threads[i]->tlbwalk();
   }
+#endif
 
   //
-  // CheckFailed and SkipBlock exceptions are raised by the chk uop.
-  // This uop is used at the start of microcoded instructions to assert
-  // that certain conditions are true so complex corrective actions can
-  // be taken if the check fails.
+  // Always clock the issue queues: they're independent of all threads
   //
-  // SkipBlock is a special case used for checks at the top of REP loops.
-  // Specifically, if the %rcx register is zero on entry to the REP, no
-  // action at all is to be taken; the rip should simply advance to
-  // whatever is in chk_recovery_rip and execution should resume.
+  // SD: Moved between forward and issue in the same cycle, so that a 0-cycle
+  // forwarding delay would actually be equivalent to a direct bypass!
+  foreach_issueq(clock());
+
   //
-  // CheckFailed exceptions usually indicate the processor needs to take
-  // evasive action to avoid a user visible exception. For instance, 
-  // CheckFailed is raised when an inlined floating point operand is
-  // denormal or otherwise cannot be handled by inlined fastpath uops,
-  // or when some unexpected segmentation or page table conditions
-  // arise.
+  // Issue whatever is ready
   //
-  if (ctx.exception == EXCEPTION_SkipBlock) {
-    ctx.commitarf[REG_rip] = chk_recovery_rip;
-    if (logable(6)) logfile << "SkipBlock pseudo-exception: skipping to ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
-    flush_pipeline();
-    return true;
-  }
+  for_each_cluster(i) { issue(i); }
 
-#ifdef PTLSIM_HYPERVISOR
-  //
-  // Map PTL internal hardware exceptions to their x86 equivalents,
-  // depending on the context. The error_code field should already
-  // be filled out.
   //
-  // Exceptions not listed here are propagated by microcode
-  // rather than the processor itself.
+  // Most of the frontend (except fetch!) also works with round robin priority
   //
-  switch (ctx.exception) {
-  case EXCEPTION_PageFaultOnRead:
-  case EXCEPTION_PageFaultOnWrite:
-  case EXCEPTION_PageFaultOnExec:
-    ctx.x86_exception = EXCEPTION_x86_page_fault; break;
-  case EXCEPTION_FloatingPointNotAvailable:
-    ctx.x86_exception = EXCEPTION_x86_fpu_not_avail; break;
-  case EXCEPTION_FloatingPoint:
-    ctx.x86_exception = EXCEPTION_x86_fpu; break;
-  default:
-    logfile << "Unsupported internal exception type ", exception_name(ctx.exception), endl, flush;
-    assert(false);
-  }
+  int dispatchrc[MAX_THREADS_PER_CORE];
+  dispatchcount = 0;
+  foreach (permute, threadcount) {
+    int tid = add_index_modulo(round_robin_tid, +permute, threadcount);
+    ThreadContext* thread = threads[tid];
+    if unlikely (!thread->ctx.running) continue;
 
-  if (logable(4)) {
-    logfile << ctx;
-    logfile << sshinfo;
-  }
+    for_each_cluster(j) { thread->complete(j); }
 
-  ctx.propagate_x86_exception(ctx.x86_exception, ctx.error_code, ctx.cr2);
+    dispatchrc[tid] = thread->dispatch();
 
-  // Flush again, but restart at modified rip
-  flush_pipeline();
+    if likely (dispatchrc[tid] >= 0) {
+      thread->frontend();
+      thread->rename();
+    }
+  }
 
-  return true;
-#else
-  if (logable(6)) 
-    logfile << "Exception (", exception_name(ctx.exception), " called from ", (void*)(Waddr)ctx.commitarf[REG_rip], 
-      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+  //
+  // Compute fetch priorities (default is ICOUNT algorithm)
+  //
+  // This means we sort in ascending order, with any unused threads
+  // (if any) given the lowest priority.
+  //
 
-  stringbuf sb;
-  logfile << exception_name(ctx.exception), " detected at fault rip ", (void*)(Waddr)ctx.commitarf[REG_rip], " @ ", 
-    total_user_insns_committed, " commits (", total_uops_committed, " uops): genuine user exception (",
-    exception_name(ctx.exception), "); aborting", endl;
-  logfile << ctx, endl;
-  logfile << flush;
+  int priority_value[MAX_THREADS_PER_CORE];
+  int priority_index[MAX_THREADS_PER_CORE];
 
-  logfile << "Aborting...", endl, flush;
-  cerr << "Aborting...", endl, flush;
+  if likely (threadcount == 1) {
+    priority_value[0] = 0;
+    priority_index[0] = 0;
+  } else {
+    foreach (i, threadcount) {
+      priority_index[i] = i;
+      ThreadContext* thread = threads[i];
+      priority_value[i] = thread->get_priority();
+      if unlikely (!thread->ctx.running) priority_value[i] = limits<int>::max;
+    }
 
-  assert(false);
-  return false;
-#endif
+    sort(priority_index, threadcount, SortPrecomputedIndexListComparator<int, false>(priority_value));
 }
 
-bool ThreadContext::handle_interrupt() {
-#ifdef PTLSIM_HYPERVISOR
-  // Release resources of everything in the pipeline:
-  core_to_external_state();
-  flush_pipeline();
+  //
+  // Fetch in thread priority order
+  //
+  // NOTE: True ICOUNT only fetches the highest priority
+  // thread per cycle, since there is usually only one
+  // instruction cache port. In a banked i-cache, we can
+  // fetch from multiple threads every cycle.
+  //
+  foreach (j, threadcount) {
+    int i = priority_index[j];
+    ThreadContext* thread = threads[i];
+    assert(thread);
+    if unlikely (!thread->ctx.running) {
+      continue;
+    }
 
-  if (logable(6)) {
-    logfile << "[vcpu ", threadid, "] interrupts pending at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
-    logfile << "Context at interrupt:", endl;
-    logfile << ctx;
-    logfile << sshinfo;
-    logfile.flush();
+    if likely (dispatchrc[i] >= 0) {
+      thread->fetch();
+    }
   }
 
-  ctx.event_upcall();
+  //
+  // Always clock the issue queues: they're independent of all threads
+  //
+  // SD: Moved between forward and issue in the same cycle, so that a 0-cycle
+  // forwarding delay would actually be equivalent to a direct bypass!
+  //foreach_issueq(clock());
 
-  if (logable(6)) {
-    logfile <<  "[vcpu ", threadid, "] after interrupt redirect:", endl;
-    logfile << ctx;
-    logfile << sshinfo;
-    logfile.flush();
+  //
+  // Advance the round robin priority index
+  //
+  round_robin_tid = add_index_modulo(round_robin_tid, +1, threadcount);
+
+  //
+  // Flush event log ring buffer
+  //
+  if unlikely (config.event_log_enabled) {
+    // logfile << "[cycle ", sim_cycle, "] Miss buffer contents:", endl;
+    // logfile << caches.missbuf;
+    if unlikely (config.flush_event_log_every_cycle) {
+      eventlog.flush(true);
+    }
   }
 
-  // Flush again, but restart at modified rip
-  flush_pipeline();
+#ifdef ENABLE_CHECKS
+  // This significantly slows down simulation; only enable it if absolutely needed:
+  // check_refcounts();
 #endif
-  return true;
-}
 
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    if unlikely (!thread->ctx.running) continue;
+    int rc = commitrc[i];
+    if likely ((rc == COMMIT_RESULT_OK) | (rc == COMMIT_RESULT_NONE)) continue;
+
+    switch (rc) {
+    case COMMIT_RESULT_SMC: {
+      if (logable(3)) logfile << "Potentially cross-modifying SMC detected: global flush required (cycle ", sim_cycle, ", ", total_user_insns_committed, " commits)", endl, flush;
 //
-// Event Formatting
+      // DO NOT GLOBALLY FLUSH! It will cut off the other thread(s) in the
+      // middle of their currently committing x86 instruction, causing massive
+      // internal corruption on any VCPUs that happen to be straddling the
+      // instruction boundary.
 //
-void PhysicalRegister::fill_operand_info(PhysicalRegisterOperandInfo& opinfo) {
-  opinfo.physreg = index();
-  opinfo.state = state;
-  opinfo.rfid = rfid;
-  opinfo.archreg = archreg;
-  if (rob) {
-    opinfo.rob = rob->index();
-    opinfo.uuid = rob->uop.uuid;
+      // BAD: machine.flush_all_pipelines();
+      //
+      // This is a temporary fix: in the *extremely* rare case where both
+      // threads have the same basic block in their pipelines and that
+      // BB is being invalidated, the BB cache will forbid us from
+      // freeing it (and will print a warning to that effect).
+      //
+      // I'm working on a solution to this, to put some BBs on an
+      // "invisible" list, where they cannot be looked up anymore,
+      // but their memory is not freed until the lock is released.
+      //
+      foreach (i, threadcount) {
+        ThreadContext* t = threads[i];
+        if unlikely (!t) continue;
+        if (logable(3)) {
+          logfile << "  [vcpu ", i, "] current_basic_block = ", t->current_basic_block;  ": ";
+          if (t->current_basic_block) logfile << t->current_basic_block->rip;
+          logfile << endl;
   }
 }
 
-ostream& OutOfOrderModel::operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo) {
-  os << "[r", opinfo.physreg, " ", short_physreg_state_names[opinfo.state], " ";
-  switch (opinfo.state) {
-  case PHYSREG_WAITING:
-  case PHYSREG_BYPASS:
-  case PHYSREG_WRITTEN:
-    os << "rob ", opinfo.rob, " uuid ", opinfo.uuid; break;
-  case PHYSREG_ARCH:
-  case PHYSREG_PENDINGFREE:
-    os << arch_reg_names[opinfo.archreg]; break;
-  };
-  os << "]";
-  return os;
+      assert(thread->flush_pipeline());
+      thread->invalidate_smc();
+      break;
 }
-
-bool EventLog::init(size_t bufsize) {
-  reset();
-  size_t bytes = bufsize * sizeof(OutOfOrderCoreEvent);
-  start = (OutOfOrderCoreEvent*)ptl_mm_alloc_private_pages(bytes);
-  if unlikely (!start) return false;
-  end = start + bufsize;
-  tail = start;
-  
-  foreach (i, bufsize) start[i].type = EVENT_INVALID;
-  return true;
+    case COMMIT_RESULT_EXCEPTION: {
+      exiting = !thread->handle_exception();
+      break;
 }
-
-void EventLog::reset() {
-  if (!start) return;
-
-  size_t bytes = (end - start) * sizeof(OutOfOrderCoreEvent);
-  ptl_mm_free_private_pages(start, bytes);
-  start = null;
-  end = null;
-  tail = null;
+    case COMMIT_RESULT_BARRIER: {
+      exiting = !thread->handle_barrier();
+      break;
 }
-
-void EventLog::flush(bool only_to_tail) {
-  if likely (!logable(6)) return;
-  if unlikely (!logfile) return;
-  if unlikely (!logfile->ok()) return;
-  print(*logfile, only_to_tail);
-  tail = start;
+    case COMMIT_RESULT_INTERRUPT: {
+      thread->handle_interrupt();
+      break;
+    }
+    case COMMIT_RESULT_STOP: {
+      assert(thread->flush_pipeline());
+      thread->stall_frontend = 1;
+      machine.stopped[thread->ctx.vcpuid] = 1;
+      // Wait for other cores to sync up, so don't exit right away
+      break;
 }
+    case COMMIT_RESULT_OK_FLUSH: {
+      if (logable(5))
+        logfile << "[vcpu ", thread->ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle,
+          ": Flushing the pipeline.", endl, flush;
 
-ostream& EventLog::print(ostream& os, bool only_to_tail) {
-  if (tail >= end) tail = start;
-  if (tail < start) tail = end;
+      assert(thread->flush_pipeline());
+      break;
+    }
+    }
+  }
 
-  OutOfOrderCoreEvent* p = (only_to_tail) ? start : tail;
+#ifdef PTLSIM_HYPERVISOR
+  if unlikely (vcpu_online_map_changed) {
+    vcpu_online_map_changed = 0;
+    foreach (i, contextcount) {
+      Context& vctx = contextof(i);
+      if likely (!vctx.dirty) continue;
+      //
+      // The VCPU is coming up for the first time after booting or being
+      // taken offline by the user.
+      //
+      // Force the active core model to flush any cached (uninitialized)
+      // internal state (like register file copies) it might have, since
+      // it did not know anything about this VCPU prior to now: if it
+      // suddenly gets marked as running without this, the core model
+      // will try to execute from bogus state data.
+      //
+      logfile << "VCPU ", vctx.vcpuid, " context was dirty: update core model internal state", endl;
 
-  W64 cycle = limits<W64>::max;
-  size_t bufsize = end - start;
+      ThreadContext* tc = threads[vctx.vcpuid];
+      assert(tc);
+      if unlikely (&tc->ctx != &vctx) {
+        logfile << "Context mismatch detected:",endl;
+        logfile << "  &contextof(",i,")= ", &vctx, " &threads[", vctx.vcpuid,
+          "]->ctx= ",&tc->ctx, endl;
+        logfile << "  i= ", i, " contextof(",i,").vcpuid= ", vctx.vcpuid,
+          " threads[", vctx.vcpuid, "]->threadid= ", tc->threadid,
+          " threads[", vctx.vcpuid, "]->core.coreid= ", tc->core.coreid, endl;
+
+        logfile << "Mismatching contexts:", endl;
+        logfile << "  contextof(",i,"):", vctx, endl;
+        if (&tc->ctx)
+          logfile << "  threads[",vctx.vcpuid,"]->ctx= ", tc->ctx, endl;
+
+        logfile << "Looping through all other contexts:", endl;
+        foreach (j, contextcount) {
+          Context& vctx     = contextof(j);             // NOTE: These hide
+          ThreadContext* tc = threads[vctx.vcpuid];     //       the outer ones!
+          logfile << "  j= ", j, " contextof(",j,").vcpuid= ", vctx.vcpuid,
+            " threads[", vctx.vcpuid, "]->threadid= ", tc->threadid,
+            " threads[", vctx.vcpuid, "]->core.coreid= ", tc->core.coreid, endl;
+          logfile << "  &contextof(",j,")= ", &vctx, " &threads[", vctx.vcpuid,
+            "]->ctx= ",&tc->ctx, endl;
+        }
+      }
+      assert(&tc->ctx == &vctx);
+      assert(tc->flush_pipeline());
+      vctx.dirty = 0;
+    }
+  }
+#endif
 
-  if (!config.flush_event_log_every_cycle) os << "#-------- Start of event log --------", endl;
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    if unlikely (!thread->ctx.running) break;
 
-  foreach (i, (only_to_tail ? (tail - start) : bufsize)) {
-    if unlikely (p >= end) p = start;
-    if unlikely (p < start) p = end-1;
-    if unlikely (p->type == EVENT_INVALID) {
-      p++;
-      continue;
+    if unlikely ((sim_cycle - thread->last_commit_at_cycle) > 4096) {
+      stringbuf sb;
+      sb << "[vcpu ", thread->ctx.vcpuid, "] thread ", thread->threadid, ": WARNING: At cycle ",
+        sim_cycle, ", ", total_user_insns_committed,  " user commits: no instructions have committed for ",
+        (sim_cycle - thread->last_commit_at_cycle), " cycles; the pipeline could be deadlocked", endl;
+      logfile << sb, flush;
+      cerr << sb, flush;
+      logfile << thread->ROB, endl, flush;
+      exiting = 1;
     }
-
-    if unlikely (p->cycle != cycle) {
-      cycle = p->cycle;
-      os << "Cycle ", cycle, ":", endl;
     }
 
-    p->print(os);
-    p++;
+  return exiting;
   }
 
-  if (!config.flush_event_log_every_cycle) os << "#-------- End of event log --------", endl;
+//
+// ReorderBufferEntry
+//
+void ReorderBufferEntry::init(int idx) {
+  this->idx = idx;
+  entry_valid = 0;
+  selfqueuelink::reset();
+  current_state_list = null;
+  reset();
+}
 
-  return os;
+//
+// Clean out various fields from the ROB entry that are
+// expected to be zero when allocating a new ROB entry.
+//
+void ReorderBufferEntry::reset() {
+  int latency, operand;
+  // Deallocate ROB entry
+  entry_valid = false;
+  cycles_left = 0;
+  physreg = (PhysicalRegister*)null;
+  lfrqslot = -1;
+  lsq = 0;
+  load_store_second_phase = 0;
+  lock_acquired = 0;
+  consumer_count = 0;
+  executable_on_cluster_mask = 0;
+  pteupdate = 0;
+  cluster = -1;
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+  dest_renamed_before_writeback = 0;
+  no_branches_between_renamings = 0;
+#endif
+  issued = 0;
+#ifdef ENABLE_ASF
+  llbline = (LLBLine*)null;
+#endif
 }
 
-ostream& OutOfOrderCoreEvent::print(ostream& os) const {
-  bool ld = isload(uop.opcode);
-  bool st = isstore(uop.opcode);
-  bool br = isbranch(uop.opcode);
-  W32 exception = LO32(commit.state.reg.rddata);
-  W32 error_code = HI32(commit.state.reg.rddata);
+bool ReorderBufferEntry::ready_to_issue() const {
+  bool raready = operands[0]->ready();
+  bool rbready = operands[1]->ready();
+  bool rcready = operands[2]->ready();
+  bool rsready = operands[3]->ready();
 
-  stringbuf uopname;
-  nameof(uopname, uop);
+  if (isstore(uop.opcode)) {
+    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready);
+  } else if (isload(uop.opcode)) {
+    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready & rcready);
+  } else {
+    return (raready & rbready & rcready & rsready);
+  }
+}
 
-  os << intstring(uuid, 20), " t", threadid, " ";
-  switch (type) {
-    //
-    // Fetch Events
-    //
-  case EVENT_FETCH_STALLED:
-    os <<  "fetch  frontend stalled"; break;
-  case EVENT_FETCH_ICACHE_WAIT:
-    os <<  "fetch  rip ", rip, ": wait for icache fill"; break;
-  case EVENT_FETCH_FETCHQ_FULL:
-    os <<  "fetch  rip ", rip, ": fetchq full"; break;
-  case EVENT_FETCH_IQ_QUOTA_FULL:
-    os <<  "fetch  rip ", rip, ": issue queue quota full = ", issueq_count, " "; break;
-  case EVENT_FETCH_BOGUS_RIP:
-    os <<  "fetch  rip ", rip, ": bogus RIP or decode failed"; break;
-  case EVENT_FETCH_ICACHE_MISS:
-    os <<  "fetch  rip ", rip, ": wait for icache fill of phys ", (void*)(Waddr)((rip.mfnlo << 12) + lowbits(rip.rip, 12)), " on missbuf ", fetch.missbuf; break;
-  case EVENT_FETCH_SPLIT:
-    os <<  "fetch  rip ", rip, ": split unaligned load or store ", uop; break;
-  case EVENT_FETCH_ASSIST:
-    os <<  "fetch  rip ", rip, ": branch into assist microcode: ", uop; break;
-  case EVENT_FETCH_TRANSLATE:
-    os <<  "xlate  rip ", rip, ": ", fetch.bb_uop_count, " uops"; break;
-  case EVENT_FETCH_OK: {
-    os <<  "fetch  rip ", rip, ": ", uop, 
-      " (uopid ", uop.bbindex;
-    if (uop.som) os << "; SOM";
-    if (uop.eom) os << "; EOM ", uop.bytes, " bytes";
-    os << ")";
-    if (uop.eom && fetch.predrip) os << " -> pred ", (void*)fetch.predrip;
-    if (isload(uop.opcode) | isstore(uop.opcode)) {
-      os << "; unaligned pred slot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), " -> ", uop.unaligned;
+bool ReorderBufferEntry::ready_to_commit() const {
+  return (current_state_list == &getthread().rob_ready_to_commit_queue);
     }
-    break;
+
+StateList& ReorderBufferEntry::get_ready_to_issue_list() const {
+  OutOfOrderCore& core = getcore();
+  ThreadContext& thread = getthread();
+  return
+    isload(uop.opcode) ? thread.rob_ready_to_load_list[cluster] :
+    isstore(uop.opcode) ? thread.rob_ready_to_store_list[cluster] :
+    thread.rob_ready_to_issue_list[cluster];
   }
+
     //
-    // Rename Events
+// Reorder Buffer
     //
-  case EVENT_RENAME_FETCHQ_EMPTY:
-    os << "rename fetchq empty"; break;
-  case EVENT_RENAME_ROB_FULL:
-    os <<  "rename ROB full"; break;
-  case EVENT_RENAME_PHYSREGS_FULL:
-    os <<  "rename physical register file full"; break;
-  case EVENT_RENAME_LDQ_FULL:
-    os <<  "rename load queue full"; break;
-  case EVENT_RENAME_STQ_FULL:
-    os <<  "rename store queue full"; break;
-  case EVENT_RENAME_MEMQ_FULL:
-    os <<  "rename memory queue full"; break;
-  case EVENT_RENAME_OK: {
-    os <<  "rename rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
-    if (ld|st) os << " lsq", lsq;
-    os << " = ";
-    foreach (i, MAX_OPERANDS) os << rename.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
-    os << "; renamed";
-    os << " ", arch_reg_names[uop.rd], " (old r", rename.oldphys, ")";
-    if unlikely (!uop.nouserflags) {
-      if likely (uop.setflags & SETFLAG_ZF) os << " zf (old r", rename.oldzf, ")";
-      if likely (uop.setflags & SETFLAG_CF) os << " cf (old r", rename.oldcf, ")";
-      if likely (uop.setflags & SETFLAG_OF) os << " of (old r", rename.oldof, ")";
-    }
-    break;
-  }
-  case EVENT_FRONTEND:
-    os <<  "front  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " frontend stage ", (FRONTEND_STAGES - frontend.cycles_left), " of ", FRONTEND_STAGES;
-    break;
-  case EVENT_CLUSTER_NO_CLUSTER:
-  case EVENT_CLUSTER_OK: {
-    os << ((type == EVENT_CLUSTER_OK) ? "clustr" : "noclus"), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " allowed FUs = ", 
-      bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), " -> clusters ",
-      bitstring(select_cluster.allowed_clusters, MAX_CLUSTERS, true), " avail";
-    foreach (i, MAX_CLUSTERS) os << " ", select_cluster.iq_avail[i];
-    os << "-> ";
-    if (type == EVENT_CLUSTER_OK) os << "cluster ", clusters[cluster].name; else os << "-> none"; break;
-    break;
+stringbuf& ReorderBufferEntry::get_operand_info(stringbuf& sb, int operand) const {
+  PhysicalRegister& physreg = *operands[operand];
+  ReorderBufferEntry& sourcerob = *physreg.rob;
+
+  sb << "r", physreg.index();
+  if (PHYS_REG_FILE_COUNT > 1) sb << "@", getcore().physregfiles[physreg.rfid].name;
+
+  switch (physreg.state) {
+  case PHYSREG_WRITTEN:
+    sb << " (written)"; break;
+  case PHYSREG_BYPASS:
+    sb << " (ready)"; break;
+  case PHYSREG_WAITING:
+    sb << " (wait rob ", sourcerob.index(), " uuid ", sourcerob.uop.uuid, ")"; break;
+  case PHYSREG_ARCH: break;
+    if (physreg.index() == PHYS_REG_NULL)  sb << " (zero)"; else sb << " (arch ", arch_reg_names[physreg.archreg], ")"; break;
+  case PHYSREG_PENDINGFREE:
+    sb << " (pending free for ", arch_reg_names[physreg.archreg], ")"; break;
+  default:
+    // Cannot be in free state!
+    sb << " (FREE)"; break;
   }
-  case EVENT_DISPATCH_NO_CLUSTER:
-  case EVENT_DISPATCH_OK: {
-    os << ((type == EVENT_DISPATCH_OK) ? "disptc" : "nodisp"),  " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " operands ";
-    foreach (i, MAX_OPERANDS) os << dispatch.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
-    if (type == EVENT_DISPATCH_OK) os << " -> cluster ", clusters[cluster].name; else os << " -> none";
-    break;
+
+  return sb;
   }
-  case EVENT_ISSUE_NO_FU: {
-    os << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
-    os << "no FUs available in cluster ", clusters[cluster].name, ": ",
-      "fu_avail = ", bitstring(issue.fu_avail, FU_COUNT, true), ", ",
-      "op_fu = ", bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), ", "
-      "fu_cl_mask = ", bitstring(clusters[cluster].fu_mask, FU_COUNT, true);
-    break;
+
+ThreadContext& ReorderBufferEntry::getthread() const { return *getcore().threads[threadid]; }
+
+issueq_tag_t ReorderBufferEntry::get_tag() {
+  int mask = ((1 << MAX_THREADS_BIT) - 1) << MAX_ROB_IDX_BIT;
+  if (logable(100)) logfile << " get_tag() thread ", (void*) threadid, " rob idx ", (void*)idx, " mask ", (void*)mask, endl;
+
+  assert(!(idx & mask));
+  assert(!(threadid >> MAX_THREADS_BIT));
+  //  int threadid = 1;
+  issueq_tag_t rc = (idx | (threadid << MAX_ROB_IDX_BIT));
+  if (logable(100)) logfile <<  " tag ", (void*) rc, endl;
+  return rc;
   }
-  case EVENT_ISSUE_OK: {
+
+ostream& ReorderBufferEntry::print_operand_info(ostream& os, int operand) const {
     stringbuf sb;
-    sb << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
-    sb << " on ", padstring(fu_names[fu], -4), " in ", padstring(cluster_names[cluster], -4), ": r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
-    sb << " "; print_value_and_flags(sb, issue.state.reg.rddata, issue.state.reg.rdflags); sb << " =";
-    sb << " "; print_value_and_flags(sb, issue.operand_data[RA], issue.operand_flags[RA]); sb << ", ";
-    sb << " "; print_value_and_flags(sb, issue.operand_data[RB], issue.operand_flags[RB]); sb << ", ";
-    sb << " "; print_value_and_flags(sb, issue.operand_data[RC], issue.operand_flags[RC]);
-    sb << " (", issue.cycles_left, " cycles left)";
-    if (issue.mispredicted) sb << "; mispredicted (real ", (void*)(Waddr)issue.state.reg.rddata, " vs expected ", (void*)(Waddr)issue.predrip, ")";
+  get_operand_info(sb, operand);
     os << sb;
-    break;
+  return os;
   }
-  case EVENT_REPLAY: {
-    os << "replay rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid],
-      " on cluster ", clusters[cluster].name, ": waiting on";
-    foreach (i, MAX_OPERANDS) {
-      if (!bit(replay.ready, i)) os << " ", replay.opinfo[i];
+
+ostream& ReorderBufferEntry::print(ostream& os) const {
+  stringbuf name, rainfo, rbinfo, rcinfo;
+  nameof(name, uop);
+  get_operand_info(rainfo, 0);
+  get_operand_info(rbinfo, 1);
+  get_operand_info(rcinfo, 2);
+
+  os << "rob ", intstring(index(), -3), " uuid ", intstring(uop.uuid, 16), " rip 0x", hexstring(uop.rip, 48), " ",
+    padstring(current_state_list->name, -24), " ", (uop.som ? "SOM" : "   "), " ", (uop.eom ? "EOM" : "   "),
+    " @ ", padstring((cluster >= 0) ? clusters[cluster].name : "???", -4), " ",
+    padstring(name, -12), " r", intstring(physreg->index(), -3), " ", padstring(arch_reg_names[uop.rd], -6);
+  if (isload(uop.opcode))
+    os << " ld", intstring(lsq->index(), -3);
+  else if (isstore(uop.opcode))
+    os << " st", intstring(lsq->index(), -3);
+  else os << "      ";
+
+  os << " = ";
+  os << padstring(rainfo, -30);
+  os << padstring(rbinfo, -30);
+  os << padstring(rcinfo, -30);
+
+#ifdef ENABLE_ASF
+  if (llbline)
+    os << " llb: ", llbline;
+#endif
+
+  return os;
     }
-    break;
+
+void ThreadContext::print_rob(ostream& os) {
+  os << "ROB head ", ROB.head, " to tail ", ROB.tail, " (", ROB.count, " entries):", endl;
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+    os << "  ", rob, endl;
   }
-  case EVENT_STORE_WAIT: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    os << "wait on ";
-    if (!loadstore.rcready) os << " rc";
-    if (loadstore.inherit_sfr_used) {
-      os << ((loadstore.rcready) ? "" : " and "), loadstore.inherit_sfr,
-        " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
-        ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
     }
-    break;
+
+void ThreadContext::print_lsq(ostream& os) {
+  os << "LSQ head ", LSQ.head, " to tail ", LSQ.tail, " (", LSQ.count, " entries):", endl;
+  foreach_forward(LSQ, i) {
+    LoadStoreQueueEntry& lsq = LSQ[i];
+    os << "  ", lsq, endl;
   }
-  case EVENT_STORE_PARALLEL_FORWARDING_MATCH: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    os << "ignored parallel forwarding match with ldq ", loadstore.inherit_sfr_lsq,
-      " (uuid ", loadstore.inherit_sfr_uuid, " rob", loadstore.inherit_sfr_rob,
-      " r", loadstore.inherit_sfr_physreg, ")";
-    break;
   }
-  case EVENT_STORE_ALIASED_LOAD: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    os << "aliased with ldbuf ", loadstore.inherit_sfr_lsq, " (uuid ", loadstore.inherit_sfr_uuid,
-      " rob", loadstore.inherit_sfr_rob, " r", loadstore.inherit_sfr_physreg, ");",
-      " (add colliding load rip ", (void*)(Waddr)loadstore.inherit_sfr_rip, "; replay from rip ", rip, ")";
-    break;
+
+void ThreadContext::print_rename_tables(ostream& os) {
+  os << "SpecRRT:", endl;
+  os << specrrt;
+  os << "CommitRRT:", endl;
+  os << commitrrt;
   }
-  case EVENT_STORE_ISSUED: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    if (loadstore.inherit_sfr_used) {
-      os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
-        ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
-        ", r", loadstore.inherit_sfr_physreg, ");";
+
+void OutOfOrderCore::print_smt_state(ostream& os) {
+  os << "Print SMT statistics:", endl;
+
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    os << "Thread ", i, ":", endl,
+      "  total_uops_committed ", thread->total_uops_committed, endl,
+      "  uipc ", double(thread->total_uops_committed) / double(iterations), endl,
+      "  total_insns_committed ",  thread->total_insns_committed,
+      "  ipc ", double(thread->total_insns_committed) / double(iterations), endl;
     }
-    os << " <= ", hexstring(loadstore.data_to_store, 8*(1<<uop.size)), " = ", loadstore.sfr;
-    break;
   }
-  case EVENT_STORE_LOCK_RELEASED: {
-    os << "lk-rel", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "lock released (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
-    break;
+
+void ThreadContext::dump_smt_state(ostream& os) {
+  os << "SMT per-thread state for t", threadid, ":", endl;
+
+  print_rename_tables(os);
+  print_rob(os);
+  print_lsq(os);
+  os << flush;
   }
-  case EVENT_STORE_LOCK_ANNULLED: {
-    os << "lk-anl", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "lock annulled (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
-    break;
+
+void OutOfOrderCore::dump_smt_state(ostream& os) {
+  os << "SMT common structures:", endl;
+
+  print_list_of_state_lists<PhysicalRegister>(os, physreg_states, "Physical register states");
+  foreach (i, PHYS_REG_FILE_COUNT) {
+    os << physregfiles[i];
   }
-  case EVENT_STORE_LOCK_REPLAY: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
-    break;
+
+  print_list_of_state_lists<ReorderBufferEntry>(os, rob_states, "ROB entry states");
+  os << "Issue Queues:", endl;
+  foreach_issueq(print(os));
+  caches.print(os);
+
+  os << "Unaligned predictor:", endl;
+  os << "  ", unaligned_predictor.popcount(), " unaligned bits out of ", UNALIGNED_PREDICTOR_SIZE, " bits", endl;
+  os << "  Raw data: ", unaligned_predictor, endl;
+
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    thread->dump_smt_state(os);
   }
+}
+
+//
+// Validate the physical register reference counters against what
+// is really accessible from the various tables and operand fields.
+//
+// This is for debugging only.
+//
+void OutOfOrderCore::check_refcounts() {
+  // this should be for each thread instead of whole core:
+  // for now, we just work on thread[0];
+  ThreadContext& thread = *threads[0];
+  Queue<ReorderBufferEntry, ROB_SIZE>& ROB = thread.ROB;
+  RegisterRenameTable& specrrt = thread.specrrt;
+  RegisterRenameTable& commitrrt = thread.commitrrt;
 
-  case EVENT_LOAD_WAIT: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    os << "wait on sfr ", loadstore.inherit_sfr,
-      " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
-      ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
-    if (loadstore.predicted_alias) os << "; stalled by predicted aliasing";
-    break;
+  int refcounts[PHYS_REG_FILE_COUNT][MAX_PHYS_REG_FILE_SIZE];
+  memset(refcounts, 0, sizeof(refcounts));
+
+  foreach (rfid, PHYS_REG_FILE_COUNT) {
+    // Null physreg in each register file is special and can never be freed:
+    refcounts[rfid][PHYS_REG_NULL]++;
   }
-  case EVENT_LOAD_HIT: 
-  case EVENT_LOAD_MISS: {
-    if (type == EVENT_LOAD_HIT)
-      os << (loadstore.load_store_second_phase ? "load2 " : "load  ");
-    else os << (loadstore.load_store_second_phase ? "ldmis2" : "ldmiss");
 
-    os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    if (loadstore.inherit_sfr_used) {
-      os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
-        ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
-        ", r", loadstore.inherit_sfr_physreg, "); ";
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+    foreach (j, MAX_OPERANDS) {
+      refcounts[rob.operands[j]->rfid][rob.operands[j]->index()]++;
     }
-    if (type == EVENT_LOAD_HIT)
-      os << "hit L1: value 0x", hexstring(loadstore.sfr.data, 64);
-    else os << "missed L1 (lfrqslot ", lfrqslot, ") [value would be 0x", hexstring(loadstore.sfr.data, 64), "]";
-    break;
   }
-  case EVENT_LOAD_BANK_CONFLICT: {
-    os << "ldbank", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "L1 bank conflict over bank ", lowbits(loadstore.sfr.physaddr, log2(CacheSubsystem::L1_DCACHE_BANKS));
-    break;
+
+  foreach (i, TRANSREG_COUNT) {
+    refcounts[commitrrt[i]->rfid][commitrrt[i]->index()]++;
+    refcounts[specrrt[i]->rfid][specrrt[i]->index()]++;
   }
-  case EVENT_LOAD_TLB_MISS: {
-    os << (loadstore.load_store_second_phase ? "ldtlb2" : "ldtlb ");  
-    os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    if (loadstore.inherit_sfr_used) {
-      os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
-        ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
-        ", r", loadstore.inherit_sfr_physreg, "); ";
+
+  bool errors = 0;
+
+  foreach (rfid, PHYS_REG_FILE_COUNT) {
+    PhysicalRegisterFile& physregs = physregfiles[rfid];
+    foreach (i, physregs.size) {
+      if unlikely (physregs[i].refcount != refcounts[rfid][i]) {
+        logfile << "ERROR: r", i, " refcount is ", physregs[i].refcount, " but should be ", refcounts[rfid][i], endl;
+
+        foreach_forward(ROB, r) {
+          ReorderBufferEntry& rob = ROB[r];
+          foreach (j, MAX_OPERANDS) {
+            if ((rob.operands[j]->index() == i) & (rob.operands[j]->rfid == rfid)) logfile << "  ROB ", r, " operand ", j, endl;
     }
-    else os << "DTLB miss", " [value would be 0x", hexstring(loadstore.sfr.data, 64), "]";
-    break;
   }
-  case EVENT_LOAD_LOCK_REPLAY: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
-    break;
+
+        foreach (j, TRANSREG_COUNT) {
+          if ((commitrrt[j]->index() == i) & (commitrrt[j]->rfid == rfid)) logfile << "  CommitRRT ", arch_reg_names[j], endl;
+          if ((specrrt[j]->index() == i) & (specrrt[j]->rfid == rfid)) logfile << "  SpecRRT ", arch_reg_names[j], endl;
   }
-  case EVENT_LOAD_LOCK_OVERFLOW: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "replay because locking required but no free interlock buffers", endl;
-    break;
+
+        errors = 1;
   }
-  case EVENT_LOAD_LOCK_ACQUIRED: {
-    os << "lk-acq", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
-      "lock acquired";
-    break;
   }
-  case EVENT_LOAD_LFRQ_FULL:
-    os << "load   rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), ": LFRQ or miss buffer full; replaying"; break;
-  case EVENT_LOAD_HIGH_ANNULLED: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
-    os << "load was annulled (high unaligned load)";
-    break;
   }
-  case EVENT_LOAD_WAKEUP:
-    os << "ldwake rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " wakeup load via lfrq slot ", lfrqslot; break;
-  case EVENT_TLBWALK_HIT: {
-    os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
-      loadstore.tlb_walk_level, "): hit for PTE at phys ", (void*)loadstore.virtaddr; break;
-    break;
+
+  if (errors) assert(false);
   }
-  case EVENT_TLBWALK_MISS: {
-    os << "wlkmis rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
-      loadstore.tlb_walk_level, "): miss for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
-    break;
+
+void OutOfOrderCore::check_rob() {
+  // this should be for each thread instead of whole core:
+  // for now, we just work on thread[0];
+  ThreadContext& thread = *threads[0];
+  Queue<ReorderBufferEntry, ROB_SIZE>& ROB = thread.ROB;
+
+  foreach (i, ROB_SIZE) {
+    ReorderBufferEntry& rob = ROB[i];
+    if (!rob.entry_valid) continue;
+    assert(inrange((int)rob.forward_cycle, 0, (MAX_FORWARDING_LATENCY+1)-1));
   }
-  case EVENT_TLBWALK_WAKEUP: {
-    os << "wlkwak rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
-      loadstore.tlb_walk_level, "): wakeup from cache miss for phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
-    break;
+
+  foreach (i, threadcount) {
+    ThreadContext* thread = threads[i];
+    foreach (i, rob_states.count) {
+      StateList& list = *(thread->rob_states[i]);
+      ReorderBufferEntry* rob;
+      foreach_list_mutable(list, rob, entry, nextentry) {
+        assert(inrange(rob->index(), 0, ROB_SIZE-1));
+        assert(rob->current_state_list == &list);
+        if (!((rob->current_state_list != &thread->rob_free_list) ? rob->entry_valid : (!rob->entry_valid))) {
+          logfile << "ROB ", rob->index(), " list = ", rob->current_state_list->name, " entry_valid ", rob->entry_valid, endl, flush;
+          dump_smt_state(logfile);
+        assert(false);
   }
-  case EVENT_TLBWALK_NO_LFRQ_MB: {
-    os << "wlknml rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
-      loadstore.tlb_walk_level, "): no LFRQ or MB for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
-    break;
   }
-  case EVENT_TLBWALK_COMPLETE: {
-    os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
-      loadstore.tlb_walk_level, "): complete!"; break;
-    break;
   }
-  case EVENT_LOAD_EXCEPTION: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
-    break;
   }
-  case EVENT_STORE_EXCEPTION: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
-      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
-      (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
-    break;
   }
-  case EVENT_ALIGNMENT_FIXUP:
-    os << "algnfx", " rip ", rip, ": set unaligned bit for uop ", uop.bbindex, " (unaligned predictor slot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), ") and refetch"; break;
-  case EVENT_FENCE_ISSUED:
-    os << "mfence rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " lsq ", lsq, " r", intstring(physreg, -3), ": memory fence (", uop, ")"; break;
-  case EVENT_ANNUL_NO_FUTURE_UOPS:
-    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, ", EOM rob ", annul.eomidx, ": no future uops to annul"; break;
-  case EVENT_ANNUL_MISSPECULATION: {
-    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, 
-      ", EOM rob ", annul.eomidx, ": annul from rob ", annul.startidx, " to rob ", annul.endidx;
-    break;
+
+ostream& LoadStoreQueueEntry::print(ostream& os) const {
+  os << (store ? "st" : "ld"), intstring(index(), -3), " ";
+  os << "uuid ", intstring(rob->uop.uuid, 10), " ";
+  os << "rob ", intstring(rob->index(), -3), " ";
+  os << "r", intstring(rob->physreg->index(), -3);
+  if (PHYS_REG_FILE_COUNT > 1) os << "@", getcore().physregfiles[rob->physreg->rfid].name;
+  os << " ";
+  if (invalid) {
+    os << "< Invalid: fault 0x", hexstring(data, 8), " > ";
+  } else {
+    if (datavalid)
+      os << bytemaskstring((const byte*)&data, bytemask, 8);
+    else os << "<    Data Invalid     >";
+    os << " @ ";
+    if (addrvalid)
+      os << "0x", hexstring(physaddr << 3, 48);
+    else os << "< Addr Inval >";
   }
-  case EVENT_ANNUL_EACH_ROB: {
-    os << "annul  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": annul rip ", rip;
-    os << (uop.som ? " SOM" : "    "); os << (uop.eom ? " EOM" : "    ");
-    os << ": free";
-    os << " r", physreg;
-    if (ld|st) os << " lsq", lsq;
-    if (lfrqslot >= 0) os << " lfrq", lfrqslot;
-    if (annul.annulras) os << " ras";
-    break;
+  return os;
   }
-  case EVENT_ANNUL_PSEUDOCOMMIT: {
-    os << "pseucm rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": r", physreg, " rebuild rrt:";
-    os << " arch ", arch_reg_names[uop.rd];
-    if likely (!uop.nouserflags) {
-      if (uop.setflags & SETFLAG_ZF) os << " zf";
-      if (uop.setflags & SETFLAG_CF) os << " cf";
-      if (uop.setflags & SETFLAG_OF) os << " of";
+
+//
+// Barriers must flush the fetchq and stall the frontend until
+// after the barrier is consumed. Execution resumes at the address
+// in internal register nextrip (rip after the instruction) after
+// handling the barrier in microcode.
+//
+bool ThreadContext::handle_barrier() {
+  // Release resources of everything in the pipeline:
+
+  core_to_external_state();
+  assert(flush_pipeline());
+
+  int assistid = ctx.commitarf[REG_rip];
+  assist_func_t assist = (assist_func_t)(Waddr)assistid_to_func[assistid];
+
+  // SD: Print event log for undefined opcodes.
+  if unlikely (assistid == ASSIST_INVALID_OPCODE) {
+    if unlikely (config.event_log_enabled)
+      getcore().eventlog.print(logfile);
     }
-    os << " = r", physreg;
-    break;
+
+  if (logable(4)) {
+    logfile << "[vcpu ", ctx.vcpuid, "] Barrier (#", assistid, " -> ", (void*)assist, " ", assist_name(assist), " called from ",
+      (RIPVirtPhys(ctx.commitarf[REG_selfrip]).update(ctx)), "; return to ", (void*)(Waddr)ctx.commitarf[REG_nextrip],
+      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
   }
-  case EVENT_ANNUL_FETCHQ_RAS:
-    os << "anlras rip ", rip, ": annul RAS update still in fetchq"; break;
-  case EVENT_ANNUL_FLUSH:
-    os << "flush  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " rip ", rip; break;
-  case EVENT_REDISPATCH_DEPENDENTS:
-    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " find all dependents"; break;
-  case EVENT_REDISPATCH_DEPENDENTS_DONE:
-    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " redispatched ", (redispatch.count - 1), " dependent uops"; break;
-  case EVENT_REDISPATCH_EACH_ROB: {
-    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " from state ", redispatch.current_state_list->name, ": dep on ";
-    if (!redispatch.dependent_operands) {
-      os << " [self]";
-    } else {
-      foreach (i, MAX_OPERANDS) {
-        if (bit(redispatch.dependent_operands, i)) os << " ", redispatch.opinfo[i];
+
+  if (logable(6)) logfile << "Calling assist function at ", (void*)assist, "...", endl, flush;
+
+  update_assist_stats(assist);
+  if (logable(6)) {
+    logfile << "Before assist:", endl, ctx, endl;
+#ifdef PTLSIM_HYPERVISOR
+    logfile << sshinfo, endl;
+#endif
       }
+
+  assist(ctx);
+
+  if (logable(6)) {
+    logfile << "Done with assist", endl;
+    logfile << "New state:", endl;
+    logfile << ctx;
+#ifdef PTLSIM_HYPERVISOR
+    logfile << sshinfo;
+#endif
     }
 
-    os << "; redispatch ";
-    os << " [rob ", rob, "]";
-    os << " [physreg ", physreg, "]";
-    if (ld|st) os << " [lsq ", lsq, "]";
-    if (redispatch.iqslot) os << " [iqslot]";
-    if (lfrqslot >= 0) os << " [lfrqslot ", lfrqslot, "]";
-    if (redispatch.opinfo[RS].physreg != PHYS_REG_NULL) os << " [inheritsfr ", redispatch.opinfo[RS], "]";
+  // Flush again, but restart at possibly modified rip
+  assert(flush_pipeline());
 
-    break;
+#ifndef PTLSIM_HYPERVISOR
+  if (requested_switch_to_native) {
+    logfile << "PTL call requested switch to native mode at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl;
+    return false;
   }
-  case EVENT_COMPLETE:
-    os << "complt rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " on ", padstring(fu_names[fu], -4), ": r", intstring(physreg, -3); break;
-  case EVENT_FORWARD: {
-    os << "forwd", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
-      " (", clusters[cluster].name, ") r", intstring(physreg, -3), 
-      " => ", "uuid ", forwarding.target_uuid, " rob ", forwarding.target_rob,
-      " (", clusters[forwarding.target_cluster].name, ") r", forwarding.target_physreg,
-      " operand ", forwarding.operand;
-    if (forwarding.target_st) os << " => st", forwarding.target_lsq;
-    os << " [still waiting?";
-    foreach (i, MAX_OPERANDS) { if (!bit(forwarding.target_operands_ready, i)) os << " r", (char)('a' + i); }
-    if (forwarding.target_all_operands_ready) os << " READY";
-    os << "]";
-    break;
+#endif
+  return true;
   }
-  case EVENT_BROADCAST: {
-    os << "brcst", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
-      " from cluster ", clusters[cluster].name, " to cluster ", clusters[forwarding.target_cluster].name,
-      " on forwarding cycle ", forwarding.forward_cycle;
-    break;
+
+bool ThreadContext::handle_exception() {
+  // Release resources of everything in the pipeline:
+  core_to_external_state();
+  assert(flush_pipeline());
+
+  if (logable(4)) {
+    logfile << "[vcpu ", ctx.vcpuid, "] Exception ", exception_name(ctx.exception), " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip],
+      " at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
   }
-  case EVENT_WRITEBACK: {
-    os << "write  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " (cluster ", clusters[cluster].name, ") r", intstring(physreg, -3), "@", phys_reg_file_names[rfid], " = 0x", hexstring(writeback.data, 64), " ", flagstring(writeback.flags);
-    if (writeback.transient) os << " (transient)";
-    os << " (", writeback.consumer_count, " consumers";
-    if (writeback.all_consumers_sourced_from_bypass) os << ", all from bypass";
-    if (writeback.no_branches_between_renamings) os << ", no intervening branches";
-    if (writeback.dest_renamed_before_writeback) os << ", dest renamed before writeback";
-    os << ")";
+
+  //
+  // CheckFailed and SkipBlock exceptions are raised by the chk uop.
+  // This uop is used at the start of microcoded instructions to assert
+  // that certain conditions are true so complex corrective actions can
+  // be taken if the check fails.
+  //
+  // SkipBlock is a special case used for checks at the top of REP loops.
+  // Specifically, if the %rcx register is zero on entry to the REP, no
+  // action at all is to be taken; the rip should simply advance to
+  // whatever is in chk_recovery_rip and execution should resume.
+  //
+  // CheckFailed exceptions usually indicate the processor needs to take
+  // evasive action to avoid a user visible exception. For instance,
+  // CheckFailed is raised when an inlined floating point operand is
+  // denormal or otherwise cannot be handled by inlined fastpath uops,
+  // or when some unexpected segmentation or page table conditions
+  // arise.
+  //
+  if (ctx.exception == EXCEPTION_SkipBlock) {
+    ctx.commitarf[REG_rip] = chk_recovery_rip;
+    if (logable(6)) logfile << "SkipBlock pseudo-exception: skipping to ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+    assert(flush_pipeline());
+    return true;
+  }
+
+#ifdef PTLSIM_HYPERVISOR
+  //
+  // Map PTL internal hardware exceptions to their x86 equivalents,
+  // depending on the context. The error_code field should already
+  // be filled out.
+  //
+  // Exceptions not listed here are propagated by microcode
+  // rather than the processor itself.
+  //
+  switch (ctx.exception) {
+  case EXCEPTION_PageFaultOnRead:
+  case EXCEPTION_PageFaultOnWrite:
+  case EXCEPTION_PageFaultOnExec:
+    ctx.x86_exception = EXCEPTION_x86_page_fault; break;
+  case EXCEPTION_FloatingPointNotAvailable:
+    ctx.x86_exception = EXCEPTION_x86_fpu_not_avail; break;
+  case EXCEPTION_FloatingPoint:
+    ctx.x86_exception = EXCEPTION_x86_fpu; break;
+  case EXCEPTION_ASF_Abort:
+    // SD NOTE: These set the proper x86_exception field already in asf.cpp!
     break;
+  default:
+    logfile << "Unsupported internal exception type ", exception_name(ctx.exception), endl, flush;
+    assert(false);
   }
-  case EVENT_COMMIT_FENCE_COMPLETED:
-    os << "mfcmit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " fence committed: wake up waiting memory uops"; break;
-  case EVENT_COMMIT_EXCEPTION_DETECTED:
-    os << "detect rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " (", exception, "), error code ", hexstring(error_code, 16), ", origvirt ", (void*)(Waddr)commit.origvirt; break;
-  case EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED:
-    os << "except rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " [EOM #", commit.total_user_insns_committed, "]"; break;
-  case EVENT_COMMIT_SKIPBLOCK:
-    os << "skipbk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " skip block: advance rip by ", uop.bytes, " to ", (void*)(Waddr)(rip.rip + uop.bytes), " [EOM #", commit.total_user_insns_committed, "]"; break;
-  case EVENT_COMMIT_SMC_DETECTED:
-    os << "smcdet rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " self-modifying code at rip ", rip, " detected (mfn was dirty); invalidate and retry [EOM #", commit.total_user_insns_committed, "]"; break;
-  case EVENT_COMMIT_MEM_LOCKED:
-    os << "waitlk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " wait for lock on physaddr ", (void*)(commit.state.st.physaddr << 3), " to be released"; break;
-  case EVENT_COMMIT_OK: {
-    os << "commit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
-    if likely (archdest_can_commit[uop.rd])
-                os << " [rrt ", arch_reg_names[uop.rd], " = r", physreg, " 0x", hexstring(commit.state.reg.rddata, 64), "]";
 
-    if ((!uop.nouserflags) && uop.setflags) {
-      os << " [flags ", ((uop.setflags & SETFLAG_ZF) ? "z" : ""), 
-        ((uop.setflags & SETFLAG_CF) ? "c" : ""), ((uop.setflags & SETFLAG_OF) ? "o" : ""),
-        " -> ", flagstring(commit.state.reg.rdflags), "]";
+  if (logable(4)) {
+    logfile << ctx;
+    logfile << sshinfo;
     }
 
-    if (uop.eom) os << " [rip = ", (void*)(Waddr)commit.target_rip, "]";
+  ctx.propagate_x86_exception(ctx.x86_exception, ctx.error_code, ctx.cr2);
 
-    if unlikely (st && (commit.state.st.bytemask != 0))
-                  os << " [mem ", (void*)(Waddr)(commit.state.st.physaddr << 3), " = ", bytemaskstring((const byte*)&commit.state.st.data, commit.state.st.bytemask, 8), " mask ", bitstring(commit.state.st.bytemask, 8, true), "]";
+  // Flush again, but restart at modified rip
+  assert(flush_pipeline());
 
-    if unlikely (commit.pteupdate.a | commit.pteupdate.d | commit.pteupdate.ptwrite) {
-      os << " [pte:";
-      if (commit.pteupdate.a) os << " a";
-      if (commit.pteupdate.d) os << " d";
-      if (commit.pteupdate.ptwrite) os << " w";
-      os << "]";
-    }
+  return true;
+#else
+  if (logable(6))
+    logfile << "Exception (", exception_name(ctx.exception), " called from ", (void*)(Waddr)ctx.commitarf[REG_rip],
+      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
         
-    if unlikely (ld|st) {
-      os << " [lsq ", lsq, "]";
-      os << " [upslot ", OutOfOrderCore::hash_unaligned_predictor_slot(rip), " = ", commit.ld_st_truly_unaligned, "]";
-    }
+  stringbuf sb;
+  logfile << exception_name(ctx.exception), " detected at fault rip ", (void*)(Waddr)ctx.commitarf[REG_rip], " @ ",
+    total_user_insns_committed, " commits (", total_uops_committed, " uops): genuine user exception (",
+    exception_name(ctx.exception), "); aborting", endl;
+  logfile << ctx, endl;
+  logfile << flush;
         
-    if likely (commit.oldphysreg > 0) {
-      if unlikely (commit.oldphysreg_refcount) {
-        os << " [pending free old r", commit.oldphysreg, " ref by";
-        os << " refcount ", commit.oldphysreg_refcount;
-        os << "]";
-      } else {
-        os << " [free old r", commit.oldphysreg, "]";
-      }
+  logfile << "Aborting...", endl, flush;
+  cerr << "Aborting...", endl, flush;
+
+  assert(false);
+  return false;
+#endif
     }
 
-    os << " [commit r", physreg, "]";
+bool ThreadContext::handle_interrupt() {
+#ifdef PTLSIM_HYPERVISOR
+  // Release resources of everything in the pipeline:
+  core_to_external_state();
+  assert(flush_pipeline());
 
-    foreach (i, MAX_OPERANDS) {
-      if unlikely (commit.operand_physregs[i] != PHYS_REG_NULL) os << " [unref r", commit.operand_physregs[i], "]";
+  if (logable(6)) {
+    logfile << "[vcpu ", threadid, "] interrupts pending at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+    logfile << "Context at interrupt:", endl;
+    logfile << ctx;
+    logfile << sshinfo;
+    logfile.flush();
     }
 
-    if unlikely (br) {
-      os << " [brupdate", (commit.taken ? " tk" : " nt"), (commit.predtaken ? " pt" : " np"), ((commit.taken == commit.predtaken) ? " ok" : " MP"), "]";
-    }
+  ctx.event_upcall();
         
-    if (uop.eom) os << " [EOM #", commit.total_user_insns_committed, "]";
-    break;
+  if (logable(6)) {
+    logfile <<  "[vcpu ", threadid, "] after interrupt redirect:", endl;
+    logfile << ctx;
+    logfile << sshinfo;
+    logfile.flush();
   }
-  case EVENT_COMMIT_ASSIST: {
-    os << "assist rob ", intstring(rob, -3), " calling assist ", (void*)rip.rip, " (#",
-      assist_index((assist_func_t)rip.rip), ": ", assist_name((assist_func_t)rip.rip), ")";
-    break;
+
+  // Flush again, but restart at modified rip
+  assert(flush_pipeline());
+#endif
+  return true;
   }
-  case EVENT_RECLAIM_PHYSREG:
-    os << "free   r", physreg, " no longer referenced; moving to free state"; break;
-  case EVENT_RELEASE_MEM_LOCK: {
-    os << "unlkcm", " phys ", (void*)(loadstore.sfr.physaddr << 3), ": lock release committed";
-    break;
+
+//
+// Event Formatting
+//
+void PhysicalRegister::fill_operand_info(PhysicalRegisterOperandInfo& opinfo) {
+  opinfo.physreg = index();
+  opinfo.state = state;
+  opinfo.rfid = rfid;
+  opinfo.archreg = archreg;
+  if (rob) {
+    opinfo.rob = rob->index();
+    opinfo.uuid = rob->uop.uuid;
   }
-  default:
-    os << "?????? unknown event type ", type;
-    break;
   }
 
-  os << endl;
+ostream& OutOfOrderModel::operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo) {
+  os << "[r", opinfo.physreg, " ", short_physreg_state_names[opinfo.state], " ";
+  switch (opinfo.state) {
+  case PHYSREG_WAITING:
+  case PHYSREG_BYPASS:
+  case PHYSREG_WRITTEN:
+    os << "rob ", opinfo.rob, " uuid ", opinfo.uuid; break;
+  case PHYSREG_ARCH:
+  case PHYSREG_PENDINGFREE:
+    os << arch_reg_names[opinfo.archreg]; break;
+  };
+  os << "]";
   return os;
 }
 
@@ -1672,16 +1795,28 @@
 //
 
 bool OutOfOrderMachine::init(PTLsimConfig& config) {
+#ifdef ENABLE_SMT
   // Note: we only create a single core for all contexts for now.
   cores[0] = new OutOfOrderCore(0, *this);
+  corecount = 1;
+#else
+  corecount = 0;
+#endif
 
   foreach (i, contextcount) {
+#ifdef ENABLE_SMT
     OutOfOrderCore& core = *cores[0];
+#else
+    cores[corecount] = new OutOfOrderCore(corecount, *this);
+    OutOfOrderCore& core    = *cores[corecount];
+    corecount++;
+#endif
+
+    ThreadContext* thread = new ThreadContext(core, core.threadcount, contextof(i));
+    core.threads[core.threadcount] = thread;
     core.threadcount++;
-    ThreadContext* thread = new ThreadContext(core, i, contextof(i));
-    core.threads[i] = thread;
     thread->init();
-
+    logfile << "New ThreadContext: Core ", core.coreid, " (", core.threadcount," threads) Thread ", thread->threadid, " &thread=", thread, " &core=", &core, corecount, " total cores ", corecount, endl, flush;
     //
     // Note: in a multi-processor model, config may
     // specify various ways of slicing contextcount up
@@ -1691,8 +1826,9 @@
     //
   }
 
-  cores[0]->init();
+  foreach (i, corecount) cores[i]->init();
   init_luts();
+
   return true;
 }
 
@@ -1713,14 +1849,17 @@
     logenable = 1;
   }
 
-  cores[0]->reset();
-  cores[0]->flush_pipeline_all();
+  foreach (i, corecount) {
+    cores[i]->reset();
+    cores[i]->flush_pipeline_all();
+
 
   logfile << "IssueQueue states:", endl;
 
-  if unlikely (config.event_log_enabled && (!cores[0]->eventlog.start)) {
-    cores[0]->eventlog.init(config.event_log_ring_buffer_size);
-    cores[0]->eventlog.logfile = &logfile;
+    if unlikely (config.event_log_enabled && (!cores[i]->eventlog.start)) {
+      cores[i]->eventlog.init(config.event_log_ring_buffer_size);
+      cores[i]->eventlog.logfile = &logfile;
+    }
   }
 
   bool exiting = false;
@@ -1735,8 +1874,9 @@
     update_progress();
     inject_events();
 
-    OutOfOrderCore& core =* cores[0]; // only one core for now
     int running_thread_count = 0;
+    foreach (j, corecount) {
+      OutOfOrderCore& core =* cores[j];
     foreach (i, core.threadcount) {
       ThreadContext* thread = core.threads[i];
 #ifdef PTLSIM_HYPERVISOR
@@ -1749,18 +1889,21 @@
         } else {
           if (thread->ctx.check_events()) thread->handle_interrupt();
         }
-        continue;
+          continue; /* NB, SD: Back to foreach (i, core.threadcount), that doesn't make much sense in the original impl, either! */
       }
 #endif
     }
 
     exiting |= core.runcycle();
+    }
 
     if unlikely (check_for_async_sim_break() && (!stopping)) {
       logfile << "Waiting for all VCPUs to reach stopping point, starting at cycle ", sim_cycle, endl;
       // force_logging_enabled();
-      OutOfOrderCore& core =* cores[0];
+      foreach (j, corecount) {
+        OutOfOrderCore& core = *cores[j];
       foreach (i, core.threadcount) core.threads[i]->stop_at_next_eom = 1;
+      }
       if (config.abort_at_end) {
         config.abort_at_end = 0;
         logfile << "Abort immediately: do not wait for next x86 boundary nor flush pipelines", endl;
@@ -1786,7 +1929,8 @@
 
   logfile << "Exiting out-of-order core at ", total_user_insns_committed, " commits, ", total_uops_committed, " uops and ", iterations, " iterations (cycles)", endl;
 
-  OutOfOrderCore& core =* cores[0]; /// only one core for now.
+  foreach (j, corecount) {
+    OutOfOrderCore& core =* cores[j];
 
   foreach (i, core.threadcount) {
     ThreadContext* thread = core.threads[i];
@@ -1798,6 +1942,7 @@
       logfile << thread->ctx;
     }
   }
+  }
 
   config.dump_state_now = 0;
 
@@ -1823,9 +1968,17 @@
   if unlikely (selective) {
     dn = caches.dtlb.flush_virt(virtaddr, threadid);
     in = caches.itlb.flush_virt(virtaddr, threadid);
+#ifdef USE_L2_TLB
+    dn += caches.l2dtlb.flush_virt(virtaddr, threadid);
+    in += caches.l2itlb.flush_virt(virtaddr, threadid);
+#endif
   } else {
     dn = caches.dtlb.flush_thread(threadid);
     in = caches.itlb.flush_thread(threadid);
+#ifdef USE_L2_TLB
+    dn += caches.l2dtlb.flush_thread(threadid);
+    in += caches.l2itlb.flush_thread(threadid);
+#endif
   }
   if (logable(5)) {
     logfile << "Flushed ", dn, " DTLB slots and ", in, " ITLB slots", endl;
@@ -1835,16 +1988,26 @@
 }
 
 void OutOfOrderMachine::flush_tlb(Context& ctx) {
-  // This assumes all VCPUs are mapped as threads in a single SMT core
+  // This assumes all VCPUs are mapped statically to either cores or threads in a single SMT core
+#ifdef ENABLE_SMT
   int coreid = 0;
   int threadid = ctx.vcpuid;
+#else
+  int coreid = ctx.vcpuid;;
+  int threadid = 0;
+#endif
   cores[coreid]->flush_tlb(ctx, threadid);
 }
 
 void OutOfOrderMachine::flush_tlb_virt(Context& ctx, Waddr virtaddr) {
-  // This assumes all VCPUs are mapped as threads in a single SMT core
+  // This assumes all VCPUs are mapped statically to either cores or threads in a single SMT core
+#ifdef ENABLE_SMT
   int coreid = 0;
   int threadid = ctx.vcpuid;
+#else
+  int coreid = ctx.vcpuid;;
+  int threadid = 0;
+#endif
   cores[coreid]->flush_tlb(ctx, threadid, true, virtaddr);
 }
 
@@ -1950,7 +2113,8 @@
   }  
 }
 
-OutOfOrderMachine ooomodel("ooo");
+/* We have asf now! */
+OutOfOrderMachine ooomodel("asfooo");
 
 OutOfOrderCore& OutOfOrderModel::coreof(int coreid) {
   return *ooomodel.cores[coreid];
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ooocore.h ptlsim-asf/ooocore.h
--- ptlsim-stable/ooocore.h	2009-10-30 19:40:25.126182000 +0100
+++ ptlsim-asf/ooocore.h	2010-03-02 12:00:55.498593000 +0100
@@ -1,2055 +1,53 @@
 // -*- c++ -*-
 //
 // PTLsim: Cycle Accurate x86-64 Simulator
-// Out-of-Order Core Simulator
+// Out-of-Order Core Simulator Configuration
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+//
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
 //
 // Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright (c) 2008-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _OOOCORE_H_
 #define _OOOCORE_H_
 
+#include <ptlsim.h>
+
 // With these disabled, simulation is faster
 #define ENABLE_CHECKS
 #define ENABLE_LOGGING
 
-//
-// Enable SMT operation:
-//
-// Note that this limits some configurations of resources and
-// issue queues that would normally be possible in single
-// threaded mode.
-//
-
-#ifdef PTLSIM_HYPERVISOR
-#define ENABLE_SMT
-#endif
-
-static const int MAX_THREADS_BIT = 4; // up to 16 threads
-static const int MAX_ROB_IDX_BIT = 12; // up to 4096 ROB entries
-
-#ifdef ENABLE_SMT
-static const int MAX_THREADS_PER_CORE = 2;
-#else
-static const int MAX_THREADS_PER_CORE = 1;
-#endif
-
-//#define ENABLE_SIM_TIMING
-#ifdef ENABLE_SIM_TIMING
-#define time_this_scope(ct) CycleTimerScope ctscope(ct)
-#define start_timer(ct) ct.start()
-#define stop_timer(ct) ct.stop()
-#else
-#define time_this_scope(ct) (0)
-#define start_timer(ct) (0)
-#define stop_timer(ct) (0)
-#endif
-
-#define per_context_ooocore_stats_ref(vcpuid) (*(((PerContextOutOfOrderCoreStats*)&stats.ooocore.vcpu0) + (vcpuid)))
-#define per_context_ooocore_stats_update(vcpuid, expr) stats.ooocore.total.expr, per_context_ooocore_stats_ref(vcpuid).expr
-
-namespace OutOfOrderModel {
-  //
-  // Operand formats
-  //
-  static const int MAX_OPERANDS = 4;
-  static const int RA = 0;
-  static const int RB = 1;
-  static const int RC = 2;
-  static const int RS = 3; // (for stores only)
-
-  //
-  // Uop to functional unit mappings
-  //
-  static const int FU_COUNT = 8;
-  static const int LOADLAT = 2;
-
-  enum {
-    FU_LDU0       = (1 << 0),
-    FU_STU0       = (1 << 1),
-    FU_LDU1       = (1 << 2),
-    FU_STU1       = (1 << 3),
-    FU_ALU0       = (1 << 4),
-    FU_FPU0       = (1 << 5),
-    FU_ALU1       = (1 << 6),
-    FU_FPU1       = (1 << 7),
-  };
-
-  static const int LOAD_FU_COUNT = 2;
-
-  const char* fu_names[FU_COUNT] = {
-    "ldu0",
-    "stu0",
-    "ldu1",
-    "stu1",
-    "alu0",
-    "fpu0",
-    "alu1",
-    "fpu1",
-  };
-
-  //
-  // Opcodes and properties
-  //
-#define ALU0 FU_ALU0
-#define ALU1 FU_ALU1
-#define STU0 FU_STU0
-#define STU1 FU_STU1
-#define LDU0 FU_LDU0
-#define LDU1 FU_LDU1
-#define FPU0 FU_FPU0
-#define FPU1 FU_FPU1
-#define A 1 // ALU latency, assuming fast bypass
-#define L LOADLAT
-
-#define ANYALU ALU0|ALU1
-#define ANYLDU LDU0|LDU1
-#define ANYSTU STU0|STU1
-#define ANYFPU FPU0|FPU1
-#define ANYINT ANYALU|ANYSTU|ANYLDU
-
-  struct FunctionalUnitInfo {
-    byte opcode;   // Must match definition in ptlhwdef.h and ptlhwdef.cpp! 
-    byte latency;  // Latency in cycles, assuming ideal bypass
-    W16  fu;       // Map of functional units on which this uop can issue
-  };
-
-  //
-  // WARNING: This table MUST be kept in sync with the table
-  // in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
-  //
-  const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
-    // name, latency, fumask
-    {OP_nop,            A, ANYINT|ANYFPU},
-    {OP_mov,            A, ANYINT|ANYFPU},
-    // Logical
-    {OP_and,            A, ANYINT|ANYFPU},
-    {OP_andnot,         A, ANYINT|ANYFPU},
-    {OP_xor,            A, ANYINT|ANYFPU},
-    {OP_or,             A, ANYINT|ANYFPU},
-    {OP_nand,           A, ANYINT|ANYFPU},
-    {OP_ornot,          A, ANYINT|ANYFPU},
-    {OP_eqv,            A, ANYINT|ANYFPU},
-    {OP_nor,            A, ANYINT|ANYFPU},
-    // Mask, insert or extract bytes
-    {OP_maskb,          A, ANYINT},
-    // Add and subtract
-    {OP_add,            A, ANYINT},
-    {OP_sub,            A, ANYINT},
-    {OP_adda,           A, ANYINT},
-    {OP_suba,           A, ANYINT},
-    {OP_addm,           A, ANYINT},
-    {OP_subm,           A, ANYINT},
-    // Condition code logical ops
-    {OP_andcc,          A, ANYINT},
-    {OP_orcc,           A, ANYINT},
-    {OP_xorcc,          A, ANYINT},
-    {OP_ornotcc,        A, ANYINT},
-    // Condition code movement and merging
-    {OP_movccr,         A, ANYINT},
-    {OP_movrcc,         A, ANYINT},
-    {OP_collcc,         A, ANYINT},
-    // Simple shifting (restricted to small immediate 1..8)
-    {OP_shls,           A, ANYINT},
-    {OP_shrs,           A, ANYINT},
-    {OP_bswap,          A, ANYINT},
-    {OP_sars,           A, ANYINT},
-    // Bit testing
-    {OP_bt,             A, ANYALU},
-    {OP_bts,            A, ANYALU},
-    {OP_btr,            A, ANYALU},
-    {OP_btc,            A, ANYALU},
-    // Set and select
-    {OP_set,            A, ANYINT},
-    {OP_set_sub,        A, ANYINT},
-    {OP_set_and,        A, ANYINT},
-    {OP_sel,            A, ANYINT},
-    {OP_sel_cmp,        A, ANYINT},
-    // Branches
-    {OP_br,             A, ANYINT},
-    {OP_br_sub,         A, ANYINT},
-    {OP_br_and,         A, ANYINT},
-    {OP_jmp,            A, ANYINT},
-    {OP_bru,            A, ANYINT},
-    {OP_jmpp,           A, ANYALU|ANYLDU},
-    {OP_brp,            A, ANYALU|ANYLDU},
-    // Checks
-    {OP_chk,            A, ANYINT},
-    {OP_chk_sub,        A, ANYINT},
-    {OP_chk_and,        A, ANYINT},
-    // Loads and stores
-    {OP_ld,             L, ANYLDU},
-    {OP_ldx,            L, ANYLDU},
-    {OP_ld_pre,         1, ANYLDU},
-    {OP_st,             1, ANYSTU},
-    {OP_mf,             1, STU0  },
-    // Shifts, rotates and complex masking
-    {OP_shl,            A, ANYALU},
-    {OP_shr,            A, ANYALU},
-    {OP_mask,           A, ANYALU},
-    {OP_sar,            A, ANYALU},
-    {OP_rotl,           A, ANYALU},  
-    {OP_rotr,           A, ANYALU},   
-    {OP_rotcl,          A, ANYALU},
-    {OP_rotcr,          A, ANYALU},  
-    // Multiplication
-    {OP_mull,           4, ANYFPU},
-    {OP_mulh,           4, ANYFPU},
-    {OP_mulhu,          4, ANYFPU},
-    {OP_mulhl,          4, ANYFPU},
-    // Bit scans
-    {OP_ctz,            3, ANYFPU},
-    {OP_clz,            3, ANYFPU},
-    {OP_ctpop,          3, ANYFPU},  
-    {OP_permb,          4, ANYFPU},
-    // Integer divide and remainder step
-    {OP_div,           32, ALU0},
-    {OP_rem,           32, ALU0},
-    {OP_divs,          32, ALU0},
-    {OP_rems,          32, ALU0},
-    // Minimum and maximum
-    {OP_min,            A, ANYALU},
-    {OP_max,            A, ANYALU},
-    {OP_min_s,          A, ANYALU},
-    {OP_max_s,          A, ANYALU},
-    // Floating point
-    // uop.size bits have following meaning:
-    // 00 = single precision, scalar (preserve high 32 bits of ra)
-    // 01 = single precision, packed (two 32-bit floats)
-    // 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
-    {OP_fadd,           6, ANYFPU},
-    {OP_fsub,           6, ANYFPU},
-    {OP_fmul,           6, ANYFPU},
-    {OP_fmadd,          6, ANYFPU},
-    {OP_fmsub,          6, ANYFPU},
-    {OP_fmsubr,         6, ANYFPU},
-    {OP_fdiv,           6, ANYFPU},
-    {OP_fsqrt,          6, ANYFPU},
-    {OP_frcp,           6, ANYFPU},
-    {OP_frsqrt,         6, ANYFPU},
-    {OP_fmin,           6, ANYFPU},
-    {OP_fmax,           6, ANYFPU},
-    {OP_fcmp,           6, ANYFPU},
-    // For fcmpcc, uop.size bits have following meaning:
-    // 00 = single precision ordered compare
-    // 01 = single precision unordered compare
-    // 10 = double precision ordered compare
-    // 11 = double precision unordered compare
-    {OP_fcmpcc,         4, ANYFPU},
-    // and/andn/or/xor are done using integer uops
-    // For these conversions, uop.size bits select truncation mode:
-    // x0 = normal IEEE-style rounding
-    // x1 = truncate to zero
-    {OP_fcvt_i2s_ins,   6, ANYFPU},
-    {OP_fcvt_i2s_p,     6, ANYFPU},
-    {OP_fcvt_i2d_lo,    6, ANYFPU},
-    {OP_fcvt_i2d_hi,    6, ANYFPU},
-    {OP_fcvt_q2s_ins,   6, ANYFPU},
-    {OP_fcvt_q2d,       6, ANYFPU},
-    {OP_fcvt_s2i,       6, ANYFPU},
-    {OP_fcvt_s2q,       6, ANYFPU},
-    {OP_fcvt_s2i_p,     6, ANYFPU},
-    {OP_fcvt_d2i,       6, ANYFPU},
-    {OP_fcvt_d2q,       6, ANYFPU},
-    {OP_fcvt_d2i_p,     6, ANYFPU},
-    {OP_fcvt_d2s_ins,   6, ANYFPU},
-    {OP_fcvt_d2s_p,     6, ANYFPU},
-    {OP_fcvt_s2d_lo,    6, ANYFPU},
-    {OP_fcvt_s2d_hi,    6, ANYFPU},
-    // Vector integer uops
-    // uop.size defines element size: 00 = byte, 01 = W16, 10 = W32, 11 = W64 (i.e. same as normal ALU uops)
-    {OP_vadd,           1, ANYFPU},
-    {OP_vsub,           1, ANYFPU},
-    {OP_vadd_us,        1, ANYFPU},
-    {OP_vsub_us,        1, ANYFPU},
-    {OP_vadd_ss,        1, ANYFPU},
-    {OP_vsub_ss,        1, ANYFPU},
-    {OP_vshl,           1, ANYFPU},
-    {OP_vshr,           1, ANYFPU},
-    {OP_vbt,            1, ANYFPU},
-    {OP_vsar,           1, ANYFPU},
-    {OP_vavg,           1, ANYFPU},
-    {OP_vcmp,           1, ANYFPU},
-    {OP_vmin,           1, ANYFPU},
-    {OP_vmax,           1, ANYFPU},
-    {OP_vmin_s,         1, ANYFPU},
-    {OP_vmax_s,         1, ANYFPU},
-    {OP_vmull,          4, ANYFPU},
-    {OP_vmulh,          4, ANYFPU},
-    {OP_vmulhu,         4, ANYFPU},
-    {OP_vmaddp,         4, ANYFPU},
-    {OP_vsad,           4, ANYFPU},
-    {OP_vpack_us,       2, ANYFPU},
-    {OP_vpack_ss,       2, ANYFPU},
-  };
-
-#undef A
-#undef L
-#undef F
-
-#undef ALU0
-#undef ALU1
-#undef STU0
-#undef STU1
-#undef LDU0
-#undef LDU1
-#undef FPU0
-#undef FPU1
-#undef L
-
-#undef ANYALU
-#undef ANYLDU
-#undef ANYSTU
-#undef ANYFPU
-#undef ANYINT
-  
-  //
-  // Global limits
-  //
-  
-  const int MAX_ISSUE_WIDTH = 4;
-  
-  // Largest size of any physical register file or the store queue:
-  const int MAX_PHYS_REG_FILE_SIZE = 256;
-  const int PHYS_REG_FILE_SIZE = 256;
-  const int PHYS_REG_NULL = 0;
-  
-  //
-  // IMPORTANT! If you change this to be greater than 256, you MUST
-  // #define BIG_ROB below to use the correct associative search logic
-  // (16-bit tags vs 8-bit tags).
-  //
-  // SMT always has BIG_ROB enabled: high 4 bits are used for thread id
-  //
-#define BIG_ROB
-
-  const int ROB_SIZE = 128;
-  
-  // Maximum number of branches in the pipeline at any given time
-  const int MAX_BRANCHES_IN_FLIGHT = 16;
-
-  // Set this to combine the integer and FP phys reg files:
-  // #define UNIFIED_INT_FP_PHYS_REG_FILE
-  
-#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
-  // unified, br, st
-  const int PHYS_REG_FILE_COUNT = 3;
-#else
-  // int, fp, br, st
-  const int PHYS_REG_FILE_COUNT = 4;
-#endif
-  
-  //
-  // Load and Store Queues
-  //
-  const int LDQ_SIZE = 48;
-  const int STQ_SIZE = 32;
-
-  //
-  // Fetch
-  //
-  const int FETCH_QUEUE_SIZE = 32;
-  const int FETCH_WIDTH = 4;
-
-  //
-  // Frontend (Rename and Decode)
-  //
-  const int FRONTEND_WIDTH = 4;
-  const int FRONTEND_STAGES = 5;
-
-  //
-  // Dispatch
-  //
-  const int DISPATCH_WIDTH = 4;
-
-  //
-  // Writeback
-  //
-  const int WRITEBACK_WIDTH = 4;
-
-  //
-  // Commit
-  //
-  const int COMMIT_WIDTH = 4;
-
-  //
-  // Clustering, Issue Queues and Bypass Network
-  //
-  const int MAX_FORWARDING_LATENCY = 2;
-
-#define MULTI_IQ
-
-#ifdef ENABLE_SMT
-  //
-  // Multiple issue queues are currently only supported in
-  // the non-SMT configuration, due to ambiguities in the
-  // ICOUNT SMT heuristic when multiple queues are active.
-  //
-#undef MULTI_IQ
-#endif
-
-#ifdef MULTI_IQ
-  const int MAX_CLUSTERS = 4;
-#else
-  const int MAX_CLUSTERS = 1;
-#endif
-
-  enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
-  static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
-  static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
-
-#ifdef INSIDE_OOOCORE
-
-  struct OutOfOrderCore;
-  OutOfOrderCore& coreof(int coreid);
-
-  struct ReorderBufferEntry;
-
-  //
-  // Issue queue based scheduler with broadcast
-  //
-#ifdef BIG_ROB
-  typedef W16 issueq_tag_t;
-#else
-  typedef byte issueq_tag_t;
-#endif
-
-  template <int size, int operandcount = MAX_OPERANDS>
-  struct IssueQueue {
-#ifdef BIG_ROB
-    typedef FullyAssociativeTags16bit<size, size> assoc_t;
-    typedef vec8w vec_t;
-#else
-    typedef FullyAssociativeTags8bit<size, size> assoc_t;
-    typedef vec16b vec_t;
-#endif
-
-    typedef issueq_tag_t tag_t;
-
-    static const int SIZE = size;
-
-    assoc_t uopids;
-    assoc_t tags[operandcount];
-
-    // States:
-    //             V I
-    // free        0 0
-    // dispatched  1 0
-    // issued      1 1
-    // complete    0 1
-
-    bitvec<size> valid;
-    bitvec<size> issued;
-    bitvec<size> allready;
-    int count;
-    byte coreid;
-    int shared_entries;
-    int reserved_entries;
-
-    void set_reserved_entries(int num) { reserved_entries = num; }
-    bool reset_shared_entries() { 
-      shared_entries = size - reserved_entries; 
-      return true;
-    }
-    bool alloc_reserved_entry() {
-      assert(shared_entries > 0);
-      shared_entries--;
-      return true;
-    }
-    bool free_shared_entry() {
-      assert(shared_entries < size - reserved_entries);
-      shared_entries++;
-      return true;
-    }    
-    bool shared_empty() {
-      return (shared_entries == 0);
-    }
-
-    bool remaining() const { return (size - count); }
-    bool empty() const { return (!count); }
-    bool full() const { return (!remaining()); }
-
-    int uopof(int slot) const {
-      return uopids[slot];
-    }
-
-    int slotof(int uopid) const {
-      return uopids.search(uopid);
-    }
-
-    void reset(int coreid);
-    void reset(int coreid, int threadid);
-    void clock();
-    bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
-    bool broadcast(tag_t uopid);
-    int issue();
-    bool replay(int slot, const tag_t* operands, const tag_t* preready);
-    bool switch_to_end(int slot, const tag_t* operands, const tag_t* preready);
-    bool remove(int slot);
-
-    ostream& print(ostream& os) const;
-    void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
-
-    //
-    // Replay a uop that has already issued once.
-    // The caller may add or reset dependencies here as needed.
-    //
-    bool replay(int slot) {
-      issued[slot] = 0;
-      return true;
-    }
-
-    //
-    // Remove an entry from the issue queue after it has completed,
-    // or in the process of annulment.
-    //
-    bool release(int slot) {
-      remove(slot);
-      return true;
-    }
-
-    bool annul(int slot) {
-      remove(slot);
-      return true;
-    }
-
-    bool annuluop(int uopid) {
-      int slot = slotof(uopid);
-      if (slot < 0) return false;
-      remove(slot);
-      return true;
-    }
-
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-  };
-
-  template <int size, int operandcount>
-  static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
-    return issueq.print(os);
-  }
-
-  //
-  // Iterate through a linked list of objects where each object directly inherits
-  // only from the selfqueuelink class or otherwise has a selfqueuelink object
-  // as the first member.
-  //
-  // This iterator supports mutable lists, meaning the current entry (obj) may
-  // be safely removed from the list and/or moved to some other list without
-  // affecting the next object processed.
-  //
-  // This does NOT mean you can remove any object from the list other than the
-  // current object obj - to do this, copy the list of pointers to an array and
-  // then process that instead.
-  //
-#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
-  linktype* entry; \
-  linktype* nextentry; \
-  for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
-    entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
-
-#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
-
-  struct StateList;
-
-  struct ListOfStateLists: public array<StateList*, 64> {
-    int count;
-
-    ListOfStateLists() { count = 0; }
-
-    int add(StateList* list);
-    void reset();
-  };
-
-  struct StateList: public selfqueuelink {
-    char* name;
-    int count;
-    int listid;
-    W64 dispatch_source_counter;
-    W64 issue_source_counter;
-    W32 flags;
-
-    StateList() { count = 0; listid = 0; }
-
-    void init(const char* name, ListOfStateLists& lol, W32 flags = 0);
-
-    StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {  
-      init(name, lol, flags);
-    }
-
-    // simulated asymmetric c++ array constructor:
-    StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
-      init(name, lol, flags);
-      return *this;
-    }
-
-    void reset();
-
-    selfqueuelink* dequeue() {
-      if (empty())
-        return null;
-      count--;
-      assert(count >=0);
-      selfqueuelink* obj = removehead(); 
-      return obj;
-    }
-
-    selfqueuelink* enqueue(selfqueuelink* entry) {
-      entry->addtail(this);
-      count++;
-      return entry;
-    }
-
-    selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
-      if (preventry) entry->addhead(preventry); else entry->addhead(this);
-      count++;
-      return entry;
-    }
-
-    selfqueuelink* remove(selfqueuelink* entry) {
-      assert(entry->linked());
-      entry->unlink();
-      count--;
-      assert(count >=0);
-      return entry;
-    }
-
-    selfqueuelink* peek() {
-      return (empty()) ? null : head();
-    }
-
-    void checkvalid();
-  };
-
-  template <typename T> 
-  static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
-
-  //
-  // Fetch Buffers
-  //
-  struct BranchPredictorUpdateInfo: public PredictorUpdate {
-    int stack_recover_idx;
-    int bptype;
-    W64 ripafter;
-  };
-
-  struct FetchBufferEntry: public TransOp {
-    RIPVirtPhys rip;
-    W64 uuid;
-    uopimpl_func_t synthop;
-    BranchPredictorUpdateInfo predinfo;
-    W16 index;
-    W8 threadid;
-    byte ld_st_truly_unaligned;
-
-    int init(int index) { this->index = index; return 0; }
-    void validate() { }
-
-    FetchBufferEntry() { }
-    
-    FetchBufferEntry(const TransOp& transop) {
-      *((TransOp*)this) = transop;
-    }
-  };
-
-  //
-  // ReorderBufferEntry
-  struct ThreadContext;
-  struct OutOfOrderCore;
-  struct PhysicalRegister;
-  struct LoadStoreQueueEntry;
-  struct OutOfOrderCoreEvent;
-  //
-  // Reorder Buffer (ROB) structure, used for tracking all uops in flight.
-  // This same structure is used to represent both dispatched but not yet issued 
-  // uops as well as issued uops.
-  //
-  struct ReorderBufferEntry: public selfqueuelink {
-    FetchBufferEntry uop;
-    struct StateList* current_state_list;
-    PhysicalRegister* physreg;
-    PhysicalRegister* operands[MAX_OPERANDS];
-    LoadStoreQueueEntry* lsq;
-    W16s idx;
-    W16s cycles_left; // execution latency counter, decremented every cycle when executing
-    W16s forward_cycle; // forwarding cycle after completion
-    W16s lfrqslot;
-    W16s iqslot;
-    W16  executable_on_cluster_mask;
-    W8s  cluster;
-    W8   coreid;
-
-    W8   threadid;
-    byte fu;
-    byte consumer_count;
-    PTEUpdate pteupdate;
-    Waddr origvirt; // original virtual address, with low bits
-    Waddr virtpage; // virtual page number actually accessed by the load or store
-    byte entry_valid:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1, issued:1;
-    byte tlb_walk_level;
-
-    int index() const { return idx; }
-    void validate() { entry_valid = true; }
-
-    void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
-      if (current_state_list)
-        current_state_list->remove(this);
-      current_state_list = &newqueue;
-      if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
-    }
-
-    void init(int idx);
-    void reset();
-    bool ready_to_issue() const;
-    bool ready_to_commit() const;
-    StateList& get_ready_to_issue_list() const;
-    bool find_sources();
-    int forward();
-    int select_cluster();
-    int issue();
-    Waddr addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& virtpage, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
-    bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
-    int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
-    int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
-    void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel);
-    int probecache(Waddr addr, LoadStoreQueueEntry* sfra);
-    void tlbwalk();
-    int issuefence(LoadStoreQueueEntry& state);
-    void release();
-    W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
-    W64 annul_after() { return annul(true); }
-    W64 annul_after_and_including() { return annul(false); }
-    int commit();
-    void replay();
-    void replay_locked();
-    int pseudocommit();
-    void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
-    void redispatch_dependents(bool inclusive = true);
-    void loadwakeup();
-    void fencewakeup();
-    LoadStoreQueueEntry* find_nearest_memory_fence();
-    bool release_mem_lock(bool forced = false);
-    ostream& print(ostream& os) const;
-    stringbuf& get_operand_info(stringbuf& sb, int operand) const;
-    ostream& print_operand_info(ostream& os, int operand) const;
-
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-
-    ThreadContext& getthread() const;
-    issueq_tag_t get_tag();
-  };
-
-  void decode_tag(issueq_tag_t tag, int& threadid, int& idx) {
-    threadid = tag >> MAX_ROB_IDX_BIT;
-    int mask = ((1 << (MAX_ROB_IDX_BIT + MAX_THREADS_BIT)) - 1) >> MAX_THREADS_BIT;
-    idx = tag & mask;
-  }
-
-  static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
-    return rob.print(os);
-  }
-
-  //
-  // Load/Store Queue
-  //
-#define LSQ_SIZE (LDQ_SIZE + STQ_SIZE)
-
-  // Define this to allow speculative issue of loads before unresolved stores
-#define SMT_ENABLE_LOAD_HOISTING
-
-  struct LoadStoreQueueEntry: public SFR {
-    ReorderBufferEntry* rob;
-    W16 idx;
-    byte coreid;
-    W8s mbtag;
-    W8 store:1, lfence:1, sfence:1, entry_valid:1;
-    W32 padding;
-
-    LoadStoreQueueEntry() { }
-
-    int index() const { return idx; }
-
-    void reset() {
-      int oldidx = idx;
-      setzero(*this);
-      idx = oldidx;
-      mbtag = -1;
-    }
-
-    void init(int idx) {
-      this->idx = idx;
-      reset();
-    }
-
-    void validate() { entry_valid = 1; }
-  
-    ostream& print(ostream& os) const;
-
-    LoadStoreQueueEntry& operator =(const SFR& sfr) {
-      *((SFR*)this) = sfr;
-      return *this;
-    }
-
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-  };
-
-  static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
-    return lsq.print(os);
-  }
-
-  struct PhysicalRegisterOperandInfo {
-    W32 uuid;
-    W16 physreg;
-    W16 rob;
-    byte state;
-    byte rfid;
-    byte archreg;
-    byte pad1;
-  };
-
-  ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
-
-  //
-  // Physical Register File
-  //
- 
-  struct PhysicalRegister: public selfqueuelink {
-    ReorderBufferEntry* rob;
-    W64 data;
-    W16 flags;
-    W16 idx;
-    W8  coreid;
-    W8  rfid;
-    W8  state;
-    W8  archreg;
-    W8  all_consumers_sourced_from_bypass:1;
-    W16s refcount;
-    W8 threadid;
-
-    StateList& get_state_list(int state) const;
-    StateList& get_state_list() const { return get_state_list(this->state); }
-
-    void changestate(int newstate) {
-      if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
-      state = newstate;
-      get_state_list(state).enqueue(this);
-    }
-
-    void init(int coreid, int rfid, int idx) {
-      this->coreid = coreid;
-      this->rfid = rfid;
-      this->idx = idx;
-      reset();
-    }
-
-  private:
-    void addref() { refcount++; }
-    void unref() {
-      refcount--;
-      assert((idx == 0) || (refcount >= 0));
-    }
-
-  public:
-
-    void addref(const ReorderBufferEntry& rob, W8 threadid) { addref(); }
-    void unref(const ReorderBufferEntry& rob, W8 threadid) { unref(); }
-    void addspecref(int archreg, W8 threadid) { addref(); }
-    void unspecref(int archreg, W8 threadid) { unref(); }
-    void addcommitref(int archreg, W8 threadid) { addref(); }
-    void uncommitref(int archreg, W8 threadid) { unref();  }
-
-    bool referenced() const { return (refcount > 0); }
-    bool nonnull() const { return (index() != PHYS_REG_NULL); }
-    bool allocated() const { return (state != PHYSREG_FREE); }
-    void commit() { changestate(PHYSREG_ARCH); }
-    void complete() { changestate(PHYSREG_BYPASS); }
-    void writeback() { changestate(PHYSREG_WRITTEN); }
-
-    void free() {      
-      changestate(PHYSREG_FREE);
-      rob = 0;
-      refcount = 0;
-      threadid = 0xff;
-      all_consumers_sourced_from_bypass = 1;
-    }
-
-  private:
-    void reset() {
-      selfqueuelink::reset();
-      state = PHYSREG_NONE;
-      free();
-    }
-
-  public:
-    void reset(W8 threadid, bool check_id = true) {
-      if (check_id && this->threadid != threadid) return;
-
-      if (!check_id) {
-        selfqueuelink::reset();
-        state = PHYSREG_NONE;
-      }
-      free();
-    }
-
-    int index() const { return idx; }
-    bool valid() const { return ((flags & FLAG_INV) == 0); }
-    bool ready() const { return ((flags & FLAG_WAIT) == 0); }
-
-    void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
-
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-  };
-
-  ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
-
-  struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
-    byte coreid;
-    byte rfid;
-    W16 size;
-    const char* name;
-    StateList states[MAX_PHYSREG_STATE];
-    W64 allocations;
-    W64 frees;
-
-    PhysicalRegisterFile() { }
-
-    PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
-      init(name, coreid, rfid, size); reset();
-    }
-
-    PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
-      init(name, coreid, rfid, size); reset(); return *this;
-    }
-
-    void init(const char* name, int coreid, int rfid, int size);
-    bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
-   
-    PhysicalRegister* alloc(W8 threadid, int r = -1);
-    void reset(W8 threadid);
-    ostream& print(ostream& os) const;
-
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-
-  private:
-    void reset();
-  };
-
-  static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
-    return physregs.print(os);
-  }
-
-  //
-  // Register Rename Table
-  //
-  struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
-#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
-    bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
-#endif
-    ostream& print(ostream& os) const;
-  };
-
-  static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
-    return rrt.print(os);
-  }
-
-  enum {
-    ISSUE_COMPLETED = 1,      // issued correctly
-    ISSUE_NEEDS_REPLAY = 0,   // fast scheduling replay
-    ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
-    ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
-  };
-
-  enum {
-    COMMIT_RESULT_NONE = 0,   // no instructions committed: some uops not ready
-    COMMIT_RESULT_OK = 1,     // committed
-    COMMIT_RESULT_EXCEPTION = 2, // exception
-    COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
-    COMMIT_RESULT_SMC = 4,    // self modifying code detected
-    COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
-    COMMIT_RESULT_STOP = 6    // stop processor model (shutdown)
-  };
-
-  // Branch predictor outcomes:
-  enum { MISPRED = 0, CORRECT = 1 };
-
-  //
-  // Lookup tables (LUTs):
-  //
-  struct Cluster {
-    const char* name;
-    W16 issue_width;
-    W32 fu_mask;
-  };
-
-  extern const Cluster clusters[MAX_CLUSTERS];
-  extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
-  extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
-  extern const byte archdest_can_commit[TRANSREG_COUNT];
-  extern const byte archdest_is_visible[TRANSREG_COUNT];
-
-  struct OutOfOrderMachine;
-
-  struct OutOfOrderCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
-    OutOfOrderCore& core;
-    OutOfOrderCoreCacheCallbacks(OutOfOrderCore& core_): core(core_) { }
-    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-  };
-
-  struct MemoryInterlockEntry {
-    W64 uuid;
-    W16 rob;
-    byte vcpuid;
-    W8 threadid;
-
-    void reset() { uuid = 0; rob = 0; vcpuid = 0; threadid = 0;}
- 
-    ostream& print(ostream& os, W64 physaddr) const {
-      os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", threadid ", threadid, ", uuid ", uuid, ", rob ", rob;
-      return os;
-    }
-  };
- 
-  struct MemoryInterlockBuffer: public LockableAssociativeArray<W64, MemoryInterlockEntry, 16, 4, 8> { };
- 
-  extern MemoryInterlockBuffer interlocks;
- 
-  //
-  // Event Tracing
-  //
-  enum {
-    EVENT_INVALID = 0,
-    EVENT_FETCH_STALLED,
-    EVENT_FETCH_ICACHE_WAIT,
-    EVENT_FETCH_FETCHQ_FULL,
-    EVENT_FETCH_IQ_QUOTA_FULL,
-    EVENT_FETCH_BOGUS_RIP,
-    EVENT_FETCH_ICACHE_MISS,
-    EVENT_FETCH_SPLIT,
-    EVENT_FETCH_ASSIST,
-    EVENT_FETCH_TRANSLATE,
-    EVENT_FETCH_OK,
-    EVENT_RENAME_FETCHQ_EMPTY,
-    EVENT_RENAME_ROB_FULL,
-    EVENT_RENAME_PHYSREGS_FULL,
-    EVENT_RENAME_LDQ_FULL,
-    EVENT_RENAME_STQ_FULL,
-    EVENT_RENAME_MEMQ_FULL,
-    EVENT_RENAME_OK,
-    EVENT_FRONTEND,
-    EVENT_CLUSTER_NO_CLUSTER,
-    EVENT_CLUSTER_OK,
-    EVENT_DISPATCH_NO_CLUSTER,
-    EVENT_DISPATCH_DEADLOCK,
-    EVENT_DISPATCH_OK,
-    EVENT_ISSUE_NO_FU,
-    EVENT_ISSUE_OK,
-    EVENT_REPLAY,
-    EVENT_STORE_EXCEPTION,
-    EVENT_STORE_WAIT,
-    EVENT_STORE_PARALLEL_FORWARDING_MATCH,
-    EVENT_STORE_ALIASED_LOAD,
-    EVENT_STORE_ISSUED,
-    EVENT_STORE_LOCK_RELEASED,
-    EVENT_STORE_LOCK_ANNULLED,
-    EVENT_STORE_LOCK_REPLAY,
-    EVENT_LOAD_EXCEPTION,
-    EVENT_LOAD_WAIT,
-    EVENT_LOAD_HIGH_ANNULLED,
-    EVENT_LOAD_HIT,
-    EVENT_LOAD_MISS,
-    EVENT_LOAD_BANK_CONFLICT,
-    EVENT_LOAD_TLB_MISS,
-    EVENT_LOAD_LOCK_REPLAY,
-    EVENT_LOAD_LOCK_OVERFLOW,
-    EVENT_LOAD_LOCK_ACQUIRED,
-    EVENT_LOAD_LFRQ_FULL,
-    EVENT_LOAD_WAKEUP,
-    EVENT_TLBWALK_HIT,
-    EVENT_TLBWALK_MISS,
-    EVENT_TLBWALK_WAKEUP,
-    EVENT_TLBWALK_NO_LFRQ_MB,
-    EVENT_TLBWALK_COMPLETE,
-    EVENT_FENCE_ISSUED,
-    EVENT_ALIGNMENT_FIXUP,
-    EVENT_ANNUL_NO_FUTURE_UOPS,
-    EVENT_ANNUL_MISSPECULATION,
-    EVENT_ANNUL_EACH_ROB,
-    EVENT_ANNUL_PSEUDOCOMMIT,
-    EVENT_ANNUL_FETCHQ_RAS,
-    EVENT_ANNUL_FETCHQ,
-    EVENT_ANNUL_FLUSH,
-    EVENT_REDISPATCH_DEPENDENTS,
-    EVENT_REDISPATCH_DEPENDENTS_DONE,
-    EVENT_REDISPATCH_EACH_ROB,
-    EVENT_COMPLETE,
-    EVENT_BROADCAST,
-    EVENT_FORWARD,
-    EVENT_WRITEBACK,
-    EVENT_COMMIT_FENCE_COMPLETED,
-    EVENT_COMMIT_EXCEPTION_DETECTED,
-    EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED,
-    EVENT_COMMIT_SKIPBLOCK,
-    EVENT_COMMIT_SMC_DETECTED,
-    EVENT_COMMIT_MEM_LOCKED,
-    EVENT_COMMIT_ASSIST,
-    EVENT_COMMIT_OK,
-    EVENT_RECLAIM_PHYSREG,
-    EVENT_RELEASE_MEM_LOCK,
-  };
-
-  //
-  // Event that gets written to the trace buffer
-  //
-  // In the interest of minimizing space, the cycle counters
-  // and uuids are only 32-bits; in practice wraparound is
-  // not likely to be a problem.
-  //
-  struct OutOfOrderCoreEvent {
-    W32 cycle;
-    W32 uuid;
-    RIPVirtPhysBase rip;
-    TransOpBase uop;
-    W16 rob;
-    W16 physreg;
-    W16 lsq;
-    W16 type;
-    W16s lfrqslot;
-    byte rfid;
-    byte cluster;
-    byte fu;
-    W8 threadid;
-    W32 issueq_count;
-
-    OutOfOrderCoreEvent* fill(int type) {
-      this->type = type;
-      cycle = sim_cycle;
-      uuid = 0;
-      threadid = 0xff;
-      return this;
-    }
-
-    OutOfOrderCoreEvent* fill(int type, const FetchBufferEntry& uop) {
-      fill(type);
-      uuid = uop.uuid;
-      rip = uop.rip;
-      threadid = uop.threadid;
-      this->uop = uop;
-      return this;
-    }
-
-    OutOfOrderCoreEvent* fill(int type, const RIPVirtPhys& rvp) {
-      fill(type);
-      rip = rvp;
-      return this;
-    }
-
-    OutOfOrderCoreEvent* fill(int type, const ReorderBufferEntry* rob) {
-      fill(type, rob->uop);
-      this->rob = rob->index();
-      physreg = rob->physreg->index();
-      lsq = (rob->lsq) ? rob->lsq->index() : 0;
-      rfid = rob->physreg->rfid;
-      cluster = rob->cluster;
-      fu = rob->fu;
-      lfrqslot = rob->lfrqslot;
-      return this;
-    }
-
-    OutOfOrderCoreEvent* fill_commit(int type, const ReorderBufferEntry* rob) {
-      fill(type, rob);
-      if unlikely (isstore(rob->uop.opcode)) {
-        commit.state.st = *rob->lsq;
-      } else {
-        commit.state.reg.rddata = rob->physreg->data;
-        commit.state.reg.rdflags = rob->physreg->flags;
-      }
-      // taken, predtaken only for branches
-      commit.ld_st_truly_unaligned = rob->uop.ld_st_truly_unaligned;
-      commit.pteupdate = rob->pteupdate;
-      // oldphysreg filled in later
-      // oldphysreg_refcount filled in later
-      commit.origvirt = rob->origvirt;
-      commit.total_user_insns_committed = total_user_insns_committed;
-      // target_rip filled in later
-      foreach (i, MAX_OPERANDS) commit.operand_physregs[i] = rob->operands[i]->index();
-      return this;
-    }
-
-    OutOfOrderCoreEvent* fill_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr, Waddr virtaddr) {
-      fill(type, rob);
-      loadstore.sfr = *rob->lsq;
-      loadstore.virtaddr = virtaddr;
-      loadstore.load_store_second_phase = rob->load_store_second_phase;
-      loadstore.inherit_sfr_used = (inherit_sfr != null);
-      if unlikely (inherit_sfr) {
-        loadstore.inherit_sfr = *inherit_sfr;
-        loadstore.inherit_sfr_lsq = inherit_sfr->rob->lsq->index();
-        loadstore.inherit_sfr_uuid = inherit_sfr->rob->uop.uuid;
-        loadstore.inherit_sfr_rob = inherit_sfr->rob->index();
-        loadstore.inherit_sfr_physreg = inherit_sfr->rob->physreg->index();
-        loadstore.inherit_sfr_rip = inherit_sfr->rob->uop.rip;
-      }
-      loadstore.tlb_walk_level = rob->tlb_walk_level;
-      return this;
-    }
-
-    union {
-      struct {
-        W16s missbuf;
-        W64 predrip;
-        W16 bb_uop_count;
-      } fetch;
-      struct {
-        W16  oldphys;
-        W16  oldzf;
-        W16  oldcf;
-        W16  oldof;
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } rename;
-      struct {
-        W16 cycles_left;
-      } frontend;
-      struct {
-        W16 allowed_clusters;
-        W16 iq_avail[MAX_CLUSTERS];
-      } select_cluster;
-      struct {
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } dispatch;
-      struct {
-        byte mispredicted:1;
-        IssueState state;
-        W16 cycles_left;
-        W64 operand_data[MAX_OPERANDS];
-        W16 operand_flags[MAX_OPERANDS];
-        W64 predrip;
-        W32 fu_avail;
-      } issue;
-      struct {
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-        byte ready;
-      } replay;
-      struct {
-        W64 virtaddr; 
-        W64 data_to_store;
-        SFR sfr;
-        SFR inherit_sfr;
-        W64 inherit_sfr_uuid;        
-        W64 inherit_sfr_rip;
-        W16 inherit_sfr_lsq;
-        W16 inherit_sfr_rob;
-        W16 inherit_sfr_physreg;
-        W16 cycles_left;
-        W64 locking_uuid;
-        byte inherit_sfr_used:1, rcready:1, load_store_second_phase:1, predicted_alias:1;
-        byte locking_vcpuid;
-        W16 locking_rob;
-        W8 threadid;
-        W8 tlb_walk_level;
-      } loadstore;
-      struct {
-        W16 somidx;
-        W16 eomidx;
-        W16 startidx;
-        W16 endidx;
-        byte annulras;
-      } annul;
-      struct {
-        StateList* current_state_list;
-        W16 iqslot;
-        W16 count;
-        byte dependent_operands;
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } redispatch;
-      struct {
-        W8  forward_cycle;
-        W8  operand;
-        W8  target_operands_ready;
-        W8  target_all_operands_ready;
-        W16 target_rob;
-        W16 target_physreg;
-        W8  target_rfid;
-        W8  target_cluster;
-        W64 target_uuid;
-        W16 target_lsq;
-        W8  target_st;
-      } forwarding;
-      struct {
-        W16 consumer_count;
-        W16 flags;
-        W64 data;
-        byte transient:1, all_consumers_sourced_from_bypass:1, no_branches_between_renamings:1, dest_renamed_before_writeback:1;
-      } writeback;
-      struct {
-        IssueState state;
-        byte taken:1, predtaken:1, ld_st_truly_unaligned:1;
-        PTEUpdateBase pteupdate;
-        W16s oldphysreg;
-        W16 oldphysreg_refcount;
-        W64 origvirt;
-        W64 total_user_insns_committed;
-        W64 target_rip;
-        W16 operand_physregs[MAX_OPERANDS];
-      } commit;
-    };
-
-    ostream& print(ostream& os) const;
-  };
-
-  struct EventLog {
-    OutOfOrderCoreEvent* start;
-    OutOfOrderCoreEvent* end;
-    OutOfOrderCoreEvent* tail;
-    ostream* logfile;
-
-    EventLog() { start = null; end = null; tail = null; logfile = null; }
-
-    bool init(size_t bufsize);
-    void reset();
-
-    OutOfOrderCoreEvent* add() {
-      if unlikely (tail >= end) {
-        tail = start;
-        flush();
-      }
-      OutOfOrderCoreEvent* event = tail;
-      tail++;
-      return event;
-    }
-
-    void flush(bool only_to_tail = false);
-
-    OutOfOrderCoreEvent* add(int type) {
-      return add()->fill(type);
-    }
-
-    OutOfOrderCoreEvent* add(int type, const RIPVirtPhys& rvp) {
-      return add()->fill(type, rvp);
-    }
-
-    OutOfOrderCoreEvent* add(int type, const FetchBufferEntry& uop) {
-      return add()->fill(type, uop);
-    }
-
-    OutOfOrderCoreEvent* add(int type, const ReorderBufferEntry* rob) {
-      return add()->fill(type, rob);
-    }
-
-    OutOfOrderCoreEvent* add_commit(int type, const ReorderBufferEntry* rob) {
-      return add()->fill_commit(type, rob);
-    }
-
-    OutOfOrderCoreEvent* add_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr = null, Waddr addr = 0) {
-      return add()->fill_load_store(type, rob, inherit_sfr, addr);
-    }
-
-    ostream& print(ostream& os, bool only_to_tail = false);
-  };
-
-  struct LoadStoreAliasPredictor: public FullyAssociativeTags<W64, 8> { };
-
-  enum {
-    ROB_STATE_READY = (1 << 0),
-    ROB_STATE_IN_ISSUE_QUEUE = (1 << 1),
-    ROB_STATE_PRE_READY_TO_DISPATCH = (1 << 2)
-  };
-
-#ifdef MULTI_IQ
-#define InitClusteredROBList(name, description, flags) \
-  name[0](description "-int0", rob_states, flags); \
-  name[1](description "-int1", rob_states, flags); \
-  name[2](description "-ld", rob_states, flags); \
-  name[3](description "-fp", rob_states, flags)
+#ifdef CORE_GENERIC
+#include <ooocore-generic.h>
+//#warning Using the generic PTLsim core.
+#else
+#ifdef CORE_AMD_K8
+#include <ooocore-amd-k8.h>
+//#warning Using the AMD K8 core.
+#else
+#ifdef CORE_AMD_BARCELONA_ASF
+#include <ooocore-amd-barcelona-asf.h>
+//#warning Using the AMD Barcelona ASF core.
 #else
-#define InitClusteredROBList(name, description, flags) \
-  name[0](description "-all", rob_states, flags);
+#error Please specify a core flavour by defining CORE_XXX in ptlsim.h!
 #endif
-
-  static const int ISSUE_QUEUE_SIZE = 16;
-
-  // How many bytes of x86 code to fetch into decode buffer at once
-  static const int ICACHE_FETCH_GRANULARITY = 16;
-  // Deadlock timeout: if nothing dispatches for this many cycles, flush the pipeline
-  static const int DISPATCH_DEADLOCK_COUNTDOWN_CYCLES = 256;
-  // Size of unaligned predictor Bloom filter
-  static const int UNALIGNED_PREDICTOR_SIZE = 4096;
-
-  struct ThreadContext {
-    OutOfOrderCore& core;
-    OutOfOrderCore& getcore() const { return core; }
-
-    int threadid;
-    Context& ctx;
-    BranchPredictorInterface branchpred;
-
-    Queue<FetchBufferEntry, FETCH_QUEUE_SIZE> fetchq;
-
-    ListOfStateLists rob_states;
-    ListOfStateLists lsq_states;
-    //
-    // Each ROB's state can be linked into at most one of the
-    // following rob_xxx_list lists at any given time; the ROB's
-    // current_state_list points back to the list it belongs to.
-    //
-    StateList rob_free_list;                             // Free ROB entyry
-    StateList rob_frontend_list;                         // Frontend in progress (artificial delay)
-    StateList rob_ready_to_dispatch_list;                // Ready to dispatch
-    StateList rob_dispatched_list[MAX_CLUSTERS];         // Dispatched but waiting for operands
-    StateList rob_ready_to_issue_list[MAX_CLUSTERS];     // Ready to issue (all operands ready)
-    StateList rob_ready_to_store_list[MAX_CLUSTERS];     // Ready to store (all operands except possibly rc are ready)
-    StateList rob_ready_to_load_list[MAX_CLUSTERS];      // Ready to load (all operands ready)
-    StateList rob_issued_list[MAX_CLUSTERS];             // Issued and in progress (or for loads, returned here after address is generated)
-    StateList rob_completed_list[MAX_CLUSTERS];          // Completed and result in transit for local and global forwarding
-    StateList rob_ready_to_writeback_list[MAX_CLUSTERS]; // Completed; result ready to writeback in parallel across all cluster register files
-    StateList rob_cache_miss_list;                       // Loads only: wait for cache miss to be serviced
-    StateList rob_tlb_miss_list;                         // TLB miss waiting to be serviced on one or more levels
-    StateList rob_memory_fence_list;                     // mf uops only: wait for memory fence to reach head of LSQ before completing
-    StateList rob_ready_to_commit_queue;                 // Ready to commit
-
-    Queue<ReorderBufferEntry, ROB_SIZE> ROB;
-
-    Queue<LoadStoreQueueEntry, LSQ_SIZE> LSQ;
-    RegisterRenameTable specrrt;
-    RegisterRenameTable commitrrt;
-
-    // Fetch-related structures
-    RIPVirtPhys fetchrip;
-    BasicBlock* current_basic_block;
-    int current_basic_block_transop_index;
-    bool stall_frontend;
-    bool waiting_for_icache_fill;
-    Waddr waiting_for_icache_fill_physaddr;
-
-    // Last block in icache we fetched into our buffer
-    W64 current_icache_block;
-    W64 fetch_uuid;
-    int loads_in_flight;
-    int stores_in_flight;
-    bool prev_interrupts_pending;
-    bool handle_interrupt_at_next_eom;
-    bool stop_at_next_eom;
-
-    W64 last_commit_at_cycle;
-    bool smc_invalidate_pending;
-    RIPVirtPhys smc_invalidate_rvp;
-    W64 chk_recovery_rip;
-
-    TransOpBuffer unaligned_ldst_buf;
-    LoadStoreAliasPredictor lsap;
-    int loads_in_this_cycle;
-    W64 load_to_store_parallel_forwarding_buffer[LOAD_FU_COUNT];
-
-    W64 consecutive_commits_inside_spinlock;
-
-    // statistics:
-    W64 total_uops_committed;
-    W64 total_insns_committed;
-    int dispatch_deadlock_countdown;    
-    int issueq_count;
-
-    //
-    // List of memory locks that will be removed from
-    // the lock controller when the macro-op commits.
-    //
-    // At most 4 chunks are allowed, to ensure
-    // cmpxchg16b works even with unaligned data.
-    //
-    byte queued_mem_lock_release_count;
-    W64 queued_mem_lock_release_list[4];
-
-    ThreadContext(OutOfOrderCore& core_, int threadid_, Context& ctx_): core(core_), threadid(threadid_), ctx(ctx_) {
-      reset();
-    }
-
-    int commit();
-    int writeback(int cluster);
-    int transfer(int cluster);
-    int complete(int cluster);
-    int dispatch();
-    void frontend();
-    void rename();
-    bool fetch();
-    void tlbwalk();
-
-    bool handle_barrier();
-    bool handle_exception();
-    bool handle_interrupt();
-    void reset_fetch_unit(W64 realrip);
-    void flush_pipeline();
-    void invalidate_smc();
-    void external_to_core_state();
-    void core_to_external_state() { }
-    void annul_fetchq();
-    BasicBlock* fetch_or_translate_basic_block(const RIPVirtPhys& rvp);
-    void redispatch_deadlock_recovery();
-    void flush_mem_lock_release_list(int start = 0);
-    int get_priority() const;
-
-    void dump_smt_state(ostream& os);
-    void print_smt_state(ostream& os);
-    void print_rob(ostream& os);
-    void print_lsq(ostream& os);
-    void print_rename_tables(ostream& os);
-
-    void reset();
-    void init();
-  };
-
-  //
-  // checkpointed core
-  //
-  struct OutOfOrderCore {
-    OutOfOrderMachine& machine;
-    int coreid;
-    OutOfOrderCore& getcore() const { return coreof(coreid); }
-
-    int threadcount;
-    ThreadContext* threads[MAX_THREADS_PER_CORE];
-
-    ListOfStateLists rob_states;
-    ListOfStateLists lsq_states;
-
-    EventLog eventlog;
-    ListOfStateLists physreg_states;
-    // Bandwidth counters:
-    int commitcount;
-    int writecount;
-    int dispatchcount;
-
-    byte round_robin_tid;
-
-    //
-    // Issue Queues (one per cluster)
-    //
-    int reserved_iq_entries;
-#define declare_issueq_templates template struct IssueQueue<ISSUE_QUEUE_SIZE>
-#ifdef MULTI_IQ
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int0;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int1;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_ld;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_fp;
-
-    // Instantiate any issueq sizes used above:
-
-
-#define foreach_issueq(expr) { OutOfOrderCore& core = getcore(); core.issueq_int0.expr; core.issueq_int1.expr; core.issueq_ld.expr; core.issueq_fp.expr; }
-  
-    void sched_get_all_issueq_free_slots(int* a) {
-      a[0] = issueq_int0.remaining();
-      a[1] = issueq_int1.remaining();
-      a[2] = issueq_ld.remaining();
-      a[3] = issueq_fp.remaining();
-    }
-
-#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
-  switch (cluster) { \
-  case 0: rc = core.issueq_int0.expr; break; \
-  case 1: rc = core.issueq_int1.expr; break; \
-  case 2: rc = core.issueq_ld.expr; break; \
-  case 3: rc = core.issueq_fp.expr; break; \
-  }
-
-#define per_cluster_stats_update(prefix, cluster, expr) \
-  switch (cluster) { \
-  case 0: prefix.int0 expr; break; \
-  case 1: prefix.int1 expr; break; \
-  case 2: prefix.ld expr; break; \
-  case 3: prefix.fp expr; break; \
-  }
-
-#else
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_all;
-#define foreach_issueq(expr) { getcore().issueq_all.expr; }
-    void sched_get_all_issueq_free_slots(int* a) {
-      a[0] = issueq_all.remaining();
-    }
-#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) rc = core.issueq_all.expr;
-#define per_cluster_stats_update(prefix, cluster, expr) prefix.all expr;
-
-#endif
-
-#define per_physregfile_stats_update(prefix, rfid, expr) \
-  switch (rfid) { \
-  case 0: prefix.integer expr; break; \
-  case 1: prefix.fp expr; break; \
-  case 2: prefix.st expr; break; \
-  case 3: prefix.br expr; break; \
-  }
-
-#define issueq_operation_on_cluster(core, cluster, expr) { int dummyrc; issueq_operation_on_cluster_with_result(core, cluster, dummyrc, expr); }
-
-#define for_each_cluster(iter) foreach (iter, MAX_CLUSTERS)
-#define for_each_operand(iter) foreach (iter, MAX_OPERANDS)
-
-    OutOfOrderCore(int coreid_, OutOfOrderMachine& machine_): coreid(coreid_), machine(machine_), cache_callbacks(*this) {
-      threadcount = 0;
-      setzero(threads);
-    }
-    
-    ~OutOfOrderCore(){};
-
-    // 
-    // Initialize structures independent of the core parameters
-    //
-    void init_generic();
-    void reset();
-
-    //
-    // Initialize all structures for the first time
-    //
-    void init() {
-      init_generic();
-      //
-      // Physical register files
-      //
-      physregfiles[0]("int", coreid, 0, PHYS_REG_FILE_SIZE);
-      physregfiles[1]("fp", coreid, 1, PHYS_REG_FILE_SIZE);
-      physregfiles[2]("st", coreid, 2, STQ_SIZE * MAX_THREADS_PER_CORE);
-      physregfiles[3]("br", coreid, 3, MAX_BRANCHES_IN_FLIGHT * MAX_THREADS_PER_CORE);
-    }
-
-    //
-    // Physical Registers
-    //
-
-    enum { PHYS_REG_FILE_INT, PHYS_REG_FILE_FP, PHYS_REG_FILE_ST, PHYS_REG_FILE_BR };
-
-    enum {  
-      PHYS_REG_FILE_MASK_INT = (1 << 0),
-      PHYS_REG_FILE_MASK_FP  = (1 << 1),
-      PHYS_REG_FILE_MASK_ST  = (1 << 2),
-      PHYS_REG_FILE_MASK_BR  = (1 << 3)
-    };
-
-    // Major core structures
-    PhysicalRegisterFile physregfiles[PHYS_REG_FILE_COUNT];
-    int round_robin_reg_file_offset;
-    W32 fu_avail;
-    ReorderBufferEntry* robs_on_fu[FU_COUNT];
-    CacheSubsystem::CacheHierarchy caches;
-    OutOfOrderCoreCacheCallbacks cache_callbacks;
-
-    // Unaligned load/store predictor
-    bitvec<UNALIGNED_PREDICTOR_SIZE> unaligned_predictor;
-    static int hash_unaligned_predictor_slot(const RIPVirtPhysBase& rvp);
-    bool get_unaligned_hint(const RIPVirtPhysBase& rvp) const;
-    void set_unaligned_hint(const RIPVirtPhysBase& rvp, bool value);
-
-    // Pipeline Stages
-    bool runcycle();
-    void flush_pipeline_all();
-    bool fetch();
-    void rename();
-    void frontend();
-    int dispatch();
-    int issue(int cluster);
-    int complete(int cluster);
-    int transfer(int cluster);
-    int writeback(int cluster);
-    int commit();
-
-    // Callbacks
-    void flush_tlb(Context& ctx, int threadid, bool selective = false, Waddr virtaddr = 0);
-
-    // Debugging
-    void dump_smt_state(ostream& os);
-    void print_smt_state(ostream& os);
-    void check_refcounts();
-    void check_rob();
-  };
-
-#define MAX_SMT_CORES 1
-
-  struct OutOfOrderMachine: public PTLsimMachine {
-    OutOfOrderCore* cores[MAX_SMT_CORES];
-    bitvec<MAX_CONTEXTS> stopped;
-    OutOfOrderMachine(const char* name);
-    virtual bool init(PTLsimConfig& config);
-    virtual int run(PTLsimConfig& config);
-    virtual void dump_state(ostream& os);
-    virtual void update_stats(PTLsimStats& stats);
-    virtual void flush_tlb(Context& ctx);
-    virtual void flush_tlb_virt(Context& ctx, Waddr virtaddr);
-    void flush_all_pipelines();
-  };
-
-  extern CycleTimer cttotal;
-  extern CycleTimer ctfetch;
-  extern CycleTimer ctdecode;
-  extern CycleTimer ctrename;
-  extern CycleTimer ctfrontend;
-  extern CycleTimer ctdispatch;
-  extern CycleTimer ctissue;
-  extern CycleTimer ctissueload;
-  extern CycleTimer ctissuestore;
-  extern CycleTimer ctcomplete;
-  extern CycleTimer cttransfer;
-  extern CycleTimer ctwriteback;
-  extern CycleTimer ctcommit;
-
-#ifdef DECLARE_STRUCTURES
-  //
-  // The following configuration has two integer/store clusters with a single cycle
-  // latency between them, but both clusters can access the load pseudo-cluster with
-  // no extra cycle. The floating point cluster is two cycles from everything else.
-  //
-#ifdef MULTI_IQ
-  const Cluster clusters[MAX_CLUSTERS] = {
-    {"int0",  2, (FU_ALU0|FU_STU0)},
-    {"int1",  2, (FU_ALU1|FU_STU1)},
-    {"ld",    2, (FU_LDU0|FU_LDU1)},
-    {"fp",    2, (FU_FPU0|FU_FPU1)},
-  };
-
-  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
-    // I0 I1 LD FP <-to
-    {0, 1, 0, 2}, // from I0
-    {1, 0, 0, 2}, // from I1
-    {0, 0, 0, 2}, // from LD
-    {2, 2, 2, 0}, // from FP
-  };
-
-  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
-    // I0 I1 LD FP <-to
-    {2, 2, 1, 1}, // from I0
-    {2, 2, 1, 1}, // from I1
-    {1, 1, 2, 2}, // from LD
-    {1, 1, 1, 2}, // from FP
-  };
-
-#else // single issueq
-  const Cluster clusters[MAX_CLUSTERS] = {
-    {"all",  4, (FU_ALU0|FU_ALU1|FU_STU0|FU_STU1|FU_LDU0|FU_LDU1|FU_FPU0|FU_FPU1)},
-   };
-  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{0}};
-  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{64}};
-#endif // multi_issueq
-
-#endif // DECLARE_STRUCTURES
-
-#endif // INSIDE_OOOCORE
-
-  //
-  // This part is used when parsing stats.h to build the
-  // data store template; these must be in sync with the
-  // corresponding definitions elsewhere.
-  //
-#ifdef MULTI_IQ
-  static const char* cluster_names[MAX_CLUSTERS] = {"int0", "int1", "ld", "fp"};
-#else
-  static const char* cluster_names[MAX_CLUSTERS] = {"all"};
-#endif
-
-  static const char* phys_reg_file_names[PHYS_REG_FILE_COUNT] = {"int", "fp", "st", "br"};
-};
-
-struct PerContextOutOfOrderCoreStats { // rootnode:
-  struct fetch {
-    struct stop { // node: summable
-      W64 stalled;
-      W64 icache_miss;
-      W64 fetchq_full;
-      W64 issueq_quota_full;
-      W64 bogus_rip;
-      W64 microcode_assist;
-      W64 branch_taken;
-      W64 full_width;
-    } stop;
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-    W64 width[OutOfOrderModel::FETCH_WIDTH+1]; // histo: 0, OutOfOrderModel::FETCH_WIDTH, 1
-    W64 blocks;
-    W64 uops;
-    W64 user_insns;
-  } fetch;
-
-  struct frontend {
-    struct status { // node: summable
-      W64 complete;
-      W64 fetchq_empty;
-      W64 rob_full;
-      W64 physregs_full;
-      W64 ldq_full;
-      W64 stq_full;
-    } status;
-    W64 width[OutOfOrderModel::FRONTEND_WIDTH+1]; // histo: 0, OutOfOrderModel::FRONTEND_WIDTH, 1
-    struct renamed {
-      W64 none;
-      W64 reg;
-      W64 flags;
-      W64 reg_and_flags;
-    } renamed;
-    struct alloc {
-      W64 reg;
-      W64 ldreg;
-      W64 sfr;
-      W64 br;
-    } alloc;
-    // NOTE: This is capped at 255 consumers to keep the size reasonable:
-    W64 consumer_count[256]; // histo: 0, 255, 1
-  } frontend;
-
-  struct dispatch {
-    W64 cluster[OutOfOrderModel::MAX_CLUSTERS]; // label: OutOfOrderModel::cluster_names
-    struct redispatch {
-      W64 trigger_uops;
-      W64 deadlock_flushes;
-      W64 deadlock_uops_flushed;
-      W64 dependent_uops[OutOfOrderModel::ROB_SIZE+1]; // histo: 0, OutOfOrderModel::ROB_SIZE, 1
-    } redispatch;
-  } dispatch;
-
-  struct issue {
-    W64 uops;
-    double uipc;
-    struct result { // node: summable
-      W64 no_fu;
-      W64 replay;
-      W64 misspeculated;
-      W64 refetch;
-      W64 branch_mispredict;
-      W64 exception;
-      W64 complete;
-    } result;
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-  } issue;
-
-  struct writeback {
-    W64 writebacks[OutOfOrderModel::PHYS_REG_FILE_COUNT]; // label: OutOfOrderModel::phys_reg_file_names
-  } writeback;
-
-  struct commit {
-    W64 uops;
-    W64 insns;
-    double uipc;
-    double ipc;
-
-    struct result { // node: summable
-      W64 none;
-      W64 ok;
-      W64 exception;
-      W64 skipblock;
-      W64 barrier;
-      W64 smc;
-      W64 memlocked;
-      W64 stop;
-    } result;
-
-    struct setflags { // node: summable
-      W64 yes;
-      W64 no;
-    } setflags;
-
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-  } commit;
-
-  struct branchpred {
-    W64 predictions;
-    W64 updates;
-
-    // These counters are [0] = mispred, [1] = correct
-    W64 cond[2]; // label: branchpred_outcome_names
-    W64 indir[2]; // label: branchpred_outcome_names
-    W64 ret[2]; // label: branchpred_outcome_names
-    W64 summary[2]; // label: branchpred_outcome_names
-    struct ras { // node: summable
-      W64 pushes;
-      W64 overflows;
-      W64 pops;
-      W64 underflows;
-      W64 annuls;
-    } ras;
-  } branchpred;
-
-  struct dcache {
-    struct load {
-      struct issue { // node: summable
-        W64 complete;
-        W64 miss;
-        W64 exception;
-        W64 ordering;
-        W64 unaligned;
-        struct replay { // node: summable
-          W64 sfr_addr_and_data_not_ready;
-          W64 sfr_addr_not_ready;
-          W64 sfr_data_not_ready;
-          W64 missbuf_full;
-          W64 interlocked;
-          W64 interlock_overflow;
-          W64 fence;
-          W64 bank_conflict;
-        } replay;
-      } issue;
-
-      struct forward { // node: summable
-        W64 cache;
-        W64 sfr;
-        W64 sfr_and_cache;
-      } forward;
-        
-      struct dependency { // node: summable
-        W64 independent;
-        W64 predicted_alias_unresolved;
-        W64 stq_address_match;
-        W64 stq_address_not_ready;
-        W64 fence;
-      } dependency;
-        
-      struct type { // node: summable
-        W64 aligned;
-        W64 unaligned;
-        W64 internal;
-      } type;
-        
-      W64 size[4]; // label: sizeshift_names
-
-      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
-    } load;
-
-    struct store {
-      struct issue { // node: summable
-        W64 complete;
-        W64 exception;
-        W64 ordering;
-        W64 unaligned;
-        struct replay { // node: summable
-          W64 sfr_addr_and_data_not_ready;
-          W64 sfr_addr_not_ready;
-          W64 sfr_data_not_ready;
-          W64 sfr_addr_and_data_and_data_to_store_not_ready;
-          W64 sfr_addr_and_data_to_store_not_ready;
-          W64 sfr_data_and_data_to_store_not_ready;
-          W64 interlocked;
-          W64 fence;
-          W64 parallel_aliasing;
-          W64 bank_conflict;
-        } replay;
-      } issue;
-
-      struct forward { // node: summable
-        W64 zero;
-        W64 sfr;
-      } forward;
-        
-      struct type { // node: summable
-        W64 aligned;
-        W64 unaligned;
-        W64 internal;
-      } type;
-        
-      W64 size[4]; // label: sizeshift_names
-
-      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
-    } store;
-
-    struct fence { // node: summable
-      W64 lfence;
-      W64 sfence;
-      W64 mfence;
-    } fence;
-  } dcache;
-};
-
-//
-// Out-of-Order Core
-//
-struct OutOfOrderCoreStats { // rootnode:
-  W64 cycles;
-
-  struct dispatch {
-    struct source { // node: summable
-      W64 integer[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 fp[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 st[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 br[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-    } source;
-    W64 width[OutOfOrderModel::DISPATCH_WIDTH+1]; // histo: 0, OutOfOrderModel::DISPATCH_WIDTH, 1
-  } dispatch;
-
-  struct issue {
-    struct source { // node: summable
-      W64 integer[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 fp[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 st[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-      W64 br[OutOfOrderModel::MAX_PHYSREG_STATE]; // label: OutOfOrderModel::physreg_state_names
-    } source;
-    struct width {
-#ifdef MULTI_IQ
-      W64 int0[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 int1[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 ld[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 fp[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-#else
-      W64 all[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
 #endif
-    } width;
-  } issue;
-
-  struct writeback {
-    struct width {
-#ifdef MULTI_IQ
-      W64 int0[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 int1[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 ld[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-      W64 fp[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
-#else
-      W64 all[OutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, OutOfOrderModel::MAX_ISSUE_WIDTH, 1
 #endif
-    } width;
-  } writeback;
-
-  struct commit {
-    struct freereg { // node: summable
-      W64 pending;
-      W64 free;
-    } freereg;
-
-    W64 free_regs_recycled;
-
-    W64 width[OutOfOrderModel::COMMIT_WIDTH+1]; // histo: 0, OutOfOrderModel::COMMIT_WIDTH, 1
-  } commit;
-
-  struct branchpred {
-    W64 predictions;
-    W64 updates;
-
-    // These counters are [0] = mispred, [1] = correct
-    W64 cond[2]; // label: branchpred_outcome_names
-    W64 indir[2]; // label: branchpred_outcome_names
-    W64 ret[2]; // label: branchpred_outcome_names
-    W64 summary[2]; // label: branchpred_outcome_names
-    struct ras { // node: summable
-      W64 pushes;
-      W64 overflows;
-      W64 pops;
-      W64 underflows;
-      W64 annuls;
-    } ras;
-  } branchpred;
-
-  PerContextOutOfOrderCoreStats total;
-  PerContextOutOfOrderCoreStats vcpu0;
-  PerContextOutOfOrderCoreStats vcpu1;
-  PerContextOutOfOrderCoreStats vcpu2;
-  PerContextOutOfOrderCoreStats vcpu3;
-
-  struct simulator {
-    double total_time;
-    struct cputime { // node: summable
-      double fetch;
-      double decode;
-      double rename;
-      double frontend;
-      double dispatch;
-      double issue;
-      double issueload;
-      double issuestore;
-      double complete;
-      double transfer;
-      double writeback;
-      double commit;
-    } cputime;
-  } simulator;
-};
 
 #endif // _OOOCORE_H_
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/oooexec.cpp ptlsim-asf/oooexec.cpp
--- ptlsim-stable/oooexec.cpp	2009-10-30 19:40:25.130183000 +0100
+++ ptlsim-asf/oooexec.cpp	2010-03-02 12:00:55.517574000 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -326,6 +328,10 @@
   bool ld = isload(uop.opcode);
   bool st = isstore(uop.opcode);
   bool br = isbranch(uop.opcode);
+  bool pf  = isprefetch(uop.opcode);
+#ifdef ENABLE_ASF
+  bool asf = isasf(uop.opcode);
+#endif
 
   assert(operands[RA]->ready());
   assert(rb.ready());
@@ -383,13 +389,23 @@
 
       state.reg.rddata = lsq->data;
       state.reg.rdflags = (lsq->invalid << log2(FLAG_INV)) | ((!lsq->datavalid) << log2(FLAG_WAIT));
+      //This is never used: state.reg.addr = lsq->physaddr;
       if unlikely (completed == ISSUE_NEEDS_REPLAY) {
         per_context_ooocore_stats_update(threadid, issue.result.replay++);
         return ISSUE_NEEDS_REPLAY;
       }
-    } else if unlikely (uop.opcode == OP_ld_pre) {
-      issueprefetch(state, radata, rbdata, rcdata, uop.cachelevel);
-    } else {
+    } else if unlikely (pf) {
+      issueprefetch(state, radata, rbdata, rcdata, uop.cachelevel, pteupdate);
+    }
+#ifdef ENABLE_ASF
+    else if unlikely (asf) {
+      if unlikely (thread.asf_pipeline_intercept.issue(*this, state, radata, rbdata, rcdata ) == ISSUE_NEEDS_REPLAY) {
+        per_context_ooocore_stats_update(threadid, issue.result.replay++);
+        return ISSUE_NEEDS_REPLAY;
+      }
+    }
+#endif
+    else {
       if unlikely (br) {
         state.brreg.riptaken = uop.riptaken;
         state.brreg.ripseq = uop.ripseq;
@@ -753,6 +769,14 @@
   
   Waddr physaddr = addrgen(state, origaddr, virtpage, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
 
+#ifdef ENABLE_ASF
+  if unlikely (uop.is_asf) {
+    // SD HACKALERT: Drop is ASF flag on the instruction without side-effects.
+    if unlikely (annul) uop.is_asf = 0;
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,"@",sim_cycle,": Issueing ", *this, "@", uop.rip, endl;
+  }
+#endif
+
   if unlikely (exception) {
     return (handle_common_load_store_exceptions(state, origaddr, addr, exception, pfec)) ? ISSUE_COMPLETED : ISSUE_MISSPECULATED;
   }
@@ -936,6 +960,8 @@
         return ISSUE_NEEDS_REPLAY;
       }
 
+      // NOTE: This exception is never found at commit time thanks to
+      // redispatching this load further down
       state.invalid = 1;
       state.data = EXCEPTION_LoadStoreAliasing;
       state.datavalid = 1;
@@ -961,6 +987,16 @@
     }
   }
 
+#ifdef ENABLE_ASF
+  if unlikely (uop.is_asf) {
+    int res = thread.asf_pipeline_intercept.issue_store(*this, state);
+    if (res != ISSUE_COMPLETED)
+      return res;
+  }
+#endif
+
+  // FIXME: Stores must check the TLB, too!
+
   //
   // Cache coherent interlocking
   //
@@ -1072,12 +1108,28 @@
   int aligntype = uop.cond;
   bool signext = (uop.opcode == OP_ldx);
 
+  /* SD: There are quite a few addresses in this function:
+     state.physaddr - physical address aligned to 8 byte, shifted right by 3
+     physaddr       - physical address, not shifted
+     addr           - is the effective virtual address, adjusted for unaligned
+                      loads of halfs
+     origaddr       - effective virtual address, not adjusted for unaligned
+                      accesses -> value in ROBentry
+     virtpage       - same as addr, but stored inside the ROBentry
+   */
   Waddr addr;
   int exception = 0;
   PageFaultErrorCode pfec;
   bool annul;
 
   Waddr physaddr = addrgen(state, origaddr, virtpage, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
+#ifdef ENABLE_ASF
+  if unlikely (uop.is_asf) {
+    // SD HACKALERT: Drop is ASF flag on the instruction without side-effects.
+    if unlikely (annul) uop.is_asf = 0;
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,"@",sim_cycle,": Issueing ", *this, "@", uop.rip, endl;
+  }
+#endif
 
   if unlikely (exception) {
     return (handle_common_load_store_exceptions(state, origaddr, addr, exception, pfec)) ? ISSUE_COMPLETED : ISSUE_MISSPECULATED;
@@ -1096,7 +1148,13 @@
   // only arrives later, but it saves us from having to copy
   // cache lines around...
   //
+  // NOTE: might read spec. modified data of ASF CS on our core, but correct values are reread on replay
   W64 data = (annul) ? 0 : loadphys(physaddr);
+#ifdef ENABLE_ASF
+  // NOTE: This relies on issue_probe_and_merge to leave the data alone in case
+  //       nothing has been updated
+  thread.asf_pipeline_intercept.issue_probe_and_merge(physaddr, uop.invalidating, data);
+#endif
 
   LoadStoreQueueEntry* sfra = null;
 
@@ -1136,6 +1194,10 @@
     } else {
       // Address is unknown: is it a memory fence that hasn't committed?
       if unlikely (stbuf.lfence) {
+#ifdef ENABLE_ASF
+        // SD: Special asf-lfences separate locked-lds from the next ASF CS
+        if (stbuf.rob->uop.is_asf && !this->uop.is_asf) continue;
+#endif
         per_context_ooocore_stats_update(threadid, dcache.load.dependency.fence++);
         sfra = &stbuf;
         break;
@@ -1210,6 +1272,14 @@
     return ISSUE_NEEDS_REPLAY;
   }
 
+#ifdef ENABLE_ASF
+  if unlikely (uop.is_asf) {
+    int res = thread.asf_pipeline_intercept.issue_load(*this, state, sfra);
+	if (res != ISSUE_COMPLETED)
+	  return res;
+  }
+#endif
+
 #ifdef ENFORCE_L1_DCACHE_BANK_CONFLICTS
   foreach (i, thread.loads_in_this_cycle) {
     W64 prevaddr = thread.load_to_store_parallel_forwarding_buffer[i];
@@ -1220,7 +1290,17 @@
     // allowed since the chunk has been loaded anyway, so we might
     // as well use it.
     //
-    if unlikely ((prevaddr != state.physaddr) && (lowbits(prevaddr, log2(CacheSubsystem::L1_DCACHE_BANKS)) == lowbits(state.physaddr, log2(CacheSubsystem::L1_DCACHE_BANKS)))) {
+    // S.D.: More flexible bank layout: banks may span cachelines now.
+    // S.D.: Barcelona has 128 bits cache width (fitting with the banksize ;))
+    //       so we will not conflict on the same 16 byte chunk! Simply assume
+    //       that such a chunksize is equal to the BANK_SIZE, which is actually
+    //       sane!
+    int bank_idx_start = log2(CacheSubsystem::L1_DCACHE_BANKSIZE) - 3;
+    int bank_idx_bits  = log2(CacheSubsystem::L1_DCACHE_BANKS);
+
+    if unlikely (((prevaddr >> bank_idx_start) != (state.physaddr >> bank_idx_start)) && (
+                  bits(prevaddr,       bank_idx_start, bank_idx_bits) ==
+                  bits(state.physaddr, bank_idx_start, bank_idx_bits))) {
       if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_BANK_CONFLICT, this, null, addr);
       per_context_ooocore_stats_update(threadid, dcache.load.issue.replay.bank_conflict++);
 
@@ -1279,8 +1359,10 @@
       }
  
       // Double-locking within a thread is NOT allowed!
-      assert(lock->vcpuid != thread.ctx.vcpuid);
-      assert(lock->threadid != threadid);
+      // S.D. This assertion is checked already in if's condition
+      //assert(lock->vcpuid != thread.ctx.vcpuid);
+      // S.D. This assertion breaks with threadids being local to cores
+      //assert(lock->threadid != threadid);
 
       per_context_ooocore_stats_update(threadid, dcache.load.issue.replay.interlocked++);
       replay_locked();
@@ -1288,11 +1370,11 @@
     }
 
     // Issuing more than one ld.acq on the same block is not allowed:
-    if (lock) {
+    if (lock) { //&& (lock->vcpuid == thread.ctx.vcpuid)
       logfile << "ERROR: thread ", thread.ctx.vcpuid, " uuid ", uop.uuid, " over physaddr ", (void*)physaddr, ": lock was already acquired by vcpuid ", lock->vcpuid, " uuid ", lock->uuid, " rob ", lock->rob, endl;
       assert(false);
     }
- 
+    // S.D. Location NOT locked
     if unlikely (uop.locked) {
       //
       // Attempt to acquire an exclusive lock on the block via ld.acq,
@@ -1409,6 +1491,7 @@
   assert(thread.loads_in_this_cycle < LOAD_FU_COUNT);
   thread.load_to_store_parallel_forwarding_buffer[thread.loads_in_this_cycle++] = state.physaddr;
 
+  // Internal loads don't hit the cache hierarchy, but rather complete in two cycles.
   if unlikely (uop.internal) {
     cycles_left = LOADLAT;
 
@@ -1416,8 +1499,10 @@
 
     load_store_second_phase = 1;
     state.datavalid = 1;
-    physreg->flags &= ~FLAG_WAIT;
-    physreg->complete();
+    // SD: Make the destination available at the complete time, when the load
+    //     actually produces the data, not now, at the beginning of issue...
+    // physreg->flags &= ~FLAG_WAIT;
+    // physreg->complete();
     changestate(thread.rob_issued_list[cluster]);
     lfrqslot = -1;
     forward_cycle = 0;
@@ -1431,22 +1516,35 @@
     // TLB miss: 
     //
     if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_TLB_MISS, this, sfra, addr);
+
+#ifdef USE_L2_TLB
+    // S.D.: A quick hack just sets the walklevel to zero, but makes the rob
+    // stay in the tlbmiss-state for the L2-DTLB latency.
+    if likely (core.caches.l2dtlb.probe(addr, threadid)) {
+      cycles_left = CacheSubsystem::L2_DTLB_LATENCY;
+      tlb_walk_level = 0;
+      per_context_dcache_stats_update(threadid, load.dtlb.l2hits++);
+    } else
+#endif /* USE_L2_TLB */
+    {
     cycles_left = 0;
     tlb_walk_level = thread.ctx.page_table_level_count();
-    changestate(thread.rob_tlb_miss_list);
     per_context_dcache_stats_update(threadid, load.dtlb.misses++);
-    
+    }
+    changestate(thread.rob_tlb_miss_list);
     return ISSUE_COMPLETED;
   }
 
-  per_context_dcache_stats_update(threadid, load.dtlb.hits++);
-#endif
+  per_context_dcache_stats_update(threadid, load.dtlb.l1hits++);
+#endif /* USE_TLB */
 
-  return probecache(physaddr, sfra);
+  return probecache(addr, sfra);
 }
 
 //
 // Probe the cache and initiate a miss if required
+// Parameters: addr - effective virtual address, adjusted for unaligned loads!
+//             sfra - LSQ-entry of aliasing load
 //
 int ReorderBufferEntry::probecache(Waddr addr, LoadStoreQueueEntry* sfra) {
   OutOfOrderCore& core = getcore();
@@ -1459,7 +1557,7 @@
   LoadStoreQueueEntry& state = *lsq;
   W64 physaddr = state.physaddr << 3;
 
-  bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(physaddr, sfra, sizeshift);
+  bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(physaddr, addr, sfra, sizeshift);
 
   if likely (L1hit) {    
     cycles_left = LOADLAT;
@@ -1468,16 +1566,27 @@
     
     load_store_second_phase = 1;
     state.datavalid = 1;
-    physreg->flags &= ~FLAG_WAIT;
-    physreg->complete();
+    // SD: Make the destination available at the complete time, when the load
+    //     actually produces the data, not now, at the beginning of issue...
+    // physreg->flags &= ~FLAG_WAIT;
+    // physreg->complete();
     changestate(thread.rob_issued_list[cluster]);
     lfrqslot = -1;
     forward_cycle = 0;
 
+    // If we have an ASF invalidating probe, this will be decoded as a load, but will also invalidate the line in all other caches!
+    if unlikely (uop.invalidating) core.caches.probe_other_caches(physaddr, addr, true);
+
     per_context_ooocore_stats_update(threadid, dcache.load.issue.complete++);
     per_context_dcache_stats_update(threadid, load.hit.L1++);
     return ISSUE_COMPLETED;
   }
+  /* in case we miss, no additional probe has to be issued (eg waiting for a PROBE_ACK, in case we received a PROBE_WAIT above, as
+     the rollback can't take longer than the cache miss: the core could either forward the original value from its LLB, when serving
+     the cache miss (that'd be MOESI) or write back the value from the LLB / ignore the changes made in L1 when using MESI!)
+     Hence, we can be sure that the data we're reading was valid at some point in time, i.e. we can find a linearisation point for
+     this read. If this read is actually part of an ASF spec. region, we don't have to worry either, as the address is already in our
+     LLB, and in case the other guy restarts his transaction after his rollback, our transaction will be aborted anyways! */
 
   per_context_ooocore_stats_update(threadid, dcache.load.issue.miss++);
 
@@ -1492,12 +1601,18 @@
   lsi.sfrused = 0;
   lsi.internal = uop.internal;
   lsi.signext = signext;
+#ifdef ENABLE_ASF_CACHE_BASED
+  lsi.asf_spec = uop.is_asf;
+#endif
 
   SFR dummysfr;
   setzero(dummysfr);
-  lfrqslot = core.caches.issueload_slowpath(physaddr, dummysfr, lsi);
+  lfrqslot = core.caches.issueload_slowpath(physaddr, addr, dummysfr, lsi);
   assert(lfrqslot >= 0);
 
+  // If we have an ASF invalidating probe, this will be decoded as a load, but will also invalidate the line in all other caches!
+  if unlikely (uop.invalidating) core.caches.probe_other_caches(physaddr,addr, true);
+
   if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_MISS, this, sfra, addr);
 
   return ISSUE_COMPLETED;
@@ -1514,6 +1629,18 @@
   OutOfOrderCoreEvent* event;
   W64 virtaddr = virtpage;
 
+#ifdef USE_L2_TLB
+  //
+  // Hits in the L2 TLB are tlb-walks with an initial level of 0 and cycles_left
+  // set to the latency of the L2. Hence we check if there are any cycles left and
+  // just decrement them.
+  //
+  if likely(cycles_left) {
+    cycles_left--;
+    return;
+  }
+#endif
+
   if unlikely (!tlb_walk_level) {
     // End of walk sequence: try to probe cache
     if unlikely (core.caches.lfrq_or_missbuf_full()) {
@@ -1529,10 +1656,14 @@
 
     if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_TLBWALK_COMPLETE, this, null, virtaddr);
     core.caches.dtlb.insert(virtaddr, threadid);
-
+#ifdef USE_L2_TLB
+    core.caches.l2dtlb.insert(virtaddr, threadid);
+#endif
     if unlikely (isprefetch(uop.opcode)) {
-      physreg->flags &= ~FLAG_WAIT;
-      physreg->complete();
+      // SD: Make the destination available at the complete time, not so much
+      //     an issue for prefetches, but for sake of consistency...
+      // physreg->flags &= ~FLAG_WAIT;
+      // physreg->complete();
       changestate(thread.rob_issued_list[cluster]);
       forward_cycle = 0;
       int exception;
@@ -1540,7 +1671,7 @@
       PTEUpdate pteupdate;
       Context& ctx = getthread().ctx;
       Waddr physaddr = ctx.check_and_translate(virtaddr, 1, 0, 0, exception, pfec, pteupdate);
-      core.caches.initiate_prefetch(physaddr, uop.cachelevel);
+      core.caches.initiate_prefetch(physaddr, virtaddr, uop.cachelevel);
     } else {
       probecache(virtaddr, null);
     }
@@ -1676,8 +1807,9 @@
 
 //
 // Issues a prefetch on the given memory address into the specified cache level.
+// TODO: Check for interplay with locked memory regions!
 //
-void ReorderBufferEntry::issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel) {
+void ReorderBufferEntry::issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate) {
   OutOfOrderCore& core = getcore();
   ThreadContext& thread = getthread();
 
@@ -1687,13 +1819,18 @@
   int exception = 0;
   Waddr addr;
   Waddr origaddr;
-  PTEUpdate pteupdate;
+  PTEUpdate dummy_pteu;
   PageFaultErrorCode pfec;
   bool annul;
 
   LoadStoreQueueEntry dummy;
   setzero(dummy);
-  Waddr physaddr = addrgen(dummy, origaddr, virtpage, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
+  Waddr physaddr = addrgen(dummy, origaddr, virtpage, ra, rb, rc, uop.is_asf ? pteupdate : dummy_pteu, addr, exception, pfec, annul);
+
+#ifdef ENABLE_ASF
+  // LOCKed PREFETCHes are decoded into loads!
+  assert(!uop.is_asf);
+#endif
 
   // Ignore bogus prefetches:
   if unlikely (exception) return;
@@ -1712,42 +1850,74 @@
     // a TLB miss, so this is disabled by default.
     //
     if unlikely (config.event_log_enabled) OutOfOrderCoreEvent* event = core.eventlog.add_load_store(EVENT_LOAD_TLB_MISS, this, null, addr);
+#ifdef USE_L2_TLB
+    // S.D.: A quick hack just sets the walklevel to zero, but makes the rob
+    // stay in the tlbmiss for the L2-DTLB latency.
+    if likely (core.caches.l2dtlb.probe(addr, threadid)) {
+      cycles_left = CacheSubsystem::L2_DTLB_LATENCY;
+      tlb_walk_level = 0;
+      per_context_dcache_stats_update(threadid, load.dtlb.l2hits++);
+    } else
+#endif /* USE_L2_TLB */
+    {
     cycles_left = 0;
     tlb_walk_level = thread.ctx.page_table_level_count();
+      per_context_dcache_stats_update(threadid, load.dtlb.misses++);
+    }
     changestate(thread.rob_tlb_miss_list);
-    per_context_dcache_stats_update(thread.threadid, load.dtlb.misses++);
-#endif
     return;
-  }
-
-  per_context_dcache_stats_update(threadid, load.dtlb.hits++);
 #endif
+  }
 
-  core.caches.initiate_prefetch(physaddr, cachelevel);
+  per_context_dcache_stats_update(threadid, load.dtlb.l1hits++);
+#endif /* USE_TLB */
+  core.caches.initiate_prefetch(physaddr, origaddr, cachelevel, uop.invalidating);
 }
 
 //
 // Data cache has delivered a load: wake up corresponding ROB/LSQ/physreg entries
 //
-void OutOfOrderCoreCacheCallbacks::dcache_wakeup(LoadStoreInfo lsi, W64 physaddr) {
+void OutOfOrderCoreCacheCallbacks::dcache_wakeup(LoadStoreInfo lsi, W64 physaddr, bool retry) {
   int idx = lsi.rob;
   ThreadContext* thread = core.threads[lsi.threadid];
   assert(inrange(idx, 0, ROB_SIZE-1));
   ReorderBufferEntry& rob = thread->ROB[idx];
 
   if(logable(100)) logfile << " dcache_wakeup ", rob, endl;
+  if (rob.current_state_list != &thread->rob_cache_miss_list) {
+    logfile << "[vcpu ", core.coreid, "]", __FILE__,__LINE__,
+    "dcache_wakeup with wrong list: ", endl,
+    "rob.current_state_list = ", rob.current_state_list,
+    " (", rob.current_state_list->name, " ", rob.current_state_list->listid,")", endl,
+    "&thread->rob_cache_miss_list = ", &thread->rob_cache_miss_list,
+    " (", thread->rob_cache_miss_list.name, " ", thread->rob_cache_miss_list.listid,")", endl,
+    " Core: ", core.coreid, " Thread: ", lsi.threadid, " Rob: ", &rob, " Idx: ", idx,
+    " Rob.coreid: ", rob.coreid, "Rob.threadid: ", rob.threadid, endl;
+  }
   assert(rob.current_state_list == &thread->rob_cache_miss_list);
 
-  rob.loadwakeup();
+  rob.loadwakeup(retry);
 }
 
-void ReorderBufferEntry::loadwakeup() {
+void ReorderBufferEntry::loadwakeup(bool retry) {
   if (tlb_walk_level) {
     // Wake up from TLB walk wait and move to next level
+    // NOTE SD: TLB walks will implicitly retry by not finding the data in L1
     if unlikely (config.event_log_enabled) getcore().eventlog.add_load_store(EVENT_TLBWALK_WAKEUP, this);
     lfrqslot = -1;
     changestate(getthread().rob_tlb_miss_list);
   } else {
+    // Reprobe the load, if the cache-line was invalidated during the probe
+    if unlikely (retry) {
+      /* TODO: One could also just remember this condition and call
+               redispatch-depends on this uop when it hits commit/retire stage. */
+      if unlikely (!reprobe_load()) {
+        // Just clean up the LFRQ slot
+        lfrqslot = -1;
+        return;
+      }
+    }
+
     // Actually wake up the load
     if unlikely (config.event_log_enabled) getcore().eventlog.add_load_store(EVENT_LOAD_WAKEUP, this);
 
@@ -1764,6 +1934,27 @@
   }
 }
 
+/**
+ * Notifies the memop ROB that it has to query the caches again as the original
+ * request has been invalidated while being in-flight.
+ * @return Can the load still see the data?
+ */
+bool ReorderBufferEntry::reprobe_load() {
+#ifdef ENABLE_ASF_CACHE_BASED
+  if unlikely (uop.is_asf) {
+    // An in-flight ASF uop has been hit by an external probe -> abort the speculative region
+    getthread().asf_pipeline_intercept.reprobe_load(*this);
+    return false;
+  }
+#endif
+  // TODO: What would be the right retry mechanism? redispatch_dependents vs replay vs in-loop reprobing
+  // TODO: We could also wait until this load instruction hits the commit/retire stage and cause redispatch from there
+  //redispatch_dependents(true); // NOTE: This causes issues as it frees the LFRQ slot that is currently used to notify this load
+  //replay(); // NOTE: This breaks as loads that miss in the cache: they have their issue completed and cannot replay!
+
+  return true;
+}
+
 void ReorderBufferEntry::fencewakeup() {
   ThreadContext& thread = getthread();
 
@@ -1829,6 +2020,7 @@
   issueq_tag_t uopids[MAX_OPERANDS];
   issueq_tag_t preready[MAX_OPERANDS];
 
+  // Refresh information of operands this uop is waiting for
   foreach (operand, MAX_OPERANDS) {
     PhysicalRegister& source_physreg = *operands[operand];
     ReorderBufferEntry& source_rob = *source_physreg.rob;
@@ -1843,6 +2035,10 @@
       preready[operand] = 1;
     }
   }
+#ifdef ENABLE_ASF
+  if unlikely (uop.is_asf)
+    thread.asf_pipeline_intercept.annul_replay_redispatch(*this);
+#endif
 
   if unlikely (operands_still_needed) {
     changestate(thread.rob_dispatched_list[cluster]);
@@ -2193,6 +2389,17 @@
       branchpred.annulras(annulrob.uop.predinfo);
     }
 
+#ifdef ENABLE_ASF
+    if (annulrob.uop.is_asf) {
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Annulling ", annulrob,endl;
+      thread.asf_pipeline_intercept.annul_replay_redispatch(annulrob);
+      if unlikely (annulrob.uop.opcode == OP_rel) {
+        loads_in_flight--;
+        annulrob.lsq->reset();
+        LSQ.annul(annulrob.lsq);
+      }
+    }
+#endif
     annulrob.reset();
 
     ROB.annul(annulrob);
@@ -2293,6 +2500,13 @@
   physreg->flags = FLAG_WAIT;
   physreg->changestate(PHYSREG_WAITING);
 
+#ifdef ENABLE_ASF
+  if (uop.is_asf) {
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Redispatching ", uop,endl;
+    thread.asf_pipeline_intercept.annul_replay_redispatch(*this);
+  }
+#endif
+
   // Force ROB to be re-dispatched in program order
   cycles_left = 0;
   forward_cycle = 0;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ooopipe.cpp ptlsim-asf/ooopipe.cpp
--- ptlsim-stable/ooopipe.cpp	2009-10-30 19:40:25.133183000 +0100
+++ ptlsim-asf/ooopipe.cpp	2010-03-02 12:00:55.536555000 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2008 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006-2008 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -93,13 +95,13 @@
   // Clear per-thread state:
   foreach (i, threadcount) {
     ThreadContext* thread = threads[i];
-    thread->flush_pipeline();
+    assert(thread->flush_pipeline());
   }
   // Clear out everything global:
   setzero(robs_on_fu);
 }
 
-void ThreadContext::flush_pipeline() {
+bool ThreadContext::flush_pipeline() {
   // SD: I wonder if flush_pipeline should really be able to flush halfway
   // through a partially committed x86 instruction. This is dangerous,
   // especially if the instruction has already partially updated
@@ -115,6 +117,7 @@
       logfile <<"  ", rob, endl;
       if (rob.uop.eom) break;
     }
+	//return false;
   }
 
   core.caches.complete(threadid);
@@ -178,6 +181,7 @@
   dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
   last_commit_at_cycle = sim_cycle;
   external_to_core_state();
+  return true;
 }
 
 //
@@ -192,7 +196,9 @@
 
   fetchrip = realrip;
   fetchrip.update(ctx);
+
   stall_frontend = 0;
+  stall_on_eom   = 0;
   waiting_for_icache_fill = 0;
   fetchq.reset();
   current_basic_block_transop_index = 0;
@@ -297,7 +303,7 @@
   per_context_ooocore_stats_update(threadid, dispatch.redispatch.deadlock_flushes++);
   // don't want to reset the counter for no commit in this case
   W64 previous_last_commit_at_cycle = last_commit_at_cycle;
-  flush_pipeline();
+  assert(flush_pipeline());
   last_commit_at_cycle = previous_last_commit_at_cycle; /// so we can exit after no commit after deadlock recovery a few times in a roll
   logfile << "[vcpu ", ctx.vcpuid, "] thread ", threadid, ": reset thread.last_commit_at_cycle to be before redispatch_deadlock_recovery() ", previous_last_commit_at_cycle, endl;
   /*
@@ -408,7 +414,7 @@
     return true;
   }
 
-  while ((fetchcount < FETCH_WIDTH) && (taken_branch_count == 0)) {
+  while ((fetchcount < FETCH_WIDTH) && (taken_branch_count == 0) && !stall_frontend) {
     if unlikely (!fetchq.remaining()) {
       if unlikely (config.event_log_enabled) {
         if (!fetchcount) {
@@ -546,6 +552,9 @@
       stall_frontend = 1;
     }
 
+    // SD: Try w/o the pipeline stalls!
+    if unlikely (stall_on_eom && transop.eom) {stall_frontend = true; stall_on_eom = false;}
+
     per_context_ooocore_stats_update(threadid, fetch.uops++);
 
     Waddr predrip = 0;
@@ -727,6 +736,9 @@
     bool st = isstore(fetchbuf.opcode);
     bool br = isbranch(fetchbuf.opcode);
 
+#ifdef ENABLE_ASF
+    ld |= (fetchbuf.opcode == OP_rel);
+#endif
     if unlikely (ld && (loads_in_flight >= LDQ_SIZE)) {
       if unlikely (config.event_log_enabled) { if likely (!prepcount) core.eventlog.add(EVENT_RENAME_LDQ_FULL)->threadid = threadid; }
       per_context_ooocore_stats_update(threadid, frontend.status.ldq_full++);
@@ -1143,6 +1155,97 @@
   return operands_still_needed;
 }
 
+// SD: This is an attempt at an improved dispatcher
+int ReorderBufferEntry::select_cluster_penalty() {
+  OutOfOrderCoreEvent* event;
+  OutOfOrderCore& core = getcore();
+  ThreadContext& thread = getthread();
+
+  if (MAX_CLUSTERS == 1) {
+    int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+    getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+    return (cluster_issue_queue_avail_count[0] > 0) ? 0 : -1;
+  }
+
+  W32 executable_on_cluster = executable_on_cluster_mask;
+
+  static const int F = 8;   //fixed point arithmetic
+  int cluster_operand_penalty[MAX_CLUSTERS];
+  foreach (i, MAX_CLUSTERS) { cluster_operand_penalty[i] = 0; }
+
+  // SD: Instead of computing bonusses to run on the same cluster as one of the
+  //     operands, we will compute penalties which correspond to distances
+  //     between us and the operand.
+  foreach (i, MAX_OPERANDS) {
+    PhysicalRegister& r = *operands[i];
+    if ((&r) && ((r.state == PHYSREG_WAITING) || (r.state == PHYSREG_BYPASS)) && (r.rob->cluster >= 0))
+      foreach (c, MAX_CLUSTERS)
+        cluster_operand_penalty[c] += intercluster_latency_map[r.rob->cluster][c] << F;
+  }
+  // and then also take the FUs into account. Try to spread the uops evenly
+  // amongst them!
+  ReorderBufferEntry* rob;
+  foreach (c, MAX_CLUSTERS) {
+    foreach_list_mutable(thread.rob_dispatched_list[c], rob, entry, nextentry) {
+      // SD: The idea is to minimise the collision probability of uops when
+      //     assuming random selection if multiple FUs are available in a
+      //     cluster.
+      W32 FU_can     = fuinfo[rob->uop.opcode].fu & clusters[c].fu_mask;
+      W32 FU_can_we  = fuinfo[uop.opcode].fu      & clusters[c].fu_mask;
+      W32 FU_overlap = FU_can & FU_can_we;
+      if unlikely(FU_overlap)
+        cluster_operand_penalty[c] += (F * popcount(FU_overlap)) /
+                                      (popcount(FU_can) * popcount(FU_can_we));
+    }
+  }
+
+  assert(executable_on_cluster);
+
+  // If a given cluster's issue queue is full, try another cluster:
+  int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+  W32 cluster_issue_queue_avail_mask = 0;
+
+  getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+
+  foreach (i, MAX_CLUSTERS) {
+    cluster_issue_queue_avail_mask |= ((cluster_issue_queue_avail_count[i] > 0) << i);
+  }
+
+  executable_on_cluster &= cluster_issue_queue_avail_mask;
+
+  if unlikely (config.event_log_enabled) {
+    event = getcore().eventlog.add(EVENT_CLUSTER_OK, this);
+    event->select_cluster.allowed_clusters = executable_on_cluster_mask;
+    foreach (i, MAX_CLUSTERS) event->select_cluster.iq_avail[i] = cluster_issue_queue_avail_count[i];
+  }
+
+  if unlikely (!executable_on_cluster) {
+    if unlikely (config.event_log_enabled) event->type = EVENT_CLUSTER_NO_CLUSTER;
+    return -1;
+  }
+
+  int n = 0;
+  // SD: Using the sim_cycle as a source of randomness is utterly stupid,
+  //     because it can put two similar uops on the same cluster, whereas
+  //     spreading them might be much more usefull!
+  int ticker = sim_cycle*DISPATCH_WIDTH + core.dispatchcount;
+  int cluster = find_random_set_bit(executable_on_cluster, ticker);
+  n = cluster_operand_penalty[cluster];
+  foreach (i, MAX_CLUSTERS) {
+    if ((cluster_operand_penalty[i] < n) && bit(executable_on_cluster, i)) {
+      n = cluster_operand_penalty[i];
+      cluster = i;
+    }
+  }
+
+  per_context_ooocore_stats_update(threadid, dispatch.cluster[cluster]++);
+
+  if unlikely (config.event_log_enabled) event->cluster = cluster;
+
+  return cluster;
+
+}
+
 int ReorderBufferEntry::select_cluster() {
   OutOfOrderCoreEvent* event;
 
@@ -1218,8 +1321,11 @@
 
     // All operands start out as valid, then get put on wait queues if they are not actually ready.
 
+#ifndef PENALTY_DISPATCHER
     rob->cluster = rob->select_cluster();
-
+#else
+    rob->cluster = rob->select_cluster_penalty();
+#endif
     //
     // An available cluster could not be found. This only happens
     // when all applicable cluster issue queues are full. Since
@@ -1284,13 +1390,25 @@
   } else if unlikely (!rob_ready_to_dispatch_list.empty()) {
     dispatch_deadlock_countdown--;
 
-    /* SD: Give outstanding cache and tlb-misses a chance to tickle in first and
-     * commit everything that is ready to do so! */
+    /* SD: Give outstanding cache and tlb-misses a chance to tickle in first */
     if ( !dispatch_deadlock_countdown &&
-         (rob_cache_miss_list.count || rob_tlb_miss_list.count ||
-          ( rob_ready_to_commit_queue.count && ROB.peekhead()->ready_to_commit())) )
+         (rob_cache_miss_list.count || rob_tlb_miss_list.count ))
       dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
 
+    /* and commit everything that is ready to do so! */
+    if ( !dispatch_deadlock_countdown && rob_ready_to_commit_queue.count) {
+      bool head_x86_ready = true;
+      /* Check if the *full* x86 instruction can actually commit */
+      foreach_forward(ROB, i) {
+        if (!ROB[i].ready_to_commit()) {
+          head_x86_ready = false;
+          break;
+        }
+        if (ROB[i].uop.eom) break;
+      }
+      if (head_x86_ready) dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
+    }
+
     if (!dispatch_deadlock_countdown) {
       redispatch_deadlock_recovery();
       dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
@@ -1335,6 +1453,9 @@
     if unlikely (rob->cycles_left <= 0) {
       if unlikely (config.event_log_enabled) core.eventlog.add(EVENT_COMPLETE, rob);
       rob->changestate(rob_completed_list[cluster]);
+      // SD: Make the register ready here, instead of at issue time. This should be more correct!
+      //     Together with the 0 cycle fwd = bypass fix elsewhere, this works as expected!
+      rob->physreg->flags &= ~FLAG_WAIT;
       rob->physreg->complete();
       rob->forward_cycle = 0;
       rob->fu = 0;
@@ -1534,6 +1655,11 @@
   //
   int rc = COMMIT_RESULT_OK;
 
+#ifdef ENABLE_ASF
+  rc = asf_pipeline_intercept.pre_commit(ctx, rc);
+  if unlikely (rc != COMMIT_RESULT_OK ) return rc;
+#endif
+
   foreach_forward(ROB, i) {
     ReorderBufferEntry& rob = ROB[i];
 
@@ -1637,7 +1763,7 @@
   // fence immediately after an locked RMW instruction,
   // as the lock is just added to the flush list at the
   // commit of the load (the R part), which will definitely
-  // happen after the commit of the preceeding fence.
+  // happen after the commit of the preceding fence.
   //
 
   if unlikely ((uop.opcode == OP_mf) && ready_to_commit() && (!load_store_second_phase)) {
@@ -1682,6 +1808,11 @@
 #endif
 
     if unlikely (subrob.physreg->flags & FLAG_INV) {
+      // The load was invalidated by another core while being in-flight
+      if unlikely (subrob.physreg->data == EXCEPTION_RetryLoad) {
+        assert("SD: EXCEPTION_RetryLoad should not be used for now!");
+      }
+
       //
       // The exception is definitely going to happen, since the
       // excepting instruction is at the head of the ROB. However,
@@ -1728,6 +1859,9 @@
   PhysicalRegister* oldphysreg = thread.commitrrt[uop.rd];
 
   bool ld = isload(uop.opcode);
+#ifdef ENABLE_ASF
+  ld  |= (uop.opcode == OP_rel);
+#endif
   bool st = isstore(uop.opcode);
   bool br = isbranch(uop.opcode);
 
@@ -1870,9 +2004,21 @@
 
   if likely (uop.som) assert(ctx.commitarf[REG_rip] == uop.rip);
 
+#ifdef ENABLE_ASF
+  if (uop.is_asf)
+    if (!thread.asf_pipeline_intercept.commit(ctx, *this))
+      return COMMIT_RESULT_NONE;
+#endif
+
   //
   // The commit of all uops in the x86 macro-op is guaranteed to happen after this point
   //
+  if unlikely (config.commitlog_filename) {
+    OutOfOrderCoreBinaryEvent e(coreid);
+    e.fill_commit(EVENT_COMMIT_OK, this);
+    commitlogfile << e;
+  }
+
   if unlikely (config.event_log_enabled) event = core.eventlog.add_commit(EVENT_COMMIT_OK, this);
 
   if unlikely (config.event_log_enabled) {
@@ -1901,7 +2047,14 @@
       assert(!isbranch(uop.opcode));
       ctx.commitarf[REG_rip] += uop.bytes;
     }
-    if unlikely (config.event_log_enabled) event->commit.target_rip = ctx.commitarf[REG_rip];
+    if unlikely (config.event_log_enabled) {
+      event->commit.target_rip = ctx.commitarf[REG_rip];
+#ifdef PTLSIM_HYPERVISOR
+      event->commit.krn = ctx.kernel_mode;
+#else
+      event->commit.krn = false;
+#endif
+    }
   }
 
   if likely ((!ld) & (!st) & (!uop.nouserflags)) {
@@ -1934,7 +2087,11 @@
     Waddr mfn = (lsq->physaddr << 3) >> 12;
     smc_setdirty(mfn);
 
-    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, thread.threadid) == 0);
+#ifdef ENABLE_ASF
+    W64 dummy_merge;
+    thread.asf_pipeline_intercept.issue_probe_and_merge(lsq->physaddr << 3, true, dummy_merge);
+#endif
+    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, virtpage, uop.internal, thread.threadid) == 0);
   }
 
   if unlikely (pteupdate) {
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/patches/linux-2.6.20-xen-self-checkpointing.diff ptlsim-asf/patches/linux-2.6.20-xen-self-checkpointing.diff
--- ptlsim-stable/patches/linux-2.6.20-xen-self-checkpointing.diff	2009-10-30 19:40:25.136180000 +0100
+++ ptlsim-asf/patches/linux-2.6.20-xen-self-checkpointing.diff	2009-12-22 14:57:34.459980000 +0100
@@ -1,26 +1,6 @@
-diff -urN linux-2.6.20-mtyrel-old/drivers/xen/core/machine_reboot.c linux-2.6.20-mtyrel-new/drivers/xen/core/machine_reboot.c
---- linux-2.6.20-mtyrel-old/drivers/xen/core/machine_reboot.c	2007-03-17 02:53:40.000000000 -0400
-+++ linux-2.6.20-mtyrel-new/drivers/xen/core/machine_reboot.c	2007-05-18 00:08:42.168470000 -0400
-@@ -130,6 +130,8 @@
- 
- #endif
- 
-+unsigned long long resume_started_at_tsc = 0;
-+
- int __xen_suspend(void)
- {
- 	int err, suspend_cancelled;
-@@ -168,6 +170,7 @@
- 	 * merely checkpointed, and 0 if it is resuming in a new domain.
- 	 */
- 	suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info));
-+	rdtscll(resume_started_at_tsc);
- 
- 	post_suspend(suspend_cancelled);
- 
 diff -urN linux-2.6.20-mtyrel-old/drivers/xen/core/reboot.c linux-2.6.20-mtyrel-new/drivers/xen/core/reboot.c
 --- linux-2.6.20-mtyrel-old/drivers/xen/core/reboot.c	2007-03-17 02:53:41.000000000 -0400
-+++ linux-2.6.20-mtyrel-new/drivers/xen/core/reboot.c	2007-05-18 02:15:11.208715000 -0400
++++ linux-2.6.20-mtyrel-new/drivers/xen/core/reboot.c	2007-05-12 23:14:19.477405000 -0400
 @@ -8,6 +8,9 @@
  #include <asm/hypervisor.h>
  #include <xen/xenbus.h>
@@ -49,89 +29,16 @@
  	return 0;
  }
  
-@@ -97,6 +106,89 @@
+@@ -97,6 +106,16 @@
  	}
  }
  
-+//
-+// PTLsim hypercalls
-+//
-+typedef unsigned char byte;
-+typedef unsigned short W16;
-+typedef unsigned int W32;
-+typedef unsigned long long W64;
-+
-+enum {
-+  PTLCALL_NOP = 0,
-+  PTLCALL_VERSION = 1,
-+  PTLCALL_ENQUEUE = 2,
-+  PTLCALL_FLUSH_QUEUE = 3,
-+};
-+
-+struct PTLsimCommandDescriptor {
-+  W64 command; // pointer to command string
-+  W64 length;       // length of command string
-+};
-+
-+static int ptlcall_not_available = 0;
-+
-+W64 ptlcall(W64 op, W64 arg1, W64 arg2, W64 arg3, W64 arg4) {
-+#if 0
-+  /*
-+   * This is disabled since subsequent calls may be running
-+   * under PTLsim and therefore should succeed.
-+   */
-+  if (ptlcall_not_available)
-+    return (W64)-ENOSYS;
-+#endif
-+
-+  asm volatile("1: .byte 0x0f,0x37\n"
-+               "2:\n"
-+               ".section .fixup,\"ax\"\n" \
-+               "3:\n"
-+               "  movq %[enosys],%%rax\n" \
-+               "  jmp 2b\n" \
-+               ".previous\n" \
-+               ".section __ex_table,\"a\"\n" \
-+               "  .align 8\n" \
-+               "  .quad 1b,3b\n" \
-+               ".previous"
-+               : "+a" (op)
-+               : "c" (arg1), "d" (arg2), "S" (arg3), "D" (arg4), [enosys] "i" (-ENOSYS)
-+               : "memory");
-+
-+  if (op == -ENOSYS)
-+    ptlcall_not_available = 1;
-+  return op;
-+}
-+
-+static inline W64 ptlcall_single_flush(const char* command) {
-+  int flush = 1;
-+  struct PTLsimCommandDescriptor desc;
-+  desc.command = (W64)command;
-+  desc.length = strlen(command);
-+
-+  return ptlcall(PTLCALL_ENQUEUE, (W64)&desc, 1, flush, 0);
-+}
-+
-+extern unsigned long long resume_started_at_tsc;
-+
-+char ptlcall_after_resume[1024];
-+
 +static void checkpoint_or_suspend_triggered(void)
 +{
 +	int err;
-+  unsigned long long resume_ended_at_tsc;
 +	shutting_down = SHUTDOWN_SUSPEND;
 +	err = kthread_create_on_cpu(xen_suspend, &suspend_waitq, "suspend", 0);
 +	wait_event_interruptible(suspend_waitq, (shutting_down < 0));
-+  rdtscll(resume_ended_at_tsc);
-+  printk("Resume took %llu cycles\n", resume_ended_at_tsc - resume_started_at_tsc);
-+  if (strlen(ptlcall_after_resume)) {
-+    printk("Executing PTLsim action after resume: '%s'\n", ptlcall_after_resume);
-+    err = ptlcall_single_flush(ptlcall_after_resume);
-+    if (err <= 0) printk("PTLcall returned error %d\n", err);
-+  }
 +}
 +
 +static int shutdown_handler_disabled = 0;
@@ -139,7 +46,7 @@
  static void shutdown_handler(struct xenbus_watch *watch,
  			     const char **vec, unsigned int len)
  {
-@@ -107,6 +199,11 @@
+@@ -107,6 +126,11 @@
  	if (shutting_down != SHUTDOWN_INVALID)
  		return;
  
@@ -151,7 +58,7 @@
   again:
  	err = xenbus_transaction_start(&xbt);
  	if (err)
-@@ -186,6 +283,143 @@
+@@ -186,6 +210,64 @@
  	.callback = sysrq_handler
  };
  
@@ -213,89 +120,10 @@
 +	return len;
 +}
 +
-+static struct proc_dir_entry *ptlcall_pde = NULL;
-+
-+static int ptlcall_write(struct file *file, const char __user *userbuf,
-+                         unsigned long count, void *data)
-+{
-+	int n = 0;
-+  int rc;
-+  char* buf = NULL;
-+
-+	if (count >= PAGE_SIZE)
-+		return -EFBIG;   /* too long */
-+
-+  buf = (char*)__get_free_page(GFP_KERNEL);
-+  if (!buf)
-+    return -ENOMEM;
-+
-+	if (copy_from_user(buf, userbuf, count)) {
-+    free_page((unsigned long)buf);
-+		return -EFAULT;
-+  }
-+
-+	buf[count] = '\0';
-+
-+	n = strlen(buf);
-+	if ((n > 0) && (buf[n-1] == '\n')) buf[n-1] = '\0';
-+
-+  /* printk("Sending PTLsim command '%s' to hypervisor\n", buf); */
-+
-+  rc = ptlcall_single_flush(buf);
-+
-+  if ((int)rc == -ENOSYS) {
-+    printk("ptlcall('%s'): Warning: PTLsim is not running\n", buf);
-+  }
-+
-+  free_page((unsigned long)buf);
-+
-+	return count;
-+}
-+
-+static int ptlcall_read(char *page, char **start, off_t off,
-+                                   int count, int *eof, void *data)
-+{
-+	*eof = 1;
-+	return 0;
-+}
-+
-+static struct proc_dir_entry *ptlcall_after_resume_pde = NULL;
-+
-+static int ptlcall_after_resume_write(struct file *file, const char __user *buffer,
-+                                    unsigned long count, void *data)
-+{
-+	int n = 0;
-+
-+	if (count >= sizeof(ptlcall_after_resume))
-+		return -EFBIG;   /* too long */
-+
-+	if (copy_from_user(ptlcall_after_resume, buffer, count))
-+		return -EFAULT;
-+	ptlcall_after_resume[count] = '\0';
-+
-+	n = strlen(ptlcall_after_resume);
-+	if ((n > 0) && (ptlcall_after_resume[n-1] == '\n')) ptlcall_after_resume[n-1] = '\0';
-+
-+  /* printk("Set ptlcall_after_resume = '%s'\n", ptlcall_after_resume); */
-+
-+	return count;
-+}
-+
-+static int ptlcall_after_resume_read(char *page, char **start, off_t off,
-+                                   int count, int *eof, void *data)
-+{
-+	int len;
-+
-+  len = sprintf(page, "%s", ptlcall_after_resume);
-+
-+	*eof = 1;
-+	return len;
-+}
-+
  static int setup_shutdown_watcher(struct notifier_block *notifier,
  				  unsigned long event,
  				  void *data)
-@@ -204,6 +438,30 @@
+@@ -204,6 +286,14 @@
  	else
  		xenbus_write(XBT_NIL, "control", "feature-sysrq", "1");
  
@@ -307,22 +135,6 @@
 +		printk(KERN_ERR "Failed to create /proc/xen/checkpoint\n");
 +	}
 +
-+	/* Use "chmod a+rw /proc/xen/checkpoint" to enable ordinary users to do this */
-+	if ((ptlcall_after_resume_pde = create_xen_proc_entry("ptlcall-after-resume", 0644))) {
-+		ptlcall_after_resume_pde->read_proc  = ptlcall_after_resume_read;
-+		ptlcall_after_resume_pde->write_proc = ptlcall_after_resume_write;
-+	} else {
-+		printk(KERN_ERR "Failed to create /proc/xen/ptlcall-after-resume\n");
-+	}
-+
-+	/* Use "chmod a+rw /proc/xen/checkpoint" to enable ordinary users to do this */
-+	if ((ptlcall_pde = create_xen_proc_entry("ptlcall", 0644))) {
-+		ptlcall_pde->read_proc  = ptlcall_read;
-+		ptlcall_pde->write_proc = ptlcall_write;
-+	} else {
-+		printk(KERN_ERR "Failed to create /proc/xen/ptlcall\n");
-+	}
-+
  	return NOTIFY_DONE;
  }
  
Only in ptlsim-stable/patches: linux-2.6.22-xen-self-checkpointing.diff
Only in ptlsim-stable/patches: ptlsim-xen-hypervisor-cset-14201-2007-03-02.diff
Only in ptlsim-stable/patches: ptlsim-xen-hypervisor-cset-15417-2007-06-21.diff
Only in ptlsim-stable/patches: ptlsim-xen-hypervisor.cset-12489.diff
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/patches/ptlsim-xen-tools.diff ptlsim-asf/patches/ptlsim-xen-tools.diff
--- ptlsim-stable/patches/ptlsim-xen-tools.diff	2009-10-30 19:40:25.147182000 +0100
+++ ptlsim-asf/patches/ptlsim-xen-tools.diff	2009-12-22 14:57:34.498940000 +0100
@@ -1,6 +1,6 @@
 diff -r f0ba459065d3 tools/python/xen/xend/XendCheckpoint.py
 --- a/tools/python/xen/xend/XendCheckpoint.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xend/XendCheckpoint.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xend/XendCheckpoint.py	Sun Jan  7 00:59:54 2007
 @@ -108,7 +108,7 @@
          raise Exception, exn
  
@@ -22,7 +22,7 @@
          
 diff -r f0ba459065d3 tools/python/xen/xend/XendConfig.py
 --- a/tools/python/xen/xend/XendConfig.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xend/XendConfig.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xend/XendConfig.py	Sun Jan  7 00:59:54 2007
 @@ -154,6 +154,7 @@
      ('bootloader_args', str),
      ('features', str),
@@ -41,7 +41,7 @@
      ('cpu_cap',      lambda info: 0),
 diff -r f0ba459065d3 tools/python/xen/xend/XendDomain.py
 --- a/tools/python/xen/xend/XendDomain.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xend/XendDomain.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xend/XendDomain.py	Sun Jan  7 00:59:54 2007
 @@ -915,7 +915,7 @@
          # !!!
          raise XendError("Unsupported")
@@ -87,7 +87,7 @@
              if not dominfo:
 diff -r f0ba459065d3 tools/python/xen/xend/XendDomainInfo.py
 --- a/tools/python/xen/xend/XendDomainInfo.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xend/XendDomainInfo.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xend/XendDomainInfo.py	Sun Jan  7 00:59:54 2007
 @@ -89,6 +89,7 @@
      ('bootloader_args', str),
      ('features',        str),
@@ -108,7 +108,7 @@
                  # persistent
 diff -r f0ba459065d3 tools/python/xen/xend/server/XMLRPCServer.py
 --- a/tools/python/xen/xend/server/XMLRPCServer.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xend/server/XMLRPCServer.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xend/server/XMLRPCServer.py	Sun Jan  7 00:59:54 2007
 @@ -64,8 +64,8 @@
      info = XendDomain.instance().domain_create(config)
      return fixup_sxpr(info.sxpr())
@@ -122,7 +122,7 @@
  def get_log():
 diff -r f0ba459065d3 tools/python/xen/xm/create.py
 --- a/tools/python/xen/xm/create.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xm/create.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xm/create.py	Sun Jan  7 00:59:54 2007
 @@ -721,6 +721,8 @@
          config.append(['backend', ['tpmif']])
      if vals.localtime:
@@ -134,7 +134,7 @@
      if vals.bootloader:
 diff -r f0ba459065d3 tools/python/xen/xm/main.py
 --- a/tools/python/xen/xm/main.py	Fri Nov 17 02:47:28 2006
-+++ b/tools/python/xen/xm/main.py	Sun Jan  7 00:59:46 2007
++++ b/tools/python/xen/xm/main.py	Sun Jan  7 00:59:54 2007
 @@ -84,7 +84,7 @@
                       'Migrate a domain to another machine.'),
      'pause'       : ('<Domain>', 'Pause execution of a domain.'),
Only in ptlsim-stable/patches: xen-intel-cpufreq-msrs.diff
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/perfctrs.cpp ptlsim-asf/perfctrs.cpp
--- ptlsim-stable/perfctrs.cpp	2009-10-30 19:40:25.153182000 +0100
+++ ptlsim-asf/perfctrs.cpp	2010-03-02 12:00:55.545546000 +0100
@@ -3,6 +3,8 @@
 // Performance counters
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlhwdef.cpp ptlsim-asf/ptlhwdef.cpp
--- ptlsim-stable/ptlhwdef.cpp	2009-10-30 19:40:25.166180000 +0100
+++ ptlsim-asf/ptlhwdef.cpp	2010-03-02 12:00:55.572519000 +0100
@@ -3,6 +3,8 @@
 // Hardware Definitions
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <ptlsim.h>
@@ -15,13 +17,13 @@
 const char* opclass_names[OPCLASS_COUNT] = {
   "logic", "addsub", "addsubc", "addshift", "sel", "cmp", "br.cc", "jmp", "bru", 
   "assist", "mf", "ld", "st", "ld.pre", "shiftsimple", "shift", "mul", "bitscan", "flags",  "chk", 
-  "fpu", "fp-div-sqrt", "fp-cmp", "fp-perm", "fp-cvt-i2f", "fp-cvt-f2i", "fp-cvt-f2f", "vec",
+  "fpu", "fp-div-sqrt", "fp-cmp", "fp-perm", "fp-cvt-i2f", "fp-cvt-f2i", "fp-cvt-f2f", "vec", "asf",
 };
 
 //
 // Micro-operation (uop) definitions
 //
-
+// SD: What is the third field good for? It is never used for anything?!
 const OpcodeInfo opinfo[OP_MAX_OPCODE] = {
   // name, opclass, latency, fu
   {"nop",            OPCLASS_LOGIC,         opNOSIZE   },
@@ -185,6 +187,12 @@
   {"vsad",           OPCLASS_VEC_ALU,       opAB }, // sum of absolute differences
   {"vpack.us",       OPCLASS_VEC_ALU,       opAB }, // pack larger to smaller (unsigned saturation)
   {"vpack.ss",       OPCLASS_VEC_ALU,       opAB }, // pack larger to smaller (signed saturation)
+#ifdef ENABLE_ASF
+  {"asf.spec",       OPCLASS_ASF,           opB },
+  {"asf.com",        OPCLASS_ASF,           opB },
+  {"asf.val",        OPCLASS_ASF,           opB },
+  {"asf.rel",        OPCLASS_ASF,           opB },
+#endif
 };
 
 const char* exception_names[EXCEPTION_COUNT] = {
@@ -198,12 +206,15 @@
   "PageExec",
   "StStAlias",
   "LdStAlias",
+  "RetryLd",
   "CheckFailed",
   "SkipBlock",
   "LFRQFull",
   "Float",
   "FloatNotAvail",
   "DivideOverflow",
+  "ASFAbort",
+  "ASFTesting",
 };
 
 const char* x86_exception_names[256] = {
@@ -424,6 +435,7 @@
 
   if ((ld|st) && (op.cachelevel > 0)) sbname << ".L", (char)('1' + op.cachelevel);
   if ((ld|st) && (op.locked)) sbname << ((ld) ? ".acq" : ".rel");
+  if (op.invalidating) sbname << ".inv";
   if (op.internal) sbname << ".p";
   if (op.eom) sbname << ".", (op.any_flags_in_insn ? "+" : "-");
 
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlhwdef.h ptlsim-asf/ptlhwdef.h
--- ptlsim-stable/ptlhwdef.h	2009-10-30 19:40:25.168180000 +0100
+++ ptlsim-asf/ptlhwdef.h	2010-03-02 12:00:55.587504000 +0100
@@ -4,6 +4,8 @@
 // Hardware Definitions
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _PTLHWDEF_H
@@ -169,7 +171,9 @@
 extern W64 sim_cycle;
 #include <logic.h>
 #include <config.h>
-
+#ifdef ENABLE_ASF
+class ASFContext;
+#endif
 //
 // Exceptions:
 // These are PTL internal exceptions, NOT x86 exceptions:
@@ -184,12 +188,15 @@
   EXCEPTION_PageFaultOnExec,
   EXCEPTION_StoreStoreAliasing,
   EXCEPTION_LoadStoreAliasing,
+  EXCEPTION_RetryLoad,
   EXCEPTION_CheckFailed,
   EXCEPTION_SkipBlock,
   EXCEPTION_LFRQFull,
   EXCEPTION_FloatingPoint,
   EXCEPTION_FloatingPointNotAvailable,
   EXCEPTION_DivideOverflow,
+  EXCEPTION_ASF_Abort,
+  EXCEPTION_ASF_Testing, /*S.D.*/
   EXCEPTION_COUNT
 };
 
@@ -198,8 +205,10 @@
 static const int MAX_BB_UOPS = 63;
 static const int MAX_BB_PER_PAGE = 4096;
 
-static const int MAX_TRANSOPS_PER_USER_INSN = 16;
-
+/* BLAME S.D.!
+ *static const int MAX_TRANSOPS_PER_USER_INSN = 16;
+ */
+static const int MAX_TRANSOPS_PER_USER_INSN = 32;
 extern const char* exception_names[EXCEPTION_COUNT];
 
 static inline const char* exception_name(W64 exception) {
@@ -835,6 +844,10 @@
   byte running;
 #endif
 
+#ifdef ENABLE_ASF
+  ASFContext* asf_context;
+#endif
+
   inline void reset() {
     setzero(commitarf);
 #ifdef PTLSIM_HYPERVISOR
@@ -1002,7 +1015,9 @@
 
 #define OPCLASS_VEC_ALU                 (1 << 27)
 
-#define OPCLASS_COUNT                   28
+#define OPCLASS_ASF                     (1 << 28)
+
+#define OPCLASS_COUNT                   29
 
 #define OPCLASS_USECOND                 (OPCLASS_COND_BRANCH|OPCLASS_SELECT|OPCLASS_CHECK)
 
@@ -1161,6 +1176,13 @@
   OP_vsad,
   OP_vpack_us,
   OP_vpack_ss,
+#ifdef ENABLE_ASF
+  // ASF
+  OP_spec,
+  OP_com,
+  OP_val,
+  OP_rel,
+#endif
   OP_MAX_OPCODE,
 };
 
@@ -1205,6 +1227,7 @@
 inline bool iscondbranch(int opcode) { return isclass(opcode, OPCLASS_COND_BRANCH|OPCLASS_INDIR_BRANCH); }
 inline bool isbranch(int opcode) { return isclass(opcode, OPCLASS_BRANCH); }
 inline bool isbarrier(int opcode) { return isclass(opcode, OPCLASS_BARRIER); }
+inline bool isasf(int opcode) { return isclass(opcode, OPCLASS_ASF); }
 inline const char* nameof(int opcode) { return (opcode < OP_MAX_OPCODE) ? opinfo[opcode].name : "INVALID"; }
 
 union MaskControlInfo {
@@ -1324,7 +1347,9 @@
   // Index in basic block
   byte bbindex;
   // Misc info (terminal writer of targets in this insn, etc)
-  byte final_insn_in_bb:1, final_arch_in_insn:1, final_flags_in_insn:1, any_flags_in_insn:1, pad:3, marked:1;
+  // SD-TODO-MERGE: What is the marked flag used for?
+  // Nothing! => Request on ML
+  byte final_insn_in_bb:1, final_arch_in_insn:1, final_flags_in_insn:1, any_flags_in_insn:1, is_asf:1, invalidating: 1, pad:1, marked:1;
   // Immediates
   W64s rbimm;
   W64s rcimm;
Only in ptlsim-stable: ptlsim
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlsim.cpp ptlsim-asf/ptlsim.cpp
--- ptlsim-stable/ptlsim.cpp	2009-10-30 19:40:25.177180000 +0100
+++ ptlsim-asf/ptlsim.cpp	2010-03-02 12:00:55.614477000 +0100
@@ -23,6 +23,7 @@
 PTLsimStats stats;
 
 ostream logfile;
+ostream commitlogfile;
 bool logenable = 0;
 W64 sim_cycle = 0;
 W64 unhalted_cycle_count = 0;
@@ -47,6 +48,7 @@
   quiet = 0;
   core_name = "ooo";
   log_filename = "ptlsim.log";
+  //commitlog_filename = "";
   loglevel = 0;
   start_log_at_iteration = 0;
   start_log_at_rip = INVALIDRIP;
@@ -147,6 +149,7 @@
   add(quiet,                        "quiet",                "Do not print PTLsim system information banner");
   add(log_filename,                 "logfile",              "Log filename (use /dev/fd/1 for stdout, /dev/fd/2 for stderr)");
   add(loglevel,                     "loglevel",             "Log level (0 to 99)");
+  add(commitlog_filename,           "commitlog",            "Commit-Log filename (use /dev/fd/1 for stdout, /dev/fd/2 for stderr)");
   add(start_log_at_iteration,       "startlog",             "Start logging after iteration <startlog>");
   add(start_log_at_rip,             "startlogrip",          "Start logging after first translation of basic block starting at rip");
   add(log_on_console,               "consolelog",           "Replicate log file messages to console");
@@ -302,16 +305,17 @@
 
 stringbuf current_stats_filename;
 stringbuf current_log_filename;
+stringbuf current_commitlog_filename;
 stringbuf current_bbcache_dump_filename;
 
-void backup_and_reopen_logfile() {
-  if (config.log_filename) {
+void backup_and_reopen_logfile(stringbuf& name, ostream& logfile) {
+  if (name) {
     if (logfile) logfile.close();
     stringbuf oldname;
-    oldname << config.log_filename, ".backup";
+    oldname << name, ".backup";
     sys_unlink(oldname);
-    sys_rename(config.log_filename, oldname);
-    logfile.open(config.log_filename);
+    sys_rename(name, oldname);
+    logfile.open(name);
   }
 }
 
@@ -361,9 +365,14 @@
 
   if (config.log_filename.set() && (config.log_filename != current_log_filename)) {
     // Can also use "-logfile /dev/fd/1" to send to stdout (or /dev/fd/2 for stderr):
-    backup_and_reopen_logfile();
+    backup_and_reopen_logfile(config.log_filename, logfile);
     current_log_filename = config.log_filename;
   }
+  if (config.commitlog_filename.set() && (config.commitlog_filename != current_commitlog_filename)) {
+    // Can also use "-logfile /dev/fd/1" to send to stdout (or /dev/fd/2 for stderr):
+    backup_and_reopen_logfile(config.commitlog_filename, commitlogfile);
+    current_commitlog_filename = config.commitlog_filename;
+  }
 
   logfile.setchain((config.log_on_console) ? &cout : null);
 
@@ -587,6 +596,7 @@
 
   logfile << sb, flush;
   cerr << sb, flush;
+  if(!config.commitlog_filename.empty()) commitlogfile << flush;
 
   if (config.dumpcode_filename.set()) {
     byte insnbuf[256];
Only in ptlsim-stable: ptlsim.dst
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlsim.h ptlsim-asf/ptlsim.h
--- ptlsim-stable/ptlsim.h	2009-10-30 19:40:25.178182000 +0100
+++ ptlsim-asf/ptlsim.h	2010-03-02 12:00:55.623468000 +0100
@@ -20,6 +20,10 @@
 #include <config.h>
 #include <datastore.h>
 
+//#define CORE_GENERIC
+//#define CORE_AMD_K8
+#define CORE_AMD_BARCELONA_ASF
+
 extern W64 sim_cycle;
 extern W64 unhalted_cycle_count;
 extern W64 total_uops_committed;
@@ -115,6 +119,7 @@
 void print_banner(ostream& os, const PTLsimStats& stats, int argc = 0, char** argv = null);
 
 extern ostream logfile;
+extern ostream commitlogfile;
 extern W64 user_insn_commits;
 extern W64 iterations;
 extern W64 total_uops_executed;
@@ -143,6 +148,7 @@
   // Logging
   bool quiet;
   stringbuf log_filename;
+  stringbuf commitlog_filename;
   W64 loglevel;
   W64 start_log_at_iteration;
   W64 start_log_at_rip;
Only in ptlsim-stable: ptlsim.log
Only in ptlsim-stable: ptlsim.log.backup
Only in ptlsim-stable: ptlstats
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlxen-memory.cpp ptlsim-asf/ptlxen-memory.cpp
--- ptlsim-stable/ptlxen-memory.cpp	2009-10-30 19:40:25.191180000 +0100
+++ ptlsim-asf/ptlxen-memory.cpp	2010-03-02 12:00:55.675418000 +0100
@@ -1313,6 +1313,10 @@
   }
 }
 
+W64 storephys(Waddr physaddr, W64 data) {
+  return storemask(physaddr, data, 0xff);
+}
+
 void Context::print_tlb(ostream& os) {
   os << "VCPU ", vcpuid, " mini-TLB:", endl;
   foreach (i, lengthof(cached_pte_virt)) {
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlxen.cpp ptlsim-asf/ptlxen.cpp
--- ptlsim-stable/ptlxen.cpp	2009-10-30 19:40:25.195180000 +0100
+++ ptlsim-asf/ptlxen.cpp	2010-03-02 12:00:55.694399000 +0100
@@ -3,6 +3,8 @@
 // Toplevel control and kernel interface to Xen inside the user domain
 //
 // Copyright 1999-2008 Matt T. Yourst <yourst@yourst.com>
+// Copyright (c) 2007-2010 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -1506,7 +1508,7 @@
   }
 
   default:
-    if (debug) logfile << "Cannot handle hypercall ", hypercallid, "!", endl, flush;
+    /*if (debug)*/ logfile << "Cannot handle hypercall ", hypercallid, "!", endl, flush;
     assert(false);
   }
 
@@ -1893,6 +1895,13 @@
   // This only works when the guest OS is Linux and the program is 64 bit.
   //
   if (ctx.use64) {
+    W64 arg1 = ctx.commitarf[REG_rdi];
+    W64 arg2 = ctx.commitarf[REG_rsi];
+    W64 arg3 = ctx.commitarf[REG_rdx];
+    W64 arg4 = ctx.commitarf[REG_r10];
+    W64 arg5 = ctx.commitarf[REG_r8];
+    W64 arg6 = ctx.commitarf[REG_r9];
+
     switch (ctx.commitarf[REG_rax]) {
     case __NR_execve: {
       char filename[256];
@@ -1902,6 +1911,48 @@
       logfile << "syscall: execve('", filename, "', ...)", endl;
       break;
     }
+    case __NR_futex: {
+      if likely (!logable(1)) break;
+      W64 stack[64];
+      int n = ctx.copy_from_user(stack, ctx.commitarf[REG_rsp], sizeof(stack));
+      const char *futex_names[]={
+        "FUTEX_WAIT",
+        "FUTEX_WAKE",
+        "FUTEX_FD",
+        "FUTEX_REQUEUE",
+        "FUTEX_CMP_REQUEUE",
+        "FUTEX_WAKE_OP",
+        "FUTEX_LOCK_PI",
+        "FUTEX_UNLOCK_PI",
+        "FUTEX_TRYLOCK_PI" 
+      };
+
+      W32 futex, val, op;
+      W64 time[2];
+
+      ctx.copy_from_user(&futex, arg1, sizeof(futex));
+      op  = (W32)arg2;
+      val = (W32)arg3;
+      ctx.copy_from_user(&time, arg4, sizeof(time));
+
+      W64 physaddr = mapped_virt_to_phys(pte_to_mapped_virt(arg1, ctx.virt_to_pte(arg1)));
+      logfile << "syscall: ", futex_names[op & 127], (op & 128) ? " (private)":"",
+                 " futex @ ",(void*)arg1,"(",(void*)physaddr ,")= ", (void*)futex, " val: ", (void*)val,
+                 " time: ", time[0], "s", time[1], "ns"
+                 " stack:", endl;
+      if (logable(5))
+        for (int i = 0; i < n / sizeof(stack[0]); i++)
+          logfile << "  ",hexstring (stack[i],64),endl;
+
+      break;
+    }
+    case __NR_write: {
+      if likely (!logable(5)) break;
+      char message[512];
+      int n = ctx.copy_from_user(message, arg2, sizeof(message)-1);
+      logfile << "sys_write to fd ", arg1, " message: ", message, endl;
+      break;
+    }
     }
   }
 
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/ptlxen.h ptlsim-asf/ptlxen.h
--- ptlsim-stable/ptlxen.h	2009-10-30 19:40:25.197180000 +0100
+++ ptlsim-asf/ptlxen.h	2010-03-02 12:00:55.705386000 +0100
@@ -712,6 +712,7 @@
 //
 W64 loadphys(Waddr physaddr);
 W64 storemask(Waddr physaddr, W64 data, byte bytemask);
+W64 storephys(Waddr physaddr, W64 data);
 
 int copy_from_user_phys_prechecked(void* target, Waddr source, int bytes, Level1PTE ptelo, Level1PTE ptehi, Waddr& faultaddr);
 
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/stats.h ptlsim-asf/stats.h
--- ptlsim-stable/stats.h	2009-10-30 19:40:25.508870000 +0100
+++ ptlsim-asf/stats.h	2010-03-02 12:00:55.788303000 +0100
@@ -14,12 +14,13 @@
 #include <datastore.h>
 #include <ptlsim.h>
 
-#define STATS_ONLY
+//#define STATS_ONLY
 #include <decode.h>
 #include <ooocore.h>
+
 #include <dcache.h>
 #include <branchpred.h>
-#undef STATS_ONLY
+//#undef STATS_ONLY
 
 
 #define increment_clipped_histogram(h, slot, incr) h[clipto(W64(slot), W64(0), W64(lengthof(h)-1))] += incr;
@@ -40,7 +41,7 @@
 // need to replicate the vcpu0,vcpu1,... structures in several
 // places below.
 //
-static const int MAX_SIMULATED_VCPUS = 4;
+static const int MAX_SIMULATED_VCPUS = 8;
 
 struct EventsInMode { // rootnode: summable
   W64 user64;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/superstl.cpp ptlsim-asf/superstl.cpp
--- ptlsim-stable/superstl.cpp	2009-10-30 19:40:25.511867000 +0100
+++ ptlsim-asf/superstl.cpp	2010-03-02 12:00:55.829264000 +0100
@@ -714,7 +714,7 @@
       char c;
       bool ok = getc(c);
 
-      if ((!ok) | (c == '\n') | (c == '\r') | (c == '\0')) {
+      if ((!ok) | (c == '\n') | (c == '\r')) {
         v[i] = 0;
         return i;
       }
@@ -733,7 +733,7 @@
       char c;
       bool ok = getc(c);
 
-      if ((!ok) | (c == '\n') | (c == '\r') | (c == '\0')) return sb.size();
+      if ((!ok) | (c == '\n') | (c == '\r')) return sb.size();
 
       sb << c;
     }
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/superstl.h ptlsim-asf/superstl.h
--- ptlsim-stable/superstl.h	2009-10-30 19:40:25.514864000 +0100
+++ ptlsim-asf/superstl.h	2010-03-02 12:00:55.853238000 +0100
@@ -3769,6 +3769,12 @@
   template <typename T> bool div_rem(T& quotient, T& remainder, T dividend_hi, T dividend_lo, T divisor);
   template <typename T> bool div_rem_s(T& quotient, T& remainder, T dividend_hi, T dividend_lo, T divisor);
 
+  //
+  // Safe divide and remainder functions that return true iff operation did not generate an exception:
+  //
+  template <typename T> bool div_rem(T& quotient, T& remainder, T dividend_hi, T dividend_lo, T divisor);
+  template <typename T> bool div_rem_s(T& quotient, T& remainder, T dividend_hi, T dividend_lo, T divisor);
+
   template <typename T>
   struct ScopedLock {
     T& lock;
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/syscalls.cpp ptlsim-asf/syscalls.cpp
--- ptlsim-stable/syscalls.cpp	2009-10-30 19:40:25.516861000 +0100
+++ ptlsim-asf/syscalls.cpp	2010-03-02 12:00:55.861230000 +0100
@@ -63,7 +63,6 @@
 declare_syscall3(__NR_execve, int, sys_execve, const char*, filename, const char**, argv, const char**, envp);
 
 declare_syscall0(__NR_getpid, pid_t, sys_getpid);
-declare_syscall0(__NR_getppid, pid_t, sys_getppid);
 declare_syscall0(__NR_gettid, pid_t, sys_gettid);
 declare_syscall1(__NR_uname, int, sys_uname, struct utsname*, buf);
 declare_syscall3(__NR_readlink, int, sys_readlink, const char*, path, char*, buf, size_t, bufsiz);
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/syscalls.h ptlsim-asf/syscalls.h
--- ptlsim-stable/syscalls.h	2009-10-30 19:40:25.518858000 +0100
+++ ptlsim-asf/syscalls.h	2010-03-02 12:00:55.871220000 +0100
@@ -35,7 +35,6 @@
   int sys_execve(const char* filename, const char** argv, const char** envp);
   
   pid_t sys_gettid();
-  pid_t sys_getppid();
   pid_t sys_getpid();
   void sys_exit(int code);
   void* sys_brk(void* newbrk);
Only in ptlsim-stable: test_cmpxchg.log
Only in ptlsim-stable: test_cmpxchg.log.backup
Only in ptlsim-stable: test_neg.log
Only in ptlsim-stable: test_neg.log.backup
Only in ptlsim-stable: test_xlat.log
Only in ptlsim-stable: test_xlat.log.backup
Only in ptlsim-stable: tests
Only in ptlsim-stable: tmp.patch
Only in ptlsim-asf/: trace_event.h
diff -rubwB -x .hg -x .git -x '*.o' -x Documentation ptlsim-stable/uopimpl.cpp ptlsim-asf/uopimpl.cpp
--- ptlsim-stable/uopimpl.cpp	2009-10-30 19:40:25.521855000 +0100
+++ ptlsim-asf/uopimpl.cpp	2010-03-02 12:00:55.895196000 +0100
@@ -1582,6 +1582,13 @@
   case OP_ld_pre:
   case OP_st:
   case OP_mf:
+#ifdef ENABLE_ASF
+  /* ASF instructions too ;) */
+  case OP_com:
+  case OP_spec:
+  case OP_rel:
+  case OP_val:
+#endif
     func = uop_impl_nop; break;
 
   case OP_bt:
Only in ptlsim-asf/: utils

