diff -r 10448c053ad6 CHANGELOG
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/CHANGELOG	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,5 @@
+1.1
+Copyrights changed to AMD, Stephan Diestelhorst added as contributor.
+
+1.0
+Initial release
diff -r 10448c053ad6 LICENSE
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/LICENSE	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	    How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) <year>  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License along
+    with this program; if not, write to the Free Software Foundation, Inc.,
+    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) year name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff -r 10448c053ad6 Makefile
--- a/Makefile	Thu May 31 15:36:20 2007 +0200
+++ b/Makefile	Wed Nov 05 14:15:51 2008 +0100
@@ -3,10 +3,20 @@
 # PTLsim: Cycle Accurate x86-64 Simulator
 # Makefile
 #
-# Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
+# Copyright 2000-2008 Matt T. Yourst <yourst@yourst.com>
 #
 
-ifeq ($(findstring x86_64,$(MACHTYPE)),x86_64)
+#
+# If you are running on a 64-bit distro but want to build
+# a 32-bit PTLsim binary, and your distro doesn't provide
+# the "linux32" or "32bit" uname-changing commands, you
+# will need to manually override the checks below:
+#
+ifndef MACHTYPE
+	MACHTYPE = "$(shell uname -a)"
+endif
+
+ifneq (,$(findstring x86_64,"$(MACHTYPE)"))
 __x86_64__=1
 endif
 
@@ -20,14 +30,21 @@
 # but can still run 32-bit code and guest operating
 # systems. See the manual and FAQ for details.
 #
-# PTLSIM_HYPERVISOR=1
+#PTLSIM_HYPERVISOR=1
 
 CC = g++
 
 GCCVER_SPECIFIC =
 
-SVNREV=`svn info | grep "Last Changed Rev" | cut -d " " -f4`
-SVNDATE=`svn info | grep "Last Changed Date" | cut -d " " -f4`
+SVNREV=$(shell svn info | grep "Last Changed Rev" | cut -d " " -f4)
+SVNDATE=$(shell svn info | grep "Last Changed Date" | cut -d " " -f4)
+
+ifeq (,$(SVNREV))
+# Subversion is either not installed or the current directory isn't a PTLsim repository:
+	SVNREV=0
+	SVNDATE=unknown
+endif
+
 INCFLAGS = -I. -DBUILDHOST="`hostname -f`" -DSVNREV="$(SVNREV)" -DSVNDATE="$(SVNDATE)"
 
 ifdef PTLSIM_HYPERVISOR
@@ -38,8 +55,6 @@
 CFLAGS = -O99 -g -fomit-frame-pointer -pipe -march=k8 -falign-functions=16 -funroll-loops -funit-at-a-time -minline-all-stringops
 #CFLAGS = -O2 -g3 -march=k8 -falign-functions=16 -minline-all-stringops
 # -O1 doesn't work
-#CFLAGS = -O2 -g -fomit-frame-pointer -march=k8 -falign-functions=16 -minline-all-stringops
-#CFLAGS = -O1 -g3 -march=k8 -falign-functions=16
 CFLAGS32BIT = $(CFLAGS) -m32
 else
 # 32-bit PTLsim32 only, on a Pentium 4:
@@ -53,39 +68,39 @@
 CFLAGS += -fpic -mno-red-zone
 endif
 
-CFLAGS += -fno-trapping-math -fno-exceptions -fno-rtti -funroll-loops -mpreferred-stack-boundary=4 -fno-strict-aliasing -Wreturn-type $(GCCVER_SPECIFIC)
+CFLAGS += -fno-trapping-math -fno-stack-protector -fno-exceptions -fno-rtti -funroll-loops -mpreferred-stack-boundary=4 -fno-strict-aliasing -Wreturn-type $(GCCVER_SPECIFIC)
 
 
-BASEOBJS = superstl.o config.o mathlib.o syscalls.o
+BASEOBJS = superstl.o config.o mathlib.o syscalls.o random_inject.o
 STDOBJS = glibc.o
 
 ifdef __x86_64__
 ifdef PTLSIM_HYPERVISOR
-COMMONOBJS = linkstart.o lowlevel-64bit-xen.o ptlsim.o ptlxen.o ptlxen-memory.o ptlxen-events.o ptlxen-common.o perfctrs.o mm.o superstl.o config.o mathlib.o klibc.o ptlhwdef.o datastore.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-64bit-xen.o ptlsim.o ptlxen.o ptlxen-memory.o ptlxen-events.o ptlxen-common.o perfctrs.o mm.o superstl.o config.o mathlib.o klibc.o ptlhwdef.o datastore.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o ptlsim.dst.o linkend.o decode-asf.o random_inject.o
 else
-COMMONOBJS = linkstart.o lowlevel-64bit.o lowlevel-32bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o datastore.o injectcode-32bit.o injectcode-64bit.o seqcore.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-64bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o datastore.o injectcode-64bit.o seqcore.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o decode-asf.o random_inject.o
 endif
 else
 # 32-bit PTLsim32 only:
-COMMONOBJS = linkstart.o lowlevel-32bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o datastore.o injectcode-32bit.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o
+COMMONOBJS = linkstart.o lowlevel-32bit.o ptlsim.o kernel.o mm.o ptlhwdef.o decode-core.o decode-fast.o decode-complex.o decode-x87.o decode-sse.o uopimpl.o seqcore.o datastore.o injectcode-32bit.o $(BASEOBJS) klibc.o ptlsim.dst.o linkend.o decode-asf.o random_inject.o
 endif
 
-OOOOBJS = ooocore.o ooopipe.o oooexec.o branchpred.o dcache.o
+OOOOBJS = ooocore.o ooopipe.o oooexec.o branchpred.o dcache.o asfooocore.o asfooopipe.o asfoooexec.o
 ifdef PTLSIM_HYPERVISOR
 OOOOBJS += smtcore.o smtpipe.o smtexec.o 
 endif
 OBJFILES = $(COMMONOBJS) $(OOOOBJS)
 
-COMMONINCLUDES = logic.h ptlhwdef.h decode.h seqexec.h dcache.h dcache-amd-k8.h config.h ptlsim.h datastore.h superstl.h globals.h kernel.h mm.h ptlcalls.h loader.h mathlib.h klibc.h syscalls.h ptlxen.h stats.h xen-types.h
-OOOINCLUDES = branchpred.h ooocore.h smtcore.h smtcore-amd-k8.h
+COMMONINCLUDES = logic.h ptlhwdef.h decode.h seqexec.h dcache.h dcache-amd-k8.h config.h ptlsim.h datastore.h superstl.h globals.h kernel.h mm.h ptlcalls.h loader.h mathlib.h klibc.h syscalls.h ptlxen.h stats.h xen-types.h random_inject.h
+OOOINCLUDES = branchpred.h ooocore.h smtcore.h smtcore-amd-k8.h asfooocore.h
 INCLUDEFILES = $(COMMONINCLUDES) $(OOOINCLUDES)
 
-COMMONCPPFILES = ptlsim.cpp kernel.cpp mm.cpp superstl.cpp ptlhwdef.cpp decode-core.cpp decode-fast.cpp decode-complex.cpp decode-x87.cpp decode-sse.cpp lowlevel-64bit.S lowlevel-32bit.S linkstart.S linkend.S uopimpl.cpp dcache.cpp config.cpp datastore.cpp injectcode.cpp ptlcalls.c cpuid.cpp ptlstats.cpp klibc.cpp glibc.cpp mathlib.cpp syscalls.cpp makeusage.cpp
+COMMONCPPFILES = ptlsim.cpp kernel.cpp mm.cpp superstl.cpp ptlhwdef.cpp decode-core.cpp decode-fast.cpp decode-complex.cpp decode-x87.cpp decode-sse.cpp lowlevel-64bit.S lowlevel-32bit.S linkstart.S linkend.S uopimpl.cpp dcache.cpp config.cpp datastore.cpp injectcode.cpp ptlcalls.c cpuid.cpp ptlstats.cpp klibc.cpp glibc.cpp mathlib.cpp syscalls.cpp makeusage.cpp decode-asf.cpp random_inject.cpp
 
 ifdef PTLSIM_HYPERVISOR
 COMMONCPPFILES += lowlevel-64bit-xen.S ptlxen.cpp ptlxen-memory.cpp ptlxen-events.cpp ptlxen-common.cpp perfctrs.cpp ptlmon.cpp ptlctl.cpp
 endif
-OOOCPPFILES = ooocore.cpp ooopipe.cpp oooexec.cpp smtcore.cpp smtpipe.cpp smtexec.cpp seqcore.cpp branchpred.cpp
+OOOCPPFILES = ooocore.cpp ooopipe.cpp oooexec.cpp smtcore.cpp smtpipe.cpp smtexec.cpp seqcore.cpp branchpred.cpp asfooocore.cpp asfooopipe.cpp asfoooexec.cpp
 
 CPPFILES = $(COMMONCPPFILES) $(OOOCPPFILES)
 
@@ -140,20 +155,30 @@
 makeusage: makeusage.o $(BASEOBJS) $(STDOBJS)
 	$(CC) $(CFLAGS) $(INCFLAGS) $(BASEOBJS) $(STDOBJS) makeusage.o -o makeusage
 
+ifdef __x86_64__
+DATA_OBJ_TYPE = elf64-x86-64
+else
+DATA_OBJ_TYPE = elf32-i386
+endif
+
 usage.o: makeusage Makefile
 	./makeusage > usage.txt
-	objcopy -I binary -O elf32-i386 -B i386 --rename-section .data=.usage,alloc,load,readonly,data,contents usage.txt usage.o
+	objcopy -I binary -O $(DATA_OBJ_TYPE) -B i386 --rename-section .data=.usage,alloc,load,readonly,data,contents usage.txt usage.o
 
 ptlsim.dst.o: ptlsim.dst
-	objcopy -I binary -O elf32-i386 -B i386 --rename-section .data=.dst,alloc,load,readonly,data,contents ptlsim.dst ptlsim.dst.o
+	objcopy -I binary -O $(DATA_OBJ_TYPE) -B i386 --rename-section .data=.dst,alloc,load,readonly,data,contents ptlsim.dst ptlsim.dst.o
 
 ifdef PTLSIM_HYPERVISOR
 ifdef __x86_64__
-ptlxen.bin: $(OBJFILES) Makefile ptlxen.lds
-	ld -v -g -O2 $(OBJFILES) -o ptlxen.bin -static --allow-multiple-definition -T ptlxen.lds `gcc -print-libgcc-file-name`
 
-ptlxen.bin.o: ptlxen.bin Makefile
-	objcopy -I binary -O elf64-x86-64 -B i386 --rename-section .data=.ptlxen,alloc,load,readonly,data,contents ptlxen.bin ptlxen.bin.o
+ptlxen.bin.debug: $(OBJFILES) Makefile ptlxen.lds
+	ld -v -g -O2 $(OBJFILES) -o ptlxen.bin.debug -static --allow-multiple-definition -T ptlxen.lds `gcc -print-libgcc-file-name`
+
+ptlxen.bin.o: ptlxen.bin.debug Makefile
+	strip -o ptlxen.bin ptlxen.bin.debug
+	objcopy -I binary -O $(DATA_OBJ_TYPE) -B i386 --rename-section .data=.ptlxen,alloc,load,readonly,data,contents ptlxen.bin ptlxen.bin.o
+#	rm -f ptlxen.bin
+
 endif
 endif
 
diff -r 10448c053ad6 RELEASE_NOTES
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/RELEASE_NOTES	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,200 @@
+               PTLsim-ASF 219.asf.1.1 
+	       Release Notes
+
+               Stephan Diestelhorst
+	       AMD Operating System Research Center
+
+	       August 2008
+
+Overview
+========
+
+PTLsim is a near-cycle-accurate AMD64 simulator with in-order and
+out-of-order execution-core models.
+
+PTLsim-ASF is a version of PTLsim that implements an experimental
+extension of them AMD64 architecture: the Advanced Synchronization
+Facility (ASF).
+
+The following paper describes ASF and its PTLsim implementation:
+  Hardware acceleration for lock-free data structures and software
+  transactional memory
+  Stephan Diestelhorst, Michael Hohmuth.  In the proceedings of the
+  Workshop on Exploiting Parallelism with Transactional Memory and
+  other Hardware Assisted Methods (EPHAM), April 2008.  Boston, MA
+  http://www.amd64.org/fileadmin/user_upload/pub/epham08-asf-eval.pdf
+
+Please find more information on ASF and PTLsim on the following web
+site:
+  http://www.amd64.org/research/multi-and-manycore-systems.html
+
+
+Disclaimer
+==========
+
+PTLsim-ASF introduces support for the experimental ASF extension.  AMD
+provides this implementation of ASF without any intent and commitment
+to release such functionality in any future microprocessor product.
+
+Furthermore, the experimental status of this extension manifests in
+the possibility that functionality, implementation, and interface of
+ASF may change in the future and deviate from previous versions (such
+as the one presented in the publication cited above).
+
+Having said that, you are welcome and invited to experiment with the
+Advanced Synchronization Facility and to let us know if you have any
+suggestions for improvement, discover bugs, and for any other
+comments.  Use the contact provided at the end of this document.
+
+We would also be glad to learn about interesting uses of ASF.  If you
+want to refer to ASF in a publication, please use the paper cited
+above.
+
+While the simulation core of PTLsim-ASF has been tweaked to behave
+similar to a AMD Opteron(tm) family 10h (Barcelona) processor, this
+model is not accurate, neither in functionality, performance, nor
+internal implementation.  Do not use this model to assess, project, or
+derive any properties of a real AMD Opteron(tm) processor.
+
+
+License
+=======
+
+PTLsim-ASF is released under GPLv2.  See LICENSE for details.
+
+
+Changes to baseline PTLsim-219
+==============================
+
+PTLsim-ASF contains the following enhancements over the baseline
+PTLsim release:
+
+* An implementation of ASF in the out-of-order core model
+
+* Support for simulating multicore configurations with a simple
+  cache-coherence performance model
+
+* Updated microarchitecture for AMD Opteron(tm) processors (family
+  10h, Barcelona core)
+
+* Various bugfixes back-ported from current PTLsim releases
+
+
+Configuration
+=============
+
+The following configurations are known to work:
+
+* Out-of-order core model "asfsmt" with ASF and multicore support
+
+  This core is a modification of PTLsim's original "smtcore"
+  simulation core with extensions to simulate a true multicore system.
+  The core has been tested in full-system simulation mode only (using
+  PTLsim/X), as it relies on support for multithreaded simulation
+  which is not available in the user-space version of PTLsim.
+
+  The modified core still resides in the smt*.{h,cpp} files, but has
+  been renamed to "asfsmt" and replaces the original "smt" core.  Use
+  "-core asfsmt -run" as parameters to PTLsim/X to make use of this
+  core model.
+
+  In contrast to the original "smt" simulation model, this enhanced
+  version contains multiple truly independent cores that do not
+  compete for functional units or ROB entries, and each core has a
+  private cache hierarchy.
+
+  These cache hierarchies are kept coherent with a simplified
+  coherence protocol that models first-order performance effects of
+  coherent caches.
+
+* Out-of-order core model "asfooo" with ASF support
+
+  In order to allow experiments in user space, an additional core
+  model, named "asfooo" has been created that adds ASF to the existing
+  "ooo" single core model with out-of-order execution.  All
+  modifications have been made to a new clone of the "ooo" core, and
+  hence both versions ("ooo" and "asfooo") can be used by using the
+  appropriate command-line option, such as "-core asfooo -run".
+
+  ASF support in this simulation core is functionally limited, as
+  PTLsim's user-space simulation infrastructure does not support
+  multithreaded applications and thus concurrent threads cannot cause
+  ASF critical section aborts due to contention on data.  However,
+  this core can still be used for quick user-space prototyping of
+  applications using ASF, as various reasons for interference with ASF
+  can be simulated using a newly added random-injection framework.
+  Several random predicates have been defined that are evaluated at
+  various stages in the pipeline.  In a configurable percentage of
+  evaluations (0 % by default), these predicates evaluate to true and
+  for example trigger ASF roll-back due to data-contention.
+
+  In the recent upstream version of PTLsim the "ooo" core has been
+  replaced by the "smt" core (renamed to "ooo"), so that a single core
+  model is used in both user-space and full system simulation.  As the
+  PTLsim-ASF project is currently preparing for merge with that most
+  recent version, the "asfooo" core is not tested as much as the
+  "asfsmt" core and slated for removal.  In that future version, the
+  "asfsmt" core will be available for user-space testing and rapid
+  prototyping.
+
+* In-order core model "seq" without ASF support
+
+  The sequential in-order simulation core "seq" has not been changed
+  and is still present in the release.  It can be used for fast
+  forwarding into the simulation, but care has to be taken as it does
+  not have support for ASF.
+
+
+Usage
+=====
+
+In order to use ASF from within C / C++, include asf-highlevel.h (in
+the root of the package) which provides convenient wrappers around the
+ASF primitives for loading, for prefetching values, and to start and
+end ASF critical sections.  Refer to the publication above for details
+on ASF primitives and section layout.
+
+Due to special treatment of the frame-pointer register within GCC (it
+cannot be clobbered), two different flavours of the macro for the
+ACQUIRE instruction exist.  If you want to compile your application
+with _enabled_ frame pointers, specify "-DASF_PUSH
+-fno-omit-frame-pointer" on GCC's command-line.  If you want to
+_disable_ frame pointers, use "-DASF_STACK -fomit-frame-pointer".
+
+This behaviour is an artifact of GCC not directly supporting ASF and
+its inability to clobber RBP, even if frame pointers are disabled.  As
+ASF does not restore any of the GPRs (except the stack pointer) after
+an abort, RBP has to be saved on the stack manually.  The ACQUIRE
+macros hence expect a local 64-bit storage on the stack that should be
+declared as follows: "volatile unsigned long acq_state;".
+
+Please note that the ASF extension is only supported for 64-bit
+applications.  Support for (legacy) 32-bit environments is not tested
+and will likely result in fancy errors anywhere in the tool chain.
+
+
+Known issues
+============
+
+* PTLsim/X sometimes crashes when rapidly switching between
+  native-execution mode and simulation mode.  The workaround is to
+  avoid switching back from simulation mode to native-execution mode,
+  or to resort to user-mode-only support.
+  Note that this is an issue introduced by the original PTLsim version.
+
+* This modification to PTLsim is based on an old release of PTLsim.
+  Several of the improvements made to upstream PTLsim have been
+  contributed by us, and hence are included in this tree as well.
+  Various others have been "back-ported".  We are currently working on
+  a merge with the current upstream version of PTLsim.
+
+Contact
+=======
+
+This version of PTLsim is maintained by
+Stephan Diestelhorst (AMD OSRC) stephan.diestelhorst@amd.com
+
+Any question specific to the extensions of PTLsim-ASF should be
+directed to him.  The general PTLsim mailing list for general PTLsim
+questions is ptlsim-devel@ptlsim.org.  Its archives can be found at:
+http://www.ptlsim.org/pipermail/ptlsim-devel/
diff -r 10448c053ad6 asf-highlevel.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asf-highlevel.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,275 @@
+/**
+ * Definitions of high-level helper functions which can
+ * be helpful to use traditional C / C++ code for ASF
+ * critical sections.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * Copyright 2007-2008 Advanced Micro Devices, Inc.
+ * Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+ *
+ * @author stephan.diestelhorst@amd.com
+ * @date 27.7.2007
+ */
+
+#ifndef __ASF_HIGHLEVEL_H_
+#define __ASF_HIGHLEVEL_H_
+
+#include "asf-opcodes.h"
+
+#define unlikely(x) (__builtin_expect(!!(x), 0))
+#define likely(x) (__builtin_expect(!!(x), 1))
+
+/* Maximum number of speculative cache-lines for ASF */
+#define ASF_ENTRIES (8)
+/* Size of lines for ASF */
+#define ASF_LINE_SIZE (64L)
+#define ASF_LINE_MASK (~(ASF_LINE_SIZE - 1))
+
+/**
+ * Specifies a line to be used in ASF, w/o reading the data.
+ * @param m Pointer to data to be fetched.
+ */
+static inline void lock_prefetchw(void *m) {
+	// NOTE: using "m"(*ptr) allows also immediate addressed locations
+	//       but sometimes uses an additional register :(
+	asm volatile(
+		LOCK
+		"prefetchw %0"
+		:
+		:"m"(*(long*)m));
+}
+
+/**
+ * Reads a byte and specifies its subsequent usage in an ASF
+ * transaction.
+ * @param m Pointer to byte.
+ * @return Data at m.
+ */
+static inline unsigned char lock_load8(unsigned char* m) {
+	unsigned char t;
+	asm ( LOCK "movb %1,%0"
+		:"=r"(t):"m"(*m));
+	return t;
+}
+
+/**
+ * Reads a word and specifies its subsequent usage in an ASF
+ * transaction.
+ * @param m Pointer to byte.
+ * @return Data at m.
+ */
+static inline unsigned short lock_load16(unsigned short* m) {
+	unsigned short t;
+	asm ( LOCK "movw %1,%0"
+		:"=r"(t):"m"(*m));
+	return t;
+}
+
+/**
+ * Reads a double-word and specifies its subsequent usage in an ASF
+ * transaction.
+ * @param m Pointer to byte.
+ * @return Data at m.
+ */
+static inline unsigned int lock_load32(unsigned int* m) {
+	unsigned int t;
+	asm ( LOCK "movl %1,%0"
+		:"=r"(t):"m"(*m));
+	return t;
+}
+
+/**
+ * Reads a quad-word and specifies its subsequent usage in an ASF
+ * transaction.
+ * @param m Pointer to byte.
+ * @return Data at m.
+ */
+static inline unsigned long lock_load64(unsigned long* m) {
+	unsigned long t;
+	asm ( LOCK "movq %1,%0"
+		:"=r"(t)
+		:"m"(*m));
+	return t;
+}
+
+/**
+ * Acquires the previously specified cache-lines, and enters the
+ * critical section.
+ * @param nloc Number of locations specified.
+ * @return Did the acquire fail?
+ */
+#if(0)
+static inline long acquire(unsigned char nloc) {
+	long fail;
+	/* Do the ACQUIRE and the mandatory check on the zero flag */
+	asm volatile(
+		".byte 0x0F,0xC7," MOD_RM( MOD_REG, "2", REG_AX) "\n\t"
+		".byte %1 										\n\t"
+		"jz 1f											\n\t"
+		"#failed... let C code handle this!				\n\t"
+		"1:												\n\t"
+		"#success										\n\t"
+		:"=a"(fail)
+		:"N"(nloc));
+	return fail;
+}
+#endif
+
+/* Validate a certain number of read locations, returns =0 on success. */
+#define validate(result, nloc)			\
+	asm volatile(										\
+		"8: "ACQUIREB(REG_AX, -(nloc))						\
+		:"=a"(result))
+
+#define validate_reg(result, nloc)					\
+	asm volatile(										\
+		"8: "ACQUIRER(REG_AX)							\
+		:"=a"(result)					\
+		:"0"(-(nloc)))
+
+#define acquire_push(fail, _ign_, nloc)						\
+	asm volatile(										\
+		"push %%rbp \n\t"								\
+		"8: "ACQUIREB(REG_AX, nloc)						\
+		:"=a"(fail))
+
+#define acquire_reg_push(fail, _ign_, nloc)					\
+	asm volatile(										\
+		"push %%rbp \n\t"								\
+		"8: "ACQUIRER(REG_AX)							\
+		:"=a"(fail):"0"(nloc))
+
+#define acquire_fail_clobber_pop(_ign_)\
+	asm volatile("pop %%rbp":::"memory","rax","rbx","rcx","rdx","rsi", "rdi",	\
+		"r8", "r9","r10","r11","r12","r13", "r14","r15" )
+
+
+//NOTE: This needs state_save to be a 64bit local variable on the stack,
+//      sth. like "volatile unisgned long acq_state;"
+#define acquire_stack(fail, state_save, nloc)			\
+	asm volatile(										\
+		"mov  %%rbp, %1 \n\t"							\
+		"8: "ACQUIREB(REG_AX, nloc)						\
+		:"=a"(fail),"=m"(state_save))
+
+#define acquire_reg_stack(fail, state_save, nloc)					\
+	asm volatile(										\
+		"mov  %%rbp, %1 \n\t"							\
+		"8: "ACQUIRER(REG_AX)							\
+		:"=a"(fail),"=m"(state_save)					\
+		:"0"(nloc))
+#define acquire_fail_clobber_stack(state_save)\
+	asm volatile("mov %0,%%rbp"							\
+		:												\
+		:"m"(state_save)								\
+		:"memory","rax","rbx","rcx","rdx","rsi", "rdi",	\
+		 "r8", "r9","r10","r11","r12","r13", "r14","r15" )
+
+
+/**
+ * Commits all modifications made to the cachelines.
+ */
+static inline void commit_() {asm volatile("9: "COMMIT : : :"memory");}
+static inline void commit_pop() {
+	asm volatile("9: "COMMIT "\n\t"
+	             "add $8,%%rsp": : :"memory");
+}
+#ifndef ASF_PUSH
+#ifndef ASF_STACK
+#error Specify either "-DASF_STACK -fomit-frame-pointer" or "-DASF_PUSH -fno-omit-frame-pointer".
+#endif
+#endif
+
+#ifdef ASF_PUSH
+  #warning "ASF uses the push based interface. Make sure to ENABLE frame-pointers."
+  #define acquire acquire_push
+  #define acquire_reg acquire_reg_push
+  #define acquire_fail_clobber acquire_fail_clobber_pop
+  #define commit commit_pop
+#endif
+#ifdef ASF_STACK
+  #warning "ASF uses the stack based interface. Make sure to DISABLE frame-pointers."
+  #define acquire acquire_stack
+  #define acquire_reg acquire_reg_stack
+  #define acquire_fail_clobber acquire_fail_clobber_stack
+  #define commit commit_
+#endif
+
+/**
+ * Touches a certain number of arguments-
+ */
+#define touch1(arg1) \
+	asm volatile (""::"g"(arg1))
+#define touch2(arg1,arg2) \
+	touch1(arg1);         \
+	touch1(arg2)
+#define touch3(arg1,arg2,arg3) \
+	touch1(arg1);              \
+	touch2(arg2, arg3)
+#define touch4(arg1,arg2,arg3,arg4) \
+	touch1(arg1);                   \
+	touch3(arg2, arg3, arg4)
+#define touch5(arg1,arg2,arg3,arg4,arg5) \
+	touch1(arg1);                        \
+	touch4(arg2, arg3, arg4, arg5)
+#define touch6(arg1,arg2,arg3,arg4,arg5,arg6) \
+	touch1(arg1);                             \
+	touch5(arg2, arg3, arg4, arg5, arg6)
+#define touch7(arg1,arg2,arg3,arg4,arg5,arg6,arg7) \
+	touch1(arg1);                                  \
+	touch6(arg2, arg3, arg4, arg5, arg6, arg7)
+#define touch8(arg1,arg2,arg3,arg4,arg5,arg6,arg7,arg8) \
+	touch1(arg1);                                       \
+	touch7(arg2, arg3, arg4, arg5, arg6, arg7, arg8)
+
+/**
+ * Returns the start / end of the next / previous
+ * transaction. Relies on the fact, that nobody uses
+ * the local labels 8 and 9, between these functions
+ * and the transactions themselves.
+ */
+static inline long start_prev_txn() {
+	long start;
+	asm volatile ("movq $8b, %0":"=r"(start));
+	return start;
+}
+static inline long end_prev_txn() {
+	long start;
+	asm volatile ("movq $9b, %0":"=r"(start));
+	return start;
+}
+static inline long start_next_txn() {
+	long start;
+	asm volatile ("movq $8f, %0":"=r"(start));
+	return start;
+}
+static inline long end_next_txn() {
+	long start;
+	asm volatile ("movq $9f, %0":"=r"(start));
+	return start;
+}
+static inline long start_this_txn() {
+	long start;
+	asm volatile ("movq $8b, %0":"=r"(start));
+	return start;
+}
+static inline long end_this_txn() {
+	long start;
+	asm volatile ("movq $9f, %0":"=r"(start));
+	return start;
+}
+#endif
diff -r 10448c053ad6 asf-opcodes.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asf-opcodes.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,77 @@
+/**
+ * Opcodes definitions for AMD's ASF (Advanced Synchronization Facility)
+ * 
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ * 02110-1301, USA.
+ *
+ * Copyright 2007-2008 Advanced Micro Devices, Inc.
+ * Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+ *
+ * @author stephan.diestelhorst@amd.com
+ * @date 21.06.2007
+ */
+
+/* First: The registers for ModR/M.rm field */
+#define REG_AX "(0x0)"
+#define REG_CX "(0x1)"
+#define REG_DX "(0x2)"
+#define REG_BX "(0x3)"
+
+#define REG_SP "(0x4)"
+#define REG_BP "(0x5)"
+#define REG_SI "(0x6)"
+#define REG_DI "(0x7)"
+
+/* ModRM byte helpers */
+/* ModRM mod = 3: direct registers in ModR/M.rm */
+#define MOD_REG "(0x3)"
+#define MOD_RM(mod, reg, rm)\
+	"(((" mod ")<<6)+((" reg ")<<3)+(" rm "))"
+
+/* REX Prefix */
+#define REX(w,r,x,b)\
+	".byte 0x40+(" w "<<3)+(" r "<<2)+(" x "<<1)+" b "\n\t"
+#define REX64 REX("1","0","0","0")
+
+/* Operand Size prefix */
+#define DATA16\
+	".byte 0x66\n\t"
+
+/* LOCK prefix */
+#define LOCK\
+	".byte 0xF0\n\t"
+
+/* ASF instructions */
+#define ACQUIREB(reg, imm8)\
+	"#acquire " #reg "," #imm8 "\n\t"\
+	".byte 0x0F,0xC7," MOD_RM( MOD_REG, "2", reg) "\n\t"\
+	".byte " #imm8 "\n\t"
+#define ACQUIREW(reg, imm16)\
+	"#acquire " #reg "," #imm16 "\n\t"\
+	".byte 0x0F,0xC7," MOD_RM( MOD_REG, "2", reg) "\n\t"\
+	".word " #imm16 "\n\t"
+#define ACQUIREDW(reg, imm32)\
+	"#acquire " #reg "," #imm32 "\n\t"\
+	".byte 0x0F,0xC7," MOD_RM( MOD_REG, "2", reg) "\n\t"\
+	".long " #imm32 "\n\t"
+
+/* Extension: Give line-count in register reg. */
+#define ACQUIRER(reg)\
+	"#acquire " #reg "\n\t"\
+	".byte 0x0F,0xC7," MOD_RM( MOD_REG, "4", reg) "\n\t"
+
+#define COMMIT\
+	"#commit\n\t"\
+	".byte 0x0F,0xC7," MOD_RM( "0", "3","0") "\n\t"
diff -r 10448c053ad6 asfooocore.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asfooocore.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,1643 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Out-of-Order Core Simulator With Experimental AMD64 ASF Extension
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2005 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+#include <globals.h>
+#include <elf.h>
+#include <ptlsim.h>
+#include <branchpred.h>
+#include <datastore.h>
+#include <logic.h>
+#include <dcache.h>
+
+#define INSIDE_OOOCORE
+#define DECLARE_STRUCTURES
+#include <asfooocore.h>
+#include <stats.h>
+
+#ifndef ENABLE_CHECKS
+#undef assert
+#define assert(x) (x)
+#endif
+
+#ifndef ENABLE_LOGGING
+#undef logable
+#define logable(level) (0)
+#endif
+
+using namespace ASFOutOfOrderModel;
+
+namespace ASFOutOfOrderModel {
+  byte uop_executable_on_cluster[OP_MAX_OPCODE];
+  W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
+};
+
+//
+// Initialize lookup tables used by the simulation
+//
+static void init_luts() {
+  // Initialize opcode maps
+  foreach (i, OP_MAX_OPCODE) {
+    W32 allowedfu = fuinfo[i].fu;
+    W32 allowedcl = 0;
+    foreach (cl, MAX_CLUSTERS) {
+      if (clusters[cl].fu_mask & allowedfu) setbit(allowedcl, cl);
+    }
+    uop_executable_on_cluster[i] = allowedcl;
+  }
+  
+  // Initialize forward-at-cycle LUTs
+  foreach (srcc, MAX_CLUSTERS) {
+    foreach (destc, MAX_CLUSTERS) {
+      foreach (lat, MAX_FORWARDING_LATENCY+1) {
+        if (lat == intercluster_latency_map[srcc][destc]) {
+          setbit(forward_at_cycle_lut[srcc][lat], destc);
+        }
+      }
+    }
+  }
+}
+
+void OutOfOrderCore::init_generic() {
+  //
+  // ROB states
+  //
+  rob_free_list("free", rob_states, 0);
+  rob_frontend_list("frontend", rob_states, ROB_STATE_PRE_READY_TO_DISPATCH);
+  rob_ready_to_dispatch_list("ready-to-dispatch", rob_states, 0);
+  InitClusteredROBList(rob_dispatched_list, "dispatched", ROB_STATE_IN_ISSUE_QUEUE),
+    InitClusteredROBList(rob_ready_to_issue_list, "ready-to-issue", ROB_STATE_IN_ISSUE_QUEUE);
+  InitClusteredROBList(rob_ready_to_store_list, "ready-to-store", ROB_STATE_IN_ISSUE_QUEUE);
+  InitClusteredROBList(rob_ready_to_load_list, "ready-to-load", ROB_STATE_IN_ISSUE_QUEUE);
+  InitClusteredROBList(rob_issued_list, "issued", 0);
+  InitClusteredROBList(rob_completed_list, "completed", ROB_STATE_READY);
+  InitClusteredROBList(rob_ready_to_writeback_list, "ready-to-write", ROB_STATE_READY);
+  rob_cache_miss_list("cache-miss", rob_states, 0);
+  rob_ready_to_commit_queue("ready-to-commit", rob_states, ROB_STATE_READY);
+  //
+  // Miscellaneous
+  //
+  branchpred.init();
+  fetch_uuid = 0;
+  current_basic_block = null;
+  current_basic_block_transop_index = 0;
+  current_icache_block = 0;
+  round_robin_reg_file_offset = 0;
+  smc_invalidate_pending = 0;
+  caches.reset();
+  caches.callback = &cache_callbacks;
+  setzero(robs_on_fu);
+  prev_interrupts_pending = 0;
+  handle_interrupt_at_next_eom = 0;
+}
+
+template <typename T> 
+static void ASFOutOfOrderModel::print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title) {
+  os << title, ":", endl;
+  foreach (i, lol.count) {
+    StateList& list = *lol[i];
+    if (!list.count) continue;
+    os << list.name, " (", list.count, " entries):", endl;
+    int n = 0;
+    T* obj;
+    foreach_list_mutable(list, obj, entry, nextentry) {
+      if ((n % 16) == 0) os << " ";
+      os << " ", intstring(obj->index(), -3);
+      if (((n % 16) == 15) || (n == list.count-1)) os << endl;
+      n++;
+    }
+    os << endl;
+    // list.validate();
+  }
+}
+
+void StateList::checkvalid() {
+#if 0
+  int realcount = 0;
+  selfqueuelink* obj;
+  foreach_list_mutable(*this, obj, entry, nextentry) {
+    realcount++;
+  }
+  assert(count == realcount);
+#endif
+}
+
+void PhysicalRegisterFile::init(const char* name, int coreid, int rfid, int size) {
+  assert(rfid < PHYS_REG_FILE_COUNT);
+  assert(size <= MAX_PHYS_REG_FILE_SIZE);
+  this->size = size;
+  this->coreid = coreid;
+  this->rfid = rfid;
+  this->name = name;
+  this->allocations = 0;
+  this->frees = 0;
+
+  foreach (i, MAX_PHYSREG_STATE) {
+    states[i].init(physreg_state_names[i], getcore().physreg_states);
+  }
+
+  foreach (i, size) {
+    (*this)[i].init(coreid, rfid, i);
+  }
+}
+
+PhysicalRegister* PhysicalRegisterFile::alloc(int r) {
+  PhysicalRegister* physreg = (PhysicalRegister*)((r >= 0) ? states[PHYSREG_FREE].remove(&(*this)[r]) : states[PHYSREG_FREE].dequeue());
+  if unlikely (!physreg) return null;
+  physreg->state = PHYSREG_NONE;
+  physreg->changestate(PHYSREG_WAITING);
+  physreg->flags = FLAG_WAIT;
+  allocations++;
+  return physreg;
+}
+
+ostream& PhysicalRegisterFile::print(ostream& os) const {
+  os << "PhysicalRegisterFile<", name, ", rfid ", rfid, ", size ", size, ">:", endl;
+  foreach (i, size) {
+    os << (*this)[i], endl;
+  }
+  return os;
+}
+
+void PhysicalRegisterFile::reset() {
+  foreach (i, MAX_PHYSREG_STATE) {
+    states[i].reset();
+  }
+
+  foreach (i, size) {
+    (*this)[i].reset();
+  }
+}
+
+StateList& PhysicalRegister::get_state_list(int s) const {
+  return getcore().physregfiles[rfid].states[s];
+}
+
+namespace ASFOutOfOrderModel {
+  ostream& operator <<(ostream& os, const PhysicalRegister& physreg) {
+    stringbuf sb;
+    print_value_and_flags(sb, physreg.data, physreg.flags);
+    
+    os << "  r", intstring(physreg.index(), -3), " state ", padstring(physreg.get_state_list().name, -12), " ", sb;
+    if (physreg.rob) os << " rob ", physreg.rob->index(), " (uuid ", physreg.rob->uop.uuid, ")";
+    os << " refcount ", physreg.refcount;
+    
+    return os;
+  }
+};
+
+ostream& RegisterRenameTable::print(ostream& os) const {
+  foreach (i, TRANSREG_COUNT) {
+    if ((i % 8) == 0) os << " ";
+    os << " ", padstring(arch_reg_names[i], -6), " r", intstring((*this)[i]->index(), -3), " | ";
+    if (((i % 8) == 7) || (i == TRANSREG_COUNT-1)) os << endl;
+  }
+  return os;
+}
+
+//
+// Execute one cycle of the entire core state machine
+//
+bool OutOfOrderCore::runcycle() {
+  bool exiting = 0;
+
+  //
+  // Detect edge triggered transition from 0->1 for
+  // pending interrupt events, then wait for current
+  // x86 insn EOM uop to commit before redirecting
+  // to the interrupt handler.
+  //
+#ifdef PTLSIM_HYPERVISOR
+  bool current_interrupts_pending = ctx.check_events();
+  bool edge_triggered = ((!prev_interrupts_pending) & current_interrupts_pending);
+  handle_interrupt_at_next_eom |= edge_triggered;
+  prev_interrupts_pending = current_interrupts_pending;
+#endif
+
+  // All FUs are available at top of cycle:
+  fu_avail = bitmask(FU_COUNT);
+  loads_in_this_cycle = 0;
+  caches.clock();
+
+  int commitrc = commit();
+
+  for_each_cluster(i) { writeback(i); }
+  for_each_cluster(i) { transfer(i); }
+  for_each_cluster(i) { issue(i); complete(i); }
+
+  int dispatchrc = dispatch();
+
+  if likely (dispatchrc >= 0) {
+    frontend();
+    rename();
+    fetch();
+  }
+
+  if likely (dispatchrc >= 0) { foreach_issueq(clock()); }
+
+  commitrc = asf_runcycle(commitrc);
+
+  if unlikely (config.event_log_enabled) {
+    if unlikely (config.flush_event_log_every_cycle) {
+      eventlog.flush(true);
+    }
+  }
+
+#ifdef ENABLE_CHECKS
+  // This significantly slows down simulation; only enable it if absolutely needed:
+  //check_refcounts();
+#endif
+
+  if unlikely (commitrc == COMMIT_RESULT_SMC) {
+    if (logable(3)) logfile << "Potentially cross-modifying SMC detected: global flush required (cycle ", sim_cycle, ", ", total_user_insns_committed, " commits)", endl, flush;
+    flush_pipeline();
+    invalidate_smc();
+    exiting = 0;
+  } else if unlikely (commitrc == COMMIT_RESULT_EXCEPTION) {
+    exiting = !handle_exception();
+  } else if unlikely (commitrc == COMMIT_RESULT_BARRIER) {
+    exiting = !handle_barrier();
+  } else if unlikely (commitrc == COMMIT_RESULT_INTERRUPT) {
+    handle_interrupt();
+  } else if unlikely (commitrc == COMMIT_RESULT_STOP) {
+    exiting = 1;
+  }
+
+  if unlikely ((sim_cycle - last_commit_at_cycle) > 1024) {
+    stringbuf sb;
+    sb << "WARNING: At cycle ", sim_cycle, ", ", total_user_insns_committed, 
+      " user commits: no instructions have committed for ", (sim_cycle - last_commit_at_cycle),
+      " cycles; the pipeline could be deadlocked", endl;
+    logfile << sb, flush;
+    cerr << sb, flush;
+    exiting = 1;
+  }
+
+  return exiting;
+}
+
+//
+// ReorderBufferEntry
+//
+void ReorderBufferEntry::init(int idx) {
+  this->idx = idx;
+  entry_valid = 0;
+  selfqueuelink::reset();
+  current_state_list = null;
+  reset();
+}
+
+//
+// Clean out various fields from the ROB entry that are 
+// expected to be zero when allocating a new ROB entry.
+//
+void ReorderBufferEntry::reset() {
+  int latency, operand;
+  // Deallocate ROB entry
+  entry_valid = false;
+  cycles_left = 0;
+  physreg = (PhysicalRegister*)null;
+  lfrqslot = -1;
+  lsq = 0;
+  issued = 0;
+  load_store_second_phase = 0;
+  lock_acquired = 0;
+  consumer_count = 0;
+  executable_on_cluster_mask = 0;
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+  dest_renamed_before_writeback = 0;
+  no_branches_between_renamings = 0;
+#endif
+  llbline = (LLBLine*)null;
+}
+
+bool ReorderBufferEntry::ready_to_issue() const {
+  bool raready = operands[0]->ready();
+  bool rbready = operands[1]->ready();
+  bool rcready = operands[2]->ready();
+  bool rsready = operands[3]->ready();
+  
+  if (isstore(uop.opcode)) {
+    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready);
+  } else if (isload(uop.opcode)) {
+    return (load_store_second_phase) ? (raready & rbready & rcready & rsready) : (raready & rbready & rcready);
+  } else {
+    return (raready & rbready & rcready & rsready);
+  }
+}
+
+bool ReorderBufferEntry::ready_to_commit() const {
+  return (current_state_list == &getcore().rob_ready_to_commit_queue);
+}
+
+StateList& ReorderBufferEntry::get_ready_to_issue_list() const {
+  OutOfOrderCore& core = getcore();
+  return 
+    isload(uop.opcode) ? core.rob_ready_to_load_list[cluster] :
+    isstore(uop.opcode) ? core.rob_ready_to_store_list[cluster] :
+    core.rob_ready_to_issue_list[cluster];
+}
+
+//
+// Reorder Buffer
+//
+stringbuf& ReorderBufferEntry::get_operand_info(stringbuf& sb, int operand) const {
+  PhysicalRegister& physreg = *operands[operand];
+  ReorderBufferEntry& sourcerob = *physreg.rob;
+
+  sb << "r", physreg.index();
+  if (PHYS_REG_FILE_COUNT > 1) sb << "@", getcore().physregfiles[physreg.rfid].name;
+
+  switch (physreg.state) {
+  case PHYSREG_WRITTEN:
+    sb << " (written)"; break;
+  case PHYSREG_BYPASS:
+    sb << " (ready)"; break;
+  case PHYSREG_WAITING:
+    sb << " (wait rob ", sourcerob.index(), " uuid ", sourcerob.uop.uuid, ")"; break;
+  case PHYSREG_ARCH: break;
+    if (physreg.index() == PHYS_REG_NULL)  sb << " (zero)"; else sb << " (arch ", arch_reg_names[physreg.archreg], ")"; break;
+  case PHYSREG_PENDINGFREE:
+    sb << " (pending free for ", arch_reg_names[physreg.archreg], ")"; break;
+  default:
+    // Cannot be in free state!
+    sb << " (FREE)"; assert(false); break;
+  }
+
+  return sb;
+}
+
+ostream& ReorderBufferEntry::print_operand_info(ostream& os, int operand) const {
+  stringbuf sb;
+  get_operand_info(sb, operand);
+  os << sb;
+  return os;
+}
+
+ostream& ReorderBufferEntry::print(ostream& os) const {
+  stringbuf name, rainfo, rbinfo, rcinfo;
+  nameof(name, uop);
+  get_operand_info(rainfo, 0);
+  get_operand_info(rbinfo, 1);
+  get_operand_info(rcinfo, 2);
+
+  os << "rob ", intstring(index(), -3), " uuid ", intstring(uop.uuid, 16), " ", padstring(current_state_list->name, -24), " @ ", padstring((cluster >= 0) ? clusters[cluster].name : "???", -4), " ", padstring(name, -12), " r", 
+    intstring(physreg->index(), -3), " ", padstring(arch_reg_names[uop.rd], -6);
+  if (isload(uop.opcode)) 
+    os << " ld", intstring(lsq->index(), -3);
+  else if (isstore(uop.opcode))
+    os << " st", intstring(lsq->index(), -3);
+  else os << "      ";
+
+  os << " = ";
+  os << padstring(rainfo, -30);
+  os << padstring(rbinfo, -30);
+  os << padstring(rcinfo, -30);
+
+  if (llbline)
+    os << " llb: ", llbline;
+
+  return os;
+}
+
+void OutOfOrderCore::print_rob(ostream& os) {
+  os << "ROB head ", ROB.head, " to tail ", ROB.tail, " (", ROB.count, " entries):", endl;
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+    os << "  ", rob, endl;
+  }
+}
+
+void OutOfOrderCore::print_lsq(ostream& os) {
+  os << "LSQ head ", LSQ.head, " to tail ", LSQ.tail, " (", LSQ.count, " entries):", endl;
+  foreach_forward(LSQ, i) {
+    LoadStoreQueueEntry& lsq = LSQ[i];
+    os << "  ", lsq, endl;
+  }
+}
+
+void OutOfOrderCore::print_rename_tables(ostream& os) {
+  os << "SpecRRT:", endl;
+  os << specrrt;
+  os << "CommitRRT:", endl;
+  os << commitrrt;
+}
+
+void OutOfOrderCore::dump_ooo_state(ostream& os) {
+  print_rename_tables(os);
+  print_rob(os);
+  print_list_of_state_lists<PhysicalRegister>(os, physreg_states, "Physical register states");
+  print_list_of_state_lists<ReorderBufferEntry>(os, rob_states, "ROB entry states");
+  print_lsq(os);
+  os << "Issue Queues:", endl;
+  foreach_issueq(print(os));
+  foreach (i, PHYS_REG_FILE_COUNT) {
+    os << physregfiles[i];
+  }
+  caches.print(os);
+  os << flush;
+}
+
+//
+// Validate the physical register reference counters against what
+// is really accessible from the various tables and operand fields.
+//
+// This is for debugging only.
+//
+void OutOfOrderCore::check_refcounts() {
+  int refcounts[PHYS_REG_FILE_COUNT][MAX_PHYS_REG_FILE_SIZE];
+  memset(refcounts, 0, sizeof(refcounts));
+
+  foreach (rfid, PHYS_REG_FILE_COUNT) {
+    // Null physreg in each register file is special and can never be freed:
+    refcounts[rfid][PHYS_REG_NULL]++;
+  }
+
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+    foreach (j, MAX_OPERANDS) {
+      refcounts[rob.operands[j]->rfid][rob.operands[j]->index()]++;
+    }
+  }
+
+  foreach (i, TRANSREG_COUNT) {
+    refcounts[commitrrt[i]->rfid][commitrrt[i]->index()]++;
+    refcounts[specrrt[i]->rfid][specrrt[i]->index()]++;
+  }
+
+  bool errors = 0;
+
+  foreach (rfid, PHYS_REG_FILE_COUNT) {
+    PhysicalRegisterFile& physregs = physregfiles[rfid];
+    foreach (i, physregs.size) {
+      if unlikely (physregs[i].refcount != refcounts[rfid][i]) {
+        logfile << "ERROR: r", i, " refcount is ", physregs[i].refcount, " but should be ", refcounts[rfid][i], endl;
+        
+        foreach_forward(ROB, r) {
+          ReorderBufferEntry& rob = ROB[r];
+          foreach (j, MAX_OPERANDS) {
+            if ((rob.operands[j]->index() == i) & (rob.operands[j]->rfid == rfid)) logfile << "  ROB ", r, " operand ", j, endl;
+          }
+        }
+        
+        foreach (j, TRANSREG_COUNT) {
+          if ((commitrrt[j]->index() == i) & (commitrrt[j]->rfid == rfid)) logfile << "  CommitRRT ", arch_reg_names[j], endl;
+          if ((specrrt[j]->index() == i) & (specrrt[j]->rfid == rfid)) logfile << "  SpecRRT ", arch_reg_names[j], endl;
+        }
+        
+        errors = 1;
+      }
+    }
+  }
+
+  if (errors) assert(false);
+}
+
+void OutOfOrderCore::check_rob() {
+  foreach (i, ROB_SIZE) {
+    ReorderBufferEntry& rob = ROB[i];
+    if (!rob.entry_valid) continue;
+    assert(inrange((int)rob.forward_cycle, 0, (MAX_FORWARDING_LATENCY+1)-1));
+  }
+
+  foreach (i, rob_states.count) {
+    StateList& list = *rob_states[i];
+    ReorderBufferEntry* rob;
+    foreach_list_mutable(list, rob, entry, nextentry) {
+      assert(inrange(rob->index(), 0, ROB_SIZE-1));
+      assert(rob->current_state_list == &list);
+      if (!((rob->current_state_list != &rob_free_list) ? rob->entry_valid : (!rob->entry_valid))) {
+        logfile << "ROB ", rob->index(), " list = ", rob->current_state_list->name, " entry_valid ", rob->entry_valid, endl, flush;
+        dump_ooo_state(logfile);
+        assert(false);
+      }
+    }
+  }
+}
+
+ostream& LoadStoreQueueEntry::print(ostream& os) const {
+  os << (store ? "st" : "ld"), intstring(index(), -3), " ";
+  os << "uuid ", intstring(rob->uop.uuid, 10), " ";
+  os << "rob ", intstring(rob->index(), -3), " ";
+  os << "r", intstring(rob->physreg->index(), -3);
+  if (PHYS_REG_FILE_COUNT > 1) os << "@", getcore().physregfiles[rob->physreg->rfid].name;
+  os << " ";
+  if (invalid) {
+    os << "< Invalid: fault 0x", hexstring(data, 8), " > ";
+  } else {
+    if (datavalid)
+      os << bytemaskstring((const byte*)&data, bytemask, 8);
+    else os << "<    Data Invalid     >";
+    os << " @ ";
+    if (addrvalid)
+      os << "0x", hexstring(physaddr << 3, 48);
+    else os << "< Addr Inval >";
+  }    
+  return os;
+}
+
+//
+// Barriers must flush the fetchq and stall the frontend until
+// after the barrier is consumed. Execution resumes at the address
+// in internal register nextrip (rip after the instruction) after
+// handling the barrier in microcode.
+//
+bool OutOfOrderCore::handle_barrier() {
+  // Release resources of everything in the pipeline:
+  core_to_external_state();
+  flush_pipeline();
+
+  int assistid = ctx.commitarf[REG_rip];
+  assist_func_t assist = (assist_func_t)(Waddr)assistid_to_func[assistid];
+  
+  if (logable(4)) {
+    logfile << "[vcpu ", ctx.vcpuid, "] Barrier (#", assistid, " -> ", (void*)assist, " ", assist_name(assist), " called from ",
+      (RIPVirtPhys(ctx.commitarf[REG_selfrip]).update(ctx)), "; return to ", (void*)(Waddr)ctx.commitarf[REG_nextrip],
+      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+  }
+
+  if (logable(6)) logfile << "Calling assist function at ", (void*)assist, "...", endl, flush; 
+  
+  update_assist_stats(assist);
+  if (logable(6)) {
+    logfile << "Before assist:", endl, ctx, endl;
+#ifdef PTLSIM_HYPERVISOR
+    logfile << sshinfo, endl;
+#endif
+  }
+  
+  assist(ctx);
+  
+  if (logable(6)) {
+    logfile << "Done with assist", endl;
+    logfile << "New state:", endl;
+    logfile << ctx;
+#ifdef PTLSIM_HYPERVISOR
+    logfile << sshinfo;
+#endif
+  }
+
+  // Flush again, but restart at possibly modified rip
+  flush_pipeline();
+
+#ifndef PTLSIM_HYPERVISOR
+  if (requested_switch_to_native) {
+    logfile << "PTL call requested switch to native mode at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl;
+    return false;
+  }
+#endif
+  return true;
+}
+
+bool OutOfOrderCore::handle_exception() {
+  // Release resources of everything in the pipeline:
+  core_to_external_state();
+  flush_pipeline();
+
+  if (logable(4)) {
+    logfile << "Exception ", exception_name(ctx.exception), " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip], 
+      " at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+  }
+
+  //
+  // CheckFailed and SkipBlock exceptions are raised by the chk uop.
+  // This uop is used at the start of microcoded instructions to assert
+  // that certain conditions are true so complex corrective actions can
+  // be taken if the check fails.
+  //
+  // SkipBlock is a special case used for checks at the top of REP loops.
+  // Specifically, if the %rcx register is zero on entry to the REP, no
+  // action at all is to be taken; the rip should simply advance to
+  // whatever is in chk_recovery_rip and execution should resume.
+  //
+  // CheckFailed exceptions usually indicate the processor needs to take
+  // evasive action to avoid a user visible exception. For instance, 
+  // CheckFailed is raised when an inlined floating point operand is
+  // denormal or otherwise cannot be handled by inlined fastpath uops,
+  // or when some unexpected segmentation or page table conditions
+  // arise.
+  //
+  if (ctx.exception == EXCEPTION_SkipBlock) {
+    ctx.commitarf[REG_rip] = chk_recovery_rip;
+    if (logable(6)) logfile << "SkipBlock pseudo-exception: skipping to ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+    flush_pipeline();
+    return true;
+  }
+
+  /* S.D.: Ignore ASF Testing exceptions! -> Reexecute Instruction*/
+  if (ctx.exception == EXCEPTION_ASF_Testing) {
+    cerr << __FILE__,":",__LINE__,"@",sim_cycle,": Ignoring ASF testing exception, re-executing instruction @ ",
+      (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+    flush_pipeline();
+    return true;
+  }
+#ifdef PTLSIM_HYPERVISOR
+  //
+  // Map PTL internal hardware exceptions to their x86 equivalents,
+  // depending on the context. The error_code field should already
+  // be filled out.
+  //
+  // Exceptions not listed here are propagated by microcode
+  // rather than the processor itself.
+  //
+  switch (ctx.exception) {
+  case EXCEPTION_PageFaultOnRead:
+  case EXCEPTION_PageFaultOnWrite:
+  case EXCEPTION_PageFaultOnExec:
+    ctx.x86_exception = EXCEPTION_x86_page_fault; break;
+  case EXCEPTION_FloatingPointNotAvailable:
+    ctx.x86_exception = EXCEPTION_x86_fpu_not_avail; break;
+  case EXCEPTION_FloatingPoint:
+    ctx.x86_exception = EXCEPTION_x86_fpu; break;
+  default:
+    logfile << "Unsupported internal exception type ", exception_name(ctx.exception), endl, flush;
+    assert(false);
+  }
+
+  if (logable(4)) {
+    logfile << ctx;
+    logfile << sshinfo;
+  }
+
+  ctx.propagate_x86_exception(ctx.x86_exception, ctx.error_code, ctx.cr2);
+
+  // Flush again, but restart at modified rip
+  flush_pipeline();
+
+  return true;
+#else
+  if (logable(6)) 
+    logfile << "Exception (", exception_name(ctx.exception), " called from ", (void*)(Waddr)ctx.commitarf[REG_rip], 
+      ") at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+
+  stringbuf sb;
+  logfile << exception_name(ctx.exception), " detected at fault rip ", (void*)(Waddr)ctx.commitarf[REG_rip], " @ ", 
+    total_user_insns_committed, " commits (", total_uops_committed, " uops): genuine user exception (",
+    exception_name(ctx.exception), "); aborting", endl;
+  logfile << ctx, endl;
+  machine.dump_state(logfile);
+  logfile << flush;
+
+  logfile << "Aborting...", endl, flush;
+  cerr << "Aborting...", endl, flush;
+
+  assert(false);
+  return false;
+#endif
+}
+
+bool OutOfOrderCore::handle_interrupt() {
+#ifdef PTLSIM_HYPERVISOR
+  // Release resources of everything in the pipeline:
+  core_to_external_state();
+  flush_pipeline();
+
+  if (logable(6)) {
+    logfile << "Interrupts pending at ", sim_cycle, " cycles, ", total_user_insns_committed, " commits", endl, flush;
+    logfile << "Context at interrupt:", endl;
+    logfile << ctx;
+    logfile << sshinfo;
+    logfile.flush();
+  }
+
+  ctx.event_upcall();
+
+  if (logable(6)) {
+    logfile << "After interrupt redirect:", endl;
+    logfile << ctx;
+    logfile << sshinfo;
+    logfile.flush();
+  }
+
+  // Flush again, but restart at modified rip
+  flush_pipeline();
+#endif
+  return true;
+}
+
+//
+// Event Formatting
+//
+void PhysicalRegister::fill_operand_info(PhysicalRegisterOperandInfo& opinfo) {
+  opinfo.physreg = index();
+  opinfo.state = state;
+  opinfo.rfid = rfid;
+  opinfo.archreg = archreg;
+  if (rob) {
+    opinfo.rob = rob->index();
+    opinfo.uuid = rob->uop.uuid;
+  }
+}
+
+ostream& ASFOutOfOrderModel::operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo) {
+  os << "[r", opinfo.physreg, " ", short_physreg_state_names[opinfo.state], " ";
+  switch (opinfo.state) {
+  case PHYSREG_WAITING:
+  case PHYSREG_BYPASS:
+  case PHYSREG_WRITTEN:
+    os << "rob ", opinfo.rob, " uuid ", opinfo.uuid; break;
+  case PHYSREG_ARCH:
+  case PHYSREG_PENDINGFREE:
+    os << arch_reg_names[opinfo.archreg]; break;
+  };
+  os << "]";
+  return os;
+}
+
+bool EventLog::init(size_t bufsize) {
+  reset();
+  size_t bytes = bufsize * sizeof(OutOfOrderCoreEvent);
+  start = (OutOfOrderCoreEvent*)ptl_mm_alloc_private_pages(bytes);
+  if unlikely (!start) return false;
+  end = start + bufsize;
+  tail = start;
+  
+  foreach (i, bufsize) start[i].type = EVENT_INVALID;
+  return true;
+}
+
+void EventLog::reset() {
+  if (!start) return;
+
+  size_t bytes = (end - start) * sizeof(OutOfOrderCoreEvent);
+  ptl_mm_free_private_pages(start, bytes);
+  start = null;
+  end = null;
+  tail = null;
+}
+
+void EventLog::flush(bool only_to_tail) {
+  if likely (!logable(6)) return;
+  if unlikely (!logfile) return;
+  if unlikely (!logfile->ok()) return;
+  print(*logfile, only_to_tail);
+  tail = start;
+}
+
+ostream& EventLog::print(ostream& os, bool only_to_tail) {
+  if (tail >= end) tail = start;
+  if (tail < start) tail = end;
+
+  OutOfOrderCoreEvent* p = (only_to_tail) ? start : tail;
+
+  // os << "Ring buffer (tail ", (tail - start), " out of ", (end - start), " entries):", endl;
+
+  W64 cycle = limits<W64>::max;
+  size_t bufsize = end - start;
+  foreach (i, (only_to_tail ? (tail - start) : bufsize)) {
+    if unlikely (p >= end) p = start;
+    if unlikely (p < start) p = end-1;
+    if unlikely (p->type == EVENT_INVALID) {
+      p++;
+      continue;
+    }
+
+    if unlikely (p->cycle != cycle) {
+      cycle = p->cycle;
+      os << "Cycle ", cycle, ":", endl;
+    }
+
+    p->print(os);
+    p++;
+  }
+
+  return os;
+}
+
+ostream& OutOfOrderCoreEvent::print(ostream& os) const {
+  bool ld = isload(uop.opcode);
+  bool st = isstore(uop.opcode);
+  bool br = isbranch(uop.opcode);
+  W32 exception = LO32(commit.state.reg.rddata);
+  W32 error_code = HI32(commit.state.reg.rddata);
+
+  stringbuf uopname;
+  nameof(uopname, uop);
+
+  os << intstring(uuid, 20), " ";
+  switch (type) {
+    //
+    // Fetch Events
+    //
+  case EVENT_FETCH_STALLED:
+    os << "fetch  frontend stalled"; break;
+  case EVENT_FETCH_ICACHE_WAIT:
+    os << "fetch  rip ", rip, ": wait for icache fill"; break;
+  case EVENT_FETCH_FETCHQ_FULL:
+    os << "fetch  rip ", rip, ": fetchq full"; break;
+  case EVENT_FETCH_BOGUS_RIP:
+    os << "fetch  rip ", rip, ": bogus RIP or decode failed"; break;
+  case EVENT_FETCH_ICACHE_MISS:
+    os << "fetch  rip ", rip, ": wait for icache fill of phys ", (void*)(Waddr)((rip.mfnlo << 12) + lowbits(rip.rip, 12)), " on missbuf ", fetch.missbuf; break;
+  case EVENT_FETCH_SPLIT:
+    os << "fetch  rip ", rip, ": split unaligned load or store ", uop; break;
+  case EVENT_FETCH_ASSIST:
+    os << "fetch  rip ", rip, ": branch into assist microcode: ", uop; break;
+  case EVENT_FETCH_TRANSLATE:
+    os << "xlate  rip ", rip, ": BB ", fetch.bb, " of ", fetch.bb_uop_count, " uops"; break;
+  case EVENT_FETCH_OK: {
+    os << "fetch  rip ", rip, ": ", uop, 
+      " (BB ", fetch.bb, " uopid ", uop.bbindex;
+    if (uop.som) os << "; SOM";
+    if (uop.eom) os << "; EOM ", uop.bytes, " bytes";
+    os << ")";
+    if (uop.eom && fetch.predrip) os << " -> pred ", (void*)fetch.predrip;
+    break;
+  }
+    //
+    // Rename Events
+    //
+  case EVENT_RENAME_FETCHQ_EMPTY:
+    os << "rename fetchq empty"; break;
+  case EVENT_RENAME_ROB_FULL:
+    os << "rename ROB full"; break;
+  case EVENT_RENAME_PHYSREGS_FULL:
+    os << "rename physical register file full"; break;
+  case EVENT_RENAME_LDQ_FULL:
+    os << "rename load queue full"; break;
+  case EVENT_RENAME_STQ_FULL:
+    os << "rename store queue full"; break;
+  case EVENT_RENAME_MEMQ_FULL:
+    os << "rename memory queue full"; break;
+  case EVENT_RENAME_OK: {
+    os << "rename rob ", intstring(rob, -3), "(",padstring(uopname,-5),")"," r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
+    if (ld|st) os << " lsq", lsq;
+    os << " = ";
+    foreach (i, MAX_OPERANDS) os << rename.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
+    os << "; renamed";
+    os << " ", arch_reg_names[uop.rd], " (old r", rename.oldphys, ")";
+    if unlikely (!uop.nouserflags) {
+      if likely (uop.setflags & SETFLAG_ZF) os << " zf (old r", rename.oldzf, ")";
+      if likely (uop.setflags & SETFLAG_CF) os << " cf (old r", rename.oldcf, ")";
+      if likely (uop.setflags & SETFLAG_OF) os << " of (old r", rename.oldof, ")";
+    }
+    break;
+  }
+  case EVENT_FRONTEND:
+    os << "front  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " frontend stage ", (FRONTEND_STAGES - frontend.cycles_left), " of ", FRONTEND_STAGES;
+    break;
+  case EVENT_CLUSTER_NO_CLUSTER:
+  case EVENT_CLUSTER_OK: {
+    os << "clustr rob ", intstring(rob, -3), "(",padstring(uopname,-5),")"," allowed FUs = ", 
+      bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), " -> clusters ",
+      bitstring(select_cluster.allowed_clusters, MAX_CLUSTERS, true), " avail";
+    foreach (i, MAX_CLUSTERS) os << " ", select_cluster.iq_avail[i];
+    os << "-> ";
+    if (type == EVENT_CLUSTER_OK) os << "cluster ", clusters[cluster].name; else os << "-> none"; break;
+    break;
+  }
+  case EVENT_DISPATCH_NO_CLUSTER:
+  case EVENT_DISPATCH_OK: {
+    os << "disptc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")"," operands ";
+    foreach (i, MAX_OPERANDS) os << dispatch.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
+    if (type == EVENT_DISPATCH_OK) os << " -> cluster ", clusters[cluster].name; else os << " -> none";
+    break;
+  }
+  case EVENT_ISSUE_NO_FU: {
+    os << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+    os << "no FUs available in cluster ", clusters[cluster].name, ": ",
+      "fu_avail = ", bitstring(issue.fu_avail, FU_COUNT, true), ", ",
+      "op_fu = ", bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), ", "
+      "fu_cl_mask = ", bitstring(clusters[cluster].fu_mask, FU_COUNT, true);
+    break;
+  }
+  case EVENT_ISSUE_OK: {
+    stringbuf sb;
+    sb << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+    sb << " on ", padstring(fu_names[fu], -4), " in ", padstring(cluster_names[cluster], -4), ": r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
+    sb << " "; print_value_and_flags(sb, issue.state.reg.rddata, issue.state.reg.rdflags); sb << " =";
+    sb << " "; print_value_and_flags(sb, issue.operand_data[RA], issue.operand_flags[RA]); sb << ", ";
+    sb << " "; print_value_and_flags(sb, issue.operand_data[RB], issue.operand_flags[RB]); sb << ", ";
+    sb << " "; print_value_and_flags(sb, issue.operand_data[RC], issue.operand_flags[RC]);
+    sb << " (", issue.cycles_left, " cycles left)";
+    if (issue.mispredicted) sb << "; mispredicted (real ", (void*)(Waddr)issue.state.reg.rddata, " vs expected ", (void*)(Waddr)issue.predrip, ")";
+    os << sb;
+    break;
+  }
+  case EVENT_REPLAY: {
+    os << "replay rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid],
+      " on cluster ", clusters[cluster].name, ": waiting on";
+    foreach (i, MAX_OPERANDS) {
+      if (!bit(replay.ready, i)) os << " ", replay.opinfo[i];
+    }
+    break;
+  }
+  case EVENT_STORE_WAIT: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    os << "wait on ";
+    if (!loadstore.rcready) os << " rc";
+    if (loadstore.inherit_sfr_used) {
+      os << ((loadstore.rcready) ? "" : " and "), loadstore.inherit_sfr,
+        " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
+        ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
+    }
+    break;
+  }
+  case EVENT_STORE_PARALLEL_FORWARDING_MATCH: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    os << "ignored parallel forwarding match with ldq ", loadstore.inherit_sfr_lsq,
+      " (uuid ", loadstore.inherit_sfr_uuid, " rob", loadstore.inherit_sfr_rob,
+      " r", loadstore.inherit_sfr_physreg, ")";
+    break;
+  }
+  case EVENT_STORE_ALIASED_LOAD: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    os << "aliased with ldbuf ", loadstore.inherit_sfr_lsq, " (uuid ", loadstore.inherit_sfr_uuid,
+      " rob", loadstore.inherit_sfr_rob, " r", loadstore.inherit_sfr_physreg, ");",
+      " (add colliding load rip ", (void*)(Waddr)loadstore.inherit_sfr_rip, "; replay from rip ", rip, ")";
+    break;
+  }
+  case EVENT_STORE_ISSUED: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    if (loadstore.inherit_sfr_used) {
+      os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
+        ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
+        ", r", loadstore.inherit_sfr_physreg, ");";
+    }
+    os << " <= ", hexstring(loadstore.data_to_store, 8*(1<<uop.size)), " = ", loadstore.sfr;
+    break;
+  }
+  case EVENT_STORE_LOCK_RELEASED: {
+    os << "lk-rel", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "lock released (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
+    break;
+  }
+  case EVENT_STORE_LOCK_ANNULLED: {
+    os << "lk-anl", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "lock annulled (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
+    break;
+  }
+  case EVENT_STORE_LOCK_REPLAY: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
+    break;
+  }
+  case EVENT_LOAD_WAIT: {
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    os << "wait on sfr ", loadstore.inherit_sfr,
+      " (uuid ", loadstore.inherit_sfr_uuid, ", stq ", loadstore.inherit_sfr_lsq,
+      ", rob ", loadstore.inherit_sfr_rob, ", r", loadstore.inherit_sfr_physreg, ")";
+    if (loadstore.predicted_alias) os << "; stalled by predicted aliasing";
+    break;
+  }
+  case EVENT_LOAD_HIT: 
+  case EVENT_LOAD_MISS: {
+    if (type == EVENT_LOAD_HIT)
+      os << (loadstore.load_store_second_phase ? "load2 " : "load  ");
+    else os << (loadstore.load_store_second_phase ? "ldmis2" : "ldmiss");
+
+    os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    if (loadstore.inherit_sfr_used) {
+      os << "inherit from ", loadstore.inherit_sfr, " (uuid ", loadstore.inherit_sfr_uuid,
+        ", rob", loadstore.inherit_sfr_rob, ", lsq ", loadstore.inherit_sfr_lsq,
+        ", r", loadstore.inherit_sfr_physreg, "); ";
+    }
+    if (type == EVENT_LOAD_HIT)
+      os << "hit L1: value 0x", hexstring(loadstore.sfr.data, 64);
+    else os << "missed L1 (lfrqslot ", lfrqslot, ") [value would be 0x", hexstring(loadstore.sfr.data, 64), "]";
+    break;
+  }
+  case EVENT_LOAD_LOCK_REPLAY: {
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
+    break;
+  }
+  case EVENT_LOAD_LOCK_OVERFLOW: {
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "replay because locking required but no free interlock buffers", endl;
+    break;
+  }
+  case EVENT_LOAD_LOCK_ACQUIRED: {
+    os << "lk-acq", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
+      "lock acquired";
+    break;
+  }
+  case EVENT_LOAD_LFRQ_FULL:
+    os << "load   rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), "(",padstring(uopname,-5),")", ": LFRQ or miss buffer full; replaying"; break;
+  case EVENT_LOAD_HIGH_ANNULLED: {
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
+    os << "load was annulled (high unaligned load)";
+    break;
+  }
+  case EVENT_LOAD_WAKEUP:
+    os << "ldwake rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), "(",padstring(uopname,-5),")", ": wakeup load via lfrq slot ", lfrqslot; break;
+  case EVENT_LOAD_EXCEPTION: {
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
+    break;
+  }
+  case EVENT_STORE_EXCEPTION: {
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
+      " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
+      (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
+    break;
+  }
+  case EVENT_ALIGNMENT_FIXUP:
+    os << "algnfx", " rip ", rip, ": set unaligned bit for uop ", uop.bbindex, " and refetch"; break;
+  case EVENT_ANNUL_NO_FUTURE_UOPS:
+    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, ", EOM rob ", annul.eomidx, ": no future uops to annul"; break;
+  case EVENT_ANNUL_MISSPECULATION: {
+    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, 
+      ", EOM rob ", annul.eomidx, ": annul from rob ", annul.startidx, " to rob ", annul.endidx;
+    break;
+  }
+  case EVENT_ANNUL_EACH_ROB: {
+    os << "annul  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": annul rip ", rip;
+    os << (uop.som ? " SOM" : "    "); os << (uop.eom ? " EOM" : "    ");
+    os << ": free";
+    os << " r", physreg;
+    if (ld|st) os << " lsq", lsq;
+    if (lfrqslot >= 0) os << " lfrq", lfrqslot;
+    if (annul.annulras) os << " ras";
+    os << " bb ", annul.bb, " (", annul.bb->refcount, " refs)";
+    break;
+  }
+  case EVENT_ANNUL_PSEUDOCOMMIT: {
+    os << "pseucm rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": r", physreg, " rebuild rrt:";
+    os << " arch ", arch_reg_names[uop.rd];
+    if likely (!uop.nouserflags) {
+      if (uop.setflags & SETFLAG_ZF) os << " zf";
+      if (uop.setflags & SETFLAG_CF) os << " cf";
+      if (uop.setflags & SETFLAG_OF) os << " of";
+    }
+    os << " = r", physreg;
+    break;
+  }
+  case EVENT_ANNUL_FETCHQ_RAS:
+    os << "anlras rip ", rip, ": annul RAS update still in fetchq"; break;
+  case EVENT_ANNUL_FETCHQ:
+    os << "anlbbc rip ", rip, ": annul bb ", annul.bb, " (", annul.bb_refcount, " refs)"; break;
+  case EVENT_ANNUL_FLUSH:
+    os << "flush  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " rip ", rip, " bb ", annul.bb, " (", annul.bb_refcount, " refs)"; break;
+  case EVENT_REDISPATCH_DEPENDENTS:
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " find all dependents"; break;
+  case EVENT_REDISPATCH_DEPENDENTS_DONE:
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " redispatched ", (redispatch.count - 1), " dependent uops"; break;
+  case EVENT_REDISPATCH_EACH_ROB: {
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " from state ", redispatch.current_state_list->name, ": dep on ";
+    if (!redispatch.dependent_operands) {
+      os << " [self]";
+    } else {
+      foreach (i, MAX_OPERANDS) {
+        if (bit(redispatch.dependent_operands, i)) os << " ", redispatch.opinfo[i];
+      }
+    }
+
+    os << "; redispatch ";
+    os << " [rob ", rob, "]";
+    os << " [physreg ", physreg, "]";
+    if (ld|st) os << " [lsq ", lsq, "]";
+    if (redispatch.iqslot) os << " [iqslot]";
+    if (lfrqslot >= 0) os << " [lfrqslot ", lfrqslot, "]";
+    if (redispatch.opinfo[RS].physreg != PHYS_REG_NULL) os << " [inheritsfr ", redispatch.opinfo[RS], "]";
+
+    break;
+  }
+  case EVENT_COMPLETE:
+    os << "complt rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " on ", padstring(fu_names[fu], -4), ": r", intstring(physreg, -3); break;
+  case EVENT_FORWARD: {
+    os << "forwd", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
+      " (", clusters[cluster].name, ") r", intstring(physreg, -3), 
+      " => ", "uuid ", forwarding.target_uuid, " rob ", forwarding.target_rob,
+      " (", clusters[forwarding.target_cluster].name, ") r", forwarding.target_physreg,
+      " operand ", forwarding.operand;
+    if (forwarding.target_st) os << " => st", forwarding.target_lsq;
+    os << " [still waiting?";
+    foreach (i, MAX_OPERANDS) { if (!bit(forwarding.target_operands_ready, i)) os << " r", (char)('a' + i); }
+    if (forwarding.target_all_operands_ready) os << " READY";
+    os << "]";
+    break;
+  }
+  case EVENT_BROADCAST: {
+    os << "brcst", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
+      " from cluster ", clusters[cluster].name, " to cluster ", clusters[forwarding.target_cluster].name,
+      " on forwarding cycle ", forwarding.forward_cycle;
+    break;
+  }
+  case EVENT_WRITEBACK: {
+    os << "write  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " (cluster ", clusters[cluster].name, ") r", intstring(physreg, -3), "@", phys_reg_file_names[rfid], " = 0x", hexstring(writeback.data, 64), " ", flagstring(writeback.flags);
+    if (writeback.transient) os << " (transient)";
+    os << " (", writeback.consumer_count, " consumers";
+    if (writeback.all_consumers_sourced_from_bypass) os << ", all from bypass";
+    if (writeback.no_branches_between_renamings) os << ", no intervening branches";
+    if (writeback.dest_renamed_before_writeback) os << ", dest renamed before writeback";
+    os << ")";
+    break;
+  }
+  case EVENT_COMMIT_EXCEPTION_DETECTED:
+    os << "detect rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " (", exception, "), error code ", hexstring(error_code, 16), ", origvirt ", (void*)(Waddr)commit.origvirt; break;
+  case EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED:
+    os << "except rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " [EOM #", commit.total_user_insns_committed, "]"; break;
+  case EVENT_COMMIT_SKIPBLOCK:
+    os << "skipbk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " skip block: advance rip by ", uop.bytes, " to ", (void*)(Waddr)(rip.rip + uop.bytes), " [EOM #", commit.total_user_insns_committed, "]"; break;
+  case EVENT_COMMIT_SMC_DETECTED:
+    os << "smcdet rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": self-modifying code at rip ", rip, " detected (mfn was dirty); invalidate and retry [EOM #", commit.total_user_insns_committed, "]"; break;
+  case EVENT_COMMIT_OK: {
+    os << "commit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
+    if likely (archdest_can_commit[uop.rd])
+                os << " [rrt ", arch_reg_names[uop.rd], " = r", physreg, " 0x", hexstring(commit.state.reg.rddata, 64), "]";
+
+    if ((!uop.nouserflags) && uop.setflags) {
+      os << " [flags ", ((uop.setflags & SETFLAG_ZF) ? "z" : ""), 
+        ((uop.setflags & SETFLAG_CF) ? "c" : ""), ((uop.setflags & SETFLAG_OF) ? "o" : ""),
+        " -> ", flagstring(commit.state.reg.rdflags), "]";
+    }
+
+    if (uop.eom) os << " [rip = ", (void*)(Waddr)commit.target_rip, "]";
+
+    if unlikely (st && commit.state.st.bytemask)
+                  os << " [mem ", (void*)(Waddr)(commit.state.st.physaddr << 3), " = ", bytemaskstring((const byte*)&commit.state.st.data, commit.state.st.bytemask, 8), "]";
+
+    if unlikely (commit.pteupdate.a | commit.pteupdate.d | commit.pteupdate.ptwrite) {
+      os << " [pte:";
+      if (commit.pteupdate.a) os << " a";
+      if (commit.pteupdate.d) os << " d";
+      if (commit.pteupdate.ptwrite) os << " w";
+      os << "]";
+    }
+        
+    if unlikely (ld|st) {
+      os << " [lsq ", lsq, "]";
+    }
+        
+    if likely (commit.oldphysreg > 0) {
+      if unlikely (commit.oldphysreg_refcount) {
+        os << " [pending free old r", commit.oldphysreg, " ref by";
+        os << " refcount ", commit.oldphysreg_refcount;
+        os << "]";
+      } else {
+        os << " [free old r", commit.oldphysreg, "]";
+      }
+    }
+
+    os << " [commit r", physreg, "]";
+
+    foreach (i, MAX_OPERANDS) {
+      if unlikely (commit.operand_physregs[i] != PHYS_REG_NULL) os << " [unref r", commit.operand_physregs[i], "]";
+    }
+
+    if unlikely (br) {
+      os << " [brupdate", (commit.taken ? " tk" : " nt"), (commit.predtaken ? " pt" : " np"), ((commit.taken == commit.predtaken) ? " ok" : " MP"), "]";
+    }
+        
+    os << " [bb ", commit.bb, ", ", commit.bb_refcount, " refs]";    
+    if (uop.eom) os << " [EOM #", commit.total_user_insns_committed, "]";
+    break;
+  }
+  case EVENT_COMMIT_ASSIST: {
+    os << "assist rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " calling assist ", (void*)rip.rip, " (#",
+      assist_index((assist_func_t)rip.rip), ": ", assist_name((assist_func_t)rip.rip), ")";
+    break;
+  }
+  case EVENT_RECLAIM_PHYSREG:
+    os << "free   r", physreg, " no longer referenced; moving to free state"; break;
+  default:
+    os << "?????? unknown event type ", type;
+    break;
+  }
+
+  os << endl;
+  return os;
+}
+
+OutOfOrderMachine::OutOfOrderMachine(const char* name) {
+  // Add to the list of available core types
+  addmachine(name, this);
+}
+
+//
+// Construct all the structures necessary to configure
+// the cores. This function is only called once, after
+// all other PTLsim subsystems are brought up.
+//
+bool OutOfOrderMachine::init(PTLsimConfig& config) {
+  foreach (i, contextcount) {
+    cores[i] = new OutOfOrderCore(i, contextof(i), *this);
+    cores[i]->init();
+    //
+    // Note: in a multi-processor model, config may
+    // specify various ways of slicing contextcount up
+    // into threads, cores and sockets; the appropriate
+    // interconnect and cache hierarchy parameters may
+    // be specified here.
+    //
+  }
+  
+  init_luts();
+
+  init_random(true);
+  return true;
+}
+
+//
+// Run the processor model, until a stopping point
+// is hit (as configured elsewhere in config).
+//
+int OutOfOrderMachine::run(PTLsimConfig& config) {
+  time_this_scope(cttotal);
+
+  logfile << "Starting out-of-order core toplevel loop", endl, flush;
+
+  foreach (i, contextcount) {
+    OutOfOrderCore& core =* cores[i];
+    Context& ctx = contextof(i);
+
+    if unlikely (config.event_log_enabled && (!core.eventlog.start)) {
+      core.eventlog.init(config.event_log_ring_buffer_size);
+      core.eventlog.logfile = &logfile;
+    }
+
+    core.flush_pipeline();
+
+    if (logable(6)) {
+      logfile << "VCPU ", i, " initial state:", endl;
+      logfile << ctx;
+      logfile << endl;
+    }
+  }
+
+  bool exiting = false;
+
+  for (;;) {
+    if unlikely (iterations >= config.start_log_at_iteration) {
+      if unlikely (!logenable) logfile << "Start logging at level ", config.loglevel, " in cycle ", iterations, endl, flush;
+      logenable = 1;
+    }
+
+    update_progress();
+    inject_events();
+
+    foreach (i, contextcount) {
+      OutOfOrderCore& core =* cores[i];
+      Context& ctx = contextof(i);
+
+#ifdef PTLSIM_HYPERVISOR
+      if unlikely (!ctx.running) {
+        if (ctx.check_events()) core.handle_interrupt();
+        continue;
+      }
+#endif
+      exiting |= core.runcycle();
+    }
+
+    exiting |= check_for_async_sim_break();
+
+    stats.summary.cycles++;
+    stats.ooocore.cycles++;
+    sim_cycle++;
+    iterations++;
+
+    if unlikely (exiting) break;
+  }
+
+  logfile << "Exiting out of order mode at ", total_user_insns_committed, " commits, ", total_uops_committed, " uops and ", iterations, " iterations (cycles)", endl;
+
+  foreach (i, contextcount) {
+    OutOfOrderCore& core =* cores[i];
+    Context& ctx = contextof(i);
+
+    core.core_to_external_state();
+
+    if (logable(6) | ((sim_cycle - core.last_commit_at_cycle) > 1024) | config.dump_state_now) {
+      logfile << "VCPU ", i, " core state at end:", endl;
+      logfile << ctx;
+      core.dump_ooo_state(logfile);
+      if (config.event_log_enabled) core.eventlog.print(logfile);
+    }
+  }
+
+  config.dump_state_now = 0;
+
+  // Flush everything to remove any remaining refs to basic blocks
+  flush_all_pipelines();
+
+  return exiting;
+}
+
+void OutOfOrderMachine::dump_state(ostream& os) {
+  foreach (i, contextcount) {
+    if (!cores[i]) continue;
+    OutOfOrderCore& core =* cores[i];
+    Context& ctx = contextof(i);
+    os << "Core ", i, ":", endl;
+    if unlikely (config.event_log_enabled) core.eventlog.print(logfile);
+    core.dump_ooo_state(os);
+  }
+  os << "Memory interlock buffer:", endl;
+  interlocks.print(os);
+}
+
+namespace ASFOutOfOrderModel {
+  CycleTimer cttotal;
+  CycleTimer ctfetch;
+  CycleTimer ctdecode;
+  CycleTimer ctrename;
+  CycleTimer ctfrontend;
+  CycleTimer ctdispatch;
+  CycleTimer ctissue;
+  CycleTimer ctissueload;
+  CycleTimer ctissuestore;
+  CycleTimer ctcomplete;
+  CycleTimer cttransfer;
+  CycleTimer ctwriteback;
+  CycleTimer ctcommit;
+};
+
+void OutOfOrderMachine::update_stats(PTLsimStats& stats) {
+  stats.ooocore.issue.uipc = (double)stats.ooocore.issue.uops / (double)stats.ooocore.cycles;
+  stats.ooocore.commit.uipc = (double)stats.ooocore.commit.uops / (double)stats.ooocore.cycles;
+  stats.ooocore.commit.ipc = (double)stats.ooocore.commit.insns / (double)stats.ooocore.cycles;
+
+  stats.ooocore.simulator.total_time = cttotal.seconds();
+  stats.ooocore.simulator.cputime.fetch = ctfetch.seconds();
+  stats.ooocore.simulator.cputime.decode = ctdecode.seconds();
+  stats.ooocore.simulator.cputime.rename = ctrename.seconds();
+  stats.ooocore.simulator.cputime.frontend = ctfrontend.seconds();
+  stats.ooocore.simulator.cputime.dispatch = ctdispatch.seconds();
+  stats.ooocore.simulator.cputime.issue = ctissue.seconds() - (ctissueload.seconds() + ctissuestore.seconds());
+  stats.ooocore.simulator.cputime.issueload = ctissueload.seconds();
+  stats.ooocore.simulator.cputime.issuestore = ctissuestore.seconds();
+  stats.ooocore.simulator.cputime.complete = ctcomplete.seconds();
+  stats.ooocore.simulator.cputime.transfer = cttransfer.seconds();
+  stats.ooocore.simulator.cputime.writeback = ctwriteback.seconds();
+  stats.ooocore.simulator.cputime.commit = ctcommit.seconds();
+}
+
+//
+// Flush all pipelines in every core, and process any
+// pending BB cache invalidates.
+//
+// Typically this is in response to some infrequent event
+// like cross-modifying SMC or cache coherence deadlocks.
+//
+void OutOfOrderMachine::flush_all_pipelines() {
+  foreach (i, contextcount) {
+    if (!cores[i]) continue;
+    OutOfOrderCore& core =* cores[i];
+    core.flush_pipeline();
+  }
+
+  foreach (i, contextcount) {
+    if (!cores[i]) continue;
+    OutOfOrderCore& core =* cores[i];
+    core.invalidate_smc();
+  }
+}
+
+/**
+ * Adds an address to the locked-line buffer (LLB). During the ACQUIRE
+ * instruction, the cachelines for the addresses inside the LLB are fetched,
+ * so that the original data can be restored if the transaction aborts.
+ * @param addr The physical address of the data to be stored in the LLB.
+ */
+LLBLine* LockedLineBuffer::add_location(Waddr addr) {
+  Waddr cache_line_phys_addr = floor(addr, CacheSubsystem::L1_LINE_SIZE);
+  /* Touch the line now, fill it later */
+  LLBLine* line = select(cache_line_phys_addr);
+  line->refcount++;
+  num_locations++;
+  cerr << __FILE__,__LINE__,": Adding location ",hexstring(addr,64), " locations: ",num_locations," line: ",line," refcount: ",line->refcount ,endl,flush;
+  return line;
+}
+
+/**
+ * Removes all addresses and their associated undo data from the LLB.
+ */
+void LockedLineBuffer::clear() {
+  if likely(empty()) return;
+  cerr <<__FILE__,__LINE__,": Clearing the LLB! Locations: ",num_locations, endl, flush;
+  num_locations = 0;
+  lasterr = 0;
+  reset();
+}
+
+/**
+ * Creates a snapshot of the cachelines associated with all the addresses in the LLB.
+ */
+void LockedLineBuffer::snapshot() {
+  int c = 0;
+  for (int i = 0; i < ASF_MAX_LINES; i++) {
+    if likely (tags[i] != tags.INVALID) {
+      /* Fetch the cacheline from the given address */
+      cerr << __FILE__,__LINE__,": Fetching LLB line ", i, " from address ", hexstring(tags[i],64),endl, flush;
+      cerr << __FILE__,__LINE__,": Copying ", sizeof(data[i].orig_data), " bytes from ", phys_to_mapped_virt(tags[i]), " to ", data[i].orig_data, endl, flush;
+      memcpy(data[i].orig_data, phys_to_mapped_virt(tags[i]), sizeof(data[i].orig_data));
+      c += data[i].refcount;
+    }
+  }
+  assert(c == num_locations);
+}
+
+/**
+ * Write back the lines from the LLB to the caches, undoing any changes made to
+ * them.
+ */
+void LockedLineBuffer::undo() {
+  int c = 0;
+  for (int i = 0; i < ASF_MAX_LINES; i++) {
+    if likely (tags[i] != tags.INVALID) {
+      /* Write the cacheline back to its position. */
+      cerr << __FILE__,__LINE__,":Restoring LLB line ", i, " at address ", hexstring(tags[i],64),endl, flush;
+      cerr << __FILE__,__LINE__,": Copying ", sizeof(data[i].orig_data), " bytes from ",  data[i].orig_data, " to ",phys_to_mapped_virt(tags[i]), endl, flush;
+      memcpy(phys_to_mapped_virt(tags[i]), data[i].orig_data, sizeof(data[i].orig_data));
+      c += data[i].refcount;
+    }
+  }
+}
+
+/**
+ * Notifies the LLB, that a reference to one of its lines has been dropped.
+ * This can occur, when a ROBEntry gets redispatched / annuled and thus must
+ * get removed from the LLB.
+ */
+void LockedLineBuffer::remove_ref(LLBLine* line) {
+  Waddr tag = tagof(line);
+
+  if unlikely (tag == tags.INVALID) {
+    cerr << __FILE__,__LINE__,": Line not in LLB anymore. Ignoring remove request!", endl, flush;
+    return;
+  }
+  assert(line->refcount > 0);
+  cerr << __FILE__,__LINE__,": Removing reference to line ",line," tag: ",tagof(line)," refcount: ", line->refcount, endl, flush;
+
+  line->refcount--;
+  num_locations--;
+
+  /* Remove a line which does not belong to any valid instructions any longer! */
+  if (!line->refcount) {
+    cerr << __FILE__,__LINE__,": No more references to line ",line,". Removing it!", endl, flush;
+    invalidate_line(line);
+  }
+}
+
+/**
+ * Rolls back all speculative modifications by the critical section and
+ * moves control flow back to the last acquire instruction.
+ * @param errorcode The error code, which the re-executed acquire instruction
+ *                  should return.
+ * @param reg_nextrip The index of the register, which will contain the RIP of the next instruction.
+ */
+void OutOfOrderCore::asf_rollback_last_acq(W64 errorcode, int reg_nextrip) {
+  assert(asf_failing_acquire);
+  LockedLineBuffer& llb = locked_line_buffer;
+
+  cerr << __FILE__,__LINE__,": Aborting critical section, jumping back to ", asf_failing_acquire, endl, flush;
+  llb.abort();
+  asf_in_crit_sec       = false;
+  asf_reissue_will_fail = true;
+  asf_stored_error      = errorcode;
+
+  ctx.commitarf[reg_nextrip] = asf_failing_acquire;
+}
+/**
+ * When the core is running in an ASF transaction check for conflicting accesses from other cores
+ * and abort the currently running transaction (in optimistic mode) if detected!
+ */
+void OutOfOrderCore::check_asf_conflicts() {
+  //cerr << __FILE__,__LINE__,": Checking for asynchronous conflicts with the current CS", endl, flush;
+  LockedLineBuffer& llb = locked_line_buffer;
+  W64 llb_err = llb.consistency_error();
+  if likely (!llb_err) return;
+  cerr << __FILE__,__LINE__,": Error ", hexstring(llb_err, 64), " found! Aborting the transaction!", endl, flush;
+  /* TODO: This is for optimistic mode just a plain -16, but for testing make this somewhat more useful!
+           We could actually also use some data from the LLB here. */
+  asf_rollback_last_acq(/*ctx.commitarf[REG_rip]*/llb_err, REG_rip);
+  cerr << __FILE__,__LINE__,": Flushing the pipeline!", endl, flush;
+  flush_pipeline();
+}
+
+/**
+ * Allow ASF to see any effects of the stages of the CPU just before the cycle ends and
+ * things such as exceptions are processed.
+ * @param commitrc Return code of the commit operation, used to tweak exception handling
+ *                 when inside ASF's critical sections.
+ * @return New commitrc, possibly tweaked to mask exceptions!
+ */
+int OutOfOrderCore::asf_runcycle(int commitrc) {
+  /* Check for conflicts of the ongoing critical section */
+  if unlikely (asf_in_crit_sec) {
+    //TODO: Add proper interrupt deferal treatment!
+    if unlikely (commitrc == COMMIT_RESULT_INTERRUPT) {
+      /* Interrupts during ASF critical sections are just masked (for now))! */
+      cerr << __FILE__,":",__LINE__,"@",sim_cycle," Masking interrupt at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+      commitrc = COMMIT_RESULT_OK;
+    } 
+
+    else if unlikely (commitrc == COMMIT_RESULT_EXCEPTION) {
+      /* Exceptions trigger, but as if they were caused by the ACQUIRE! */
+      if (ctx.exception != EXCEPTION_SkipBlock) {
+        cerr << __FILE__,":",__LINE__,"@",sim_cycle," Exception ", exception_name(ctx.exception),
+          " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip], " faking it at the last acquire!", endl, flush;
+        asf_rollback_last_acq(ctx.exception, REG_rip);
+      }
+    }
+
+    else if unlikely (commitrc == COMMIT_RESULT_BARRIER) {
+      /* Far control flow movements push the rip of the last Acquire on the stack! */
+      int assistid = ctx.commitarf[REG_rip];
+      if (inrange(assistid, (int)ASSIST_INT, (int)ASSIST_IRET64))
+        cerr << __FILE__,":",__LINE__,"@",sim_cycle,
+          " Assist ", assist_names[assistid]," called from rip ", (void*)(Waddr)ctx.commitarf[REG_selfrip],
+          " faking it to return to old acquire!", endl, flush;
+      //NOTE: This is scary stuff!
+      if unlikely (assistid == ASSIST_INT) {
+        /* int just pushes the RIP of the next Instruktion onto the stack -> modify this! */
+        asf_rollback_last_acq(assistid, REG_nextrip);
+      } else if unlikely (assistid == ASSIST_SYSCALL) {
+        /* syscall moves the RIP of the next instruction into RCX, PTLsim has done that in
+           ucode already before this assist -> change RCX! */
+        asf_rollback_last_acq(assistid, REG_RCX);
+      } else {
+        /* Can't do much for all the other crazyness! */
+        cerr << __FILE__,":",__LINE__,"@",sim_cycle,
+          " Unhandled assist ", assist_names[assistid], " encountered within an ASF critical section. This is pretty bad!!!", endl, flush;
+      }
+    }
+
+    if likely (commitrc == COMMIT_RESULT_OK) {
+      /* Just checking for normal interference from other cores.. */
+      check_asf_conflicts();
+      return commitrc;
+    }
+  } else {
+    if likely (commitrc == COMMIT_RESULT_OK) return commitrc;
+    if likely (locked_line_buffer.empty())   return commitrc;
+
+    /* In case of any exception / far control transfer: clear the LLB!
+       Example: An lock prefetchw causes a page fault -> the subsequent acquire will fail! */
+    if unlikely (commitrc == COMMIT_RESULT_EXCEPTION) {
+      cerr << __FILE__,":",__LINE__,"@",sim_cycle," Exception ", exception_name(ctx.exception),
+        " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip],  endl, flush;
+
+      if (ctx.exception != EXCEPTION_SkipBlock) locked_line_buffer.clear();
+    }
+
+    else if unlikely (commitrc == COMMIT_RESULT_BARRIER) {
+      int assistid = ctx.commitarf[REG_rip];
+      //NOTE: This relies on assist-id ordering!
+      //TODO: Add other ASF unfriendly assists here!
+      if (inrange(assistid, (int)ASSIST_INT, (int)ASSIST_IRET64)) {
+        cerr << __FILE__,":",__LINE__,"@",sim_cycle," Assist ", assist_names[assistid],
+          " called from rip ", (void*)(Waddr)ctx.commitarf[REG_selfrip],  endl, flush;
+        locked_line_buffer.clear();
+      }
+    }
+    else if unlikely (commitrc == COMMIT_RESULT_INTERRUPT) {
+      cerr << __FILE__,":",__LINE__,"@",sim_cycle," Interrupt at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+      locked_line_buffer.clear();
+    }
+  }
+  return commitrc;
+}
+ASFOutOfOrderModel::OutOfOrderMachine asfooomodel("asfooo");
+
+OutOfOrderCore& ASFOutOfOrderModel::coreof(int coreid) {
+  return *asfooomodel.cores[coreid];
+}
diff -r 10448c053ad6 asfooocore.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asfooocore.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,1805 @@
+// -*- c++ -*-
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Out-of-Order Core Configuration With Experimental AMD64 ASF Extension
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+#include <random_inject.h>
+
+// With these disabled, simulation is faster
+//#define ENABLE_CHECKS
+#define ENABLE_LOGGING
+
+#ifndef _OOOCORE_H_
+#define _OOOCORE_H_
+
+//#define ENABLE_SIM_TIMING
+#ifdef ENABLE_SIM_TIMING
+#define time_this_scope(ct) CycleTimerScope ctscope(ct)
+#define start_timer(ct) ct.start()
+#define stop_timer(ct) ct.stop()
+#else
+#define time_this_scope(ct) (0)
+#define start_timer(ct) (0)
+#define stop_timer(ct) (0)
+#endif
+
+namespace ASFOutOfOrderModel {
+  //
+  // Operand formats
+  //
+  static const int MAX_OPERANDS = 4;
+  static const int RA = 0;
+  static const int RB = 1;
+  static const int RC = 2;
+  static const int RS = 3;
+
+  //
+  // Uop to functional unit mappings
+  //
+  static const int FU_COUNT = 8;
+  static const int LOADLAT = 2;
+
+  enum {
+    FU_LDU0       = (1 << 0),
+    FU_STU0       = (1 << 1),
+    FU_LDU1       = (1 << 2),
+    FU_STU1       = (1 << 3),
+    FU_ALU0       = (1 << 4),
+    FU_FPU0       = (1 << 5),
+    FU_ALU1       = (1 << 6),
+    FU_FPU1       = (1 << 7),
+  };
+
+  static const int LOAD_FU_COUNT = 2;
+
+  const char* fu_names[FU_COUNT] = {
+    "ldu0",
+    "stu0",
+    "ldu1",
+    "stu1",
+    "alu0",
+    "fpu0",
+    "alu1",
+    "fpu1",
+  };
+
+  //
+  // Opcodes and properties
+  //
+#define ALU0 FU_ALU0
+#define ALU1 FU_ALU1
+#define STU0 FU_STU0
+#define STU1 FU_STU1
+#define LDU0 FU_LDU0
+#define LDU1 FU_LDU1
+#define FPU0 FU_FPU0
+#define FPU1 FU_FPU1
+#define A 1 // ALU latency, assuming fast bypass
+#define L LOADLAT
+
+#define ANYALU ALU0|ALU1
+#define ANYLDU LDU0|LDU1
+#define ANYSTU STU0|STU1
+#define ANYFPU FPU0|FPU1
+#define ANYINT ANYALU|ANYSTU|ANYLDU
+
+  struct FunctionalUnitInfo {
+    byte opcode;   // Must match definition in ptlhwdef.h and ptlhwdef.cpp! 
+    byte latency;  // Latency in cycles, assuming ideal bypass
+    W16  fu;       // Map of functional units on which this uop can issue
+  };
+
+  //
+  // WARNING: This table MUST be kept in sync with the table
+  // in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
+  //
+  const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
+    // name, latency, fumask
+    {OP_nop,            A, ANYINT|ANYFPU},
+    {OP_mov,            A, ANYINT|ANYFPU},
+    // Logical
+    {OP_and,            A, ANYINT|ANYFPU},
+    {OP_andnot,         A, ANYINT|ANYFPU},
+    {OP_xor,            A, ANYINT|ANYFPU},
+    {OP_or,             A, ANYINT|ANYFPU},
+    {OP_nand,           A, ANYINT|ANYFPU},
+    {OP_ornot,          A, ANYINT|ANYFPU},
+    {OP_eqv,            A, ANYINT|ANYFPU},
+    {OP_nor,            A, ANYINT|ANYFPU},
+    // Mask, insert or extract bytes
+    {OP_maskb,          A, ANYINT},
+    // Add and subtract
+    {OP_add,            A, ANYINT},
+    {OP_sub,            A, ANYINT},
+    {OP_adda,           A, ANYINT},
+    {OP_suba,           A, ANYINT},
+    {OP_addm,           A, ANYINT},
+    {OP_subm,           A, ANYINT},
+    // Condition code logical ops
+    {OP_andcc,          A, ANYINT},
+    {OP_orcc,           A, ANYINT},
+    {OP_xorcc,          A, ANYINT},
+    {OP_ornotcc,        A, ANYINT},
+    // Condition code movement and merging
+    {OP_movccr,         A, ANYINT},
+    {OP_movrcc,         A, ANYINT},
+    {OP_collcc,         A, ANYINT},
+    // Simple shifting (restricted to small immediate 1..8)
+    {OP_shls,           A, ANYINT},
+    {OP_shrs,           A, ANYINT},
+    {OP_bswap,          A, ANYINT},
+    {OP_sars,           A, ANYINT},
+    // Bit testing
+    {OP_bt,             A, ANYALU},
+    {OP_bts,            A, ANYALU},
+    {OP_btr,            A, ANYALU},
+    {OP_btc,            A, ANYALU},
+    // Set and select
+    {OP_set,            A, ANYINT},
+    {OP_set_sub,        A, ANYINT},
+    {OP_set_and,        A, ANYINT},
+    {OP_sel,            A, ANYINT},
+    // Branches
+    {OP_br,             A, ANYINT},
+    {OP_br_sub,         A, ANYINT},
+    {OP_br_and,         A, ANYINT},
+    {OP_jmp,            A, ANYINT},
+    {OP_bru,            A, ANYINT},
+    {OP_jmpp,           A, ANYALU|ANYLDU},
+    {OP_brp,            A, ANYALU|ANYLDU},
+    // Checks
+    {OP_chk,            A, ANYINT},
+    {OP_chk_sub,        A, ANYINT},
+    {OP_chk_and,        A, ANYINT},
+    // Loads and stores
+    {OP_ld,             L, ANYLDU},
+    {OP_ldx,            L, ANYLDU},
+    {OP_ld_pre,         1, ANYLDU},
+    {OP_st,             1, ANYSTU},
+    {OP_mf,             1, STU0  },
+    // Shifts, rotates and complex masking
+    {OP_shl,            A, ANYALU},
+    {OP_shr,            A, ANYALU},
+    {OP_mask,           A, ANYALU},
+    {OP_sar,            A, ANYALU},
+    {OP_rotl,           A, ANYALU},  
+    {OP_rotr,           A, ANYALU},   
+    {OP_rotcl,          A, ANYALU},
+    {OP_rotcr,          A, ANYALU},  
+    // Multiplication
+    {OP_mull,           4, ANYFPU},
+    {OP_mulh,           4, ANYFPU},
+    {OP_mulhu,          4, ANYFPU},
+    // Bit scans
+    {OP_ctz,            3, ANYFPU},
+    {OP_clz,            3, ANYFPU},
+    {OP_ctpop,          3, ANYFPU},  
+    {OP_permb,          4, ANYFPU},
+    // Floating point
+    // uop.size bits have following meaning:
+    // 00 = single precision, scalar (preserve high 32 bits of ra)
+    // 01 = single precision, packed (two 32-bit floats)
+    // 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
+    {OP_addf,           6, ANYFPU},
+    {OP_subf,           6, ANYFPU},
+    {OP_mulf,           6, ANYFPU},
+    {OP_maddf,          6, ANYFPU},
+    {OP_msubf,          6, ANYFPU},
+    {OP_divf,           6, ANYFPU},
+    {OP_sqrtf,          6, ANYFPU},
+    {OP_rcpf,           6, ANYFPU},
+    {OP_rsqrtf,         6, ANYFPU},
+    {OP_minf,           4, ANYFPU},
+    {OP_maxf,           4, ANYFPU},
+    {OP_cmpf,           4, ANYFPU},
+    // For fcmpcc, uop.size bits have following meaning:
+    // 00 = single precision ordered compare
+    // 01 = single precision unordered compare
+    // 10 = double precision ordered compare
+    // 11 = double precision unordered compare
+    {OP_cmpccf,         4, ANYFPU},
+    // and/andn/or/xor are done using integer uops
+    {OP_permf,          3, ANYFPU}, // shuffles
+    // For these conversions, uop.size bits select truncation mode:
+    // x0 = normal IEEE-style rounding
+    // x1 = truncate to zero
+    {OP_cvtf_i2s_ins,   6, ANYFPU},
+    {OP_cvtf_i2s_p,     6, ANYFPU},
+    {OP_cvtf_i2d_lo,    6, ANYFPU},
+    {OP_cvtf_i2d_hi,    6, ANYFPU},
+    {OP_cvtf_q2s_ins,   6, ANYFPU},
+    {OP_cvtf_q2d,       6, ANYFPU},
+    {OP_cvtf_s2i,       6, ANYFPU},
+    {OP_cvtf_s2q,       6, ANYFPU},
+    {OP_cvtf_s2i_p,     6, ANYFPU},
+    {OP_cvtf_d2i,       6, ANYFPU},
+    {OP_cvtf_d2q,       6, ANYFPU},
+    {OP_cvtf_d2i_p,     6, ANYFPU},
+    {OP_cvtf_d2s_ins,   6, ANYFPU},
+    {OP_cvtf_d2s_p,     6, ANYFPU},
+    {OP_cvtf_s2d_lo,    6, ANYFPU},
+    {OP_cvtf_s2d_hi,    6, ANYFPU},
+    {OP_acq,            A, ANYINT},
+    {OP_com,            A, ANYINT|ANYFPU},
+  };
+
+#undef A
+#undef L
+#undef F
+
+#undef ALU0
+#undef ALU1
+#undef STU0
+#undef STU1
+#undef LDU0
+#undef LDU1
+#undef FPU0
+#undef FPU1
+#undef L
+
+#undef ANYALU
+#undef ANYLDU
+#undef ANYSTU
+#undef ANYFPU
+#undef ANYINT
+  
+  //
+  // Global limits
+  //
+  
+  const int MAX_ISSUE_WIDTH = 8;
+  
+  // Largest size of any physical register file or the store queue:
+  const int MAX_PHYS_REG_FILE_SIZE = 128;
+  const int PHYS_REG_NULL = 0;
+  
+  //
+  // IMPORTANT! If you change this to be greater than 256, you MUST
+  // #define BIG_ROB below to use the correct associative search logic
+  // (16-bit tags vs 8-bit tags).
+  //
+  //#define BIG_ROB
+  
+  const int ROB_SIZE = 128;
+  
+  // Maximum number of branches in the pipeline at any given time
+  const int MAX_BRANCHES_IN_FLIGHT = 32;
+
+  // Set this to combine the integer and FP phys reg files:
+  // #define UNIFIED_INT_FP_PHYS_REG_FILE
+  
+#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
+  // unified, br, st
+  const int PHYS_REG_FILE_COUNT = 3;
+#else
+  // int, fp, br, st
+  const int PHYS_REG_FILE_COUNT = 4;
+#endif
+  
+  //
+  // Load and Store Queues
+  //
+  const int LDQ_SIZE = 48;
+  const int STQ_SIZE = 32;
+
+  //
+  // Fetch
+  //
+  const int FETCH_QUEUE_SIZE = 32;
+  const int FETCH_WIDTH = 4;
+
+  //
+  // Frontend (Rename and Decode)
+  //
+  const int FRONTEND_WIDTH = 4;
+  const int FRONTEND_STAGES = 5;
+
+  //
+  // Dispatch
+  //
+  const int DISPATCH_WIDTH = 4;
+
+  //
+  // Writeback
+  //
+  const int WRITEBACK_WIDTH = 4;
+
+  //
+  // Commit
+  //
+  const int COMMIT_WIDTH = 4;
+
+  //
+  // Clustering, Issue Queues and Bypass Network
+  //
+  const int MAX_FORWARDING_LATENCY = 2;
+  const int MAX_CLUSTERS = 4;
+
+  enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
+  static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
+  static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
+
+#ifdef INSIDE_OOOCORE
+  struct OutOfOrderCore;
+  OutOfOrderCore& coreof(int coreid);
+
+  struct ReorderBufferEntry;
+
+  //
+  // Issue queue based scheduler with broadcast
+  //
+#ifdef BIG_ROB
+  typedef W16 issueq_tag_t;
+#else
+  typedef byte issueq_tag_t;
+#endif
+
+  template <int size, int operandcount = MAX_OPERANDS>
+  struct IssueQueue {
+#ifdef BIG_ROB
+    typedef FullyAssociativeTags16bit<size, size> assoc_t;
+    typedef vec8w vec_t;
+#else
+    typedef FullyAssociativeTags8bit<size, size> assoc_t;
+    typedef vec16b vec_t;
+#endif
+
+    typedef issueq_tag_t tag_t;
+
+    static const int SIZE = size;
+
+    assoc_t uopids;
+    assoc_t tags[operandcount];
+
+    // States:
+    //             V I
+    // free        0 0
+    // dispatched  1 0
+    // issued      1 1
+    // complete    0 1
+
+    bitvec<size> valid;
+    bitvec<size> issued;
+    bitvec<size> allready;
+    int count;
+    byte coreid;
+
+    bool remaining() const { return (size - count); }
+    bool empty() const { return (!count); }
+    bool full() const { return (!remaining()); }
+
+    int uopof(int slot) const {
+      return uopids[slot];
+    }
+
+    int slotof(int uopid) const {
+      return uopids.search(uopid);
+    }
+
+    void reset(int coreid);
+    void clock();
+    bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
+    bool broadcast(tag_t uopid);
+    int issue();
+    bool replay(int slot, const tag_t* operands, const tag_t* preready);
+    bool remove(int slot);
+    ostream& print(ostream& os) const;
+    void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
+
+    //
+    // Replay a uop that has already issued once.
+    // The caller may add or reset dependencies here as needed.
+    //
+    bool replay(int slot) {
+      issued[slot] = 0;
+      return true;
+    }
+
+    //
+    // Remove an entry from the issue queue after it has completed,
+    // or in the process of annulment.
+    //
+    bool release(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annul(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annuluop(int uopid) {
+      int slot = slotof(uopid);
+      if (slot < 0) return false;
+      remove(slot);
+      return true;
+    }
+
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+  };
+
+  template <int size, int operandcount>
+  static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
+    return issueq.print(os);
+  }
+
+  //
+  // Iterate through a linked list of objects where each object directly inherits
+  // only from the selfqueuelink class or otherwise has a selfqueuelink object
+  // as the first member.
+  //
+  // This iterator supports mutable lists, meaning the current entry (obj) may
+  // be safely removed from the list and/or moved to some other list without
+  // affecting the next object processed.
+  //
+  // This does NOT mean you can remove any object from the list other than the
+  // current object obj - to do this, copy the list of pointers to an array and
+  // then process that instead.
+  //
+#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
+  linktype* entry; \
+  linktype* nextentry; \
+  for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
+    entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
+
+#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
+
+  struct StateList;
+
+  struct ListOfStateLists: public array<StateList*, 64> {
+    int count;
+
+    ListOfStateLists() { count = 0; }
+
+    int add(StateList* list);
+    void reset();
+  };
+
+  struct StateList: public selfqueuelink {
+    const char* name;
+    int count;
+    int listid;
+    W64 dispatch_source_counter;
+    W64 issue_source_counter;
+    W32 flags;
+
+    StateList() { count = 0; listid = 0; }
+
+    void init(const char* name, ListOfStateLists& lol, W32 flags = 0) {
+      reset();
+      this->name = name;
+      this->flags = flags;
+      count = 0;
+      listid = lol.add(this);
+      dispatch_source_counter = 0;
+      issue_source_counter = 0;
+    }
+
+    StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {
+      init(name, lol, flags);
+    }
+
+    // simulated asymmetric c++ array constructor:
+    StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
+      init(name, lol, flags);
+      return *this;
+    }
+
+    void reset() {
+      selfqueuelink::reset();
+      count = 0;
+    }
+
+    selfqueuelink* dequeue() {
+      if (empty())
+        return null;
+      count--;
+      selfqueuelink* obj = removehead();
+      return obj;
+    }
+
+    selfqueuelink* enqueue(selfqueuelink* entry) {
+      entry->addtail(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
+      if (preventry) entry->addhead(preventry); else entry->addhead(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* remove(selfqueuelink* entry) {
+      assert(entry->linked());
+      entry->unlink();
+      count--;
+      return entry;
+    }
+
+    selfqueuelink* peek() {
+      return (empty()) ? null : head();
+    }
+
+    void checkvalid();
+  };
+
+  int ListOfStateLists::add(StateList* list) {
+    assert(count < lengthof(data));
+    data[count] = list;
+    return count++;
+  }
+
+  void ListOfStateLists::reset() {
+    foreach (i, count) {
+      data[i]->reset();
+    }
+  }
+
+  template <typename T> 
+  static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
+
+  //
+  // Fetch Buffers
+  //
+  struct BranchPredictorUpdateInfo: public PredictorUpdate {
+    int stack_recover_idx;
+    int bptype;
+    W64 ripafter;
+  };
+
+  struct FetchBufferEntry: public TransOp {
+    RIPVirtPhys rip;
+    W64 uuid;
+    uopimpl_func_t synthop;
+    BranchPredictorUpdateInfo predinfo;
+    BasicBlock* bb;
+    W16 index;
+
+    int init(int index) { this->index = index; return 0; }
+    void validate() { }
+
+    FetchBufferEntry() { }
+    
+    FetchBufferEntry(const TransOp& transop) {
+      *((TransOp*)this) = transop;
+    }
+  };
+
+  //
+  // ReorderBufferEntry
+  //
+
+  struct OutOfOrderCore;
+  struct PhysicalRegister;
+  struct LoadStoreQueueEntry;
+  struct OutOfOrderCoreEvent;
+  struct LLBLine;
+  //
+  // Reorder Buffer (ROB) structure, used for tracking all uops in flight.
+  // This same structure is used to represent both dispatched but not yet issued 
+  // uops as well as issued uops.
+  //
+  struct ReorderBufferEntry: public selfqueuelink {
+    FetchBufferEntry uop;
+    struct StateList* current_state_list;
+    PhysicalRegister* physreg;
+    PhysicalRegister* operands[MAX_OPERANDS];
+    LoadStoreQueueEntry* lsq;
+    W16s idx;
+    W16s cycles_left; // execution latency counter, decremented every cycle when executing
+    W16s forward_cycle; // forwarding cycle after completion
+    W16s lfrqslot;
+    W16s iqslot;
+    W16  executable_on_cluster_mask;
+    W8s  cluster;
+    W8   coreid;
+    byte fu;
+    byte consumer_count;
+    PTEUpdate pteupdate;
+    Waddr origvirt;
+    byte entry_valid:1, issued:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1;
+
+    int index() const { return idx; }
+    void validate() { entry_valid = true; }
+
+    void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
+      if (current_state_list)
+        current_state_list->remove(this);
+      current_state_list = &newqueue;
+      if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
+    }
+
+    void init(int idx);
+    void reset();
+    bool ready_to_issue() const;
+    bool ready_to_commit() const;
+    StateList& get_ready_to_issue_list() const;
+    bool find_sources();
+    int forward();
+    int select_cluster();
+    int issue();
+    void* addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
+    bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
+    int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
+    int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
+    void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate);
+    void release();
+    W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
+    W64 annul_after() { return annul(true); }
+    W64 annul_after_and_including() { return annul(false); }
+    int commit();
+    void replay();
+    int pseudocommit();
+    void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
+    void redispatch_dependents(bool inclusive = true);
+    void loadwakeup();
+    bool release_mem_lock(bool forced = false);
+    ostream& print(ostream& os) const;
+    stringbuf& get_operand_info(stringbuf& sb, int operand) const;
+    ostream& print_operand_info(ostream& os, int operand) const;
+
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+
+    // ASF-related things
+    LLBLine* llbline;
+    int commit_asf_instruction();
+    int issueasf(IssueState& state, W64 rbdata);
+    void abort_asf();
+  };
+
+  static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
+    return rob.print(os);
+  }
+
+  //
+  // Load/Store Queue
+  //
+#define LSQ_SIZE (LDQ_SIZE + STQ_SIZE)
+
+  struct LoadStoreQueueEntry: public SFR {
+    ReorderBufferEntry* rob;
+    W16 idx;
+    byte coreid;
+    W8s mbtag;
+    W8 store:1, entry_valid:1;
+    W32 padding;
+
+    LoadStoreQueueEntry() { }
+
+    int index() const { return idx; }
+
+    void reset() {
+      int oldidx = idx;
+      setzero(*this);
+      idx = oldidx;
+      mbtag = -1;
+    }
+
+    void init(int idx) {
+      this->idx = idx;
+      reset();
+    }
+
+    void validate() { entry_valid = 1; }
+  
+    ostream& print(ostream& os) const;
+
+    LoadStoreQueueEntry& operator =(const SFR& sfr) {
+      *((SFR*)this) = sfr;
+      return *this;
+    }
+
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+  };
+
+  static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
+    return lsq.print(os);
+  }
+
+  struct PhysicalRegisterOperandInfo {
+    W32 uuid;
+    W16 physreg;
+    W16 rob;
+    byte state;
+    byte rfid;
+    byte archreg;
+    byte pad1;
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
+
+  //
+  // Physical Register File
+  //
+  struct PhysicalRegister: public selfqueuelink {
+    ReorderBufferEntry* rob;
+    W64 data;
+    W16 flags;
+    W16 idx;
+    W8  coreid;
+    W8  rfid;
+    W8  state;
+    W8  archreg;
+    W8  all_consumers_sourced_from_bypass:1;
+    W16s refcount;
+
+    StateList& get_state_list(int state) const;
+    StateList& get_state_list() const { return get_state_list(this->state); }
+
+    void changestate(int newstate) {
+      if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
+      state = newstate;
+      get_state_list(state).enqueue(this);
+    }
+
+    void init(int coreid, int rfid, int idx) {
+      this->coreid = coreid;
+      this->rfid = rfid;
+      this->idx = idx;
+      reset();
+    }
+
+    void addref() { refcount++; }
+    void unref() { refcount--; assert(refcount >= 0); }
+    void addref(const ReorderBufferEntry& rob) { addref(); }
+    void unref(const ReorderBufferEntry& rob) { unref(); }
+    void addspecref(int archreg) { addref(); }
+    void unspecref(int archreg) { unref(); }
+    void addcommitref(int archreg) { addref(); }
+    void uncommitref(int archreg) { unref(); }
+    bool referenced() const { return (refcount > 0); }
+    bool nonnull() const { return (index() != PHYS_REG_NULL); }
+    bool allocated() const { return (state != PHYSREG_FREE); }
+    void commit() { changestate(PHYSREG_ARCH); }
+    void complete() { changestate(PHYSREG_BYPASS); }
+    void writeback() { changestate(PHYSREG_WRITTEN); }
+
+    void free() {
+      changestate(PHYSREG_FREE);
+      rob = 0;
+      refcount = 0;
+      all_consumers_sourced_from_bypass = 1;
+    }
+
+    void reset() {
+      selfqueuelink::reset();
+      state = PHYSREG_NONE;
+      free();
+    }
+
+    int index() const { return idx; }
+    bool valid() const { return ((flags & FLAG_INV) == 0); }
+    bool ready() const { return ((flags & FLAG_WAIT) == 0); }
+
+    void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
+
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
+
+  struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
+    byte coreid;
+    byte rfid;
+    W16 size;
+    const char* name;
+    StateList states[MAX_PHYSREG_STATE];
+    W64 allocations;
+    W64 frees;
+
+    PhysicalRegisterFile() { }
+
+    PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset();
+    }
+
+    PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset(); return *this;
+    }
+
+    void init(const char* name, int coreid, int rfid, int size);
+    bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
+    PhysicalRegister* alloc(int r = -1);
+    void reset();
+    ostream& print(ostream& os) const;
+
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+  };
+
+  static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
+    return physregs.print(os);
+  }
+
+  //
+  // Register Rename Table
+  //
+  struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+    bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
+#endif
+    ostream& print(ostream& os) const;
+  };
+
+  static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
+    return rrt.print(os);
+  }
+
+  enum {
+    ISSUE_COMPLETED = 1,      // issued correctly
+    ISSUE_NEEDS_REPLAY = 0,   // fast scheduling replay
+    ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
+    ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
+  };
+
+  enum {
+    COMMIT_RESULT_NONE = 0,   // no instructions committed: some uops not ready
+    COMMIT_RESULT_OK = 1,     // committed
+    COMMIT_RESULT_EXCEPTION = 2, // exception
+    COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
+    COMMIT_RESULT_SMC = 4,    // self modifying code detected
+    COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
+    COMMIT_RESULT_STOP = 6    // stop processor model (shutdown)
+  };
+
+  // Branch predictor outcomes:
+  enum { MISPRED = 0, CORRECT = 1 };
+
+  //
+  // Lookup tables (LUTs):
+  //
+  struct Cluster {
+    char* name;
+    W16 issue_width;
+    W32 fu_mask;
+  };
+
+  extern const Cluster clusters[MAX_CLUSTERS];
+  extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
+  extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
+  extern const byte archdest_can_commit[TRANSREG_COUNT];
+  extern const byte archdest_is_visible[TRANSREG_COUNT];
+
+  struct OutOfOrderMachine;
+
+  struct OutOfOrderCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
+    OutOfOrderCore& core;
+    OutOfOrderCoreCacheCallbacks(OutOfOrderCore& core_): core(core_) { }
+    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+  };
+
+  struct MemoryInterlockEntry {
+    W64 uuid;
+    W16 rob;
+    byte vcpuid;
+    void reset() { uuid = 0; rob = 0; vcpuid = 0; }
+
+    ostream& print(ostream& os, W64 physaddr) const {
+      os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", uuid ", uuid, ", rob ", rob;
+      return os;
+    }
+  };
+
+  struct MemoryInterlockBuffer: public LockableAssociativeArray<W64, MemoryInterlockEntry, 16, 4, 8> { };
+
+  extern MemoryInterlockBuffer interlocks;
+
+  //
+  // Event Tracing
+  //
+  enum {
+    EVENT_INVALID = 0,
+    EVENT_FETCH_STALLED,
+    EVENT_FETCH_ICACHE_WAIT,
+    EVENT_FETCH_FETCHQ_FULL,
+    EVENT_FETCH_BOGUS_RIP,
+    EVENT_FETCH_ICACHE_MISS,
+    EVENT_FETCH_SPLIT,
+    EVENT_FETCH_ASSIST,
+    EVENT_FETCH_TRANSLATE,
+    EVENT_FETCH_OK,
+    EVENT_RENAME_FETCHQ_EMPTY,
+    EVENT_RENAME_ROB_FULL,
+    EVENT_RENAME_PHYSREGS_FULL,
+    EVENT_RENAME_LDQ_FULL,
+    EVENT_RENAME_STQ_FULL,
+    EVENT_RENAME_MEMQ_FULL,
+    EVENT_RENAME_OK,
+    EVENT_FRONTEND,
+    EVENT_CLUSTER_NO_CLUSTER,
+    EVENT_CLUSTER_OK,
+    EVENT_DISPATCH_NO_CLUSTER,
+    EVENT_DISPATCH_DEADLOCK,
+    EVENT_DISPATCH_OK,
+    EVENT_ISSUE_NO_FU,
+    EVENT_ISSUE_OK,
+    EVENT_REPLAY,
+    EVENT_STORE_EXCEPTION,
+    EVENT_STORE_WAIT,
+    EVENT_STORE_PARALLEL_FORWARDING_MATCH,
+    EVENT_STORE_ALIASED_LOAD,
+    EVENT_STORE_ISSUED,
+    EVENT_STORE_LOCK_RELEASED,
+    EVENT_STORE_LOCK_ANNULLED,
+    EVENT_STORE_LOCK_REPLAY,
+    EVENT_LOAD_EXCEPTION,
+    EVENT_LOAD_WAIT,
+    EVENT_LOAD_HIGH_ANNULLED,
+    EVENT_LOAD_HIT,
+    EVENT_LOAD_MISS,
+    EVENT_LOAD_LOCK_REPLAY,
+    EVENT_LOAD_LOCK_OVERFLOW,
+    EVENT_LOAD_LOCK_ACQUIRED,
+    EVENT_LOAD_LFRQ_FULL,
+    EVENT_LOAD_WAKEUP,
+    EVENT_ALIGNMENT_FIXUP,
+    EVENT_ANNUL_NO_FUTURE_UOPS,
+    EVENT_ANNUL_MISSPECULATION,
+    EVENT_ANNUL_EACH_ROB,
+    EVENT_ANNUL_PSEUDOCOMMIT,
+    EVENT_ANNUL_FETCHQ_RAS,
+    EVENT_ANNUL_FETCHQ,
+    EVENT_ANNUL_FLUSH,
+    EVENT_REDISPATCH_DEPENDENTS,
+    EVENT_REDISPATCH_DEPENDENTS_DONE,
+    EVENT_REDISPATCH_EACH_ROB,
+    EVENT_COMPLETE,
+    EVENT_BROADCAST,
+    EVENT_FORWARD,
+    EVENT_WRITEBACK,
+    EVENT_COMMIT_EXCEPTION_DETECTED,
+    EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED,
+    EVENT_COMMIT_SKIPBLOCK,
+    EVENT_COMMIT_SMC_DETECTED,
+    EVENT_COMMIT_ASSIST,
+    EVENT_COMMIT_OK,
+    EVENT_RECLAIM_PHYSREG,
+  };
+
+  //
+  // Event that gets written to the trace buffer
+  //
+  // In the interest of minimizing space, the cycle counters
+  // and uuids are only 32-bits; in practice wraparound is
+  // not likely to be a problem.
+  //
+  struct OutOfOrderCoreEvent {
+    W32 cycle;
+    W32 uuid;
+    RIPVirtPhysBase rip;
+    TransOpBase uop;
+    W16 rob;
+    W16 physreg;
+    W16 lsq;
+    W16 type;
+    W16s lfrqslot;
+    byte rfid;
+    byte cluster;
+    byte fu;
+
+    OutOfOrderCoreEvent* fill(int type) {
+      this->type = type;
+      cycle = sim_cycle;
+      uuid = 0;
+      return this;
+    }
+
+    OutOfOrderCoreEvent* fill(int type, const FetchBufferEntry& uop) {
+      fill(type);
+      uuid = uop.uuid;
+      rip = uop.rip;
+      this->uop = uop;
+      return this;
+    }
+
+    OutOfOrderCoreEvent* fill(int type, const RIPVirtPhys& rvp) {
+      fill(type);
+      rip = rvp;
+      return this;
+    }
+
+    OutOfOrderCoreEvent* fill(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob->uop);
+      this->rob = rob->index();
+      physreg = rob->physreg->index();
+      lsq = (rob->lsq) ? rob->lsq->index() : 0;
+      rfid = rob->physreg->rfid;
+      cluster = rob->cluster;
+      fu = rob->fu;
+      lfrqslot = rob->lfrqslot;
+      return this;
+    }
+
+    OutOfOrderCoreEvent* fill_commit(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob);
+      if unlikely (isstore(rob->uop.opcode)) {
+        commit.state.st = *rob->lsq;
+      } else {
+        commit.state.reg.rddata = rob->physreg->data;
+        commit.state.reg.rdflags = rob->physreg->flags;
+      }
+      // taken, predtaken only for branches
+      commit.pteupdate = rob->pteupdate;
+      // oldphysreg filled in later
+      // oldphysreg_refcount filled in later
+      commit.bb_refcount = rob->uop.bb->refcount;
+      commit.bb = rob->uop.bb;
+      commit.origvirt = rob->origvirt;
+      commit.total_user_insns_committed = total_user_insns_committed;
+      // target_rip filled in later
+      foreach (i, MAX_OPERANDS) commit.operand_physregs[i] = rob->operands[i]->index();
+      return this;
+    }
+
+    OutOfOrderCoreEvent* fill_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr, Waddr virtaddr) {
+      fill(type, rob);
+      loadstore.sfr = *rob->lsq;
+      loadstore.virtaddr = virtaddr;
+      loadstore.load_store_second_phase = rob->load_store_second_phase;
+      loadstore.inherit_sfr_used = (inherit_sfr != null);
+      if unlikely (inherit_sfr) {
+        loadstore.inherit_sfr = *inherit_sfr;
+        loadstore.inherit_sfr_lsq = inherit_sfr->rob->lsq->index();
+        loadstore.inherit_sfr_uuid = inherit_sfr->rob->uop.uuid;
+        loadstore.inherit_sfr_rob = inherit_sfr->rob->index();
+        loadstore.inherit_sfr_physreg = inherit_sfr->rob->physreg->index();
+        loadstore.inherit_sfr_rip = inherit_sfr->rob->uop.rip;
+      }
+      return this;
+    }
+
+    union {
+      struct {
+        W16s missbuf;
+        BasicBlock* bb;
+        W64 predrip;
+        W16 bb_uop_count;
+      } fetch;
+      struct {
+        W16  oldphys;
+        W16  oldzf;
+        W16  oldcf;
+        W16  oldof;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } rename;
+      struct {
+        W16 cycles_left;
+      } frontend;
+      struct {
+        W16 allowed_clusters;
+        W16 iq_avail[MAX_CLUSTERS];
+      } select_cluster;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } dispatch;
+      struct {
+        byte mispredicted:1;
+        IssueState state;
+        W16 cycles_left;
+        W64 operand_data[MAX_OPERANDS];
+        W16 operand_flags[MAX_OPERANDS];
+        W64 predrip;
+        W32 fu_avail;
+      } issue;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+        byte ready;
+      } replay;
+      struct {
+        W64 virtaddr;
+        W64 data_to_store;
+        SFR sfr;
+        SFR inherit_sfr;
+        W64 inherit_sfr_uuid;
+        W64 inherit_sfr_rip;
+        W16 inherit_sfr_lsq;
+        W16 inherit_sfr_rob;
+        W16 inherit_sfr_physreg;
+        W16 cycles_left;
+        W64 locking_uuid;
+        byte inherit_sfr_used:1, rcready:1, load_store_second_phase:1, predicted_alias:1;
+        byte locking_vcpuid;
+        W16 locking_rob;
+      } loadstore;
+      struct {
+        W16 somidx;
+        W16 eomidx;
+        W16 startidx;
+        W16 endidx;
+        W16 bb_refcount;
+        byte annulras;
+        BasicBlock* bb;
+      } annul;
+      struct {
+        StateList* current_state_list;
+        W16 iqslot;
+        W16 count;
+        byte dependent_operands;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } redispatch;
+      struct {
+        W8  forward_cycle;
+        W8  operand;
+        W8  target_operands_ready;
+        W8  target_all_operands_ready;
+        W16 target_rob;
+        W16 target_physreg;
+        W8  target_rfid;
+        W8  target_cluster;
+        W64 target_uuid;
+        W16 target_lsq;
+        W8  target_st;
+      } forwarding;
+      struct {
+        W16 consumer_count;
+        W16 flags;
+        W64 data;
+        byte transient:1, all_consumers_sourced_from_bypass:1, no_branches_between_renamings:1, dest_renamed_before_writeback:1;
+      } writeback;
+      struct {
+        IssueState state;
+        byte taken:1, predtaken:1;
+        PTEUpdateBase pteupdate;
+        W16s oldphysreg;
+        W16 oldphysreg_refcount;
+        W16 bb_refcount;
+        BasicBlock* bb;
+        W64 origvirt;
+        W64 total_user_insns_committed;
+        W64 target_rip;
+        W16 operand_physregs[MAX_OPERANDS];
+      } commit;
+    };
+
+    ostream& print(ostream& os) const;
+  };
+
+  struct EventLog {
+    OutOfOrderCoreEvent* start;
+    OutOfOrderCoreEvent* end;
+    OutOfOrderCoreEvent* tail;
+    ostream* logfile;
+
+    EventLog() { start = null; end = null; tail = null; logfile = null; }
+
+    bool init(size_t bufsize);
+    void reset();
+
+    OutOfOrderCoreEvent* add() {
+      if unlikely (tail >= end) {
+        tail = start;
+        flush();
+      }
+      OutOfOrderCoreEvent* event = tail;
+      tail++;
+      return event;
+    }
+
+    void flush(bool only_to_tail = false);
+
+    OutOfOrderCoreEvent* add(int type) {
+      return add()->fill(type);
+    }
+
+    OutOfOrderCoreEvent* add(int type, const RIPVirtPhys& rvp) {
+      return add()->fill(type, rvp);
+    }
+
+    OutOfOrderCoreEvent* add(int type, const FetchBufferEntry& uop) {
+      return add()->fill(type, uop);
+    }
+
+    OutOfOrderCoreEvent* add(int type, const ReorderBufferEntry* rob) {
+      return add()->fill(type, rob);
+    }
+
+    OutOfOrderCoreEvent* add_commit(int type, const ReorderBufferEntry* rob) {
+      return add()->fill_commit(type, rob);
+    }
+
+    OutOfOrderCoreEvent* add_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr = null, Waddr addr = 0) {
+      return add()->fill_load_store(type, rob, inherit_sfr, addr);
+    }
+
+    ostream& print(ostream& os, bool only_to_tail = false);
+  };
+
+  #define ASF_MAX_LINES (8)
+  struct LLBLine {
+    bool  read, write;
+    byte  orig_data[CacheSubsystem::L1_LINE_SIZE];
+    int   refcount;
+    void  reset() {read=write=false; refcount=0;}
+    LLBLine():read(false),write(false),refcount(0) {}
+  };
+  struct LockedLineBuffer: public FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_LINES> {
+    typedef FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_LINES> base_t;
+    int num_locations;
+
+    LLBLine* add_location(Waddr addr);
+    void remove_ref(LLBLine* line);
+    void clear();
+    void snapshot();
+    void undo();
+
+    void commit() {clear(); lasterr = 0;};
+    void abort() { undo(); /*clear();*/ lasterr = 0; };
+
+    bool contains(Waddr addr) {return probe(floor(addr, CacheSubsystem::L1_LINE_SIZE));}
+    bool empty() {return (num_locations == 0);}
+    void external_probe(Waddr addr, bool invalidating);
+    W64  consistency_error() { /* No errors yet! */ 
+      if (!lasterr)  lasterr = (asf_consistency_error()) ? 0xDEADBEEF : 0;
+      return lasterr;
+    }
+    private: W64 lasterr;
+  };
+  //
+  // Out-of-order core
+  //
+  struct OutOfOrderCore {
+    int coreid;
+    OutOfOrderMachine& machine;
+    Context& ctx;
+    EventLog eventlog;
+    BranchPredictorInterface branchpred;
+    ListOfStateLists rob_states;
+    ListOfStateLists physreg_states;
+    ListOfStateLists lsq_states;
+
+    //
+    // Issue Queues (one per cluster)
+    //
+    IssueQueue<16> issueq_int0;
+    IssueQueue<16> issueq_int1;
+    IssueQueue<16> issueq_ld;
+    IssueQueue<16> issueq_fp;
+
+    // Instantiate any issueq sizes used above:
+#define declare_issueq_templates template struct IssueQueue<16>
+
+#define foreach_issueq(expr) { issueq_int0.expr; issueq_int1.expr; issueq_ld.expr; issueq_fp.expr; }
+  
+    void sched_get_all_issueq_free_slots(int* a) {
+      a[0] = issueq_int0.remaining();
+      a[1] = issueq_int1.remaining();
+      a[2] = issueq_ld.remaining();
+      a[3] = issueq_fp.remaining();
+    }
+
+#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
+  switch (cluster) { \
+  case 0: rc = core.issueq_int0.expr; break; \
+  case 1: rc = core.issueq_int1.expr; break; \
+  case 2: rc = core.issueq_ld.expr; break; \
+  case 3: rc = core.issueq_fp.expr; break; \
+  }
+
+#define per_cluster_stats_update(prefix, cluster, expr) \
+  switch (cluster) { \
+  case 0: prefix.int0 expr; break; \
+  case 1: prefix.int1 expr; break; \
+  case 2: prefix.ld expr; break; \
+  case 3: prefix.fp expr; break; \
+  }
+
+#define per_physregfile_stats_update(prefix, rfid, expr) \
+  switch (rfid) { \
+  case 0: prefix.integer expr; break; \
+  case 1: prefix.fp expr; break; \
+  case 2: prefix.st expr; break; \
+  case 3: prefix.br expr; break; \
+  }
+
+#define issueq_operation_on_cluster(core, cluster, expr) { int dummyrc; issueq_operation_on_cluster_with_result(core, cluster, dummyrc, expr); }
+
+#define for_each_cluster(iter) foreach (iter, MAX_CLUSTERS)
+#define for_each_operand(iter) foreach (iter, MAX_OPERANDS)
+
+    //
+    // Each ROB's state can be linked into at most one of the
+    // following rob_xxx_list lists at any given time; the ROB's
+    // current_state_list points back to the list it belongs to.
+    //
+    StateList rob_free_list;                             // Free ROB entyry
+    StateList rob_frontend_list;                         // Frontend in progress (artificial delay)
+    StateList rob_ready_to_dispatch_list;                // Ready to dispatch
+    StateList rob_dispatched_list[MAX_CLUSTERS];         // Dispatched but waiting for operands
+    StateList rob_ready_to_issue_list[MAX_CLUSTERS];     // Ready to issue (all operands ready)
+    StateList rob_ready_to_store_list[MAX_CLUSTERS];     // Ready to store (all operands except possibly rc are ready)
+    StateList rob_ready_to_load_list[MAX_CLUSTERS];      // Ready to load (all operands ready)
+    StateList rob_issued_list[MAX_CLUSTERS];             // Issued and in progress (or for loads, returned here after address is generated)
+    StateList rob_completed_list[MAX_CLUSTERS];          // Completed and result in transit for local and global forwarding
+    StateList rob_ready_to_writeback_list[MAX_CLUSTERS]; // Completed; result ready to writeback in parallel across all cluster register files
+    StateList rob_cache_miss_list;                       // Loads only: wait for cache miss to be serviced
+    StateList rob_ready_to_commit_queue;                 // Ready to commit
+
+    enum {
+      ROB_STATE_READY = (1 << 0),
+      ROB_STATE_IN_ISSUE_QUEUE = (1 << 1),
+      ROB_STATE_PRE_READY_TO_DISPATCH = (1 << 2)
+    };
+
+#define InitClusteredROBList(name, description, flags) \
+  name[0](description "-int0", rob_states, flags); \
+  name[1](description "-int1", rob_states, flags); \
+  name[2](description "-ld", rob_states, flags); \
+  name[3](description "-fp", rob_states, flags)
+
+    // Default constructor to bind a core to a specific hardware context
+    OutOfOrderCore(int coreid_, Context& ctx_, OutOfOrderMachine& machine_): coreid(coreid_), ctx(ctx_), machine(machine_), cache_callbacks(*this) { }
+
+    //
+    // Initialize structures independent of the core parameters
+    //
+    void init_generic();
+
+    //
+    // Initialize all structures for the first time
+    //
+    void init() {
+      init_generic();
+
+      //
+      // Physical register files
+      //
+      physregfiles[0]("int", coreid, 0, 128);
+      physregfiles[1]("fp", coreid, 1, 128);
+      physregfiles[2]("st", coreid, 2, STQ_SIZE);
+      physregfiles[3]("br", coreid, 3, MAX_BRANCHES_IN_FLIGHT);
+    }
+
+    //
+    // Physical Registers
+    //
+
+    enum { PHYS_REG_FILE_INT, PHYS_REG_FILE_FP, PHYS_REG_FILE_ST, PHYS_REG_FILE_BR };
+
+    enum {  
+      PHYS_REG_FILE_MASK_INT = (1 << 0),
+      PHYS_REG_FILE_MASK_FP  = (1 << 1),
+      PHYS_REG_FILE_MASK_ST  = (1 << 2),
+      PHYS_REG_FILE_MASK_BR  = (1 << 3)
+    };
+
+    // Major core structures
+    PhysicalRegisterFile physregfiles[PHYS_REG_FILE_COUNT];
+    Queue<ReorderBufferEntry, ROB_SIZE> ROB;
+    Queue<LoadStoreQueueEntry, LSQ_SIZE> LSQ;
+    RegisterRenameTable specrrt;
+    RegisterRenameTable commitrrt;
+
+    // Fetch-related structures
+    Queue<FetchBufferEntry, FETCH_QUEUE_SIZE> fetchq;
+    RIPVirtPhys fetchrip;
+    BasicBlock* current_basic_block;
+    TransOpBuffer unaligned_ldst_buf;
+    int current_basic_block_transop_index;
+    bool stall_frontend;
+    bool waiting_for_icache_fill;
+    // How many bytes of x86 code to fetch into decode buffer at once
+    static const int ICACHE_FETCH_GRANULARITY = 16;
+    // Last block in icache we fetched into our buffer
+    W64 current_icache_block;
+    W64 fetch_uuid;
+    int loads_in_flight;
+    int stores_in_flight;
+    bool prev_interrupts_pending;
+    bool handle_interrupt_at_next_eom;
+
+    // Dispatch
+    int round_robin_reg_file_offset;
+    static const int DISPATCH_DEADLOCK_COUNTDOWN_CYCLES = 64;
+    int dispatch_deadlock_countdown;
+
+    // Issue
+    W32 fu_avail;
+    ReorderBufferEntry* robs_on_fu[FU_COUNT];
+    struct LoadStoreAliasPredictor: public FullyAssociativeTags<W64, 8> { };
+    LoadStoreAliasPredictor lsap;
+    int loads_in_this_cycle;
+    W32 load_to_store_parallel_forwarding_buffer[LOAD_FU_COUNT];
+
+    // Commit
+    W64 chk_recovery_rip;
+    W64 last_commit_at_cycle;
+    bool smc_invalidate_pending;
+    RIPVirtPhys smc_invalidate_rvp;
+
+    CacheSubsystem::CacheHierarchy caches;
+    OutOfOrderCoreCacheCallbacks cache_callbacks;
+
+    // Pipeline Stages
+    bool runcycle();
+    bool fetch();
+    void rename();
+    void frontend();
+    int dispatch();
+    int issue(int cluster);
+    int complete(int cluster);
+    int transfer(int cluster);
+    int writeback(int cluster);
+    int commit();
+
+    // Pipeline Flush Handling
+    bool handle_barrier();
+    bool handle_exception();
+    bool handle_interrupt();
+
+    // Pipeline Control and Fetching
+    void reset_fetch_unit(W64 realrip);
+    void flush_pipeline();
+    void invalidate_smc();
+    void external_to_core_state();
+    void core_to_external_state() { }
+    void annul_fetchq();
+    BasicBlock* fetch_or_translate_basic_block(Context& ctx, const RIPVirtPhys& rvp);
+    void redispatch_deadlock_recovery();
+
+    // Debugging
+    void dump_ooo_state(ostream& os);
+    void print_rob(ostream& os);
+    void print_lsq(ostream& os);
+    void check_refcounts();
+    void check_rob();
+    void print_rename_tables(ostream& os);
+    OutOfOrderCore& getcore() const { return coreof(coreid); }
+
+    // ASF
+    bool asf_in_crit_sec;
+    bool asf_reissue_will_fail;
+    W64  asf_stored_error;
+
+    RIPVirtPhys asf_failing_acquire;
+    LockedLineBuffer locked_line_buffer;
+    void check_asf_conflicts();
+    int asf_runcycle(int commitrc);
+    void asf_rollback_last_acq(W64 errorcode, int reg_nextrip);
+  };
+
+  struct OutOfOrderMachine: public PTLsimMachine {
+    OutOfOrderCore* cores[MAX_CONTEXTS];
+
+    OutOfOrderMachine(const char* name);
+    virtual bool init(PTLsimConfig& config);
+    virtual int run(PTLsimConfig& config);
+    virtual void dump_state(ostream& os);
+    virtual void update_stats(PTLsimStats& stats);
+    void flush_all_pipelines();
+  };
+
+  extern CycleTimer cttotal;
+  extern CycleTimer ctfetch;
+  extern CycleTimer ctdecode;
+  extern CycleTimer ctrename;
+  extern CycleTimer ctfrontend;
+  extern CycleTimer ctdispatch;
+  extern CycleTimer ctissue;
+  extern CycleTimer ctissueload;
+  extern CycleTimer ctissuestore;
+  extern CycleTimer ctcomplete;
+  extern CycleTimer cttransfer;
+  extern CycleTimer ctwriteback;
+  extern CycleTimer ctcommit;
+
+#ifdef DECLARE_STRUCTURES
+  //
+  // The following configuration has two integer/store clusters with a single cycle
+  // latency between them, but both clusters can access the load pseudo-cluster with
+  // no extra cycle. The floating point cluster is two cycles from everything else.
+  //
+
+  const Cluster clusters[MAX_CLUSTERS] = {
+    {"int0",  2, (FU_ALU0|FU_STU0)},
+    {"int1",  2, (FU_ALU1|FU_STU1)},
+    {"ld",    2, (FU_LDU0|FU_LDU1)},
+    {"fp",    2, (FU_FPU0|FU_FPU1)},
+  };
+
+  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 LD FP <-to
+    {0, 1, 0, 2}, // from I0
+    {1, 0, 0, 2}, // from I1
+    {0, 0, 0, 2}, // from LD
+    {2, 2, 2, 0}, // from FP
+  };
+
+  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 LD FP <-to
+    {2, 2, 1, 1}, // from I0
+    {2, 2, 1, 1}, // from I1
+    {1, 1, 2, 2}, // from LD
+    {1, 1, 1, 2}, // from FP
+  };
+#endif // DECLARE_STRUCTURES
+
+#endif // INSIDE_OOOCORE
+
+  //
+  // This part is used when parsing stats.h to build the
+  // data store template; these must be in sync with the
+  // corresponding definitions elsewhere.
+  //
+  static const char* cluster_names[MAX_CLUSTERS] = {"int0", "int1", "ld", "fp"};
+  static const char* phys_reg_file_names[PHYS_REG_FILE_COUNT] = {"int", "fp", "st", "br"};
+};
+
+//
+// Out of Order Core
+//
+struct OutOfOrderCoreStats { // rootnode:
+  W64 cycles;
+  struct fetch {
+    struct stop { // node: summable
+      W64 stalled;
+      W64 icache_miss;
+      W64 fetchq_full;
+      W64 bogus_rip;
+      W64 microcode_assist;
+      W64 branch_taken;
+      W64 full_width;
+    } stop;
+
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+    W64 width[ASFOutOfOrderModel::FETCH_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::FETCH_WIDTH, 1
+
+    W64 blocks;
+    W64 uops;
+    W64 user_insns;
+  } fetch;
+  struct frontend {
+    struct status { // node: summable
+      W64 complete;
+      W64 fetchq_empty;
+      W64 rob_full;
+      W64 physregs_full;
+      W64 ldq_full;
+      W64 stq_full;
+    } status;
+
+    W64 width[ASFOutOfOrderModel::FRONTEND_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::FRONTEND_WIDTH, 1
+
+    struct renamed {
+      W64 none;
+      W64 reg;
+      W64 flags;
+      W64 reg_and_flags;
+    } renamed;
+
+    struct alloc {
+      W64 reg;
+      W64 ldreg;
+      W64 sfr;
+      W64 br;
+    } alloc;
+
+    // NOTE: This is capped at 255 consumers to keep the size reasonable:
+    W64 consumer_count[256]; // histo: 0, 255, 1
+  } frontend;
+  struct dispatch {
+    W64 width[ASFOutOfOrderModel::DISPATCH_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::DISPATCH_WIDTH, 1
+
+    struct source { // node: summable
+      W64 integer[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 fp[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 st[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 br[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+    } source;
+
+    W64 cluster[ASFOutOfOrderModel::MAX_CLUSTERS]; // label: ASFOutOfOrderModel::cluster_names
+
+    struct redispatch {
+      W64 trigger_uops;
+      W64 deadlock_flushes;
+      W64 deadlock_uops_flushed;
+      W64 dependent_uops[ASFOutOfOrderModel::ROB_SIZE+1]; // histo: 0, ASFOutOfOrderModel::ROB_SIZE, 1
+    } redispatch;
+
+  } dispatch;
+  struct issue {
+    W64 uops;
+    double uipc;
+    struct result { // node: summable
+      W64 no_fu;
+      W64 replay;
+      W64 misspeculated;
+      W64 refetch;
+      W64 branch_mispredict;
+      W64 exception;
+      W64 complete;
+    } result;
+    struct width {
+      W64 int0[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 int1[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 ld[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+    } width;
+    struct source { // node: summable
+      W64 integer[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 fp[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 st[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+      W64 br[ASFOutOfOrderModel::MAX_PHYSREG_STATE]; // label: ASFOutOfOrderModel::physreg_state_names
+    } source;
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } issue;
+  struct writeback {
+    W64 writebacks[ASFOutOfOrderModel::PHYS_REG_FILE_COUNT]; // label: OutOfOrderModel::phys_reg_file_names
+    struct width {
+      W64 int0[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 int1[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 ld[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[ASFOutOfOrderModel::MAX_ISSUE_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::MAX_ISSUE_WIDTH, 1
+    } width;
+  } writeback;
+
+  struct commit {
+    W64 uops;
+    W64 insns;
+    double uipc;
+    double ipc;
+    struct freereg { // node: summable
+      W64 pending;
+      W64 free;
+    } freereg;
+
+    W64 free_regs_recycled;
+
+    struct result { // node: summable
+      W64 none;
+      W64 ok;
+      W64 exception;
+      W64 skipblock;
+      W64 barrier;
+      W64 smc;
+      W64 memlocked;
+      W64 stop;
+    } result;
+
+    struct setflags { // node: summable
+      W64 yes;
+      W64 no;
+    } setflags;
+
+    W64 width[ASFOutOfOrderModel::COMMIT_WIDTH+1]; // histo: 0, ASFOutOfOrderModel::COMMIT_WIDTH, 1
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } commit;
+
+  struct branchpred {
+    W64 predictions;
+    W64 updates;
+
+    // These counters are [0] = mispred, [1] = correct
+    W64 cond[2]; // label: branchpred_outcome_names
+    W64 indir[2]; // label: branchpred_outcome_names
+    W64 ret[2]; // label: branchpred_outcome_names
+    W64 summary[2]; // label: branchpred_outcome_names
+    struct ras { // node: summable
+      W64 pushes;
+      W64 overflows;
+      W64 pops;
+      W64 underflows;
+      W64 annuls;
+    } ras;
+  } branchpred;
+
+  struct dcache {
+    struct load {
+      struct issue { // node: summable
+        W64 complete;
+        W64 miss;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 missbuf_full;
+          W64 interlocked;
+          W64 interlock_overflow;
+          W64 fence;
+        } replay;
+      } issue;
+        
+      struct forward { // node: summable
+        W64 cache;
+        W64 sfr;
+        W64 sfr_and_cache;
+      } forward;
+        
+      struct dependency { // node: summable
+        W64 independent;
+        W64 predicted_alias_unresolved;
+        W64 stq_address_match;
+        W64 stq_address_not_ready;
+      } dependency;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+    } load;
+
+    struct store {
+      struct issue { // node: summable
+        W64 complete;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 sfr_addr_and_data_and_data_to_store_not_ready;
+          W64 sfr_addr_and_data_to_store_not_ready;
+          W64 sfr_data_and_data_to_store_not_ready;
+          W64 interlocked;
+          W64 fence;
+        } replay;
+      } issue;
+
+      struct forward { // node: summable
+        W64 zero;
+        W64 sfr;
+      } forward;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+
+      W64 parallel_aliasing;
+    } store;
+  } dcache;
+
+  struct simulator {
+    double total_time;
+    struct cputime { // node: summable
+      double fetch;
+      double decode;
+      double rename;
+      double frontend;
+      double dispatch;
+      double issue;
+      double issueload;
+      double issuestore;
+      double complete;
+      double transfer;
+      double writeback;
+      double commit;
+    } cputime;
+  } simulator;
+};
+
+#endif // _OOOCORE_H_
diff -r 10448c053ad6 asfoooexec.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asfoooexec.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,1848 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Out-of-Order Core Simulator With Experimental AMD64 ASF Extension
+// Execution Pipeline Stages: Scheduling, Execution, Broadcast
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+#include <globals.h>
+#include <elf.h>
+#include <ptlsim.h>
+#include <branchpred.h>
+#include <datastore.h>
+#include <logic.h>
+#include <dcache.h>
+#include <random_inject.h>
+
+#define INSIDE_OOOCORE
+#include <asfooocore.h>
+#include <stats.h>
+
+#ifndef ENABLE_CHECKS
+#undef assert
+#define assert(x) (x)
+#endif
+
+#ifndef ENABLE_LOGGING
+#undef logable
+#define logable(level) (0)
+#endif
+
+using namespace ASFOutOfOrderModel;
+//
+// Issue Queue
+//
+template <int size, int operandcount>
+void IssueQueue<size, operandcount>::reset(int coreid) {
+  this->coreid = coreid;
+  count = 0;
+  valid = 0;
+  issued = 0;
+  allready = 0;
+  foreach (i, operandcount) {
+    tags[i].reset();
+  }
+  uopids.reset();
+}
+
+template <int size, int operandcount>
+void IssueQueue<size, operandcount>::clock() {
+  allready = (valid & (~issued));
+  foreach (operand, operandcount) {
+    allready &= ~tags[operand].valid;
+  }
+}
+
+template <int size, int operandcount>
+bool IssueQueue<size, operandcount>::insert(tag_t uopid, const tag_t* operands, const tag_t* preready) {
+  if unlikely (count == size)
+                return false;
+  
+  int slot = count++;
+  assert(!bit(valid, slot));
+  
+  uopids.insertslot(slot, uopid);
+  
+  valid[slot] = 1;
+  issued[slot] = 0;
+  
+  foreach (operand, operandcount) {
+    if likely (preready[operand])
+      tags[operand].invalidateslot(slot);
+    else tags[operand].insertslot(slot, operands[operand]);
+  }
+  
+  return true;
+}
+
+template <int size, int operandcount>
+void IssueQueue<size, operandcount>::tally_broadcast_matches(IssueQueue<size, operandcount>::tag_t sourceid, const bitvec<size>& mask, int operand) const {
+  if likely (!config.event_log_enabled) return;
+
+  OutOfOrderCore& core = getcore();
+  const ReorderBufferEntry* source = &core.ROB[sourceid];
+
+  bitvec<size> temp = mask;
+
+  while (*temp) {
+    int slot = temp.lsb();
+    int robid = uopof(slot);
+    assert(inrange(robid, 0, ROB_SIZE-1));
+    const ReorderBufferEntry* target = &core.ROB[robid];
+    temp[slot] = 0;
+
+    OutOfOrderCoreEvent* event = core.eventlog.add(EVENT_FORWARD, source);
+    event->forwarding.operand = operand;
+    event->forwarding.forward_cycle = source->forward_cycle;
+    event->forwarding.target_uuid = target->uop.uuid;
+    event->forwarding.target_rob = target->index();
+    event->forwarding.target_physreg = target->physreg->index();
+    event->forwarding.target_rfid = target->physreg->rfid;
+    event->forwarding.target_cluster = target->cluster;
+    bool target_st = isstore(target->uop.opcode);
+    event->forwarding.target_st = target_st;
+    if (target_st) event->forwarding.target_lsq = target->lsq->index();
+    event->forwarding.target_operands_ready = 0;
+    foreach (i, MAX_OPERANDS) event->forwarding.target_operands_ready |= ((target->operands[i]->ready()) << i);
+    event->forwarding.target_all_operands_ready = target->ready_to_issue();
+  }
+}
+
+template <int size, int operandcount>
+bool IssueQueue<size, operandcount>::broadcast(tag_t uopid) {
+  vec_t tagvec = assoc_t::prep(uopid);
+  
+  foreach (operand, operandcount) {
+    bitvec<size> mask = tags[operand].invalidate(tagvec);
+    if unlikely (config.event_log_enabled) tally_broadcast_matches(uopid, mask, operand);
+  }
+
+  return true;
+}
+
+//
+// Select one ready slot and move it to the issued state.
+// This function returns the slot id. The returned slot
+// id becomes invalid after the next call to remove()
+// before the next uop can be processed in any way.
+//
+template <int size, int operandcount>
+int IssueQueue<size, operandcount>::issue() {
+  if (!allready) return -1;
+  int slot = allready.lsb();
+  issued[slot] = 1;
+  return slot;
+}
+
+//
+// Replay a uop that has already issued once.
+// The caller may add or reset dependencies here as needed.
+//
+template <int size, int operandcount>
+bool IssueQueue<size, operandcount>::replay(int slot, const tag_t* operands, const tag_t* preready) {
+  assert(valid[slot]);
+  assert(issued[slot]);
+  
+  issued[slot] = 0;
+  
+  foreach (operand, operandcount) {
+    if (preready[operand])
+      tags[operand].invalidateslot(slot);
+    else tags[operand].insertslot(slot, operands[operand]);
+  }
+  
+  return true;
+}
+
+// NOTE: This is a fairly expensive operation:
+template <int size, int operandcount>
+bool IssueQueue<size, operandcount>::remove(int slot) {
+  uopids.collapse(slot);
+
+  foreach (i, operandcount) {
+    tags[i].collapse(slot);
+  }
+  
+  valid = valid.remove(slot, 1);
+  issued = issued.remove(slot, 1);
+  allready = allready.remove(slot, 1);
+  
+  count--;
+  assert(count >= 0);
+  return true;
+}
+
+template <int size, int operandcount>
+ostream& IssueQueue<size, operandcount>::print(ostream& os) const {
+  os << "IssueQueue: count = ", count, ":", endl;
+  foreach (i, size) {
+    os << "  uop ";
+    uopids.printid(os, i);
+    os << ": ",
+      ((valid[i]) ? 'V' : '-'), ' ',
+      ((issued[i]) ? 'I' : '-'), ' ',
+      ((allready[i]) ? 'R' : '-'), ' ';
+    foreach (j, operandcount) {
+      if (j) os << ' ';
+      tags[j].printid(os, i);
+    }
+    os << endl;
+  }
+  return os;
+}
+
+// Instantiate all methods in the specific IssueQueue sizes we're using:
+declare_issueq_templates;
+
+//
+// Issue a single ROB. 
+//
+// Returns:
+//  +1 if issue was successful
+//   0 if no functional unit was available
+//  -1 if there was an exception and we should stop issuing this cycle
+//
+int ReorderBufferEntry::issue() {
+  // S.D.'s crazyness!
+  /*if (uop.opcode == OP_ld_pre) {
+    cerr << __FILE__,":",__LINE__," Cycle: ",sim_cycle," Issueing ", *this, " uop: ",uop," asf: ",(uop.is_asf)?"yes":"no",endl,flush;
+  }*/
+  OutOfOrderCore& core = getcore();
+  OutOfOrderCoreEvent* event = null;
+
+  W32 executable_on_fu = fuinfo[uop.opcode].fu & clusters[cluster].fu_mask & core.fu_avail;
+
+  // Are any FUs available in this cycle?
+  if unlikely (!executable_on_fu) {
+    if unlikely (config.event_log_enabled) {
+      event = core.eventlog.add(EVENT_ISSUE_NO_FU, this);
+      event->issue.fu_avail = core.fu_avail;
+    }
+
+    stats.ooocore.issue.result.no_fu++;
+    //
+    // When this (very rarely) happens, stop issuing uops to this cluster
+    // and try again with the problem uop on the next cycle. In practice
+    // this scenario rarely happens.
+    //
+    issueq_operation_on_cluster(core, cluster, replay(iqslot));
+    return ISSUE_NEEDS_REPLAY;
+  }
+
+  PhysicalRegister& ra = *operands[RA];
+  PhysicalRegister& rb = *operands[RB];
+  PhysicalRegister& rc = *operands[RC];
+
+  //
+  // Check if any other resources are missing that we didn't
+  // know about earlier, and replay like we did above if
+  // needed. This is our last chance to do so.
+  //
+
+  stats.summary.uops++;
+  stats.ooocore.issue.uops++;
+
+  fu = lsbindex(executable_on_fu);
+  clearbit(core.fu_avail, fu);
+  core.robs_on_fu[fu] = this;
+  cycles_left = fuinfo[uop.opcode].latency;
+/*S.D.*///cerr<<__FILE__,__LINE__,": ROB ",this," was on list/state ",current_state_list->name," and is issued now!",endl,flush;
+  changestate(core.rob_issued_list[cluster]);
+
+  IssueState state;
+  state.reg.rdflags = 0;
+
+  W64 radata = ra.data;
+  W64 rbdata = (uop.rb == REG_imm) ? uop.rbimm : rb.data;
+  W64 rcdata = (uop.rc == REG_imm) ? uop.rcimm : rc.data;
+
+  bool ld = isload(uop.opcode);
+  bool st = isstore(uop.opcode);
+  bool br = isbranch(uop.opcode);
+  bool pf = isprefetch(uop.opcode);
+
+  assert(operands[RA]->ready());
+  if likely (uop.rb != REG_imm) assert(rb.ready());
+  if likely ((!st || (st && load_store_second_phase)) && (uop.rc != REG_imm)) assert(rc.ready());
+  if likely (!st) assert(operands[RS]->ready());
+
+  if likely (ra.nonnull()) {
+    ra.get_state_list().issue_source_counter++;
+    ra.all_consumers_sourced_from_bypass &= (ra.state == PHYSREG_BYPASS);
+    per_physregfile_stats_update(stats.ooocore.issue.source, ra.rfid, [ra.state]++);
+  }
+
+  if likely ((!uop.rbimm) & (rb.nonnull())) { 
+    rb.get_state_list().issue_source_counter++;
+    rb.all_consumers_sourced_from_bypass &= (rb.state == PHYSREG_BYPASS);
+    per_physregfile_stats_update(stats.ooocore.issue.source, rb.rfid, [rb.state]++);
+  }
+
+  if unlikely ((!uop.rcimm) & (rc.nonnull())) {
+    rc.get_state_list().issue_source_counter++;
+    rc.all_consumers_sourced_from_bypass &= (rc.state == PHYSREG_BYPASS);
+    per_physregfile_stats_update(stats.ooocore.issue.source, rc.rfid, [rc.state]++);
+  }
+
+  bool propagated_exception = 0;
+  if unlikely ((ra.flags | rb.flags | rc.flags) & FLAG_INV) {
+    //
+    // Invalid data propagated through operands: mark output as
+    // invalid and don't even execute the uop at all.
+    //
+    state.st.invalid = 1;
+    state.reg.rdflags = FLAG_INV;
+    state.reg.rddata = EXCEPTION_Propagate;
+    propagated_exception = 1;
+  } else {
+    stats.ooocore.issue.opclass[opclassof(uop.opcode)]++;
+
+    if unlikely (ld|st) {
+      int completed = (ld) ? issueload(*lsq, origvirt, radata, rbdata, rcdata, pteupdate) : issuestore(*lsq, origvirt, radata, rbdata, rcdata, operands[2]->ready(), pteupdate);
+      if unlikely (completed == ISSUE_MISSPECULATED) {
+        stats.ooocore.issue.result.misspeculated++;
+        return -1;
+      }
+      if unlikely (completed == ISSUE_NEEDS_REFETCH) {
+        stats.ooocore.issue.result.refetch++;
+        return -1;
+      }
+      state.reg.rddata = lsq->data;
+      state.reg.rdflags = (lsq->invalid << log2(FLAG_INV)) | ((!lsq->datavalid) << log2(FLAG_WAIT));
+      if unlikely (completed == ISSUE_NEEDS_REPLAY) {
+        stats.ooocore.issue.result.replay++;
+        return 0;
+      }
+    } else if unlikely (pf) {
+      issueprefetch(state, radata, rbdata, rcdata, uop.cachelevel, pteupdate);
+    } else if unlikely ((uop.opcode == OP_acq) |(uop.opcode == OP_com)){
+      // NOTE: Doing the actual backup copy of the data in the LLB is for now done in commit_asf_instruction() !
+      //       For a more precise simulation this should be tried already here, but the dependencies have to be
+      //       tracked by scanning the LSQ for any pending stores from addresses in the LLB and locked loads, which
+      //       add further addresses to the LLB. That means, that the acquire would have to be modeled as a store
+      //       in the LSQ, in order to allow locked loads that are before (in prog order) the acquire but are
+      //       issued later to find out that there was an acquire that operated on wrong data.
+      //       Somewhat similar to the use of fences in the smt-model!
+      //       Given this complexity, for now creating the snapshot at (in-order) commits seems to be most reasonable!
+      //       S.D.
+      if unlikely (issueasf(state, rbdata) == ISSUE_NEEDS_REPLAY) {
+        stats.ooocore.issue.result.replay++;
+        return 0;
+      }
+    } else {
+      if unlikely (br) {
+        state.brreg.riptaken = uop.riptaken;
+        state.brreg.ripseq = uop.ripseq;
+      }
+      uop.synthop(state, radata, rbdata, rcdata, ra.flags, rb.flags, rc.flags); 
+    }
+  }
+
+  /* Injection of exceptions during the ASF critical section */
+  if unlikely (core.asf_in_crit_sec && asf_exception_critsec()) {
+    state.st.invalid = 1;
+    state.reg.rdflags = FLAG_INV;
+    state.reg.rddata = EXCEPTION_ASF_Testing;
+  }
+
+  physreg->flags = state.reg.rdflags;
+  physreg->data = state.reg.rddata;
+
+  if unlikely (!physreg->valid()) {
+    //
+    // If the uop caused an exception, force it directly to the commit
+    // state and not through writeback (this keeps dependencies waiting until 
+    // they can be properly annulled by the speculation logic.) The commit 
+    // stage will detect the exception and take appropriate action.
+    //
+    // If the exceptional uop was speculatively executed beyond a
+    // branch, it will never reach commit anyway since the branch would
+    // have to commit before the exception was ever seen.
+    //
+    cycles_left = 0;
+    changestate(core.rob_ready_to_commit_queue);
+    //
+    // NOTE: The frontend should not necessarily be stalled on exceptions
+    // when extensive speculation is in use, since re-dispatch can be used
+    // without refetching to resolve these situations.
+    //
+    // stall_frontend = true;
+  }
+
+  bool mispredicted = (physreg->data != uop.riptaken);
+
+  if unlikely (config.event_log_enabled && (propagated_exception | (!(ld|st)))) {
+    event = core.eventlog.add(EVENT_ISSUE_OK, this);
+    event->issue.state = state;
+    event->issue.cycles_left = cycles_left;
+    event->issue.operand_data[0] = radata;
+    event->issue.operand_data[1] = rbdata;
+    event->issue.operand_data[2] = rcdata;
+    event->issue.operand_flags[0] = ra.flags;
+    event->issue.operand_flags[1] = rb.flags;
+    event->issue.operand_flags[2] = rc.flags;
+    event->issue.mispredicted = br & mispredicted;
+    event->issue.predrip = uop.riptaken;
+  }
+
+  //
+  // Release the issue queue entry, since we are beyond the point of no return:
+  // the uop cannot possibly be replayed at this point, but may still be annulled
+  // or re-dispatched in case of speculation failures.
+  //
+  release();
+  issued = 1;
+
+  if likely (physreg->valid()) {
+    if unlikely (br) {
+      int bptype = uop.predinfo.bptype;
+
+      bool cond = bit(bptype, log2(BRANCH_HINT_COND));
+      bool indir = bit(bptype, log2(BRANCH_HINT_INDIRECT));
+      bool ret = bit(bptype, log2(BRANCH_HINT_RET));
+        
+      if unlikely (mispredicted) {
+        stats.ooocore.branchpred.cond[MISPRED] += cond;
+        stats.ooocore.branchpred.indir[MISPRED] += (indir & !ret);
+        stats.ooocore.branchpred.ret[MISPRED] += ret;
+        stats.ooocore.branchpred.summary[MISPRED]++;
+
+        W64 realrip = physreg->data;
+
+        //
+        // Correct the branch directions and cond code field.
+        // This is required since the branch may again be
+        // re-dispatched if we mis-identified a mispredict
+        // due to very deep speculation.
+        //
+        // Basically the riptaken field must always point
+        // to the correct next instruction in the ROB after
+        // the branch.
+        //
+        if likely (isclass(uop.opcode, OPCLASS_COND_BRANCH)) {
+          assert(realrip == uop.ripseq);
+          uop.cond = invert_cond(uop.cond);
+          
+          //
+          // We need to be careful here: we already looked up the synthop for this
+          // uop according to the old condition, so redo that here so we call the
+          // correct code for the swapped condition.
+          //
+          uop.synthop = get_synthcode_for_cond_branch(uop.opcode, uop.cond, uop.size, 0);
+          swap(uop.riptaken, uop.ripseq);
+        } else if unlikely (isclass(uop.opcode, OPCLASS_INDIR_BRANCH)) {
+          uop.riptaken = realrip;
+          uop.ripseq = realrip;
+        } else if unlikely (isclass(uop.opcode, OPCLASS_UNCOND_BRANCH)) { // unconditional branches need no special handling
+          assert(realrip == uop.riptaken);
+        }
+
+        //
+        // Early misprediction handling. Annul everything after the
+        // branch and restart fetching in the correct direction
+        //
+        core.annul_fetchq();
+        annul_after();
+
+        //
+        // The fetch queue is reset and fetching is redirected to the
+        // correct branch direction.
+        //
+        // Note that we do NOT just reissue the branch - this would be
+        // pointless as we already know the correct direction since
+        // it has already been issued once. Just let it writeback and
+        // commit like it was predicted perfectly in the first place.
+        //
+        core.reset_fetch_unit(realrip);
+        stats.ooocore.issue.result.branch_mispredict++;
+
+        return -1;
+      } else {
+        stats.ooocore.branchpred.cond[CORRECT] += cond;
+        stats.ooocore.branchpred.indir[CORRECT] += (indir & !ret);
+        stats.ooocore.branchpred.ret[CORRECT] += ret;
+        stats.ooocore.branchpred.summary[CORRECT]++;
+        stats.ooocore.issue.result.complete++;
+      }
+    } else {
+      stats.ooocore.issue.result.complete++;
+    }
+  } else {
+    stats.ooocore.issue.result.exception++;
+  }
+
+  return 1;
+}
+
+//
+// Address generation common to both loads and stores
+//
+void* ReorderBufferEntry::addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul) {
+  Context& ctx = getcore().ctx;
+  bool st = isstore(uop.opcode);
+
+  int sizeshift = uop.size;
+  int aligntype = uop.cond;
+  bool internal = uop.internal;
+  bool signext = (uop.opcode == OP_ldx);
+
+  addr = (st) ? (ra + rb) : ((aligntype == LDST_ALIGN_NORMAL) ? (ra + rb) : ra);
+  //
+  // x86-64 requires virtual addresses to be canonical: if bit 47 is set, 
+  // all upper 16 bits must be set. If this is not true, we need to signal
+  // a general protection fault.
+  //
+  addr = (W64)signext64(addr, 48);
+  addr &= ctx.virt_addr_mask;
+  origaddr = addr;
+  annul = 0;
+
+  switch (aligntype) {
+  case LDST_ALIGN_NORMAL:
+    break;
+  case LDST_ALIGN_LO:
+    addr = floor(addr, 8); break;
+  case LDST_ALIGN_HI:
+    //
+    // Is the high load ever even used? If not, don't check for exceptions;
+    // otherwise we may erroneously flag page boundary conditions as invalid
+    //
+    addr = floor(addr, 8);
+    annul = (floor(origaddr + ((1<<sizeshift)-1), 8) == addr);
+    addr += 8; 
+    break;
+  }
+
+  state.physaddr = addr >> 3;
+  state.invalid = 0;
+  //
+  // Notice that datavalid is not set until both the rc operand to
+  // store is ready AND any inherited SFR data is ready to merge.
+  //
+  state.addrvalid = 1;
+  state.datavalid = 0;
+
+  //
+  // Special case: if no part of the actual user load/store falls inside
+  // of the high 64 bits, do not perform the access and do not signal
+  // any exceptions if that page was invalid.
+  //
+  // However, we must be extremely careful if we're inheriting an SFR
+  // from an earlier store: the earlier store may have updated some
+  // bytes in the high 64-bit chunk even though we're not updating
+  // any bytes. In this case we still must do the write since it
+  // could very well be the final commit to that address. In any
+  // case, the SFR mismatch and LSAT must still be checked.
+  //
+  // The store commit code checks if the bytemask is zero and does
+  // not attempt the actual store if so. This will always be correct
+  // for high stores as described in this scenario.
+  //
+
+  exception = 0;
+
+  // For debugging use only:
+  // if (logable(6)) logfile << intstring(uop.uuid, 20), " adrgen", " rip ", (void*)(Waddr)uop.rip, ": origaddr ", (void*)(Waddr)origaddr, ", virtaddr ", (void*)(Waddr)addr, endl;
+
+  void* mapped = (annul) ? null : ctx.check_and_translate(addr, uop.size, st, uop.internal, exception, pfec, pteupdate);
+  return mapped;
+}
+
+bool ReorderBufferEntry::handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec) {
+  OutOfOrderCore& core = getcore();
+
+  bool st = isstore(uop.opcode);
+  int aligntype = uop.cond;
+
+  state.invalid = 1;
+  state.data = exception | ((W64)pfec << 32);
+  state.datavalid = 1;
+
+  if unlikely (config.event_log_enabled) core.eventlog.add_load_store((st) ? EVENT_STORE_EXCEPTION : EVENT_LOAD_EXCEPTION, this, null, addr);
+
+  if unlikely (exception == EXCEPTION_UnalignedAccess) {
+    //
+    // If we have an unaligned access, locate the excepting uop in the
+    // basic block cache through the uop.origop pointer. Directly set
+    // the unaligned bit in the uop, and restart fetching at the start
+    // of the x86 macro-op. The frontend will then split the uop into
+    // low and high parts as it is refetched.
+    //
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_ALIGNMENT_FIXUP, this, null, addr);
+
+    uop.bb->transops[uop.bbindex].unaligned = 1;
+
+    core.annul_fetchq();
+    W64 recoveryrip = annul_after_and_including();
+    core.reset_fetch_unit(recoveryrip);
+
+    W64& stat = (st) ? stats.ooocore.dcache.store.issue.unaligned : stats.ooocore.dcache.load.issue.unaligned;
+    stat++;
+
+    return false;
+  }
+
+  if unlikely (((exception == EXCEPTION_PageFaultOnRead) | (exception == EXCEPTION_PageFaultOnWrite)) & (aligntype == LDST_ALIGN_HI)) {
+    //
+    // If we have a page fault on an unaligned access, and this is the high
+    // half (ld.hi / st.hi) of that access, the page fault address recorded
+    // in CR2 must be at the very first byte of the second page the access
+    // overlapped onto (otherwise the kernel will repeatedly fault in the
+    // first page, even though that one is already present.
+    //
+    origaddr = addr;
+  }
+
+  W64& stat = (st) ? stats.ooocore.dcache.store.issue.exception : stats.ooocore.dcache.load.issue.exception;
+  stat++;
+
+  return true;
+}
+
+namespace ASFOutOfOrderModel {
+  // One global interlock buffer for all VCPUs:
+  MemoryInterlockBuffer interlocks;
+};
+
+bool ReorderBufferEntry::release_mem_lock(bool forced) {
+  if likely (!lock_acquired) return false;
+
+  W64 lockaddr = lsq->physaddr << 3;
+  MemoryInterlockEntry* lock = interlocks.probe(lockaddr);
+  assert(lock);
+
+  OutOfOrderCore& core = getcore();
+
+  if unlikely (config.event_log_enabled) {
+    OutOfOrderCoreEvent* event = core.eventlog.add_load_store((forced) ? EVENT_STORE_LOCK_ANNULLED : EVENT_STORE_LOCK_RELEASED, this, null, lockaddr);
+    event->loadstore.locking_vcpuid = lock->vcpuid;
+    event->loadstore.locking_uuid = lock->uuid;
+    event->loadstore.locking_rob = lock->rob;
+  }
+
+  assert(lock->vcpuid == core.ctx.vcpuid);
+  assert(lock->uuid == uop.uuid);
+  assert(lock->rob == index());
+  interlocks.invalidate(lockaddr);
+  lock_acquired = 0;
+  return true;
+}
+
+//
+// Stores have special dependency rules: they may issue as soon as operands ra and rb are ready,
+// even if rc (the value to store) or rs (the store buffer to inherit from) is not yet ready or
+// even known.
+//
+// After both ra and rb are ready, the store is moved to [ready_to_issue] as a first phase store.
+// When the store issues, it generates its physical address [ra+rb] and establishes an SFR with
+// the address marked valid but the data marked invalid.
+//
+// The sole purpose of doing this is to allow other loads and stores to create an rs dependency
+// on the SFR output of the store.
+//
+// The store is then marked as a second phase store, since the address has been generated.
+// When the store is replayed and rescheduled, it must now have all operands ready this time.
+//
+int ReorderBufferEntry::issuestore(LoadStoreQueueEntry& state, Waddr& origaddr, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate) {
+  time_this_scope(ctissuestore);
+
+  OutOfOrderCore& core = getcore();
+  OutOfOrderCoreEvent* event;
+
+  int sizeshift = uop.size;
+  int aligntype = uop.cond;
+  
+  Waddr addr;
+  int exception = 0;
+  PageFaultErrorCode pfec;
+  bool annul;
+  
+  void* mapped = addrgen(state, origaddr, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
+
+  if unlikely (exception) {
+    return (handle_common_load_store_exceptions(state, origaddr, addr, exception, pfec)) ? ISSUE_COMPLETED : ISSUE_MISSPECULATED;
+  }
+
+  stats.ooocore.dcache.store.type.aligned += ((!uop.internal) & (aligntype == LDST_ALIGN_NORMAL));
+  stats.ooocore.dcache.store.type.unaligned += ((!uop.internal) & (aligntype != LDST_ALIGN_NORMAL));
+  stats.ooocore.dcache.store.type.internal += uop.internal;
+  stats.ooocore.dcache.store.size[sizeshift]++;
+
+  state.physaddr = (annul) ? 0xffffffffffffffffULL : (mapped_virt_to_phys(mapped) >> 3);
+
+  //
+  // The STQ is then searched for the most recent prior store S to same 64-bit block. If found, U's
+  // rs dependency is set to S by setting the ROB's rs field to point to the prior store's physreg
+  // and hence its ROB. If not found, U's rs dependency remains unset (i.e. to PHYS_REG_NULL).
+  // If some prior stores are ambiguous (addresses not resolved yet), we assume they are a match
+  // to ensure correctness yet avoid additional checks; the store is replayed and tries again 
+  // when the ambiguous reference resolves.
+  //
+  LoadStoreQueueEntry* sfra = null;
+
+  foreach_backward_before(core.LSQ, lsq, i) {
+    LoadStoreQueueEntry& stbuf = core.LSQ[i];
+
+    if unlikely (stbuf.store && (!stbuf.addrvalid || (stbuf.addrvalid && (stbuf.physaddr == state.physaddr)))) {
+      assert(stbuf.rob->uop.uuid < uop.uuid);
+      sfra = &stbuf;
+      break;
+    }
+  }
+
+  //
+  // Always update deps in case redispatch is required
+  // because of a future speculation failure: we must
+  // know which loads and stores inherited bogus values
+  //
+  operands[RS]->unref(*this);
+  operands[RS] = (sfra) ? sfra->rob->physreg : &core.physregfiles[0][PHYS_REG_NULL];
+  operands[RS]->addref(*this);
+
+  bool ready = (!sfra || (sfra && sfra->addrvalid && sfra->datavalid)) && rcready;
+
+  //
+  // If any of the following are true:
+  // - Prior store S with same address is found but its data is not ready
+  // - Prior store S with unknown address is found
+  // - Data to store (rc operand) is not yet ready
+  //
+  // Then the store is moved back into [ready_to_dispatch], where this time all operands are checked.
+  // The replay() function will put the newly selected prior store S's ROB as the rs dependency
+  // of the current store before replaying it.
+  //
+  // When the current store wakes up again, it will rescan the STQ to see if any intervening stores
+  // slipped in, and may repeatedly go back to sleep on the new store until the entire chain of stores
+  // to a given location is resolved in the correct order. This does not mean all stores must issue in
+  // program order - it simply means stores to the same address (8-byte chunk) are serialized in
+  // program order, but out of order w.r.t. unrelated stores. This is similar to the constraints on
+  // store buffer merging in Pentium 4 and AMD K8.
+  //
+
+  if unlikely (!ready) {
+    if unlikely (config.event_log_enabled) {
+      event = core.eventlog.add_load_store(EVENT_STORE_WAIT, this, sfra, addr);
+      event->loadstore.rcready = rcready;
+    }
+
+    replay();
+    load_store_second_phase = 1;
+
+    stats.ooocore.dcache.store.issue.replay.sfr_addr_and_data_and_data_to_store_not_ready += ((!rcready) & (sfra && (!sfra->addrvalid) & (!sfra->datavalid)));
+    stats.ooocore.dcache.store.issue.replay.sfr_addr_and_data_to_store_not_ready += ((!rcready) & (sfra && (!sfra->addrvalid)));
+    stats.ooocore.dcache.store.issue.replay.sfr_data_and_data_to_store_not_ready += ((!rcready) & (sfra && sfra->addrvalid && (!sfra->datavalid)));
+
+    stats.ooocore.dcache.store.issue.replay.sfr_addr_and_data_not_ready += (rcready & (sfra && (!sfra->addrvalid) & (!sfra->datavalid)));
+    stats.ooocore.dcache.store.issue.replay.sfr_addr_not_ready += (rcready & (sfra && ((!sfra->addrvalid) & (sfra->datavalid))));
+    stats.ooocore.dcache.store.issue.replay.sfr_data_not_ready += (rcready & (sfra && (sfra->addrvalid & (!sfra->datavalid))));
+
+    return ISSUE_NEEDS_REPLAY;
+  }
+
+  //
+  // Load/Store Aliasing Prevention
+  //
+  // We always issue loads as soon as possible even if some entries in the
+  // store queue have unresolved addresses. If a load gets erroneously
+  // issued before an earlier store in program order to the same address,
+  // this is considered load/store aliasing.
+  // 
+  // Aliasing is detected when stores issue: the load queue is scanned
+  // for earlier loads in program order which collide with the store's
+  // address. In this case all uops in program order after and including
+  // the store (and by extension, the colliding load) must be annulled.
+  //
+  // To keep this from happening repeatedly, whenever a collision is
+  // detected, the store looks up the rip of the colliding load and adds
+  // it to a small table called the LSAP (load/store alias predictor).
+  //
+  // Loads query the LSAP with the rip of the load; if a matching entry
+  // is found in the LSAP and the store address is unresolved, the load
+  // is not allowed to proceed.
+  //
+  // Check all later loads in LDQ to see if any have already issued
+  // and have already obtained their data but really should have 
+  // depended on the data generated by this store. If so, mark the
+  // store as invalid (EXCEPTION_LoadStoreAliasing) so it annuls
+  // itself and the load after it in program order at commit time.
+  //
+  foreach_forward_after (core.LSQ, lsq, i) {
+    LoadStoreQueueEntry& ldbuf = core.LSQ[i];
+    //
+    // (see notes on Load Replay Conditions below)
+    //
+
+    if unlikely ((!ldbuf.store) & ldbuf.addrvalid & ldbuf.rob->issued & (ldbuf.physaddr == state.physaddr)) {
+      //
+      // Check for the extremely rare case where:
+      // - load is in the ready_to_load state at the start of the simulated 
+      //   cycle, and is processed by load_issue()
+      // - that load gets its data forwarded from a store (i.e., the store
+      //   being handled here) scheduled for execution in the same cycle
+      // - the load and the store alias each other
+      //
+      // Handle this by checking the list of addresses for loads processed
+      // in the same cycle, and only signal a load speculation failure if
+      // the aliased load truly came at least one cycle before the store.
+      //
+      int i;
+      int parallel_forwarding_match = 0;
+      foreach (i, core.loads_in_this_cycle) {
+        parallel_forwarding_match |= (core.load_to_store_parallel_forwarding_buffer[i] == state.physaddr);
+      }
+
+      if unlikely (parallel_forwarding_match) {
+        if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_STORE_PARALLEL_FORWARDING_MATCH, this, &ldbuf, addr);
+        stats.ooocore.dcache.store.parallel_aliasing++;
+        continue;
+      }
+
+      state.invalid = 1;
+      state.data = EXCEPTION_LoadStoreAliasing;
+      state.datavalid = 1;
+
+      if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_STORE_ALIASED_LOAD, this, &ldbuf, addr);
+
+      // Add the rip to the load to the load/store alias predictor:
+      core.lsap.select(ldbuf.rob->uop.rip);
+      //
+      // The load as dependent on this store. Add a new dependency
+      // on the store to the load so the normal redispatch mechanism
+      // will find this.
+      //
+      ldbuf.rob->operands[RS]->unref(*this);
+      ldbuf.rob->operands[RS] = physreg;
+      ldbuf.rob->operands[RS]->addref(*this);
+
+      redispatch_dependents();
+
+      stats.ooocore.dcache.store.issue.ordering++;
+
+      return ISSUE_MISSPECULATED;
+    }
+  }
+
+  //
+  // At this point all operands are valid, so merge the data and mark the store as valid.
+  //
+
+  byte bytemask = 0;
+
+  switch (aligntype) {
+  case LDST_ALIGN_NORMAL:
+  case LDST_ALIGN_LO:
+    bytemask = ((1 << (1 << sizeshift))-1) << (lowbits(origaddr, 3));
+    rc <<= 8*lowbits(origaddr, 3);
+    break;
+  case LDST_ALIGN_HI:
+    bytemask = ((1 << (1 << sizeshift))-1) >> (8 - lowbits(origaddr, 3));
+    rc >>= 8*(8 - lowbits(origaddr, 3));
+  }
+
+  state.invalid = 0;
+  state.data = (sfra) ? mux64(expand_8bit_to_64bit_lut[bytemask], sfra->data, rc) : rc;
+  state.bytemask = (sfra) ? (sfra->bytemask | bytemask) : bytemask;
+  state.datavalid = 1;
+
+  stats.ooocore.dcache.store.forward.zero += (sfra == null);
+  stats.ooocore.dcache.store.forward.sfr += (sfra != null);
+  stats.ooocore.dcache.store.datatype[uop.datatype]++;
+
+  if unlikely (config.event_log_enabled) {
+    event = core.eventlog.add_load_store(EVENT_STORE_ISSUED, this, sfra, addr);
+    event->loadstore.data_to_store = rc;
+  }
+
+  load_store_second_phase = 1;
+
+  stats.ooocore.dcache.store.issue.complete++;
+
+  return ISSUE_COMPLETED;
+}
+
+static inline W64 extract_bytes(void* target, int SIZESHIFT, bool SIGNEXT) {
+  W64 data;
+  switch (SIZESHIFT) {
+  case 0:
+    data = (SIGNEXT) ? (W64s)(*(W8s*)target) : (*(W8*)target); break;
+  case 1:
+    data = (SIGNEXT) ? (W64s)(*(W16s*)target) : (*(W16*)target); break;
+  case 2:
+    data = (SIGNEXT) ? (W64s)(*(W32s*)target) : (*(W32*)target); break;
+  case 3:
+    data = *(W64*)target; break;
+  }
+  return data;
+}
+
+int ReorderBufferEntry::issueload(LoadStoreQueueEntry& state, Waddr& origaddr, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate) {
+  time_this_scope(ctissueload);
+
+  OutOfOrderCore& core = getcore();
+  OutOfOrderCoreEvent* event;
+
+  int sizeshift = uop.size;
+  int aligntype = uop.cond;
+  bool signext = (uop.opcode == OP_ldx);
+
+  Waddr addr;
+  int exception = 0;
+  PageFaultErrorCode pfec;
+  bool annul;
+  
+  void* mapped = addrgen(state, origaddr, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
+/*S.D. TESTING: */ if unlikely (uop.is_asf && asf_load_exception()) exception = EXCEPTION_ASF_Testing;
+  if unlikely (exception) {
+    return (handle_common_load_store_exceptions(state, origaddr, addr, exception, pfec)) ? ISSUE_COMPLETED : ISSUE_MISSPECULATED;
+  }
+
+  stats.ooocore.dcache.load.type.aligned += ((!uop.internal) & (aligntype == LDST_ALIGN_NORMAL));
+  stats.ooocore.dcache.load.type.unaligned += ((!uop.internal) & (aligntype != LDST_ALIGN_NORMAL));
+  stats.ooocore.dcache.load.type.internal += uop.internal;
+  stats.ooocore.dcache.load.size[sizeshift]++;
+
+  state.physaddr = (annul) ? 0xffffffffffffffffULL : (mapped_virt_to_phys(mapped) >> 3);
+
+  if unlikely (uop.is_asf) {
+    // NOTE: Adding of address to LLB is now done at commit time in commit_asf_instruction() !
+    //       If a more reasonable modell is to be designed, adding the address to the LLB should be
+    //       done here!
+    //       S.D.
+    /* LOCKed loads are illegal inside an ASF-critical-section */
+    //TODO: Make this OOO-aware! I.e. own state, own LLB or synchronise!
+    if unlikely (core.asf_in_crit_sec) {
+      //TODO: Handle the exception, by writing the proper exception code into the physreg!
+    }
+    /* Add the address to the LLB */
+    cerr<<__FILE__,__LINE__,": Hi we: ", uop, "@", uop.rip, " add to the LLB!", endl, flush;
+    llbline = core.locked_line_buffer.add_location(state.physaddr << 3);
+  }
+
+  //
+  // For simulation purposes only, load the data immediately
+  // so it is easier to track. In the hardware this obviously
+  // only arrives later, but it saves us from having to copy
+  // cache lines around...
+  //
+  barrier();
+  W64 data = (annul) ? 0 : *((W64*)(Waddr)floor(signext64((Waddr)mapped, 48), 8));
+
+  LoadStoreQueueEntry* sfra = null;
+
+  bool load_is_known_to_alias_with_store = (core.lsap(uop.rip) >= 0);
+
+  foreach_backward_before(core.LSQ, lsq, i) {
+    LoadStoreQueueEntry& stbuf = core.LSQ[i];
+
+    if likely (!stbuf.store) continue;
+
+    if unlikely ((load_is_known_to_alias_with_store & (!stbuf.addrvalid)) || ((stbuf.physaddr == state.physaddr) & stbuf.addrvalid)) {
+      stats.ooocore.dcache.load.dependency.predicted_alias_unresolved += (load_is_known_to_alias_with_store);
+      stats.ooocore.dcache.load.dependency.stq_address_match += (!load_is_known_to_alias_with_store);
+      sfra = &stbuf;
+      break;
+    }
+  }
+
+  stats.ooocore.dcache.load.dependency.independent += (sfra == null);
+
+  bool ready = (!sfra || (sfra && sfra->addrvalid && sfra->datavalid));
+
+  //
+  // Always update deps in case redispatch is required
+  // because of a future speculation failure: we must
+  // know which loads and stores inherited bogus values
+  //
+  operands[RS]->unref(*this);
+  operands[RS] = (sfra) ? sfra->rob->physreg : &core.physregfiles[0][PHYS_REG_NULL];
+  operands[RS]->addref(*this);
+
+  if unlikely (!ready) {
+    //
+    // Load Replay Conditions:
+    //
+    // - Earlier store is known to alias (based on rip) yet its address is not yet resolved
+    // - Earlier store to the same 8-byte chunk was found but its data has not yet arrived
+    //
+    // In these cases we create an rs dependency on the earlier store and replay the load uop
+    // back to the dispatched state. It will be re-issued once the earlier store resolves.
+    //
+    // Consider the following sequence of events:
+    // - Load B issues
+    // - Store A issues and detects aliasing with load B; both A and B annulled
+    // - Load B attempts to re-issue but aliasing is predicted, so it creates a dependency on store A
+    // - Store A issues but sees that load B has already attempted to issue, so an aliasing replay is taken
+    //
+    // This becomes an infinite loop unless we clear both the addrvalid and datavalid fields of loads
+    // when they replay; clearing both suppresses the aliasing replay the second time around.
+    //
+
+    assert(sfra);
+
+    if unlikely (config.event_log_enabled) {
+      event = core.eventlog.add_load_store(EVENT_LOAD_WAIT, this, sfra, addr);
+      event->loadstore.predicted_alias = (load_is_known_to_alias_with_store && sfra && (!sfra->addrvalid));
+    }
+
+    stats.ooocore.dcache.load.issue.replay.sfr_addr_and_data_not_ready += ((!sfra->addrvalid) & (!sfra->datavalid));
+    stats.ooocore.dcache.load.issue.replay.sfr_addr_not_ready += ((!sfra->addrvalid) & (sfra->datavalid));
+    stats.ooocore.dcache.load.issue.replay.sfr_data_not_ready += ((sfra->addrvalid) & (!sfra->datavalid));
+
+    replay();
+    load_store_second_phase = 1;
+    return ISSUE_NEEDS_REPLAY;
+  }
+
+  state.addrvalid = 1;
+
+  if unlikely (aligntype == LDST_ALIGN_HI) {
+    //
+    // Concatenate the aligned data from a previous ld.lo uop provided in rb
+    // with the currently loaded data D as follows:
+    //
+    // rb | D
+    //
+    // Example:
+    //
+    // floor(a) floor(a)+8
+    // ---rb--  --DD---
+    // 0123456701234567
+    //    XXXXXXXX
+    //    ^ origaddr
+    //
+    if likely (!annul) {
+      if unlikely (sfra) data = mux64(expand_8bit_to_64bit_lut[sfra->bytemask], data, sfra->data);
+      
+      struct {
+        W64 lo;
+        W64 hi;
+      } aligner;
+      
+      aligner.lo = rb;
+      aligner.hi = data;
+      
+      W64 offset = lowbits(origaddr - floor(origaddr, 8), 4);
+
+      data = extract_bytes(((byte*)&aligner) + offset, sizeshift, signext);
+    } else {
+      //
+      // annulled: we need no data from the high load anyway; only use the low data
+      // that was already checked for exceptions and forwarding:
+      //
+      W64 offset = lowbits(origaddr, 3);
+      state.data = extract_bytes(((byte*)&rb) + offset, sizeshift, signext);
+      state.invalid = 0;
+      state.datavalid = 1;
+
+      if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIGH_ANNULLED, this, sfra, addr);
+
+      return ISSUE_COMPLETED;
+    }
+  } else {
+    if unlikely (sfra) data = mux64(expand_8bit_to_64bit_lut[sfra->bytemask], data, sfra->data);
+    data = extract_bytes(((byte*)&data) + lowbits(addr, 3), sizeshift, signext);
+  }
+
+  // shift is how many bits to shift the 8-bit bytemask left by within the cache line;
+  bool covered = core.caches.covered_by_sfr(addr, sfra, sizeshift);
+  stats.ooocore.dcache.load.forward.cache += (sfra == null);
+  stats.ooocore.dcache.load.forward.sfr += ((sfra != null) & covered);
+  stats.ooocore.dcache.load.forward.sfr_and_cache += ((sfra != null) & (!covered));
+  stats.ooocore.dcache.load.datatype[uop.datatype]++;
+
+  //
+  // NOTE: Technically the data is valid right now for simulation purposes
+  // only; in reality it may still be arriving from the cache.
+  //
+  state.data = data;
+  state.invalid = 0;
+  state.bytemask = 0xff;
+
+  // Internal loads don't hit the cache hierarchy, but rather complete in two cycles.
+  if unlikely (uop.internal) {
+    cycles_left = LOADLAT;
+
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIT, this, sfra, addr);
+
+    load_store_second_phase = 1;
+    state.datavalid = 1;
+    physreg->flags &= ~FLAG_WAIT;
+    physreg->complete();
+    // TODO: Is this necessary? We should be on the issued_list_already!
+    changestate(core.rob_issued_list[cluster]);
+    lfrqslot = -1;
+    forward_cycle = 0;
+
+    return ISSUE_COMPLETED;
+  }
+
+  bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(addr, sfra, sizeshift);
+
+  if likely (L1hit) {    
+    cycles_left = LOADLAT;
+
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIT, this, sfra, addr);
+
+    assert(core.loads_in_this_cycle < LOAD_FU_COUNT);
+    core.load_to_store_parallel_forwarding_buffer[core.loads_in_this_cycle++] = floor(addr, 8);
+    
+    load_store_second_phase = 1;
+    state.datavalid = 1;
+
+    stats.ooocore.dcache.load.issue.complete++;
+    per_context_dcache_stats_update(0, load.hit.L1++);
+    return ISSUE_COMPLETED;
+  }
+
+  stats.ooocore.dcache.load.issue.miss++;
+
+  cycles_left = 0;
+  changestate(core.rob_cache_miss_list);
+
+  LoadStoreInfo lsi;
+  lsi.rob = index();
+  lsi.sizeshift = sizeshift;
+  lsi.aligntype = aligntype;
+  lsi.sfrused = (sfra != null);
+  lsi.internal = uop.internal;
+  lsi.signext = signext;
+
+  //
+  // NOTE: this state is not really used anywhere since load misses
+  // will fill directly into the physical register instead.
+  //
+  lfrqslot = core.caches.issueload_slowpath(addr, origvirt, *sfra, lsi);
+
+  if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_MISS, this, sfra, addr);
+
+  if unlikely (lfrqslot < 0) {
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_LFRQ_FULL, this, null, addr);
+    stats.ooocore.dcache.load.issue.replay.missbuf_full++;
+
+    release_mem_lock(true);
+    state.addrvalid = 0;
+    replay();
+    return ISSUE_NEEDS_REPLAY;
+  }
+
+  assert(core.loads_in_this_cycle < LOAD_FU_COUNT);
+  core.load_to_store_parallel_forwarding_buffer[core.loads_in_this_cycle++] = floor(addr, 8);
+
+  return ISSUE_COMPLETED;
+}
+
+void ReorderBufferEntry::issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate) {
+//cerr <<__FILE__,__LINE__,": Issueing prefetch:",uop, (uop.is_asf)?" ASF!":"",endl,flush;
+  OutOfOrderCore& core = getcore();
+
+  state.reg.rddata = 0;
+  state.reg.rdflags = 0;
+
+  int exception = 0;
+  Waddr addr;
+  PTEUpdate dummy_pteu;
+  Waddr origaddr;
+  PageFaultErrorCode pfec;
+  bool annul;
+
+  LoadStoreQueueEntry dummy;
+  setzero(dummy);
+  void* mapped = addrgen(dummy, origaddr, ra, rb, rc, uop.is_asf ? pteupdate : dummy_pteu,
+                         addr, exception, pfec, annul);
+
+  /*S.D. TESTING: */ if unlikely (uop.is_asf && asf_prefetch_exception()) exception = EXCEPTION_ASF_Testing;
+  // Ignore bogus prefetches:
+  if unlikely (exception) {
+//    cerr <<__FILE__,__LINE__,": Prefetch has exception ",exception_name(exception),endl,flush;
+    if unlikely(uop.is_asf) {
+      /* Prefetches used for specification in ASF do generate pagefaults and clear the LLB when commited! */
+      state.reg.rdflags |= FLAG_INV;
+      state.reg.rddata   = exception | ((W64)pfec << 32);
+    }
+    return;
+  }
+  // Ignore unaligned prefetches (should never happen)
+  if unlikely (annul) {
+    return;
+  }
+
+  // (Stats are already updated by initiate_prefetch())
+
+  Waddr physaddr = (annul) ? 0 : Waddr(mapped_virt_to_phys(mapped));
+//cerr <<__FILE__,__LINE__,": Issueing prefetch:",uop," address ", physaddr, (uop.is_asf)?" ASF!":"",endl,flush;
+  if unlikely (uop.is_asf) {
+    if unlikely (core.asf_in_crit_sec) {
+      //TODO: Create proper exception in the physreg!
+    }
+    /* Add the address to the LLB */
+    cerr<<__FILE__,__LINE__,": Hi we: ", uop, "@", uop.rip, " add to the LLB!", endl, flush;
+    llbline = core.locked_line_buffer.add_location(physaddr);
+  }
+
+  core.caches.initiate_prefetch(physaddr, origvirt, cachelevel);
+}
+
+//
+// Data cache has delivered a load: wake up corresponding ROB/LSQ/physreg entries
+//
+void OutOfOrderCoreCacheCallbacks::dcache_wakeup(LoadStoreInfo lsi, W64 physaddr) {
+  ReorderBufferEntry& rob = core.ROB[lsi.rob];
+  assert(rob.current_state_list == &core.rob_cache_miss_list);
+  rob.loadwakeup();
+}
+
+void ReorderBufferEntry::loadwakeup() {
+  if unlikely (config.event_log_enabled) getcore().eventlog.add_load_store(EVENT_LOAD_WAKEUP, this);
+
+  physreg->flags &= ~FLAG_WAIT;
+  physreg->complete();
+
+  lsq->datavalid = 1;
+
+  changestate(getcore().rob_completed_list[cluster]);
+  cycles_left = 0;
+  lfrqslot = -1;
+  forward_cycle = 0;
+  fu = 0;
+}
+
+//
+// Replay the uop by recirculating it back to the dispatched
+// state so it can wait for additional dependencies not known
+// when it was originally dispatched, e.g. waiting on store
+// queue entries or value to store, etc.
+//
+// This involves re-initializing the uop's operands in its
+// already assigned issue queue slot and returning that slot
+// to the dispatched but not issued state.
+//
+// This must be done here instead of simply sending the uop
+// back to the dispatch state since otherwise we could have 
+// a deadlock if there is not enough room in the issue queue.
+//
+void ReorderBufferEntry::replay() {
+  OutOfOrderCore& core = getcore();
+
+  if unlikely (config.event_log_enabled) {
+    OutOfOrderCoreEvent* event = core.eventlog.add(EVENT_REPLAY, this);
+    foreach (i, MAX_OPERANDS) {
+      operands[i]->fill_operand_info(event->replay.opinfo[i]);
+      event->replay.ready |= (operands[i]->ready()) << i;
+    }
+  }
+
+  assert(!lock_acquired);
+
+  int operands_still_needed = 0;
+
+  issueq_tag_t uopids[MAX_OPERANDS];
+  issueq_tag_t preready[MAX_OPERANDS];
+
+  foreach (operand, MAX_OPERANDS) {
+    PhysicalRegister& source_physreg = *operands[operand];
+    ReorderBufferEntry& source_rob = *source_physreg.rob;
+
+    if likely (source_physreg.state == PHYSREG_WAITING) {
+      uopids[operand] = source_rob.index();
+      preready[operand] = 0;
+      operands_still_needed++;
+    } else {
+      // No need to wait for it
+      uopids[operand] = 0;
+      preready[operand] = 1;
+    }
+  }
+
+  if unlikely (operands_still_needed) {
+    changestate(core.rob_dispatched_list[cluster]);
+  } else {
+    changestate(get_ready_to_issue_list());
+  }
+
+  issueq_operation_on_cluster(core, cluster, replay(iqslot, uopids, preready));
+}
+
+//
+// Release the ROB from the issue queue after there is
+// no possibility it will need to be pulled back for
+// replay or annulment.
+//
+void ReorderBufferEntry::release() {
+  issueq_operation_on_cluster(getcore(), cluster, release(iqslot));
+  iqslot = -1;
+}
+
+//
+// Process the ready to issue queue and issue as many ROBs as possible
+//
+
+int OutOfOrderCore::issue(int cluster) {
+  time_this_scope(ctissue);
+
+  int issuecount = 0;
+  ReorderBufferEntry* rob;
+
+  int maxwidth = clusters[cluster].issue_width;
+
+  while (issuecount < maxwidth) {
+    int iqslot;
+    issueq_operation_on_cluster_with_result(getcore(), cluster, iqslot, issue());
+  
+    // Is anything ready?
+    if unlikely (iqslot < 0) break;
+
+    int robid;
+    issueq_operation_on_cluster_with_result(getcore(), cluster, robid, uopof(iqslot));
+    assert(inrange(robid, 0, ROB_SIZE-1));
+    ReorderBufferEntry& rob = ROB[robid];
+    rob.iqslot = iqslot;
+    int rc = rob.issue();
+    // Stop issuing from this cluster once something replays or has a mis-speculation
+    issuecount++;
+    if unlikely (rc <= 0) break;
+  }
+
+  per_cluster_stats_update(stats.ooocore.issue.width, cluster, [min(issuecount, MAX_ISSUE_WIDTH)]++);
+
+  return issuecount;
+}
+
+//
+// Forward the result of ROB 'result' to any other waiting ROBs
+// dispatched to the issue queues. This is done by broadcasting
+// the ROB tag to all issue queues in clusters reachable within
+// N cycles after the uop issued, where N is forward_cycle. This
+// technique is used to model arbitrarily complex multi-cycle
+// forwarding networks.
+//
+int ReorderBufferEntry::forward() {
+  ReorderBufferEntry* target;
+  int wakeupcount = 0;
+
+  assert(inrange((int)forward_cycle, 0, (MAX_FORWARDING_LATENCY+1)-1));
+
+  W32 targets = forward_at_cycle_lut[cluster][forward_cycle];
+  foreach (i, MAX_CLUSTERS) {
+    if likely (!bit(targets, i)) continue;
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = getcore().eventlog.add(EVENT_BROADCAST, this);
+      event->forwarding.target_cluster = i;
+      event->forwarding.forward_cycle = forward_cycle;
+    }
+
+    issueq_operation_on_cluster(getcore(), i, broadcast(index()));
+  }
+
+  return 0;
+}
+
+//
+// Exception recovery and redispatch
+//
+// Remove any and all ROBs that entered the pipeline after and
+// including the misspeculated uop. Because we move all affected
+// ROBs to the free state, they are instantly taken out of 
+// consideration for future pipeline stages and will be dropped on 
+// the next cycle.
+//
+// Normally this means that mispredicted branch uops are annulled 
+// even though only the code after the branch itself is invalid.
+// In this special case, the recovery rip is set to the actual
+// target of the branch rather than refetching the branch insn.
+//
+// We must be extremely careful to annul all uops in an
+// x86 macro-op; otherwise half the x86 instruction could
+// be executed twice once refetched. Therefore, if the
+// first uop to annul is not also the first uop in the x86
+// macro-op, we may have to scan backwards in the ROB until
+// we find the first uop of the macro-op. In this way, we
+// ensure that we can annul the entire macro-op. All uops
+// comprising the macro-op are guaranteed to still be in 
+// the ROB since none of the uops commit until the entire
+// macro-op can commit.
+//
+// Note that this does not apply if the final uop in the
+// macro-op is a branch and that branch uop itself is
+// being retained as occurs with mispredicted branches.
+//
+
+W64 ReorderBufferEntry::annul(bool keep_misspec_uop, bool return_first_annulled_rip) {
+  OutOfOrderCore& core = getcore();
+  OutOfOrderCoreEvent* event;
+
+  int idx;
+
+  //
+  // Pass 0: determine macro-op boundaries around uop
+  //
+  int somidx = index();
+  while (!core.ROB[somidx].uop.som) somidx = add_index_modulo(somidx, -1, ROB_SIZE);
+  int eomidx = index();
+  while (!core.ROB[eomidx].uop.eom) eomidx = add_index_modulo(eomidx, +1, ROB_SIZE);
+
+  // Find uop to start annulment at
+  int startidx = (keep_misspec_uop) ? add_index_modulo(eomidx, +1, ROB_SIZE) : somidx;
+  if unlikely (startidx == core.ROB.tail) {
+    // The uop causing the mis-speculation was the only uop in the ROB:
+    // no action is necessary (but in practice this is generally not possible)
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = core.eventlog.add(EVENT_ANNUL_NO_FUTURE_UOPS, this);
+      event->annul.somidx = somidx; event->annul.eomidx = eomidx;
+    }
+
+    return uop.rip;
+  }
+
+  // Find uop to stop annulment at (later in program order)
+  int endidx = add_index_modulo(core.ROB.tail, -1, ROB_SIZE);
+
+  // For branches, branch must always terminate the macro-op
+  if (keep_misspec_uop) assert(eomidx == index());
+
+  if unlikely (config.event_log_enabled) {
+    event = core.eventlog.add(EVENT_ANNUL_MISSPECULATION, this);
+    event->annul.startidx = startidx; event->annul.endidx = endidx;
+    event->annul.somidx = somidx; event->annul.eomidx = eomidx;
+  }
+
+  //
+  // Pass 1: invalidate issue queue slot for the annulled ROB
+  //
+  idx = endidx;
+  for (;;) {
+    ReorderBufferEntry& annulrob = core.ROB[idx];
+    issueq_operation_on_cluster(core, annulrob.cluster, annuluop(annulrob.index()));
+    annulrob.iqslot = -1;
+    if unlikely (idx == startidx) break;
+    idx = add_index_modulo(idx, -1, ROB_SIZE);
+  }
+
+  int annulcount = 0;
+
+  //
+  // Pass 2: reconstruct the SpecRRT as it existed just before (or after)
+  // the mis-speculated operation. This is done using the fast flush with
+  // pseudo-commit method as follows:
+  //
+  // First overwrite the SpecRRT with the CommitRRT.
+  //
+  // Then, simulate the commit of all non-speculative ROBs up to the branch
+  // by updating the SpecRRT as if it were the CommitRRT. This brings the
+  // speculative RRT to the same state as if all in flight nonspeculative
+  // operations before the branch had actually committed. Resume instruction 
+  // fetch at the correct branch target.
+  //
+  // Other methods (like backwards walk) are difficult to impossible because
+  // of the requirement that flag rename tables be restored even if some
+  // of the required physical registers with attached flags have since been
+  // freed. Therefore we don't do this.
+  //
+  // Technically RRT checkpointing could be used but due to the load/store
+  // replay mechanism in use, this would require a checkpoint at every load
+  // and store as well as branches.
+  //
+  foreach (i, TRANSREG_COUNT) { core.specrrt[i]->unspecref(i); }
+  core.specrrt = core.commitrrt;
+  foreach (i, TRANSREG_COUNT) { core.specrrt[i]->addspecref(i); }
+
+  // if (logable(6)) logfile << "Restored SpecRRT from CommitRRT; walking forward from:", endl, core.specrrt, endl;
+
+  idx = core.ROB.head;
+  for (idx = core.ROB.head; idx != startidx; idx = add_index_modulo(idx, +1, ROB_SIZE)) {
+    ReorderBufferEntry& rob = core.ROB[idx];
+    rob.pseudocommit();
+  }
+
+  // if (logable(6)) logfile << "Recovered SpecRRT:", endl, core.specrrt, endl;
+
+  //
+  // Pass 3: For each speculative ROB, reinitialize and free speculative ROBs
+  //
+
+  ReorderBufferEntry* lastrob = null;
+
+  idx = endidx;
+  for (;;) {
+    ReorderBufferEntry& annulrob = core.ROB[idx];
+
+    lastrob = &annulrob;
+
+    if unlikely (config.event_log_enabled) {
+      event = core.eventlog.add(EVENT_ANNUL_EACH_ROB, &annulrob);
+      event->annul.annulras = 0;
+    }
+
+    //
+    // Free the speculatively allocated physical register
+    // See notes above on Physical Register Recycling Complications
+    //
+    foreach (j, MAX_OPERANDS) { annulrob.operands[j]->unref(annulrob); }
+    annulrob.physreg->free();
+
+    if unlikely (isclass(annulrob.uop.opcode, OPCLASS_LOAD|OPCLASS_STORE)) {
+      annulrob.release_mem_lock(true);
+      core.loads_in_flight -= (annulrob.lsq->store == 0);
+      core.stores_in_flight -= (annulrob.lsq->store == 1);
+      annulrob.lsq->reset();
+      core.LSQ.annul(annulrob.lsq);
+    }
+
+    if unlikely (annulrob.lfrqslot >= 0) {
+      core.caches.annul_lfrq_slot(annulrob.lfrqslot);
+    }
+
+    if unlikely (isbranch(annulrob.uop.opcode) && (annulrob.uop.predinfo.bptype & (BRANCH_HINT_CALL|BRANCH_HINT_RET))) {
+      //
+      // Return Address Stack (RAS) correction:
+      // Example calls and returns in pipeline
+      //
+      // C1
+      //   C2
+      //   R2 
+      //   BR (mispredicted branch)
+      //   C3
+      //     C4
+      //
+      // BR mispredicts, so everything after BR must be annulled.
+      // RAS contains: C1 C3 C4, so we need to annul [C4 C3].
+      //
+      if unlikely (config.event_log_enabled) event->annul.annulras = 1;
+      core.branchpred.annulras(annulrob.uop.predinfo);
+    }
+
+    // Release our lock on the cached basic block containing this uop
+    if unlikely (config.event_log_enabled) {
+      event->annul.bb = annulrob.uop.bb;
+      event->annul.bb_refcount = annulrob.uop.bb->refcount;
+    }
+    annulrob.uop.bb->release();
+
+    if (annulrob.uop.is_asf)
+      cerr << __FILE__,__LINE__,": Annulling ", annulrob.uop,endl,flush;
+
+    // Remove it from the ASF-LLB, if it was on one
+    if unlikely(annulrob.llbline) {
+      core.locked_line_buffer.remove_ref(annulrob.llbline);
+      annulrob.llbline = (LLBLine*)null;
+    }
+
+    annulrob.reset();
+    core.ROB.annul(annulrob);
+
+    annulrob.changestate(core.rob_free_list);
+    annulcount++;
+
+    if (idx == startidx) break;
+    idx = add_index_modulo(idx, -1, ROB_SIZE);
+  }
+
+  assert(core.ROB[startidx].uop.som);
+
+  if (return_first_annulled_rip) return core.ROB[startidx].uop.rip;
+
+  return (keep_misspec_uop) ? core.ROB[startidx].uop.riptaken : (Waddr)core.ROB[startidx].uop.rip;
+}
+
+//
+// Return the specified uop back to the ready_to_dispatch state.
+// All structures allocated to the uop are reset to the same state
+// they had immediately after allocation.
+//
+// This function is used to handle various types of mis-speculations
+// in which only the values are invalid, rather than the actual uops
+// as with branch mispredicts and unaligned accesses. It is also
+// useful for various kinds of value speculation.
+//
+// The normal "fast" replay mechanism is still used for scheduler
+// related replays - this is much more expensive.
+//
+// If this function is called for a given uop U, all of U's
+// consumers must also be re-dispatched. The redispatch_dependents()
+// function automatically does this.
+//
+// The <prevrob> argument should be the previous ROB, in program
+// order, before this one. If this is the first ROB being
+// re-dispatched, <prevrob> should be null.
+//
+
+void ReorderBufferEntry::redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob) {
+  OutOfOrderCore& core = getcore();
+  OutOfOrderCoreEvent* event;
+
+  if unlikely (config.event_log_enabled) {
+    event = core.eventlog.add(EVENT_REDISPATCH_EACH_ROB, this);
+    event->redispatch.current_state_list = current_state_list;
+    event->redispatch.dependent_operands = dependent_operands.integer();
+    foreach (i, MAX_OPERANDS) operands[i]->fill_operand_info(event->redispatch.opinfo[i]);
+  }
+
+  stats.ooocore.dispatch.redispatch.trigger_uops++;
+
+  // Un-issue the uop:
+  issued = 0;
+
+  // Remove from issue queue, if it was already in some issue queue
+  if unlikely (cluster >= 0) {
+    bool found = 0;
+    issueq_operation_on_cluster_with_result(getcore(), cluster, found, annuluop(index()));
+    if unlikely (config.event_log_enabled) event->redispatch.iqslot = found;
+    cluster = -1;
+  }
+
+  if unlikely (lfrqslot >= 0) {
+    core.caches.annul_lfrq_slot(lfrqslot);
+    lfrqslot = -1;
+  }
+
+  release_mem_lock(true);
+
+  if unlikely (lsq) {
+    lsq->physaddr = 0;
+    lsq->addrvalid = 0;
+    lsq->datavalid = 0;
+    lsq->mbtag = -1;
+    lsq->data = 0;
+    lsq->physaddr = 0;
+    lsq->invalid = 0;
+
+    if (operands[RS]->nonnull()) {
+      operands[RS]->unref(*this);
+      operands[RS] = &core.physregfiles[0][PHYS_REG_NULL];
+      operands[RS]->addref(*this);
+    }
+  }
+
+  // Return physreg to state just after allocation
+  physreg->data = 0;
+  physreg->flags = FLAG_WAIT;
+  physreg->changestate(PHYSREG_WAITING);
+  if (uop.is_asf)
+    cerr << __FILE__,__LINE__,": Redispatching ", uop,endl,flush;
+  // Remove it from the ASF-LLB, if it was on one
+  if unlikely(llbline) {
+    core.locked_line_buffer.remove_ref(llbline);
+    llbline = (LLBLine*)null;
+  }
+
+  // Force ROB to be re-dispatched in program order
+  cycles_left = 0;
+  forward_cycle = 0;
+  load_store_second_phase = 0;
+  changestate(core.rob_ready_to_dispatch_list, true, prevrob);
+}
+
+//
+// Find all uops dependent on the specified uop, and 
+// redispatch each of them.
+//
+void ReorderBufferEntry::redispatch_dependents(bool inclusive) {
+  OutOfOrderCore& core = getcore();
+
+  bitvec<ROB_SIZE> depmap;
+  depmap = 0;
+  depmap[index()] = 1;
+
+  OutOfOrderCoreEvent* event;
+  if unlikely (config.event_log_enabled) event = core.eventlog.add(EVENT_REDISPATCH_DEPENDENTS, this);
+
+  //
+  // Go through the ROB and identify the slice of all uops
+  // depending on this one, through the use of physical
+  // registers as operands.
+  //
+  int count = 0;
+
+  ReorderBufferEntry* prevrob = null;
+
+  foreach_forward_from(core.ROB, this, robidx) {
+    ReorderBufferEntry& reissuerob = core.ROB[robidx];
+
+    if (!inclusive) {
+      depmap[reissuerob.index()] = 1;
+      continue;
+    }
+
+    bitvec<MAX_OPERANDS> dependent_operands;
+    dependent_operands = 0;
+
+    foreach (i, MAX_OPERANDS) {
+      const PhysicalRegister* operand = reissuerob.operands[i];
+      dependent_operands[i] = (operand->rob && depmap[operand->rob->index()]);
+    }
+
+    //
+    // We must also redispatch all stores, since in pathological cases, there may
+    // be store-store ordering cases we don't know about, i.e. if some store
+    // inherits from a previous store, but that previous store actually has the
+    // wrong address because of some other bogus uop providing its address.
+    //
+    // In addition, ld.acq and st.rel create additional complexity: we can never
+    // re-dispatch the ld.acq but not the st.rel and vice versa; both must be
+    // redispatched together.
+    //
+    bool dep = (*dependent_operands) | (robidx == index()) | isstore(uop.opcode);
+
+    if unlikely (dep) {
+      count++;
+      depmap[reissuerob.index()] = 1;
+      reissuerob.redispatch(dependent_operands, prevrob);
+      prevrob = &reissuerob;
+    }
+  }
+
+  assert(inrange(count, 1, ROB_SIZE));
+  stats.ooocore.dispatch.redispatch.dependent_uops[count-1]++;
+
+  if unlikely (config.event_log_enabled) {
+    event = core.eventlog.add(EVENT_REDISPATCH_DEPENDENTS_DONE, this);
+    event->redispatch.count = count;
+  }
+}
+
+int ReorderBufferEntry::pseudocommit() {
+  OutOfOrderCore& core = getcore();
+  if unlikely (config.event_log_enabled) core.eventlog.add(EVENT_ANNUL_PSEUDOCOMMIT, this);
+
+  if likely (archdest_can_commit[uop.rd]) {
+    core.specrrt[uop.rd]->unspecref(uop.rd);
+    core.specrrt[uop.rd] = physreg;
+    core.specrrt[uop.rd]->addspecref(uop.rd);
+  }
+
+  if likely (!uop.nouserflags) {
+    if (uop.setflags & SETFLAG_ZF) {
+      core.specrrt[REG_zf]->unspecref(REG_zf);
+      core.specrrt[REG_zf] = physreg;
+      core.specrrt[REG_zf]->addspecref(REG_zf);
+    }
+    if (uop.setflags & SETFLAG_CF) {
+      core.specrrt[REG_cf]->unspecref(REG_cf);
+      core.specrrt[REG_cf] = physreg;
+      core.specrrt[REG_cf]->addspecref(REG_cf);
+    }
+    if (uop.setflags & SETFLAG_OF) {
+      core.specrrt[REG_of]->unspecref(REG_of);
+      core.specrrt[REG_of] = physreg;
+      core.specrrt[REG_of]->addspecref(REG_of);
+    }
+  }
+
+  if unlikely (isclass(uop.opcode, OPCLASS_BARRIER))
+                return COMMIT_RESULT_BARRIER;
+
+  return COMMIT_RESULT_OK;
+}
+
+/**
+ * Issues an ASF operation (i.e. ACQUIRE / COMMIT) on the core.
+ */
+template <typename T> byte x86_genflags(T r);
+int ReorderBufferEntry::issueasf(IssueState& state, W64 rbdata) {
+  OutOfOrderCore& core = getcore();
+  LockedLineBuffer& llb = core.locked_line_buffer;
+  W64 asf_err;
+
+  switch(uop.opcode) {
+    case (OP_acq):
+      /* Check, whether this is an re-execution of the ACQUIRE instruction. This is caused
+         by either an exception or an abort within the critical section. There is no need
+         to check anything else, we just return the error-code and reset the state. */
+      if (core.asf_reissue_will_fail) {
+        cerr << __FILE__,__LINE__,": Re-Issueing failing ACQUIRE, error code: ", core.asf_stored_error, endl,flush;
+        llb.clear();
+        state.reg.rddata = core.asf_stored_error;
+        /* Unstall the frontend to allow fetching of the following instructions */
+        /*TODO: Is this safe? Let's see, it should be! ;-)*/core.stall_frontend = false;
+        break;
+      }
+      cerr << __FILE__,__LINE__,"@",sim_cycle,": Issue ASF ACQUIRE.",endl,flush;
+      /* Scan backwards through the ROB, in order to find the latest locked loads and stores, which alias with the
+         content of the LLB, in order to not ignore any changes to addresses (locked loads) or data (stores) in the LLB.
+         This works, as the LSQ is filled in-order, together with the ROB! */
+      foreach_backward_before(core.ROB, this, i) {
+        ReorderBufferEntry&  rob = core.ROB[i];
+        //cerr << "ROB[",i,"]=", rob, endl,flush;
+        if (!isclass(rob.uop.opcode, OPCLASS_MEM)) continue;
+
+        /* Ignore everything non-asf, except stores */
+        if likely(!(rob.uop.is_asf || isstore(rob.uop.opcode))) continue;
+        /* Locked prefetches require special treatment, as they don't have an associated LSQEntry! */
+        if unlikely (isprefetch(rob.uop.opcode)) {
+          if (rob.issued) continue;
+        } else {
+          LoadStoreQueueEntry& lsq = *rob.lsq;
+          if (!lsq.entry_valid) continue;
+          /* Ignore loads which have already a valid address (which is already in the LLB!) */
+          if (!lsq.store && lsq.addrvalid) continue;
+          /* and also stores, which have a known non-aliasing address */
+          if (lsq.store && lsq.addrvalid && !llb.contains(lsq.physaddr << 3)) continue;
+        }
+        /* we have found something dangerous, either:
+           -a locked prefetchw, which has not yet issued
+           -a locked load, which does not yet have an address added to the LLB
+           -a store, which either aliases with one of the LLB entries or does not have a valid address generated
+          -> create dependency on it! */
+        operands[RS]->unref(*this);
+        operands[RS] = rob.physreg;
+        operands[RS]->addref(*this);
+
+        cerr << __FILE__,__LINE__,": Found aliasing uop ", rob.uop, " at ROB[",i,"]=", rob, endl, flush;
+        replay();
+        return ISSUE_NEEDS_REPLAY;
+      }
+      /* Not found anything potentially bad -> check for consistency with intermediate! */
+      cerr << __FILE__,__LINE__,": Nothing found! Checking intermediate..", endl, flush;
+      if unlikely(rbdata != llb.num_locations) {
+        cerr << __FILE__,__LINE__,": Wrong number of locations for ACQUIRE: Spec:",rbdata," vs. actual: ",llb.num_locations, endl,flush;
+        llb.clear();
+        state.reg.rddata = -2;
+        /* Unstall the frontend to allow fetching of the following instructions */
+        /*TODO: Is this safe? Let's see, it should be! ;-)*/core.stall_frontend = false;
+        break;
+      }
+      /* Check for any conflicts during the specification phase, i.e. invalidating probes to cachelines in the LLB and fault early.
+         This is also used, when the critical section aborts asynchronously and the acquire is re-executed!
+         This will be done again at commit time, in order to find any remaining conflicts that occured between issue and commit of
+         the acquire instruction. In case there are errors then, the acquire instruction has to be redispatched! */
+      asf_err = llb.consistency_error();
+      if unlikely(asf_err) { 
+        cerr << __FILE__,__LINE__,": ACQUIRE could not create a valid snapshot! Error ", asf_err, endl,flush;
+        llb.clear();
+        state.reg.rddata = uop.rip;
+        /* Unstall the frontend to allow fetching of the following instructions */
+        /*TODO: Is this safe? Let's see, it should be! ;-)*/core.stall_frontend = false;
+        break;
+      }
+
+      cerr << __FILE__,__LINE__,": Taking snapshot!",endl,flush;
+      llb.snapshot();
+      state.reg.rddata = 0;
+      break;
+    case (OP_com):
+      /* Nothing to be done yet for releasing at issue time, see comment in ROBEntry::issue */
+      break;
+    default:
+      assert(false);
+  }
+
+  state.reg.rdflags = x86_genflags<W64>(state.reg.rddata);
+  return ISSUE_COMPLETED;
+}
diff -r 10448c053ad6 asfooopipe.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/asfooopipe.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,1852 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Out-of-Order Core Simulator With Experimental AMD64 ASF Extension
+// Core Pipeline Stages: Frontend, Writeback, Commit
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+#include <globals.h>
+#include <elf.h>
+#include <ptlsim.h>
+#include <branchpred.h>
+#include <datastore.h>
+#include <logic.h>
+#include <dcache.h>
+
+#define INSIDE_OOOCORE
+#include <asfooocore.h>
+#include <stats.h>
+
+#ifndef ENABLE_CHECKS
+#undef assert
+#define assert(x) (x)
+#endif
+
+#ifndef ENABLE_LOGGING
+#undef logable
+#define logable(level) (0)
+#endif
+
+using namespace ASFOutOfOrderModel;
+
+void OutOfOrderCoreCacheCallbacks::icache_wakeup(LoadStoreInfo lsi, W64 physaddr) {
+  if (logable(6)) logfile << "I-cache wakeup of physaddr ", (void*)(Waddr)physaddr, endl;
+  core.waiting_for_icache_fill = 0;
+}
+
+//
+// Determine which physical register files can be written
+// by a given type of uop.
+//
+// This must be customized if the physical register files
+// are altered in ooohwdef.h.
+//
+static W32 phys_reg_files_writable_by_uop(const TransOp& uop) {
+  W32 c = opinfo[uop.opcode].opclass;
+
+#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
+  return
+    (c & OPCLASS_STORE) ? OutOfOrderCore::PHYS_REG_FILE_MASK_ST :
+    (c & OPCLASS_BRANCH) ? OutOfOrderCore::PHYS_REG_FILE_MASK_BR :
+    OutOfOrderCore::PHYS_REG_FILE_MASK_INT;
+#else
+  return
+    (c & OPCLASS_STORE) ? OutOfOrderCore::PHYS_REG_FILE_MASK_ST :
+    (c & OPCLASS_BRANCH) ? OutOfOrderCore::PHYS_REG_FILE_MASK_BR :
+    (c & (OPCLASS_LOAD | OPCLASS_PREFETCH)) ? ((uop.datatype == DATATYPE_INT) ? OutOfOrderCore::PHYS_REG_FILE_MASK_INT : OutOfOrderCore::PHYS_REG_FILE_MASK_FP) :
+    ((c & OPCLASS_FP) | inrange((int)uop.rd, REG_xmml0, REG_xmmh15) | inrange((int)uop.rd, REG_fptos, REG_ctx)) ? OutOfOrderCore::PHYS_REG_FILE_MASK_FP :
+    OutOfOrderCore::PHYS_REG_FILE_MASK_INT;
+#endif
+}
+
+void OutOfOrderCore::annul_fetchq() {
+  //
+  // There may be return address stack (RAS) updates from calls and returns
+  // in the fetch queue that never made it to renaming, so they have no ROB
+  // that the core can annul normally. Therefore, we must go backwards in
+  // the fetch queue to annul these updates, in addition to checking the ROB.
+  //
+  foreach_backward (fetchq, i) {
+    FetchBufferEntry& fetchbuf = fetchq[i];
+    if unlikely (isbranch(fetchbuf.opcode) && (fetchbuf.predinfo.bptype & (BRANCH_HINT_CALL|BRANCH_HINT_RET))) {
+      if unlikely (config.event_log_enabled) eventlog.add(EVENT_ANNUL_FETCHQ_RAS, fetchbuf);
+      branchpred.annulras(fetchbuf.predinfo);
+    }
+    // Also release the reference to the uop's basic block
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = eventlog.add(EVENT_ANNUL_FETCHQ, fetchbuf);
+      event->annul.bb = fetchbuf.bb; event->annul.bb_refcount = fetchbuf.bb->refcount;
+    }
+    fetchbuf.bb->release();
+  }
+}
+
+//
+// Flush entire pipeline immediately, reset all processor
+// structures to their initial state, and resume from the
+// state saved in ctx.commitarf.
+//
+void OutOfOrderCore::flush_pipeline() {
+  caches.complete();
+  annul_fetchq();
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+    // Release our lock on the cached basic block containing each uop
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = eventlog.add(EVENT_ANNUL_FLUSH, &rob);
+      event->annul.bb = rob.uop.bb; event->annul.bb_refcount = rob.uop.bb->refcount;
+    }
+    rob.release_mem_lock(true);
+    rob.uop.bb->release();
+  }
+
+  reset_fetch_unit(ctx.commitarf[REG_rip]);
+  rob_states.reset();
+  // physreg_states.reset();
+
+  ROB.reset();
+  foreach (i, ROB_SIZE) {
+    ROB[i].coreid = coreid;
+    ROB[i].changestate(rob_free_list);
+  }
+  LSQ.reset();
+  foreach (i, LSQ_SIZE) {
+    LSQ[i].coreid = coreid;
+  }
+  loads_in_flight = 0;
+  stores_in_flight = 0;
+
+  foreach (i, PHYS_REG_FILE_COUNT) physregfiles[i].reset();
+
+  foreach_issueq(reset(coreid));
+
+  dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
+  last_commit_at_cycle = sim_cycle;
+
+  external_to_core_state();
+}
+
+// call this in response to a branch mispredict:
+void OutOfOrderCore::reset_fetch_unit(W64 realrip) {
+  if (current_basic_block) {
+    // Release our lock on the cached basic block we're currently fetching
+    current_basic_block->release();
+    current_basic_block = null;
+  }
+
+  fetchrip = realrip;
+  fetchrip.update(ctx);
+  stall_frontend = 0;
+  waiting_for_icache_fill = 0;
+  fetchq.reset();
+  current_basic_block_transop_index = 0;
+  unaligned_ldst_buf.reset();
+}
+
+//
+// Process any pending self-modifying code invalidate requests.
+// This must be called on all cores *after* flushing all pipelines,
+// to ensure no stale BBs are referenced, thus preventing them
+// from being freed.
+//
+void OutOfOrderCore::invalidate_smc() {
+  if unlikely (smc_invalidate_pending) {
+    if (logable(5)) logfile << "SMC invalidate pending on ", smc_invalidate_rvp, endl;
+    bbcache.invalidate_page(smc_invalidate_rvp.mfnlo, INVALIDATE_REASON_SMC);
+    if unlikely (smc_invalidate_rvp.mfnlo != smc_invalidate_rvp.mfnhi) bbcache.invalidate_page(smc_invalidate_rvp.mfnhi, INVALIDATE_REASON_SMC);
+    smc_invalidate_pending = 0;
+  }
+}
+
+//
+// Copy external archregs to physregs and reset all rename tables
+//
+void OutOfOrderCore::external_to_core_state() {
+  foreach (i, PHYS_REG_FILE_COUNT) {
+    PhysicalRegisterFile& rf = physregfiles[i];
+    PhysicalRegister* zeroreg = rf.alloc(PHYS_REG_NULL);
+    zeroreg->addref();
+    zeroreg->commit();
+    zeroreg->data = 0;
+    zeroreg->flags = 0;
+    zeroreg->archreg = REG_zero;
+  }
+
+  // Always start out on cluster 0:
+  PhysicalRegister* zeroreg = &physregfiles[0][PHYS_REG_NULL];
+
+  //
+  // Allocate and commit each architectural register
+  //
+  foreach (i, ARCHREG_COUNT) {
+    //
+    // IMPORTANT! If using some register file configuration other
+    // than (integer, fp), this needs to be changed!
+    //
+#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
+    int rfid = (i == REG_rip) ? PHYS_REG_FILE_BR : PHYS_REG_FILE_INT;
+#else
+    bool fp = inrange((int)i, REG_xmml0, REG_xmmh15) | (inrange((int)i, REG_fptos, REG_ctx));
+    int rfid = (fp) ? PHYS_REG_FILE_FP : (i == REG_rip) ? PHYS_REG_FILE_BR : PHYS_REG_FILE_INT;
+#endif
+    PhysicalRegisterFile& rf = physregfiles[rfid];
+    PhysicalRegister* physreg = (i == REG_zero) ? zeroreg : rf.alloc();
+    physreg->data = ctx.commitarf[i];
+    physreg->flags = 0;
+    commitrrt[i] = physreg;
+  }
+
+  commitrrt[REG_flags]->flags = (W16)commitrrt[REG_flags]->data;
+
+  //
+  // Internal translation registers are never used before
+  // they are written for the first time:
+  //
+  for (int i = ARCHREG_COUNT; i < TRANSREG_COUNT; i++) {
+    commitrrt[i] = zeroreg;
+  }
+
+  //
+  // Set renamable flags
+  // 
+  commitrrt[REG_zf] = commitrrt[REG_flags];
+  commitrrt[REG_cf] = commitrrt[REG_flags];
+  commitrrt[REG_of] = commitrrt[REG_flags];
+
+  //
+  // Copy commitrrt to specrrt and update refcounts
+  //
+  foreach (i, TRANSREG_COUNT) {
+    commitrrt[i]->commit();
+    specrrt[i] = commitrrt[i];
+    specrrt[i]->addspecref(i);
+    commitrrt[i]->addcommitref(i);
+  }
+
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+  specrrt.renamed_in_this_basic_block.reset();
+  commitrrt.renamed_in_this_basic_block.reset();
+#endif
+}
+
+//
+// Re-dispatch all uops in the ROB that have not yet generated
+// a result or are otherwise stalled.
+//
+void OutOfOrderCore::redispatch_deadlock_recovery() {
+  if (logable(6)) dump_ooo_state(logfile);
+
+  stats.ooocore.dispatch.redispatch.deadlock_flushes++;
+
+  flush_pipeline();
+
+  /*
+  //
+  // This is a more selective scheme than the full pipeline flush.
+  // Presently it does not work correctly with some combinations
+  // of user-modifiable parameters, so it's disabled to ensure
+  // deadlock-free operation in every configuration.
+  //
+
+  ReorderBufferEntry* prevrob = null;
+  bitvec<MAX_OPERANDS> noops = 0;
+
+  foreach_forward(ROB, robidx) {
+  ReorderBufferEntry& rob = ROB[robidx];
+
+  //
+  // Only re-dispatch those uops that have not yet generated a value
+  // or are guaranteed to produce a value soon without tying up resources.
+  // This must occur in program order to avoid deadlock!
+  // 
+  // bool recovery_required = (rob.current_state_list->flags & ROB_STATE_IN_ISSUE_QUEUE) || (rob.current_state_list == &rob_ready_to_dispatch_list);
+  bool recovery_required = 1; // for now, just to be safe
+
+  if (recovery_required) {
+  rob.redispatch(noops, prevrob);
+  prevrob = &rob;
+  stats.ooocore.dispatch.redispatch.deadlock_uops_flushed++;
+  }
+  }
+
+  if (logable(6)) dump_ooo_state();
+  */
+}
+
+
+//
+// Fetch Stage
+//
+// Fetch a stream of x86 instructions from the L1 i-cache along predicted
+// branch paths.
+//
+// Internally, up to N uops per clock corresponding to instructions in
+// the current basic block are fetched per cycle and placed in the uopq
+// as TransOps. When we run out of uops in one basic block, we proceed
+// to lookup or translate the next basic block.
+//
+
+//
+// Used to debug crashes when cycle to start logging can't be determined:
+//
+static RIPVirtPhys fetch_bb_address_ringbuf[256];
+static W64 fetch_bb_address_ringbuf_head = 0;
+
+static void print_fetch_bb_address_ringbuf(ostream& os) {
+  os << "Head: ", fetch_bb_address_ringbuf_head, endl;
+  foreach (i, lengthof(fetch_bb_address_ringbuf)) {
+    int j = (fetch_bb_address_ringbuf_head + i) % lengthof(fetch_bb_address_ringbuf);
+    const RIPVirtPhys& addr = fetch_bb_address_ringbuf[j];
+    os << "  ", intstring(i, 16), ": ", addr, endl;
+  }
+}
+
+bool OutOfOrderCore::fetch() {
+  time_this_scope(ctfetch);
+
+  int fetchcount = 0;
+  int taken_branch_count = 0;
+
+  OutOfOrderCoreEvent* event;
+
+  if unlikely (stall_frontend) {
+    if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_STALLED);
+    stats.ooocore.fetch.stop.stalled++;
+    return true;
+  }
+
+  if unlikely (waiting_for_icache_fill) {
+    if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_ICACHE_WAIT);
+    stats.ooocore.fetch.stop.icache_miss++;
+    return true;
+  }
+
+  while ((fetchcount < FETCH_WIDTH) && (taken_branch_count == 0) && !stall_frontend) {
+    if unlikely (!fetchq.remaining()) {
+      if unlikely (config.event_log_enabled) { if (!fetchcount) eventlog.add(EVENT_FETCH_FETCHQ_FULL); }
+      stats.ooocore.fetch.stop.fetchq_full++;
+      break;
+    }
+
+    if unlikely ((fetchrip.rip == config.start_log_at_rip) && (fetchrip.rip != 0xffffffffffffffffULL)) {
+      config.start_log_at_iteration = 0;
+      logenable = 1;
+    }
+
+    if unlikely ((!current_basic_block) || (current_basic_block_transop_index >= current_basic_block->count)) {
+      fetch_bb_address_ringbuf[fetch_bb_address_ringbuf_head] = fetchrip;
+      fetch_bb_address_ringbuf_head = add_index_modulo(fetch_bb_address_ringbuf_head, +1, lengthof(fetch_bb_address_ringbuf));
+      fetch_or_translate_basic_block(ctx, fetchrip);
+    }
+
+    if unlikely (current_basic_block->invalidblock) {
+      if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_BOGUS_RIP, fetchrip);
+      stats.ooocore.fetch.stop.bogus_rip++;
+      //
+      // Keep fetching - the decoder has injected assist microcode that
+      // branches to the invalid opcode or exec page fault handler.
+      //
+    }
+
+#ifdef PTLSIM_HYPERVISOR
+    Waddr physaddr = (fetchrip.mfnlo << 12) + lowbits(fetchrip, 12);
+#else
+    Waddr physaddr = fetchrip;
+#endif
+
+    W64 req_icache_block = floor(physaddr, ICACHE_FETCH_GRANULARITY);
+    if ((!current_basic_block->invalidblock) && (req_icache_block != current_icache_block)) {
+      bool hit = caches.probe_icache(fetchrip, physaddr);
+      hit |= config.perfect_cache;
+      if unlikely (!hit) {
+        int missbuf = caches.initiate_icache_miss(physaddr);
+        if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_ICACHE_MISS, fetchrip)->fetch.missbuf = missbuf;
+
+        if unlikely (missbuf < 0) {
+          // Try to re-allocate a miss buffer on the next cycle
+          break;
+        }
+        waiting_for_icache_fill = 1;
+        stats.ooocore.fetch.stop.icache_miss++;
+        break;
+      }
+
+      stats.ooocore.fetch.blocks++;
+      current_icache_block = req_icache_block;
+      per_context_dcache_stats_update(0, fetch.hit.L1++);
+    }
+
+    FetchBufferEntry& transop = *fetchq.alloc();
+    uopimpl_func_t synthop = null;
+
+    assert(current_basic_block->synthops);
+
+    if likely (!unaligned_ldst_buf.get(transop, synthop)) {
+      transop = current_basic_block->transops[current_basic_block_transop_index];
+      synthop = current_basic_block->synthops[current_basic_block_transop_index];
+    }
+
+    transop.rip = fetchrip;
+    transop.uuid = fetch_uuid;
+
+    //
+    // Handle loads and stores marked as unaligned in the basic block cache.
+    // These uops are split into two parts (ld.lo, ld.hi or st.lo, st.hi)
+    // and the parts are put into a 2-entry buffer (unaligned_ldst_pair).
+    // Fetching continues from this buffer instead of the basic block
+    // until both uops are forced into the pipeline.
+    //
+    if unlikely (transop.unaligned) {
+      if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_SPLIT, transop);
+      split_unaligned(transop, unaligned_ldst_buf);
+      assert(unaligned_ldst_buf.get(transop, synthop));
+    }
+
+    assert(transop.bbindex == current_basic_block_transop_index);
+
+    // Transform memory fences into NOPs for single-core model
+    if unlikely (transop.opcode == OP_mf) transop.opcode = OP_nop;
+
+    transop.bb = current_basic_block;
+    transop.bb->acquire();
+    transop.synthop = synthop;
+
+    current_basic_block_transop_index += (unaligned_ldst_buf.empty());
+
+    stats.ooocore.fetch.user_insns += transop.som;
+
+    if unlikely (isclass(transop.opcode, OPCLASS_BARRIER)) {
+      // We've hit an assist: stall the frontend until we resume or redirect
+      if unlikely (config.event_log_enabled) eventlog.add(EVENT_FETCH_ASSIST, transop);
+      stats.ooocore.fetch.stop.microcode_assist++;
+      stall_frontend = 1;      
+    }
+
+    if unlikely ((transop.opcode == OP_acq) || (transop.opcode == OP_com)) {
+      /* For a simple first simulation, ASF's acquire and release instrucitons stall the frontend, until they commit! */
+      cerr << __FILE__,__LINE__,": Stalling frontend because of ",transop, endl, flush;
+      stall_frontend = true;
+    }
+    stats.ooocore.fetch.uops++;
+
+    Waddr predrip = 0;
+    bool redirectrip = false;
+
+    transop.rip = fetchrip;
+    transop.uuid = fetch_uuid++;
+
+    if (isbranch(transop.opcode)) {
+      transop.predinfo.uuid = transop.uuid;
+      transop.predinfo.bptype = 
+        (isclass(transop.opcode, OPCLASS_COND_BRANCH) << log2(BRANCH_HINT_COND)) |
+        (isclass(transop.opcode, OPCLASS_INDIR_BRANCH) << log2(BRANCH_HINT_INDIRECT)) |
+        (bit(transop.extshift, log2(BRANCH_HINT_PUSH_RAS)) << log2(BRANCH_HINT_CALL)) |
+        (bit(transop.extshift, log2(BRANCH_HINT_POP_RAS)) << log2(BRANCH_HINT_RET));
+
+      // SMP/SMT: Fill in with target thread ID (if the predictor supports this):
+      transop.predinfo.ctxid = 0;
+      transop.predinfo.ripafter = fetchrip + transop.bytes;
+      predrip = branchpred.predict(transop.predinfo, transop.predinfo.bptype, transop.predinfo.ripafter, transop.riptaken);
+      redirectrip = 1;
+      stats.ooocore.branchpred.predictions++;
+    }
+
+    // Set up branches so mispredicts can be calculated correctly:
+    if unlikely (isclass(transop.opcode, OPCLASS_COND_BRANCH)) {
+      if unlikely (predrip != transop.riptaken) {
+        assert(predrip == transop.ripseq);
+        transop.cond = invert_cond(transop.cond);
+        //
+        // We need to be careful here: we already looked up the synthop for this
+        // uop according to the old condition, so redo that here so we call the
+        // correct code for the swapped condition.
+        //
+        transop.synthop = get_synthcode_for_cond_branch(transop.opcode, transop.cond, transop.size, 0);
+        swap(transop.riptaken, transop.ripseq);
+      }
+    } else if unlikely (isclass(transop.opcode, OPCLASS_INDIR_BRANCH)) {
+      transop.riptaken = predrip;
+      transop.ripseq = predrip;
+    }
+
+    stats.ooocore.fetch.opclass[opclassof(transop.opcode)]++;
+
+    if unlikely (config.event_log_enabled) {
+      event = eventlog.add(EVENT_FETCH_OK, transop);
+      event->fetch.bb = current_basic_block;
+      event->fetch.predrip = predrip;
+    }
+
+    if likely (transop.eom) {
+      fetchrip.rip += transop.bytes;
+      fetchrip.update(ctx);
+
+      if unlikely (isbranch(transop.opcode) && (transop.predinfo.bptype & (BRANCH_HINT_CALL|BRANCH_HINT_RET)))
+                    branchpred.updateras(transop.predinfo, transop.predinfo.ripafter);
+
+      if unlikely (redirectrip) {
+        // follow to target, then end fetching for this cycle if predicted taken
+        bool taken = (predrip != fetchrip);
+        taken_branch_count += taken;
+        fetchrip = predrip;
+        fetchrip.update(ctx);
+        if (taken) {
+          fetchcount++;
+          stats.ooocore.fetch.stop.branch_taken++;
+          break;
+        }
+      }
+    }
+
+    fetchcount++;
+  }
+
+  stats.ooocore.fetch.stop.full_width += (fetchcount == FETCH_WIDTH);
+  stats.ooocore.fetch.width[fetchcount]++;
+
+  return true;
+}
+
+BasicBlock* OutOfOrderCore::fetch_or_translate_basic_block(Context& ctx, const RIPVirtPhys& rvp) {  
+  time_this_scope(ctdecode);
+
+  if likely (current_basic_block) {
+    // Release our ref to the old basic block being fetched
+    current_basic_block->release();
+    current_basic_block = null;
+  }
+
+  BasicBlock* bb = bbcache(rvp);
+
+  if likely (bb) {
+    current_basic_block = bb;
+  } else {
+    current_basic_block = bbcache.translate(ctx, rvp);
+    assert(current_basic_block);
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = eventlog.add(EVENT_FETCH_TRANSLATE, rvp);
+      event->fetch.bb = current_basic_block; event->fetch.bb_uop_count = current_basic_block->count;
+    }
+  }
+
+  //
+  // Acquire a reference to the new basic block being fetched.
+  // This must be done right away so future allocations do not
+  // reclaim the BB while we still have a reference to it.
+  //
+  current_basic_block->acquire();
+  current_basic_block->use(sim_cycle);  
+
+  if unlikely (!current_basic_block->synthops) synth_uops_for_bb(*current_basic_block);
+  assert(current_basic_block->synthops);
+  
+  current_basic_block_transop_index = 0;
+  assert(current_basic_block->rip == rvp);
+
+  return current_basic_block;
+}
+
+//
+// Allocate and Rename Stages
+//
+
+void OutOfOrderCore::rename() {
+  time_this_scope(ctrename);
+
+  int prepcount = 0;
+
+  while (prepcount < FRONTEND_WIDTH) {
+    if unlikely (fetchq.empty()) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add(EVENT_RENAME_FETCHQ_EMPTY); }
+      stats.ooocore.frontend.status.fetchq_empty++;
+      break;
+    } 
+
+    if unlikely (!ROB.remaining()) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add(EVENT_RENAME_ROB_FULL); }
+      stats.ooocore.frontend.status.rob_full++;
+      break;
+    }
+
+    FetchBufferEntry& fetchbuf = *fetchq.peek();
+
+    int phys_reg_file = -1;
+
+    W32 acceptable_phys_reg_files = phys_reg_files_writable_by_uop(fetchbuf);
+
+    foreach (i, PHYS_REG_FILE_COUNT) {
+      int reg_file_to_check = add_index_modulo(round_robin_reg_file_offset, i, PHYS_REG_FILE_COUNT);
+      if likely (bit(acceptable_phys_reg_files, reg_file_to_check) && physregfiles[reg_file_to_check].remaining()) {
+        phys_reg_file = reg_file_to_check; break;
+      }
+    }
+
+    if (phys_reg_file < 0) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add()->fill(EVENT_RENAME_PHYSREGS_FULL); }
+      stats.ooocore.frontend.status.physregs_full++;
+      break;
+    }
+
+    bool ld = isload(fetchbuf.opcode);
+    bool st = isstore(fetchbuf.opcode);
+    bool br = isbranch(fetchbuf.opcode);
+
+    if unlikely (ld && (loads_in_flight >= LDQ_SIZE)) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add(EVENT_RENAME_LDQ_FULL); }
+      stats.ooocore.frontend.status.ldq_full++;
+      break;
+    }
+
+    if unlikely (st && (stores_in_flight >= STQ_SIZE)) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add(EVENT_RENAME_STQ_FULL); }
+      stats.ooocore.frontend.status.stq_full++;
+      break;
+    }
+
+    if unlikely ((ld|st) && (!LSQ.remaining())) {
+      if unlikely (config.event_log_enabled) { if likely (!prepcount) eventlog.add(EVENT_RENAME_MEMQ_FULL); }
+      break;
+    }
+
+    stats.ooocore.frontend.status.complete++;
+
+    FetchBufferEntry& transop = *fetchq.dequeue();
+    ReorderBufferEntry& rob = *ROB.alloc();
+    PhysicalRegister* physreg = null;
+
+    LoadStoreQueueEntry* lsqp = (ld|st) ? LSQ.alloc() : null;
+    LoadStoreQueueEntry& lsq = *lsqp;
+
+    rob.reset();
+    rob.uop = transop;
+    rob.entry_valid = 1;
+    rob.cycles_left = FRONTEND_STAGES;
+    rob.lsq = null;
+    if unlikely (ld|st) {
+      rob.lsq = &lsq;
+      lsq.rob = &rob;
+      lsq.store = st;
+      lsq.datavalid = 0;
+      lsq.addrvalid = 0;
+      lsq.invalid = 0;
+    }
+
+    stats.ooocore.frontend.alloc.reg += (!(ld|st|br));
+    stats.ooocore.frontend.alloc.ldreg += ld;
+    stats.ooocore.frontend.alloc.sfr += st;
+    stats.ooocore.frontend.alloc.br += br;
+
+    //
+    // Rename operands:
+    //
+
+    rob.operands[RA] = specrrt[transop.ra];
+    rob.operands[RB] = specrrt[transop.rb];
+    rob.operands[RC] = specrrt[transop.rc];
+    rob.operands[RS] = &physregfiles[0][PHYS_REG_NULL]; // used for loads and stores only
+
+    // See notes above on Physical Register Recycling Complications
+    foreach (i, MAX_OPERANDS) {
+      rob.operands[i]->addref(rob);
+      assert(rob.operands[i]->state != PHYSREG_FREE);
+
+      if likely ((rob.operands[i]->state == PHYSREG_WAITING) |
+                 (rob.operands[i]->state == PHYSREG_BYPASS) |
+                 (rob.operands[i]->state == PHYSREG_WRITTEN)) {
+        rob.operands[i]->rob->consumer_count = min(rob.operands[i]->rob->consumer_count + 1, 255);
+      }
+    }
+
+    //
+    // Select a physical register file based on desired
+    // heuristics. We only consider a given register
+    // file N if bit N in the acceptable_phys_reg_files
+    // bitmap is set (otherwise it is off limits for
+    // the type of functional unit or cluster the uop
+    // must execute on).
+    //
+    // The phys_reg_file variable should be set to the
+    // register file ID selected by the heuristics.
+    //
+
+    //
+    // Default heuristics from above: phys_reg_file is already
+    // set to the first acceptable physical register file ID
+    // which has free registers.
+    //
+    rob.executable_on_cluster_mask = uop_executable_on_cluster[transop.opcode];
+
+    // This is used if there is exactly one physical register file per cluster:
+    // rob.executable_on_cluster_mask = (1 << phys_reg_file);
+
+    // For assignment only:
+    assert(bit(acceptable_phys_reg_files, phys_reg_file));
+
+    //
+    // Allocate the physical register
+    //
+
+    physreg = physregfiles[phys_reg_file].alloc();
+    assert(physreg);
+    physreg->flags = FLAG_WAIT;
+    physreg->data = 0xdeadbeefdeadbeefULL;
+    physreg->rob = &rob;
+    physreg->archreg = rob.uop.rd;
+    rob.physreg = physreg;
+
+    //
+    // Logging
+    //
+
+    if unlikely (config.event_log_enabled) {
+      OutOfOrderCoreEvent* event = eventlog.add(EVENT_RENAME_OK, &rob);
+      foreach (i, MAX_OPERANDS) rob.operands[i]->fill_operand_info(event->rename.opinfo[i]);
+      
+      if likely (archdest_can_commit[transop.rd]) {
+        event->rename.oldphys = specrrt[transop.rd]->index();
+        event->rename.oldzf = specrrt[REG_zf]->index();
+        event->rename.oldcf = specrrt[REG_cf]->index();
+        event->rename.oldof = specrrt[REG_of]->index();
+      }
+    }
+
+    bool renamed_reg = 0;
+    bool renamed_flags = 0;
+
+    if likely (archdest_can_commit[transop.rd]) {
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+      PhysicalRegister* oldmapping = specrrt[transop.rd];
+      if ((oldmapping->current_state_list == &physreg_waiting_list) |
+          (oldmapping->current_state_list == &physreg_ready_list)) {
+        oldmapping->rob->dest_renamed_before_writeback = 1;
+      }
+
+      if ((oldmapping->current_state_list == &physreg_waiting_list) |
+          (oldmapping->current_state_list == &physreg_ready_list) | 
+          (oldmapping->current_state_list == &physreg_written_list)) {
+        oldmapping->rob->no_branches_between_renamings = specrrt.renamed_in_this_basic_block[transop.rd];
+      }
+
+      specrrt.renamed_in_this_basic_block[transop.rd] = 1;
+#endif
+
+      specrrt[transop.rd]->unspecref(transop.rd);
+      specrrt[transop.rd] = rob.physreg;
+      rob.physreg->addspecref(transop.rd);
+      renamed_reg = archdest_is_visible[transop.rd];
+    }
+
+    if unlikely (!transop.nouserflags) {
+      if (transop.setflags & SETFLAG_ZF) {
+        specrrt[REG_zf]->unspecref(REG_zf);
+        specrrt[REG_zf] = rob.physreg;
+        rob.physreg->addspecref(REG_zf);
+      }
+      if (transop.setflags & SETFLAG_CF) {
+        specrrt[REG_cf]->unspecref(REG_cf);
+        specrrt[REG_cf] = rob.physreg;
+        rob.physreg->addspecref(REG_cf);
+      }
+      if (transop.setflags & SETFLAG_OF) {
+        specrrt[REG_of]->unspecref(REG_of);
+        specrrt[REG_of] = rob.physreg;
+        rob.physreg->addspecref(REG_of);
+      }
+      renamed_flags = (transop.setflags != 0);
+    }
+
+    foreach (i, MAX_OPERANDS) {
+      assert(rob.operands[i]->allocated());
+    }
+
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+    if unlikely (br) specrrt.renamed_in_this_basic_block.reset();
+#endif
+
+    stats.ooocore.frontend.renamed.none += ((!renamed_reg) && (!renamed_flags));
+    stats.ooocore.frontend.renamed.reg += ((renamed_reg) && (!renamed_flags));
+    stats.ooocore.frontend.renamed.flags += ((!renamed_reg) && (renamed_flags));
+    stats.ooocore.frontend.renamed.reg_and_flags += ((renamed_reg) && (renamed_flags));
+
+    rob.changestate(rob_frontend_list);
+
+    prepcount++;
+  }
+
+  stats.ooocore.frontend.width[prepcount]++;
+}
+
+void OutOfOrderCore::frontend() {
+  time_this_scope(ctfrontend);
+
+  ReorderBufferEntry* rob;
+  
+  foreach_list_mutable(rob_frontend_list, rob, entry, nextentry) {
+    if unlikely (rob->cycles_left <= 0) {
+      rob->cycles_left = -1;
+      rob->changestate(rob_ready_to_dispatch_list);
+    } else {
+      if unlikely (config.event_log_enabled) {
+        OutOfOrderCoreEvent* event = eventlog.add(EVENT_FRONTEND, rob);
+        event->frontend.cycles_left = rob->cycles_left;
+      }
+    }
+    
+    rob->cycles_left--;
+  }
+}
+
+//
+// Dispatch and Cluster Selection
+//
+static byte bit_indices_set_8bits[1<<8][8] = {
+  {0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0},  
+  {1, 1, 1, 1, 1, 1, 1, 1}, {0, 1, 0, 1, 0, 1, 0, 1},
+  {2, 2, 2, 2, 2, 2, 2, 2}, {0, 2, 0, 2, 0, 2, 0, 2},
+  {1, 2, 1, 2, 1, 2, 1, 2}, {0, 1, 2, 0, 1, 2, 0, 1},
+  {3, 3, 3, 3, 3, 3, 3, 3}, {0, 3, 0, 3, 0, 3, 0, 3},
+  {1, 3, 1, 3, 1, 3, 1, 3}, {0, 1, 3, 0, 1, 3, 0, 1},
+  {2, 3, 2, 3, 2, 3, 2, 3}, {0, 2, 3, 0, 2, 3, 0, 2},
+  {1, 2, 3, 1, 2, 3, 1, 2}, {0, 1, 2, 3, 0, 1, 2, 3},
+  {4, 4, 4, 4, 4, 4, 4, 4}, {0, 4, 0, 4, 0, 4, 0, 4},
+  {1, 4, 1, 4, 1, 4, 1, 4}, {0, 1, 4, 0, 1, 4, 0, 1},
+  {2, 4, 2, 4, 2, 4, 2, 4}, {0, 2, 4, 0, 2, 4, 0, 2},
+  {1, 2, 4, 1, 2, 4, 1, 2}, {0, 1, 2, 4, 0, 1, 2, 4},
+  {3, 4, 3, 4, 3, 4, 3, 4}, {0, 3, 4, 0, 3, 4, 0, 3},
+  {1, 3, 4, 1, 3, 4, 1, 3}, {0, 1, 3, 4, 0, 1, 3, 4},
+  {2, 3, 4, 2, 3, 4, 2, 3}, {0, 2, 3, 4, 0, 2, 3, 4},
+  {1, 2, 3, 4, 1, 2, 3, 4}, {0, 1, 2, 3, 4, 0, 1, 2},
+  {5, 5, 5, 5, 5, 5, 5, 5}, {0, 5, 0, 5, 0, 5, 0, 5},
+  {1, 5, 1, 5, 1, 5, 1, 5}, {0, 1, 5, 0, 1, 5, 0, 1},
+  {2, 5, 2, 5, 2, 5, 2, 5}, {0, 2, 5, 0, 2, 5, 0, 2},
+  {1, 2, 5, 1, 2, 5, 1, 2}, {0, 1, 2, 5, 0, 1, 2, 5},
+  {3, 5, 3, 5, 3, 5, 3, 5}, {0, 3, 5, 0, 3, 5, 0, 3},
+  {1, 3, 5, 1, 3, 5, 1, 3}, {0, 1, 3, 5, 0, 1, 3, 5},
+  {2, 3, 5, 2, 3, 5, 2, 3}, {0, 2, 3, 5, 0, 2, 3, 5},
+  {1, 2, 3, 5, 1, 2, 3, 5}, {0, 1, 2, 3, 5, 0, 1, 2},
+  {4, 5, 4, 5, 4, 5, 4, 5}, {0, 4, 5, 0, 4, 5, 0, 4},
+  {1, 4, 5, 1, 4, 5, 1, 4}, {0, 1, 4, 5, 0, 1, 4, 5},
+  {2, 4, 5, 2, 4, 5, 2, 4}, {0, 2, 4, 5, 0, 2, 4, 5},
+  {1, 2, 4, 5, 1, 2, 4, 5}, {0, 1, 2, 4, 5, 0, 1, 2},
+  {3, 4, 5, 3, 4, 5, 3, 4}, {0, 3, 4, 5, 0, 3, 4, 5},
+  {1, 3, 4, 5, 1, 3, 4, 5}, {0, 1, 3, 4, 5, 0, 1, 3},
+  {2, 3, 4, 5, 2, 3, 4, 5}, {0, 2, 3, 4, 5, 0, 2, 3},
+  {1, 2, 3, 4, 5, 1, 2, 3}, {0, 1, 2, 3, 4, 5, 0, 1},
+  {6, 6, 6, 6, 6, 6, 6, 6}, {0, 6, 0, 6, 0, 6, 0, 6},
+  {1, 6, 1, 6, 1, 6, 1, 6}, {0, 1, 6, 0, 1, 6, 0, 1},
+  {2, 6, 2, 6, 2, 6, 2, 6}, {0, 2, 6, 0, 2, 6, 0, 2},
+  {1, 2, 6, 1, 2, 6, 1, 2}, {0, 1, 2, 6, 0, 1, 2, 6},
+  {3, 6, 3, 6, 3, 6, 3, 6}, {0, 3, 6, 0, 3, 6, 0, 3},
+  {1, 3, 6, 1, 3, 6, 1, 3}, {0, 1, 3, 6, 0, 1, 3, 6},
+  {2, 3, 6, 2, 3, 6, 2, 3}, {0, 2, 3, 6, 0, 2, 3, 6},
+  {1, 2, 3, 6, 1, 2, 3, 6}, {0, 1, 2, 3, 6, 0, 1, 2},
+  {4, 6, 4, 6, 4, 6, 4, 6}, {0, 4, 6, 0, 4, 6, 0, 4},
+  {1, 4, 6, 1, 4, 6, 1, 4}, {0, 1, 4, 6, 0, 1, 4, 6},
+  {2, 4, 6, 2, 4, 6, 2, 4}, {0, 2, 4, 6, 0, 2, 4, 6},
+  {1, 2, 4, 6, 1, 2, 4, 6}, {0, 1, 2, 4, 6, 0, 1, 2},
+  {3, 4, 6, 3, 4, 6, 3, 4}, {0, 3, 4, 6, 0, 3, 4, 6},
+  {1, 3, 4, 6, 1, 3, 4, 6}, {0, 1, 3, 4, 6, 0, 1, 3},
+  {2, 3, 4, 6, 2, 3, 4, 6}, {0, 2, 3, 4, 6, 0, 2, 3},
+  {1, 2, 3, 4, 6, 1, 2, 3}, {0, 1, 2, 3, 4, 6, 0, 1},
+  {5, 6, 5, 6, 5, 6, 5, 6}, {0, 5, 6, 0, 5, 6, 0, 5},
+  {1, 5, 6, 1, 5, 6, 1, 5}, {0, 1, 5, 6, 0, 1, 5, 6},
+  {2, 5, 6, 2, 5, 6, 2, 5}, {0, 2, 5, 6, 0, 2, 5, 6},
+  {1, 2, 5, 6, 1, 2, 5, 6}, {0, 1, 2, 5, 6, 0, 1, 2},
+  {3, 5, 6, 3, 5, 6, 3, 5}, {0, 3, 5, 6, 0, 3, 5, 6},
+  {1, 3, 5, 6, 1, 3, 5, 6}, {0, 1, 3, 5, 6, 0, 1, 3},
+  {2, 3, 5, 6, 2, 3, 5, 6}, {0, 2, 3, 5, 6, 0, 2, 3},
+  {1, 2, 3, 5, 6, 1, 2, 3}, {0, 1, 2, 3, 5, 6, 0, 1},
+  {4, 5, 6, 4, 5, 6, 4, 5}, {0, 4, 5, 6, 0, 4, 5, 6},
+  {1, 4, 5, 6, 1, 4, 5, 6}, {0, 1, 4, 5, 6, 0, 1, 4},
+  {2, 4, 5, 6, 2, 4, 5, 6}, {0, 2, 4, 5, 6, 0, 2, 4},
+  {1, 2, 4, 5, 6, 1, 2, 4}, {0, 1, 2, 4, 5, 6, 0, 1},
+  {3, 4, 5, 6, 3, 4, 5, 6}, {0, 3, 4, 5, 6, 0, 3, 4},
+  {1, 3, 4, 5, 6, 1, 3, 4}, {0, 1, 3, 4, 5, 6, 0, 1},
+  {2, 3, 4, 5, 6, 2, 3, 4}, {0, 2, 3, 4, 5, 6, 0, 2},
+  {1, 2, 3, 4, 5, 6, 1, 2}, {0, 1, 2, 3, 4, 5, 6, 0},
+  {7, 7, 7, 7, 7, 7, 7, 7}, {0, 7, 0, 7, 0, 7, 0, 7},
+  {1, 7, 1, 7, 1, 7, 1, 7}, {0, 1, 7, 0, 1, 7, 0, 1},
+  {2, 7, 2, 7, 2, 7, 2, 7}, {0, 2, 7, 0, 2, 7, 0, 2},
+  {1, 2, 7, 1, 2, 7, 1, 2}, {0, 1, 2, 7, 0, 1, 2, 7},
+  {3, 7, 3, 7, 3, 7, 3, 7}, {0, 3, 7, 0, 3, 7, 0, 3},
+  {1, 3, 7, 1, 3, 7, 1, 3}, {0, 1, 3, 7, 0, 1, 3, 7},
+  {2, 3, 7, 2, 3, 7, 2, 3}, {0, 2, 3, 7, 0, 2, 3, 7},
+  {1, 2, 3, 7, 1, 2, 3, 7}, {0, 1, 2, 3, 7, 0, 1, 2},
+  {4, 7, 4, 7, 4, 7, 4, 7}, {0, 4, 7, 0, 4, 7, 0, 4},
+  {1, 4, 7, 1, 4, 7, 1, 4}, {0, 1, 4, 7, 0, 1, 4, 7},
+  {2, 4, 7, 2, 4, 7, 2, 4}, {0, 2, 4, 7, 0, 2, 4, 7},
+  {1, 2, 4, 7, 1, 2, 4, 7}, {0, 1, 2, 4, 7, 0, 1, 2},
+  {3, 4, 7, 3, 4, 7, 3, 4}, {0, 3, 4, 7, 0, 3, 4, 7},
+  {1, 3, 4, 7, 1, 3, 4, 7}, {0, 1, 3, 4, 7, 0, 1, 3},
+  {2, 3, 4, 7, 2, 3, 4, 7}, {0, 2, 3, 4, 7, 0, 2, 3},
+  {1, 2, 3, 4, 7, 1, 2, 3}, {0, 1, 2, 3, 4, 7, 0, 1},
+  {5, 7, 5, 7, 5, 7, 5, 7}, {0, 5, 7, 0, 5, 7, 0, 5},
+  {1, 5, 7, 1, 5, 7, 1, 5}, {0, 1, 5, 7, 0, 1, 5, 7},
+  {2, 5, 7, 2, 5, 7, 2, 5}, {0, 2, 5, 7, 0, 2, 5, 7},
+  {1, 2, 5, 7, 1, 2, 5, 7}, {0, 1, 2, 5, 7, 0, 1, 2},
+  {3, 5, 7, 3, 5, 7, 3, 5}, {0, 3, 5, 7, 0, 3, 5, 7},
+  {1, 3, 5, 7, 1, 3, 5, 7}, {0, 1, 3, 5, 7, 0, 1, 3},
+  {2, 3, 5, 7, 2, 3, 5, 7}, {0, 2, 3, 5, 7, 0, 2, 3},
+  {1, 2, 3, 5, 7, 1, 2, 3}, {0, 1, 2, 3, 5, 7, 0, 1},
+  {4, 5, 7, 4, 5, 7, 4, 5}, {0, 4, 5, 7, 0, 4, 5, 7},
+  {1, 4, 5, 7, 1, 4, 5, 7}, {0, 1, 4, 5, 7, 0, 1, 4},
+  {2, 4, 5, 7, 2, 4, 5, 7}, {0, 2, 4, 5, 7, 0, 2, 4},
+  {1, 2, 4, 5, 7, 1, 2, 4}, {0, 1, 2, 4, 5, 7, 0, 1},
+  {3, 4, 5, 7, 3, 4, 5, 7}, {0, 3, 4, 5, 7, 0, 3, 4},
+  {1, 3, 4, 5, 7, 1, 3, 4}, {0, 1, 3, 4, 5, 7, 0, 1},
+  {2, 3, 4, 5, 7, 2, 3, 4}, {0, 2, 3, 4, 5, 7, 0, 2},
+  {1, 2, 3, 4, 5, 7, 1, 2}, {0, 1, 2, 3, 4, 5, 7, 0},
+  {6, 7, 6, 7, 6, 7, 6, 7}, {0, 6, 7, 0, 6, 7, 0, 6},
+  {1, 6, 7, 1, 6, 7, 1, 6}, {0, 1, 6, 7, 0, 1, 6, 7},
+  {2, 6, 7, 2, 6, 7, 2, 6}, {0, 2, 6, 7, 0, 2, 6, 7},
+  {1, 2, 6, 7, 1, 2, 6, 7}, {0, 1, 2, 6, 7, 0, 1, 2},
+  {3, 6, 7, 3, 6, 7, 3, 6}, {0, 3, 6, 7, 0, 3, 6, 7},
+  {1, 3, 6, 7, 1, 3, 6, 7}, {0, 1, 3, 6, 7, 0, 1, 3},
+  {2, 3, 6, 7, 2, 3, 6, 7}, {0, 2, 3, 6, 7, 0, 2, 3},
+  {1, 2, 3, 6, 7, 1, 2, 3}, {0, 1, 2, 3, 6, 7, 0, 1},
+  {4, 6, 7, 4, 6, 7, 4, 6}, {0, 4, 6, 7, 0, 4, 6, 7},
+  {1, 4, 6, 7, 1, 4, 6, 7}, {0, 1, 4, 6, 7, 0, 1, 4},
+  {2, 4, 6, 7, 2, 4, 6, 7}, {0, 2, 4, 6, 7, 0, 2, 4},
+  {1, 2, 4, 6, 7, 1, 2, 4}, {0, 1, 2, 4, 6, 7, 0, 1},
+  {3, 4, 6, 7, 3, 4, 6, 7}, {0, 3, 4, 6, 7, 0, 3, 4},
+  {1, 3, 4, 6, 7, 1, 3, 4}, {0, 1, 3, 4, 6, 7, 0, 1},
+  {2, 3, 4, 6, 7, 2, 3, 4}, {0, 2, 3, 4, 6, 7, 0, 2},
+  {1, 2, 3, 4, 6, 7, 1, 2}, {0, 1, 2, 3, 4, 6, 7, 0},
+  {5, 6, 7, 5, 6, 7, 5, 6}, {0, 5, 6, 7, 0, 5, 6, 7},
+  {1, 5, 6, 7, 1, 5, 6, 7}, {0, 1, 5, 6, 7, 0, 1, 5},
+  {2, 5, 6, 7, 2, 5, 6, 7}, {0, 2, 5, 6, 7, 0, 2, 5},
+  {1, 2, 5, 6, 7, 1, 2, 5}, {0, 1, 2, 5, 6, 7, 0, 1},
+  {3, 5, 6, 7, 3, 5, 6, 7}, {0, 3, 5, 6, 7, 0, 3, 5},
+  {1, 3, 5, 6, 7, 1, 3, 5}, {0, 1, 3, 5, 6, 7, 0, 1},
+  {2, 3, 5, 6, 7, 2, 3, 5}, {0, 2, 3, 5, 6, 7, 0, 2},
+  {1, 2, 3, 5, 6, 7, 1, 2}, {0, 1, 2, 3, 5, 6, 7, 0},
+  {4, 5, 6, 7, 4, 5, 6, 7}, {0, 4, 5, 6, 7, 0, 4, 5},
+  {1, 4, 5, 6, 7, 1, 4, 5}, {0, 1, 4, 5, 6, 7, 0, 1},
+  {2, 4, 5, 6, 7, 2, 4, 5}, {0, 2, 4, 5, 6, 7, 0, 2},
+  {1, 2, 4, 5, 6, 7, 1, 2}, {0, 1, 2, 4, 5, 6, 7, 0},
+  {3, 4, 5, 6, 7, 3, 4, 5}, {0, 3, 4, 5, 6, 7, 0, 3},
+  {1, 3, 4, 5, 6, 7, 1, 3}, {0, 1, 3, 4, 5, 6, 7, 0},
+  {2, 3, 4, 5, 6, 7, 2, 3}, {0, 2, 3, 4, 5, 6, 7, 0},
+  {1, 2, 3, 4, 5, 6, 7, 1}, {0, 1, 2, 3, 4, 5, 6, 7},
+};
+
+static inline int find_random_set_bit(W32 v, int randsource) {
+  return bit_indices_set_8bits[v & 0xff][randsource & 0x7];
+}
+
+//
+// This function locates the source operands for a uop and prepares to add the
+// uop to its cluster's issue queue.
+//
+// If an operand is already ready at dispatch time, the issue queue associative
+// array slot for that operand is marked as unused; otherwise it is marked
+// as valid so the operand's ROB index can be matched when broadcast.
+//
+// returns: 1 iff all operands were ready at dispatch time
+//
+bool ReorderBufferEntry::find_sources() {
+  int operands_still_needed = 0;
+
+  issueq_tag_t uopids[MAX_OPERANDS];
+  issueq_tag_t preready[MAX_OPERANDS];
+
+  foreach (operand, MAX_OPERANDS) {
+    PhysicalRegister& source_physreg = *operands[operand];
+    ReorderBufferEntry& source_rob = *source_physreg.rob;
+
+    if likely (source_physreg.state == PHYSREG_WAITING) {
+      uopids[operand] = source_rob.index();
+      preready[operand] = 0;
+      operands_still_needed++;
+    } else {
+      // No need to wait for it
+      uopids[operand] = 0;
+      preready[operand] = 1;
+    }
+
+    if likely (source_physreg.nonnull()) {
+      per_physregfile_stats_update(stats.ooocore.dispatch.source, source_physreg.rfid, [source_physreg.state]++);
+    }
+  }
+
+  //
+  // Stores are special: we can issue a store even if its rc operand (the value
+  // to store) is not yet ready. In this case the store uop just checks for
+  // exceptions, establishes an STQ entry and gets replayed as a second phase
+  // store (this time around with the rc dependency required)
+  //
+  if unlikely (isstore(uop.opcode) && !load_store_second_phase) {
+    preready[RC] = 1;
+  }
+
+  bool ok;
+  issueq_operation_on_cluster_with_result(getcore(), cluster, ok, insert(index(), uopids, preready));
+  assert(ok);
+
+  return operands_still_needed;
+}
+
+int ReorderBufferEntry::select_cluster() {
+  OutOfOrderCoreEvent* event;
+
+  if (MAX_CLUSTERS == 1) {
+    int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+    getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+    return (cluster_issue_queue_avail_count[0] > 0) ? 0 : -1;
+  }
+
+  W32 executable_on_cluster = executable_on_cluster_mask;
+
+  int cluster_operand_tally[MAX_CLUSTERS];
+  foreach (i, MAX_CLUSTERS) { cluster_operand_tally[i] = 0; }
+  foreach (i, MAX_OPERANDS) {
+    PhysicalRegister& r = *operands[i];
+    if ((&r) && ((r.state == PHYSREG_WAITING) || (r.state == PHYSREG_BYPASS)) && (r.rob->cluster >= 0)) cluster_operand_tally[r.rob->cluster]++;
+  }
+
+  assert(executable_on_cluster);
+
+  // If a given cluster's issue queue is full, try another cluster:
+  int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+  W32 cluster_issue_queue_avail_mask = 0;
+
+  getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+
+  foreach (i, MAX_CLUSTERS) {
+    cluster_issue_queue_avail_mask |= ((cluster_issue_queue_avail_count[i] > 0) << i);
+  }
+
+  executable_on_cluster &= cluster_issue_queue_avail_mask;
+
+  if unlikely (config.event_log_enabled) {
+    event = getcore().eventlog.add(EVENT_CLUSTER_OK, this);
+    event->select_cluster.allowed_clusters = executable_on_cluster_mask;
+    foreach (i, MAX_CLUSTERS) event->select_cluster.iq_avail[i] = cluster_issue_queue_avail_count[i];
+  }
+
+  if unlikely (!executable_on_cluster) {
+    if unlikely (config.event_log_enabled) event->type = EVENT_CLUSTER_NO_CLUSTER;
+    return -1;
+  }
+  
+  int n = 0;
+  int cluster = find_random_set_bit(executable_on_cluster, sim_cycle);
+  
+  foreach (i, MAX_CLUSTERS) {
+    if ((cluster_operand_tally[i] > n) && bit(executable_on_cluster, i)) {
+      n = cluster_operand_tally[i];
+      cluster = i;
+    }
+  }
+
+  stats.ooocore.dispatch.cluster[cluster]++;
+
+  if unlikely (config.event_log_enabled) event->cluster = cluster;
+
+  return cluster;
+}
+
+//
+// Dispatch any uops in the rob_ready_to_dispatch_list by locating
+// their source operands and adding entries to the issue queues.
+//
+
+int OutOfOrderCore::dispatch() {
+  time_this_scope(ctdispatch);
+
+  int dispatchcount = 0;
+  OutOfOrderCoreEvent* event;
+  ReorderBufferEntry* rob;
+
+  foreach_list_mutable(rob_ready_to_dispatch_list, rob, entry, nextentry) {
+    if unlikely (dispatchcount >= DISPATCH_WIDTH) break;
+
+    // All operands start out as valid, then get put on wait queues if they are not actually ready.
+
+    rob->cluster = rob->select_cluster();
+
+    //
+    // An available cluster could not be found. This only happens 
+    // when all applicable cluster issue queues are full. Since
+    // we are still processing instructions in order at this point,
+    // abort dispatching for this cycle.
+    //
+    if unlikely (rob->cluster < 0) {
+      if unlikely (config.event_log_enabled) {
+        event = eventlog.add(EVENT_DISPATCH_NO_CLUSTER, rob);
+        foreach (i, MAX_OPERANDS) rob->operands[i]->fill_operand_info(event->dispatch.opinfo[i]);
+      }
+      continue; // try the next uop to avoid deadlock on re-dispatches
+    }
+
+    int operands_still_needed = rob->find_sources();
+
+    if likely (operands_still_needed) {
+      rob->changestate(rob_dispatched_list[rob->cluster]);
+    } else {
+      rob->changestate(rob->get_ready_to_issue_list());
+    }
+
+    if unlikely (config.event_log_enabled) {
+      event = eventlog.add(EVENT_DISPATCH_OK, rob);
+      foreach (i, MAX_OPERANDS) rob->operands[i]->fill_operand_info(event->dispatch.opinfo[i]);
+    }
+
+    dispatchcount++;
+  }
+
+  stats.ooocore.dispatch.width[dispatchcount]++;
+
+  if likely (dispatchcount) {
+    dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
+  } else if unlikely (!rob_ready_to_dispatch_list.empty()) {
+    dispatch_deadlock_countdown--;
+    if (!dispatch_deadlock_countdown) {
+      if (logable(6)) logfile << "Dispatch deadlock at cycle ", sim_cycle, ", commits ", total_user_insns_committed, ": recovering...", endl;
+      redispatch_deadlock_recovery();
+      dispatch_deadlock_countdown = DISPATCH_DEADLOCK_COUNTDOWN_CYCLES;
+      return -1;
+    }
+  }
+
+  return dispatchcount;
+}
+
+//
+// Issue Stage
+// (see oooexec.cpp for issue stages)
+//
+
+//
+// Complete Stage
+//
+// Process any ROB entries that just finished producing a result, forwarding
+// data within the same cluster directly to the waiting instructions.
+//
+// Note that we use the target physical register as a temporary repository
+// for the data. In a modern hardware implementation, this data would exist
+// only "on the wire" such that back to back ALU operations within a cluster
+// can occur using local forwarding.
+//
+
+int OutOfOrderCore::complete(int cluster) {
+  time_this_scope(ctcomplete);
+
+  int completecount = 0;
+  ReorderBufferEntry* rob;
+
+  // 
+  // Check the list of issued ROBs. If a given ROB is complete (i.e., is ready
+  // for writeback and forwarding), move it to rob_completed_list.
+  //
+  foreach_list_mutable(rob_issued_list[cluster], rob, entry, nextentry) {
+    rob->cycles_left--;
+
+    if unlikely (rob->cycles_left <= 0) {
+      if unlikely (config.event_log_enabled) eventlog.add(EVENT_COMPLETE, rob);
+      rob->changestate(rob_completed_list[cluster]);
+      rob->physreg->complete();
+      rob->forward_cycle = 0;
+      rob->fu = 0;
+      completecount++;
+    }
+  }
+
+  return 0;
+}
+
+//
+// Transfer Stage
+//
+// Process ROBs in flight between completion and global forwarding/writeback.
+//
+
+int OutOfOrderCore::transfer(int cluster) {
+  time_this_scope(cttransfer);
+
+  int wakeupcount = 0;
+  ReorderBufferEntry* rob;
+
+  foreach_list_mutable(rob_completed_list[cluster], rob, entry, nextentry) {
+    rob->forward();
+    rob->forward_cycle++;
+    if unlikely (rob->forward_cycle > MAX_FORWARDING_LATENCY) {
+      rob->forward_cycle = MAX_FORWARDING_LATENCY;
+      rob->changestate(rob_ready_to_writeback_list[rob->cluster]);
+    }
+  }
+
+  return 0;
+}
+
+//
+// Writeback Stage
+//
+// Writeback at most WRITEBACK_WIDTH ROBs on rob_ready_to_writeback_list.
+//
+
+int OutOfOrderCore::writeback(int cluster) {
+  time_this_scope(ctwriteback);
+
+  int writecount = 0;
+  int wakeupcount = 0;
+  ReorderBufferEntry* rob;
+
+  foreach_list_mutable(rob_ready_to_writeback_list[cluster], rob, entry, nextentry) {
+    if unlikely (writecount >= WRITEBACK_WIDTH)
+                  break;
+
+    //
+    // Gather statistics
+    //
+    bool transient = 0;
+
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+    if likely (!isclass(rob->uop.opcode, OPCLASS_STORE|OPCLASS_BRANCH)) {
+      transient =
+        (rob->dest_renamed_before_writeback) &&
+        (rob->consumer_count <= 1) &&
+        (rob->physreg->all_consumers_sourced_from_bypass) &&
+        (rob->no_branches_between_renamings);
+
+      writeback_transient += transient;
+      writeback_persistent += (!transient);
+    }
+
+    rob->transient = transient;
+#endif
+
+    if likely (!isclass(rob->uop.opcode, OPCLASS_STORE|OPCLASS_BRANCH)) {
+      if unlikely (config.event_log_enabled) {
+        OutOfOrderCoreEvent* event = eventlog.add(EVENT_WRITEBACK, rob);
+        event->writeback.data = rob->physreg->data;
+        event->writeback.flags = rob->physreg->flags;
+        event->writeback.consumer_count = rob->consumer_count;
+        event->writeback.transient = transient;
+        event->writeback.all_consumers_sourced_from_bypass = rob->physreg->all_consumers_sourced_from_bypass;
+        event->writeback.no_branches_between_renamings = rob->no_branches_between_renamings;
+        event->writeback.dest_renamed_before_writeback = rob->dest_renamed_before_writeback;
+      }
+    }
+
+    //
+    // Catch corner case where dependent uop was scheduled
+    // while producer waited in ready_to_writeback state:
+    //
+    wakeupcount += rob->forward();
+
+    writecount++;
+
+    //
+    // For simulation purposes, final value is already in rob->physreg,
+    // so we don't need to actually write anything back here.
+    //
+    stats.ooocore.writeback.writebacks[rob->physreg->rfid]++;
+    rob->physreg->writeback();
+    rob->cycles_left = -1;
+
+    rob->changestate(rob_ready_to_commit_queue);
+  }
+
+  per_cluster_stats_update(stats.ooocore.writeback.width, cluster, [writecount]++);
+
+  return writecount;
+}
+
+//
+// Commit Stage
+//
+// Commit at most COMMIT_WIDTH ready to commit instructions from ROB queue,
+// and commits any stores by writing to the L1 cache with write through.
+//
+// Returns:
+//    -1 if we are supposed to abort the simulation
+//  >= 0 for the number of instructions actually committed
+//
+// Physical Register Recycling Complications
+//
+// Consider the following scenario:
+//
+// - uop U3 is renamed and found to depend on physical register R from an earlier uop U1.
+// - U1 commits to architectural register A and moves R to the arch state
+// - U2, which updates the same architectural register A as U1, also commits. Since the
+//   mapping of A is being logically overwritten by U2, U1's physical register R is freed.
+// - U3 finally issues, but finds that operand physical register R for U1 no longer exists.
+//
+// Additionally, in x86 processors the flags attached to a given physical register may 
+// be referenced by three additional rename table entries (for ZAPS, CF, OF) so simply
+// freeing the old physical register mapping when the RRT is updated doesn't work.
+//
+// For these reasons, we need to prevent U2's register from being freed if it is still
+// referenced by anything still in the pipeline; the normal reorder buffer mechanism
+// cannot always handle this situation in a very long pipeline.
+//
+// The solution is to give each physical register a reference counter. As each uop operand
+// is renamed, the counter for the corresponding physical register is incremented. As each
+// uop commits, the counter for each of its operands is decremented, but the counter for
+// the target physical register itself is incremented before that register is moved to
+// the arch state during commitment (since the committed state now owns that register).
+//
+// As we update the committed RRT during the commit stage, the old register R mapped
+// to the destination architectural register A of the uop being committed is examined.
+// The register R is only moved to the free state iff its reference counter is zero.
+// Otherwise, it is moved to the pendingfree state. The hardware examines all counters
+// every cycle and moves physical registers to the free state only when their counters
+// become zero and they are in the pendingfree state.
+//
+// An additional complication arises for x86 since we maintain three separate rename 
+// table entries for the ZAPS, CF, OF flags in addition to the register rename table
+// entry. Therefore, each speculative RRT and commit RRT entry adds to the refcount.
+//
+// Hardware Implementation
+//
+// The hardware implementation of this scheme is straightforward and low complexity.
+// The counters can have a very small number of bits since it is very unlikely a given
+// physical register would be referenced by all 100+ uops in the ROB; 3 bits should be
+// enough to handle the typical maximum of < 8 uops sharing a given operand. Counter
+// overflows can simply stall renaming or flush the pipeline since they are so rare.
+//
+// The counter table can be updated in bulk each cycle by adding/subtracting the
+// appropriate sum or just adding zero if the corresponding register wasn't used.
+// Since there are several stages between renaming and commit, the same counter is never
+// both incremented and decremented in the same cycle, so race conditions are not an 
+// issue. 
+//
+// In real processors, the Pentium 4 uses a scheme similar to this one but uses bit
+// vectors instead. For smaller physical register files, this may be a better solution.
+// Each physical register has a bit vector with one bit per ROB entry. If a given
+// physical register P is still used by ROB entry E in the pipeline, P's bit vector
+// bit R is set. Register P cannot be freed until all bits in its vector are zero.
+//
+
+int OutOfOrderCore::commit() {
+  time_this_scope(ctcommit);
+
+  foreach (rfid, PHYS_REG_FILE_COUNT) {
+    StateList& statelist = physregfiles[rfid].states[PHYSREG_PENDINGFREE];
+    PhysicalRegister* physreg;
+    foreach_list_mutable(statelist, physreg, entry, nextentry) {
+      if unlikely (!physreg->referenced()) {
+        if unlikely (config.event_log_enabled) eventlog.add(EVENT_RECLAIM_PHYSREG)->physreg = physreg->index();
+        physreg->free();
+        stats.ooocore.commit.free_regs_recycled++;
+      }
+    }
+  }
+
+  //
+  // Commit ROB entries *in program order*, stopping at the first ROB that is 
+  // not ready to commit or has an exception.
+  //
+  int commitcount = 0;
+
+  int rc = COMMIT_RESULT_OK;
+
+  foreach_forward(ROB, i) {
+    ReorderBufferEntry& rob = ROB[i];
+
+    if unlikely (commitcount >= COMMIT_WIDTH) break;
+    rc = rob.commit();
+    /* Injection of random interrupts for testing of single-core ASF on PTLsim/classic happens here */
+    if unlikely(asf_in_crit_sec && asf_interrupt_critsec()) rc = COMMIT_RESULT_INTERRUPT;
+
+    if likely (rc == COMMIT_RESULT_OK) {
+      commitcount++;
+      last_commit_at_cycle = sim_cycle;
+      if (total_user_insns_committed >= config.stop_at_user_insns) {
+        rc = COMMIT_RESULT_STOP;
+        break;
+      }
+    } else {
+      break;
+    }
+  }
+
+  stats.ooocore.commit.width[commitcount]++;
+
+  return rc;
+}
+
+int ReorderBufferEntry::commit() {
+  OutOfOrderCore& core = getcore();
+  Context& ctx = core.ctx;
+
+  bool all_ready_to_commit = true;
+  bool macro_op_has_exceptions = false;
+
+  //
+  // Create an event log entry
+  //
+  OutOfOrderCoreEvent* event;
+
+  //
+  // Each x86 instruction may be composed of multiple uops; none of the uops
+  // may commit until ALL uops are ready to commit (either correctly or
+  // if one or more uops have exceptions). 
+  //
+  // This is accomplished by checking if the uop at the head of the ROB (next
+  // to commit) has its SOM (start of macro-op) bit set. If so, the ROB is 
+  // scanned forwards from the SOM uop to the EOM (end of macro-op) uop. If
+  // all uops in this range are ready to commit and are exception-free, the
+  // SOM uop allowed to commit. 
+  //
+  // Any exceptions in the macro-op uop range immediately signals an exception
+  // to the user code, and no part of the uop is committed. In any case,
+  // asynchronous interrupts are only taken after committing or excepting the
+  // EOM uop in a macro-op.
+  //
+
+  bool found_eom = 0;
+
+  foreach_forward_from(core.ROB, this, j) {
+    ReorderBufferEntry& subrob = core.ROB[j];
+
+    found_eom |= subrob.uop.eom;
+
+    if unlikely (!subrob.ready_to_commit()) {
+      all_ready_to_commit = false;
+    }
+
+#ifdef PTLSIM_HYPERVISOR
+    if unlikely ((subrob.uop.is_sse|subrob.uop.is_x87) && (ctx.cr0.ts | (subrob.uop.is_x87 & ctx.cr0.em))) {
+      subrob.physreg->data = EXCEPTION_FloatingPointNotAvailable;
+      subrob.physreg->flags = FLAG_INV;
+      if unlikely (subrob.lsq) subrob.lsq->invalid = 1;
+    }
+#endif
+
+    if unlikely (subrob.physreg->flags & FLAG_INV) {
+      //
+      // The exception is definitely going to happen, since the
+      // excepting instruction is at the head of the ROB. However,
+      // we don't know which uop within the instruction actually
+      // had the problem, e.g. if it's a load-alu-store insn, the
+      // load is OK but the store has PageFaultOnWrite. We take
+      // the first exception in uop order.
+      //
+      ctx.exception = LO32(subrob.physreg->data);
+      ctx.error_code = HI32(subrob.physreg->data);
+
+#ifdef PTLSIM_HYPERVISOR
+      // Capture the faulting virtual address for page faults
+      if ((ctx.exception == EXCEPTION_PageFaultOnRead) |
+          (ctx.exception == EXCEPTION_PageFaultOnWrite)) {
+        ctx.cr2 = subrob.origvirt;
+      }
+#endif
+
+      if unlikely (config.event_log_enabled) core.eventlog.add_commit(EVENT_COMMIT_EXCEPTION_DETECTED, &subrob);
+
+      macro_op_has_exceptions = true;
+      all_ready_to_commit = true;
+      found_eom = true;
+      break;
+    }
+    
+    if likely (subrob.uop.eom) break;
+  }
+
+  //
+  // Protect against the extremely rare case where only one x86
+  // instruction is in flight and its EOM uop has not even made
+  // it into the ROB by the time the first uop is ready to commit.
+  //
+
+  all_ready_to_commit &= found_eom;
+
+  if unlikely (!all_ready_to_commit) {
+    stats.ooocore.commit.result.none++;
+    return COMMIT_RESULT_NONE;
+  }
+
+  assert(ready_to_commit());
+
+  PhysicalRegister* oldphysreg = core.commitrrt[uop.rd];
+
+  //
+  // Update architectural state
+  //
+
+  bool ld = isload(uop.opcode);
+  bool st = isstore(uop.opcode);
+  bool br = isbranch(uop.opcode);
+
+  stats.ooocore.commit.opclass[opclassof(uop.opcode)]++;
+
+  if unlikely (macro_op_has_exceptions) {
+    if unlikely (config.event_log_enabled) event = core.eventlog.add_commit(EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED, this);
+
+    // See notes in handle_exception():
+    if likely (isclass(uop.opcode, OPCLASS_CHECK) & (ctx.exception == EXCEPTION_SkipBlock)) {
+      core.chk_recovery_rip = ctx.commitarf[REG_rip] + uop.bytes;
+      if unlikely (config.event_log_enabled) event->type = EVENT_COMMIT_SKIPBLOCK;
+      stats.ooocore.commit.result.skipblock++;
+    } else {
+      stats.ooocore.commit.result.exception++;
+    }
+
+    return COMMIT_RESULT_EXCEPTION;
+  }
+
+  //
+  // Check for self modifying code (SMC) by checking if any previous
+  // instruction has dirtied the page(s) on which the current instruction
+  // resides. The SMC check is done first since it's perfectly legal for a
+  // store to overwrite its own instruction bytes, but this update only
+  // becomes visible after the store has committed.
+  //
+  bool page_crossing = ((lowbits(uop.rip.rip, 12) + (uop.bytes-1)) >> 12);
+  if unlikely (smc_isdirty(uop.rip.mfnlo) | (page_crossing && smc_isdirty(uop.rip.mfnhi))) {
+    if unlikely (config.event_log_enabled) core.eventlog.add_commit(EVENT_COMMIT_SMC_DETECTED, this);
+
+    //
+    // Invalidate the pages only after the pipeline is flushed: we may still
+    // hold refs to the affected basic blocks in the pipeline. Queue the
+    // updates for later.
+    //
+    core.smc_invalidate_pending = 1;
+    core.smc_invalidate_rvp = uop.rip;
+
+    stats.ooocore.commit.result.smc++;
+    return COMMIT_RESULT_SMC;
+  }
+  
+  if (st) assert(lsq->addrvalid && lsq->datavalid);
+
+  W64 result = physreg->data;
+
+  if likely (uop.som) assert(ctx.commitarf[REG_rip] == uop.rip); 
+
+  if (uop.is_asf) {
+    int asf_commit_rc = commit_asf_instruction(); 
+    if (asf_commit_rc != COMMIT_RESULT_OK) return asf_commit_rc;
+  }
+
+  //
+  // The commit of all uops in the x86 macro-op is guaranteed to happen after this point
+  //
+  if unlikely (config.event_log_enabled) event = core.eventlog.add_commit(EVENT_COMMIT_OK, this);
+
+  if unlikely (config.event_log_enabled) {
+    if unlikely ((uop.rip.rip == config.log_backwards_from_trigger_rip) && (uop.som)) {
+      logfile << "Hit trigger rip ", (void*)(Waddr)config.log_backwards_from_trigger_rip, "; printing event ring buffer:", endl, flush;
+      core.eventlog.print(logfile);
+      logfile << "End of triggered event dump", endl, flush;
+    }
+  }
+
+  if likely (archdest_can_commit[uop.rd]) {
+    core.commitrrt[uop.rd]->uncommitref(uop.rd);
+    core.commitrrt[uop.rd] = physreg;
+    core.commitrrt[uop.rd]->addcommitref(uop.rd);
+
+    if likely (uop.rd < ARCHREG_COUNT) ctx.commitarf[uop.rd] = physreg->data;
+
+    physreg->rob = null;
+  }
+
+  if likely (uop.eom) {
+    if unlikely (uop.rd == REG_rip) {
+      assert(isbranch(uop.opcode));
+      ctx.commitarf[REG_rip] = physreg->data;
+    } else {
+      assert(!isbranch(uop.opcode));
+      ctx.commitarf[REG_rip] += uop.bytes;
+    }
+    if unlikely (config.event_log_enabled) event->commit.target_rip = ctx.commitarf[REG_rip];
+  }
+
+  if unlikely (!uop.nouserflags) {
+    W64 flagmask = setflags_to_x86_flags[uop.setflags];
+    ctx.commitarf[REG_flags] = (ctx.commitarf[REG_flags] & ~flagmask) | (physreg->flags & flagmask);
+
+    stats.ooocore.commit.setflags.no += (uop.setflags == 0);
+    stats.ooocore.commit.setflags.yes += (uop.setflags != 0);
+
+    if unlikely (config.event_log_enabled) event->commit.state.reg.rdflags = ctx.commitarf[REG_flags];
+
+    if likely (uop.setflags & SETFLAG_ZF) {
+      core.commitrrt[REG_zf]->uncommitref(REG_zf);
+      core.commitrrt[REG_zf] = physreg;
+      core.commitrrt[REG_zf]->addcommitref(REG_zf);
+    }
+    if likely (uop.setflags & SETFLAG_CF) {
+      core.commitrrt[REG_cf]->uncommitref(REG_cf);
+      core.commitrrt[REG_cf] = physreg;
+      core.commitrrt[REG_cf]->addcommitref(REG_cf);
+    }
+    if likely (uop.setflags & SETFLAG_OF) {
+      core.commitrrt[REG_of]->uncommitref(REG_of);
+      core.commitrrt[REG_of] = physreg;
+      core.commitrrt[REG_of]->addcommitref(REG_of);
+    }
+  }
+
+  if unlikely (st) {
+    Waddr mfn = (lsq->physaddr << 3) >> 12;
+    smc_setdirty(mfn);
+    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, (W64)origvirt, uop.internal) == 0);
+  }
+
+  if unlikely (pteupdate) {
+    ctx.update_pte_acc_dirty(origvirt, pteupdate);
+  }
+
+  //
+  // Free physical registers, load/store queue entries, etc.
+  //
+  if unlikely (ld|st) {
+    core.loads_in_flight -= (lsq->store == 0);
+    core.stores_in_flight -= (lsq->store == 1);
+    lsq->reset();
+    core.LSQ.commit(lsq);
+  }
+
+  assert(archdest_can_commit[uop.rd]);
+  assert(oldphysreg->state == PHYSREG_ARCH);
+
+  if unlikely (config.event_log_enabled) event->commit.oldphysreg = -1;
+  if likely (oldphysreg->nonnull()) {
+    if unlikely (config.event_log_enabled) {
+      event->commit.oldphysreg = oldphysreg->index();
+      event->commit.oldphysreg_refcount = oldphysreg->refcount;
+    }
+
+    if unlikely (oldphysreg->referenced()) {
+      oldphysreg->changestate(PHYSREG_PENDINGFREE); 
+      stats.ooocore.commit.freereg.pending++;
+    } else  {
+      oldphysreg->free();
+      stats.ooocore.commit.freereg.free++;
+    }
+  }
+
+  if likely (!(br|st)) {
+    int k = clipto((int)consumer_count, 0, lengthof(stats.ooocore.frontend.consumer_count)-1);
+    stats.ooocore.frontend.consumer_count[k]++;
+  }
+
+  physreg->changestate(PHYSREG_ARCH);
+
+  //
+  // Unlock operand physregs since we no longer need to worry about speculation recovery
+  // Technically this can be done after the issue queue entry is released, but we do it
+  // here for simplicity.
+  //
+  foreach (i, MAX_OPERANDS) {
+    operands[i]->unref(*this);
+  }
+
+  //
+  // Update branch prediction
+  //
+  if unlikely (isclass(uop.opcode, OPCLASS_BRANCH)) {
+    assert(uop.eom);
+    //
+    // NOTE: Technically the "branch address" refers to the rip of the *next* 
+    // x86 instruction after the branch; we use this consistently since x86
+    // instructions vary in length and we cannot easily calculate the next
+    // instruction in sequence from within the branch predictor logic.
+    //
+    W64 end_of_branch_x86_insn = uop.rip + uop.bytes;
+    bool taken = (ctx.commitarf[REG_rip] != end_of_branch_x86_insn);
+    bool predtaken = (uop.riptaken != end_of_branch_x86_insn);
+
+    if unlikely (config.event_log_enabled) {
+      event->commit.taken = taken;
+      event->commit.predtaken = predtaken;
+    }
+
+    core.branchpred.update(uop.predinfo, end_of_branch_x86_insn, ctx.commitarf[REG_rip]);
+    stats.ooocore.branchpred.updates++;
+  }
+
+  // Release our lock on the cached basic block containing this uop
+  uop.bb->release();
+
+  if likely (uop.eom) {
+    total_user_insns_committed++;
+    stats.ooocore.commit.insns++;
+    stats.summary.insns++;
+  }
+
+  stats.summary.uops++;
+  total_uops_committed++;
+  stats.ooocore.commit.uops++;
+
+  bool uop_is_eom = uop.eom;
+  bool uop_is_barrier = isclass(uop.opcode, OPCLASS_BARRIER);
+
+  changestate(core.rob_free_list);
+  reset();
+  core.ROB.commit(*this);
+
+  if unlikely (uop_is_barrier) {
+    if unlikely (config.event_log_enabled) core.eventlog.add(EVENT_COMMIT_ASSIST, RIPVirtPhys(ctx.commitarf[REG_rip]));
+    stats.ooocore.commit.result.barrier++;
+    return COMMIT_RESULT_BARRIER;
+  }
+
+  if unlikely (uop_is_eom & core.handle_interrupt_at_next_eom) {
+    core.handle_interrupt_at_next_eom = 0;
+    return COMMIT_RESULT_INTERRUPT;
+  }
+
+  stats.ooocore.commit.result.ok++;
+  return COMMIT_RESULT_OK;
+}
+
+namespace ASFOutOfOrderModel {
+  const byte archdest_is_visible[TRANSREG_COUNT] = {
+    // Integer registers
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // SSE registers, low 64 bits
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // SSE registers, high 64 bits
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // x87 FP / MMX / special
+    1, 1, 1, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    // The following are ONLY used during the translation and renaming process:
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+  };
+
+  const byte archdest_can_commit[TRANSREG_COUNT] = {
+    // Integer registers
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // SSE registers, low 64 bits
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // SSE registers, high 64 bits
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+    // x87 FP / MMX / special
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 0,
+    // The following are ONLY used during the translation and renaming process:
+    1, 1, 1, 1, 1, 1, 1, 1,
+    1, 1, 1, 1, 1, 1, 1, 1,
+  };
+};
+
+/**
+ * This aborts a currently running ASF transaction, by
+ * copying (instantly) all backed up values from the
+ * LLB to the caches and memory (this is the same in PTLsim).
+ * This should be called during the commit phase of the
+ * instruction causing the abort. It checks all loads and
+ * stores which are in flight and forces them to redispatch.
+ * This implements Leendert's suggestion of lazy abort at the
+ * final release.
+ */
+
+void ReorderBufferEntry::abort_asf() {
+  OutOfOrderCore& core = getcore();
+  LockedLineBuffer& llb = core.locked_line_buffer;
+
+  assert(core.asf_in_crit_sec);
+  cerr << __FILE__,__LINE__,": Aborting ASF crit-sec, µop: ", uop, " LLB: ", llb.num_locations, endl, flush;
+  /* Scan through the LoadStoreQueue and replay every load and store that hits one of the adresses in the llb. */
+  foreach_forward(core.LSQ, i){
+    //cerr << __FILE__,__LINE__,": Checking LSQ[",i,"]=",core.LSQ[i],endl,flush;
+    LoadStoreQueueEntry& lsq = core.LSQ[i];
+    /* Check for valid entries that alias */
+    //cerr << __FILE__,__LINE__,": Checking ", lsq.store ? "store":"load", " at LSQ[",i,"]: ", lsq, endl, flush;
+    if (lsq.entry_valid && llb.contains(lsq.physaddr << 3)) {
+      cerr << __FILE__,__LINE__,": Found aliasing ", lsq.store ? "store":"load", " at LSQ[",i,"] ";
+      cerr << " which happens ",(uop.uuid > lsq.rob->uop.uuid) ? "earlier": "later", " (we=",uop.uuid,", them=",lsq.rob->uop.uuid,")",endl, flush;
+      cerr << __FILE__,__LINE__,": Redispatching associated ROBEntry ",lsq.rob->uop,endl,flush;
+      /* REplaying is not enough! *///lsq.rob->replay();
+      lsq.rob->redispatch_dependents();
+    }
+  }
+  /* Restore the contents from the LLB and clear it */
+  llb.abort();
+  core.asf_in_crit_sec = false;
+}
+
+/**
+ * Hook into ASF, processes committing the ASF state changing instructions,
+ * ACQUIRE and COMMIT. It checks, whether there have been no concurrent accesses
+ * before the ACQUIRE and handles COMMIT.
+ */
+int ReorderBufferEntry::commit_asf_instruction() {
+  assert(uop.is_asf);
+
+  OutOfOrderCore&   core = getcore();
+  LockedLineBuffer& llb  = core.locked_line_buffer;
+  Context&          ctx  = core.ctx;
+  cerr << __FILE__,__LINE__,": Commiting ASF instruction ", uop, " LLB: ", llb.num_locations, endl, flush;
+
+  /* Unstall the frontend to allow fetching of the following instructions */
+  core.stall_frontend = false;
+
+  if unlikely(uop.opcode == OP_acq) {
+    assert(!core.asf_in_crit_sec);
+    /* We already have some problems -> do not enter the critical section! */
+    if unlikely(physreg->data) {
+      core.asf_reissue_will_fail = false;
+      /* All side-effects have already been cleared! Just do NOT enter the crit.sec.*/
+      return COMMIT_RESULT_OK;
+    }
+    /* Late check for any problems during the execution (between issue and commit) of
+       the ACQUIRE instruction, before we (finally) enter the critical section. */
+    W64 asf_err = llb.consistency_error();
+    if unlikely(asf_err) {
+      cerr << __FILE__,__LINE__,": ACQUIRE could not create a valid snapshot, detected LATE! Error ", asf_err, endl,flush;
+      //llb.clear(); // Not clearing the LLB will allow the re-execution to detect the proper error in the issue stage!
+      /* As this has happened fairly late, other ops might actually already execute inside the CS.
+         -> Redispatch, the conditional jump after the ACQUIRE will move the control flow away
+         from any instruction inside the CS and annul any speculative data! */
+      /* The acquire will fail, because the number of specified locations will be wrong. Alternatively,
+         one might specify some error_code, this is TODO! */
+      /* Should that actually be done at commit time of the acquire? Probably the Acquire instruction should
+         be prevented from committing, by redispathcing it, when a conflicting access hits inside the LLB. TODO!
+         For now, let's hope that this works! */
+      redispatch_dependents(true);
+      core.asf_reissue_will_fail = true;
+     /* TODO: Current spec says simply -18 for this, but how about using the RIP here? We also could
+              incorporate some information from the LLB. TBD! */
+      core.asf_stored_error = uop.rip;
+      return  COMMIT_RESULT_NONE;
+    }
+    /* All is well -> inside crit. section, save our acquire rip*/
+    core.asf_in_crit_sec       = true;
+    core.asf_failing_acquire   = uop.rip;
+    core.asf_reissue_will_fail = false;
+  } else if unlikely (uop.opcode == OP_com) {
+    llb.commit();
+    core.asf_in_crit_sec       = false;
+    core.asf_failing_acquire   = 0;
+  }
+  /* For ASF loads, store the physical address in the Locked Line Buffer (LLB) */
+  if ((uop.opcode == OP_ld) || (uop.opcode == OP_ld_pre)) {
+    /* LOCKed loads are illegal inside an ASF-critical-section */
+    if unlikely (core.asf_in_crit_sec)
+      ctx.propagate_x86_exception(EXCEPTION_x86_invalid_opcode);
+  }
+  return COMMIT_RESULT_OK;
+}
diff -r 10448c053ad6 dcache-amd-barcelona-asf.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dcache-amd-barcelona-asf.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,893 @@
+// -*- c++ -*-
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Data Cache
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+/**
+ * Enables cross core cache invalidation and cross core cache forwarding.
+ * Only has effect with a proper SMP model, ie. you have to _disable_
+ * ENABLE_SMT.
+ */
+#define POOR_MANS_MESI
+#include <ptlsim.h>
+//#include <datastore.h>
+#define MAX_HIERARCHIES 8
+struct LoadStoreInfo {
+  W16 rob;
+  W8  threadid;
+  W8  sizeshift:2, aligntype:2, sfrused:1, internal:1, signext:1, pad1:1;
+  W32 pad32;
+  RawDataAccessors(LoadStoreInfo, W64);
+};
+
+#define per_context_dcache_stats_ref(vcpuid) (*(((PerContextDataCacheStats*)&stats.dcache.vcpu0) + (vcpuid)))
+#define per_context_dcache_stats_update(vcpuid, expr) stats.dcache.total.expr, per_context_dcache_stats_ref(vcpuid).expr
+
+namespace CacheSubsystem {
+  // How many load wakeups can be driven into the core each cycle:
+  const int MAX_WAKEUPS_PER_CYCLE = 2;
+
+#ifndef STATS_ONLY
+
+// non-debugging only:
+//#define __RELEASE__
+#ifdef __RELEASE__
+#undef assert
+#define assert(x) (x)
+#endif
+
+  //#define CACHE_ALWAYS_HITS
+  //#define L2_ALWAYS_HITS
+  
+  // 64 KB L1 at 3 cycles
+  const int L1_LINE_SIZE = 64;
+  const int L1_SET_COUNT = 512;
+  const int L1_WAY_COUNT = 2;
+#define ENFORCE_L1_DCACHE_BANK_CONFLICTS
+  const int L1_DCACHE_BANKS    =  8; // 8x16 byte = 128 bytes > linesize! -> not all lines can interfere w/ each other
+  const int L1_DCACHE_BANKSIZE = 16;
+  // BEWARE: If you define this, you must make sure that virtual addresses are
+  // passed correctly to the functions in dache, namely issueload_slowpath,
+  // issue_prefetch, commitstore
+#define L1_VIRTUALLY_INDEXED
+  // With virtually indexed caches a single memory location could be present
+  // twice in the cache, at different virtual indices. If this option below is
+  // defined, the cache will ensure that such aliases are detected and removed.
+  // This is not strictly necessary for PTLsim, as data is still consistent and
+  // checking for aliases costs a significant amount of time!
+#define L1_ENFORCE_VIRTUAL_ALIASING
+
+  // 64 KB L1I
+  const int L1I_LINE_SIZE = 64;
+  const int L1I_SET_COUNT = 512;
+  const int L1I_WAY_COUNT = 2;
+
+  // 512 KB L2 at 9 cycles (specs) total (+L1 and issue) ~15 cycles (measured!)
+  // TODO: Exclusive, victim-cache!
+  const int L2_LINE_SIZE = 64;
+  const int L2_SET_COUNT = 512;
+  const int L2_WAY_COUNT = 16;
+  const int L2_LATENCY   = 13; // don't include the extra wakeup cycle (waiting->ready state transition) in the LFRQ
+
+  // TODO: Share this between cores on the same die!
+  // TODO: This is also a victim cache :-/ and has some sharing features!
+#define ENABLE_L3_CACHE
+#ifdef ENABLE_L3_CACHE
+  // 2 MB L3 cache (4096 sets, 16 ways) with 64-byte lines, TODO: latency?
+  const int L3_SET_COUNT = 1024;
+  const int L3_WAY_COUNT = 16;
+  const int L3_LINE_SIZE = 128;
+  const int L3_LATENCY   = 35;
+#endif
+  // Load Fill Request Queue (maximum number of missed loads)
+  const int LFRQ_SIZE = 63;
+
+  // Allow up to 16 outstanding lines in the L2 awaiting service:
+  const int MISSBUF_COUNT = 16;
+  const int MAIN_MEM_LATENCY = 160;
+
+  const int CROSS_CACHE_LATENCY = 100;
+  // TLBs
+#define USE_TLB
+  // NOTE: This is L1 TLBs!
+  const int ITLB_SIZE = 32;
+  const int DTLB_SIZE = 48;
+#define USE_L2_TLB
+  // L2 i-TLB 512 4k entries, 4-way set associative
+  const int L2_ITLB_SET_COUNT = 128;
+  const int L2_ITLB_WAY_COUNT = 4;
+  const int L2_ITLB_LATENCY   = 4;
+  // L2 i-TLB 512 4k entries, 4-way set associative (ignore 2MB entries)
+  const int L2_DTLB_SET_COUNT = 128;
+  const int L2_DTLB_WAY_COUNT = 4;
+  const int L2_DTLB_LATENCY   = 4;
+
+//#define ISSUE_LOAD_STORE_DEBUG
+//#define CHECK_LOADS_AND_STORES
+
+// Line Usage Statistics
+
+//#define TRACK_LINE_USAGE
+
+#ifdef TRACK_LINE_USAGE
+#define DCACHE_L1_LINE_LIFETIME_INTERVAL   1
+#define DCACHE_L1_LINE_DEADTIME_INTERVAL   1
+#define DCACHE_L1_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L1_LINE_LIFETIME_SLOTS      8192
+#define DCACHE_L1_LINE_DEADTIME_SLOTS      8192
+#define DCACHE_L1_LINE_HITCOUNT_SLOTS      64
+
+#define DCACHE_L1I_LINE_LIFETIME_INTERVAL  16
+#define DCACHE_L1I_LINE_DEADTIME_INTERVAL  16
+#define DCACHE_L1I_LINE_HITCOUNT_INTERVAL  1
+#define DCACHE_L1I_LINE_LIFETIME_SLOTS     8192
+#define DCACHE_L1I_LINE_DEADTIME_SLOTS     8192
+#define DCACHE_L1I_LINE_HITCOUNT_SLOTS     1024
+
+#define DCACHE_L2_LINE_LIFETIME_INTERVAL   4
+#define DCACHE_L2_LINE_DEADTIME_INTERVAL   4
+#define DCACHE_L2_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L2_LINE_LIFETIME_SLOTS      65536
+#define DCACHE_L2_LINE_DEADTIME_SLOTS      65536
+#define DCACHE_L2_LINE_HITCOUNT_SLOTS      256
+
+#define DCACHE_L3_LINE_LIFETIME_INTERVAL   64
+#define DCACHE_L3_LINE_DEADTIME_INTERVAL   64
+#define DCACHE_L3_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L3_LINE_LIFETIME_SLOTS      16384
+#define DCACHE_L3_LINE_DEADTIME_SLOTS      16384
+#define DCACHE_L3_LINE_HITCOUNT_SLOTS      256
+#endif
+
+  //
+  // Cache Line Types
+  //
+  template <int linesize>
+  struct CacheLine {
+#ifdef TRACK_LINE_USAGE
+    W32 filltime;
+    W32 lasttime;
+    W32 hitcount;
+#else
+    byte dummy;
+#endif
+    void reset() { clearstats(); }
+    void invalidate() { reset(); }
+    void fill(W64 tag, const bitvec<linesize>& valid) { }
+
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      filltime = sim_cycle;
+      lasttime = sim_cycle;
+      hitcount = 0;
+#endif
+    }
+
+    ostream& print(ostream& os, W64 tag) const;
+  };
+
+  template <int linesize>
+  static inline ostream& operator <<(ostream& os, const CacheLine<linesize>& line) {
+    return line.print(os, 0);
+  }
+
+  template <int linesize>
+  struct CacheLineWithValidMask {
+    bitvec<linesize> valid;
+#ifdef TRACK_LINE_USAGE
+    W32 filltime;
+    W32 lasttime;
+    W32 hitcount;
+#endif
+
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      filltime = sim_cycle;
+      lasttime = sim_cycle;
+      hitcount = 0;
+#endif
+    }
+
+    void reset() { valid = 0; clearstats(); }
+    void invalidate() { reset(); }
+    void fill(W64 tag, const bitvec<linesize>& valid) { this->valid |= valid; }
+    ostream& print(ostream& os, W64 tag) const;
+  };
+
+  template <int linesize>
+  static inline ostream& operator <<(ostream& os, const CacheLineWithValidMask<linesize>& line) {
+    return line.print(os, 0);
+  }
+
+  typedef CacheLineWithValidMask<L1_LINE_SIZE> L1CacheLine;
+  typedef CacheLine<L1I_LINE_SIZE> L1ICacheLine;
+  typedef CacheLineWithValidMask<L2_LINE_SIZE> L2CacheLine;
+#ifdef ENABLE_L3_CACHE
+  typedef CacheLine<L3_LINE_SIZE> L3CacheLine;
+#endif
+
+  //
+  // L1 data cache
+  //
+#ifdef TRACK_LINE_USAGE
+  static const char* cache_names[4] = {"L1", "I1", "L2", "L3"};
+
+  template <int uniq, typename V, int LIFETIME_INTERVAL, int LIFETIME_SLOTS, int DEADTIME_INTERVAL, int DEADTIME_SLOTS, int HITCOUNT_INTERVAL, int HITCOUNT_SLOTS>
+  struct HistogramAssociativeArrayStatisticsCollector {
+    static W64 line_lifetime_histogram[LIFETIME_SLOTS];
+    static W64 line_deadtime_histogram[DEADTIME_SLOTS];
+    static W64 line_hitcount_histogram[HITCOUNT_SLOTS];
+
+    static const bool FORCE_DEBUG = 0;
+
+    HistogramAssociativeArrayStatisticsCollector() {
+      reset();
+    }
+
+    static void reset() {
+      setzero(line_lifetime_histogram);
+      setzero(line_deadtime_histogram);
+      setzero(line_hitcount_histogram);
+    }
+
+    static void evicted(const V& line, W64 tag) {
+      // Line has been evicted: update statistics
+      W64s lifetime = line.lasttime - line.filltime;
+      assert(lifetime >= 0);
+      int lifetimeslot = clipto(lifetime / LIFETIME_INTERVAL, 0, LIFETIME_SLOTS-1);
+      line_lifetime_histogram[lifetimeslot]++;
+
+      W64s deadtime = sim_cycle - line.lasttime;
+      int deadtimeslot = clipto(deadtime / DEADTIME_INTERVAL, 0, DEADTIME_SLOTS-1);
+      line_deadtime_histogram[deadtimeslot]++;
+
+      W64 hitcount = line.hitcount;
+      int hitcountslot = clipto(hitcount / HITCOUNT_INTERVAL, 0, HITCOUNT_SLOTS-1);
+      line_hitcount_histogram[hitcountslot]++;
+
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": evicted(", (void*)tag, "): lifetime ", lifetime, ", deadtime ", deadtime, ", hitcount ", hitcount, " (line addr ", &line, ")", endl;
+    }
+
+    static void filled(V& line, W64 tag) {
+      line.filltime = sim_cycle;
+      line.lasttime = sim_cycle;
+      line.hitcount = 1;
+
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": filled(", (void*)tag, ")", " (line addr ", &line, ")", endl;
+    }
+
+    static void inserted(V& line, W64 newtag, int way) {
+      filled(line, newtag);
+    }
+
+    static void replaced(V& line, W64 oldtag, W64 newtag, int way) {
+      evicted(line, oldtag);
+      filled(line, newtag);
+    }
+
+    static void probed(V& line, W64 tag, int way, bool hit) { 
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": probe(", (void*)tag, "): ", (hit ? "HIT" : "miss"), " way ", way, ": hitcount ", line.hitcount, ", filltime ", line.filltime, ", lasttime ", line.lasttime, " (line addr ", &line, ")", endl;
+      if (hit) {
+        line.hitcount++;
+        line.lasttime = sim_cycle;
+      }
+    }
+
+    static void overflow(W64 tag) { }
+
+    static void locked(V& slot, W64 tag, int way) { }
+    static void unlocked(V& slot, W64 tag, int way) { }
+
+    static void invalidated(V& line, W64 oldtag, int way) { evicted(line, oldtag); }
+
+    static void savestats(DataStoreNode& ds) {
+      ds.add("lifetime", (W64s*)line_lifetime_histogram, LIFETIME_SLOTS, 0, ((LIFETIME_SLOTS-1) * LIFETIME_INTERVAL), LIFETIME_INTERVAL);
+      ds.add("deadtime", (W64s*)line_deadtime_histogram, DEADTIME_SLOTS, 0, ((DEADTIME_SLOTS-1) * DEADTIME_INTERVAL), DEADTIME_INTERVAL);
+      ds.add("hitcount", (W64s*)line_hitcount_histogram, HITCOUNT_SLOTS, 0, ((HITCOUNT_SLOTS-1) * HITCOUNT_INTERVAL), HITCOUNT_INTERVAL);
+    }
+  };
+
+  typedef HistogramAssociativeArrayStatisticsCollector<0, L1CacheLine,
+    DCACHE_L1_LINE_LIFETIME_INTERVAL, DCACHE_L1_LINE_LIFETIME_SLOTS, 
+    DCACHE_L1_LINE_DEADTIME_INTERVAL, DCACHE_L1_LINE_DEADTIME_SLOTS, 
+    DCACHE_L1_LINE_HITCOUNT_INTERVAL, DCACHE_L1_LINE_HITCOUNT_SLOTS> L1StatsCollectorBase;
+
+  typedef HistogramAssociativeArrayStatisticsCollector<1, L1ICacheLine,
+    DCACHE_L1I_LINE_LIFETIME_INTERVAL, DCACHE_L1I_LINE_LIFETIME_SLOTS, 
+    DCACHE_L1I_LINE_DEADTIME_INTERVAL, DCACHE_L1I_LINE_DEADTIME_SLOTS, 
+    DCACHE_L1I_LINE_HITCOUNT_INTERVAL, DCACHE_L1I_LINE_HITCOUNT_SLOTS> L1IStatsCollectorBase;
+
+  typedef HistogramAssociativeArrayStatisticsCollector<2, L2CacheLine,
+    DCACHE_L2_LINE_LIFETIME_INTERVAL, DCACHE_L2_LINE_LIFETIME_SLOTS, 
+    DCACHE_L2_LINE_DEADTIME_INTERVAL, DCACHE_L2_LINE_DEADTIME_SLOTS, 
+    DCACHE_L2_LINE_HITCOUNT_INTERVAL, DCACHE_L2_LINE_HITCOUNT_SLOTS> L2StatsCollectorBase;
+
+#ifdef ENABLE_L3_CACHE
+  typedef HistogramAssociativeArrayStatisticsCollector<3, L3CacheLine,
+    DCACHE_L3_LINE_LIFETIME_INTERVAL, DCACHE_L3_LINE_LIFETIME_SLOTS, 
+    DCACHE_L3_LINE_DEADTIME_INTERVAL, DCACHE_L3_LINE_DEADTIME_SLOTS, 
+    DCACHE_L3_LINE_HITCOUNT_INTERVAL, DCACHE_L3_LINE_HITCOUNT_SLOTS> L3StatsCollectorBase;
+#endif
+
+  struct L1StatsCollector: public L1StatsCollectorBase { };
+  struct L1IStatsCollector: public L1IStatsCollectorBase { };
+  struct L2StatsCollector: public L2StatsCollectorBase { };
+#ifdef ENABLE_L3_CACHE
+  struct L3StatsCollector: public L3StatsCollectorBase { };
+#endif
+
+#else
+  typedef NullAssociativeArrayStatisticsCollector<W64, L1CacheLine> L1StatsCollector;
+  typedef NullAssociativeArrayStatisticsCollector<W64, L1ICacheLine> L1IStatsCollector;
+  typedef NullAssociativeArrayStatisticsCollector<W64, L2CacheLine> L2StatsCollector;
+#ifdef ENABLE_L3_CACHE
+  typedef NullAssociativeArrayStatisticsCollector<W64, L3CacheLine> L3StatsCollector;
+#endif
+#endif
+
+  template <typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<W64, V> > 
+  struct DataCache: public AssociativeArray<W64, V, setcount, waycount, linesize, stats> {
+    typedef AssociativeArray<W64, V, setcount, waycount, linesize, stats> base_t;
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      foreach (set, L1_SET_COUNT) {
+        foreach (way, waycount) {
+          base_t::sets[set][way].clearstats();
+        }
+      }
+#endif
+    }
+    /**
+     * Probing virtually indexed caches.
+     * @param physaddr Physical addres of data to be probed for.
+     * @param virtaddr Virtual address of probed item.
+     */
+    V* split_probe(W64 physaddr, W64 virtaddr) {
+      assert(floor(lowbits(physaddr,PAGE_SHIFT), L1_LINE_SIZE)
+          == floor(lowbits(virtaddr,PAGE_SHIFT), L1_LINE_SIZE));
+
+      V* res =  base_t::sets[base_t::setof(virtaddr)].probe(base_t::tagof(physaddr));
+      return res;
+    }
+
+    /**
+     * Selecting virtually indexed caches, care has to be taken to prevent
+     * aliasing! Simple handling here: On a cache miss, probe the other aliases
+     * and evict them if present.
+     * @param physaddr Physical addres of data to be probed for.
+     * @param virtaddr Virtual address of probed item.
+     */
+    V* split_select(W64 physaddr, W64 virtaddr) {
+      assert(floor(lowbits(physaddr,PAGE_SHIFT), L1_LINE_SIZE)
+          == floor(lowbits(virtaddr,PAGE_SHIFT), L1_LINE_SIZE));
+
+#ifdef L1_ENFORCE_VIRTUAL_ALIASING
+      V* res = split_probe(physaddr, virtaddr);
+      if likely (res) return res;
+
+      /* SD: Nothing found, remove potential aliases. 
+         Aliasing happens in the bits which are part of the index and the
+         virtual page number. */
+      const int naliases = (setcount*linesize) >> PAGE_SHIFT;
+      if (!naliases) return res;
+      int this_alias = (virtaddr >> PAGE_SHIFT) & (naliases - 1);
+
+      /* SD: Find all _other_ aliases and remove them */
+      int aliasset;
+      foreach (i, naliases) {
+        if (i == this_alias) continue;
+        aliasset = base_t::setof((i << PAGE_SHIFT) | lowbits(virtaddr, PAGE_SHIFT));
+        base_t::sets[aliasset].invalidate(base_t::tagof(physaddr));
+      }
+#endif
+      W64 dummy;
+      return base_t::sets[base_t::setof(virtaddr)].select(base_t::tagof(physaddr), dummy);
+    }
+
+    /**
+     * Invalidating virtually indexed caches.
+     * @param physaddr Physical addres of data to be invalidated.
+     * @param virtaddr Virtual address of data to be invalidated.
+     */
+    void split_invalidate(W64 physaddr, W64 virtaddr) {
+      assert(floor(lowbits(physaddr,PAGE_SHIFT), L1_LINE_SIZE)
+          == floor(lowbits(virtaddr,PAGE_SHIFT), L1_LINE_SIZE));
+#ifdef L1_ENFORCE_VIRTUAL_ALIASING
+      const int naliases = (setcount*linesize) >> PAGE_SHIFT;
+      if (!naliases) {
+         base_t::sets[base_t::setof(virtaddr)].invalidate(base_t::tagof(physaddr));
+        return;
+      }
+      /* SD: Find all aliases & invalidate them, if they have the same physaddr. */
+      int aliasset;
+      foreach (i, naliases) {
+        aliasset = base_t::setof((i << PAGE_SHIFT) | lowbits(virtaddr, PAGE_SHIFT));
+        base_t::sets[aliasset].invalidate(base_t::tagof(physaddr));
+      }
+#else
+      base_t::sets[base_t::setof(virtaddr)].invalidate(base_t::tagof(physaddr));
+#endif
+    }
+
+  };
+
+  struct L1Cache: public DataCache<L1CacheLine, L1_SET_COUNT, L1_WAY_COUNT, L1_LINE_SIZE, L1StatsCollector> {
+    L1CacheLine* validate(W64 physaddr, W64 virtaddr, const bitvec<L1_LINE_SIZE>& valid) {
+
+#ifndef L1_VIRTUALLY_INDEXED
+      L1CacheLine* line = select(physaddr);
+#else
+      L1CacheLine* line = split_select(physaddr, virtaddr);
+#endif
+
+      line->fill(tagof(physaddr), valid);
+      return line;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const L1Cache& cache) {
+    return os;
+  }
+
+  //
+  // L1 instruction cache
+  //
+
+  struct L1ICache: public DataCache<L1ICacheLine, L1I_SET_COUNT, L1I_WAY_COUNT, L1I_LINE_SIZE, L1IStatsCollector> {
+    L1ICacheLine* validate(W64 addr, const bitvec<L1I_LINE_SIZE>& valid) {
+      addr = tagof(addr);
+      L1ICacheLine* line = select(addr);
+      line->fill(addr, valid);
+      return line;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const L1ICache& cache) {
+    return os;
+  }
+
+  //
+  // L2 cache
+  //
+
+  typedef DataCache<L2CacheLine, L2_SET_COUNT, L2_WAY_COUNT, L2_LINE_SIZE, L2StatsCollector> L2CacheBase;
+
+  struct L2Cache: public L2CacheBase {
+    void validate(W64 addr) {
+      L2CacheLine* line = select(addr);
+      if (!line) return;
+      line->valid.setall();
+    }
+
+    void deliver(W64 address);
+  };
+
+  //
+  // L3 cache
+  //
+#ifdef ENABLE_L3_CACHE
+  static inline ostream& operator <<(ostream& os, const L3CacheLine& line) {
+    return line.print(os, 0);
+  }
+
+  struct L3Cache: public DataCache<L3CacheLine, L3_SET_COUNT, L3_WAY_COUNT, L3_LINE_SIZE, L3StatsCollector> {
+    L3CacheLine* validate(W64 addr) {
+      W64 oldaddr;
+      L3CacheLine* line = select(addr, oldaddr);
+      return line;
+    }
+  };
+#endif
+
+  static inline void prep_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L1_LINE_SIZE>& sframask, bitvec<L1_LINE_SIZE>& reqmask) {
+    sframask = (sfr) ? (bitvec<L1_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L1_LINE_SIZE)-3)) : 0;
+    reqmask = bitvec<L1_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L1_LINE_SIZE));
+  }
+
+  static inline void prep_L2_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L2_LINE_SIZE>& sframask, bitvec<L2_LINE_SIZE>& reqmask) {
+    sframask = (sfr) ? (bitvec<L2_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L2_LINE_SIZE)-3)) : 0;
+    reqmask = bitvec<L2_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L2_LINE_SIZE));
+  }
+
+  //
+  // TLB class with one-hot semantics. 36 bit tags are required since
+  // virtual addresses are 48 bits, so 48 - 12 (2^12 bytes per page)
+  // is 36 bits.
+  //
+  template <int tlbid, int setcount, int waycount>
+  struct TranslationLookasideBuffer {
+    typedef FullyAssociativeTagsNbitOneHot<waycount, 40> Set;
+    Set sets[setcount];
+
+    TranslationLookasideBuffer() { reset(); }
+
+    void reset() { foreach (set, setcount) sets[set].reset(); }
+
+    // Get the 40-bit TLB tag (36 bit virtual page ID plus 4 bit threadid)
+    static W64 tagof(W64 addr, W64 threadid) {
+      return bits(addr, 12, 36) | (threadid << 36);
+    }
+    static int setof(W64 addr) { return lowbits(addr, log2(setcount)); }
+
+    bool probe(W64 addr, int threadid = 0) {
+      W64 tag = tagof(addr, threadid);
+      return (sets[setof(addr)].probe(tag) >= 0);
+    }
+
+    bool insert(W64 addr, int threadid = 0) {
+      addr = floor(addr, PAGE_SIZE);
+      W64 tag = tagof(addr, threadid);
+      W64 set = setof(addr);
+      W64 oldtag;
+      int way = sets[set].select(tag, oldtag);
+      W64 oldaddr = lowbits(oldtag, 36) << 12;
+      if (logable(6)) {
+        logfile << "TLB insertion of virt page ", (void*)(Waddr)addr, " (virt addr ", 
+          (void*)(Waddr)(addr), ") into set ", set, " way ", way, ": ",
+          ((oldtag != tag) ? "evicted old entry" : "already present"), endl;
+      }
+      return (oldtag != tag);
+    }
+
+    int flush_all() {
+      reset();
+      return setcount * waycount;
+    }
+
+    int flush_thread(W64 threadid) {
+      W64 tag = threadid << 36;
+      W64 tagmask = 0xfULL << 36;
+      int n;
+      foreach (set, setcount) {
+        bitvec<waycount> slotmask = sets[set].masked_match(tag, tagmask);
+        n += slotmask.popcount();
+        sets[set].masked_invalidate(slotmask);
+      }
+      return n;
+    }
+
+    int flush_virt(Waddr virtaddr, W64 threadid) {
+      return sets[setof(virtaddr)].invalidate(tagof(virtaddr, threadid));
+    }
+
+    ostream& print(ostream& os) const {
+      os << "TLB<", setcount, " sets, ", waycount, " ways>:", endl;
+      foreach (set, setcount) {
+        os << "  Set ", set, ":", endl;
+        os << sets[set];
+      }
+      return os;
+    }
+  };
+
+  template <int tlbid, int setcount, int waycount>
+  static inline ostream& operator <<(ostream& os, const TranslationLookasideBuffer<tlbid, setcount, waycount>& tlb) {
+    return tlb.print(os);
+  }
+
+  typedef TranslationLookasideBuffer<0, 1, DTLB_SIZE> DTLB;
+  typedef TranslationLookasideBuffer<1, 1, ITLB_SIZE> ITLB;
+  typedef TranslationLookasideBuffer<0, L2_DTLB_SET_COUNT, L2_DTLB_WAY_COUNT> L2_DTLB;
+  typedef TranslationLookasideBuffer<1, L2_ITLB_SET_COUNT, L2_ITLB_WAY_COUNT> L2_ITLB;
+
+  struct CacheHierarchy;
+
+  //
+  // Load fill request queue (LFRQ) contains any requests for outstanding
+  // loads from both the L2 or L1. 
+  //
+  struct LoadFillReq {
+    W64 addr;       // physical address
+    W64 virtaddr;   // virtual address for virtually indexed caches
+    W64 data;       // data already known so far (e.g. from SFR)
+    LoadStoreInfo lsi;
+    W32  initcycle;
+    byte mask;
+    byte fillL1:1, fillL2:1;
+
+    inline LoadFillReq() { }
+  
+    LoadFillReq(W64 addr, W64 virtaddr, W64 data, byte mask, LoadStoreInfo lsi) {
+      this->addr = addr;
+      this->virtaddr = virtaddr;
+      this->data = data;
+      this->mask = mask;
+      this->lsi = lsi;
+      this->lsi.threadid = lsi.threadid; 
+      this->fillL1 = 1;
+      this->fillL2 = 1;
+      this->initcycle = sim_cycle;
+    }
+
+    ostream& print(ostream& os) const {
+      os << " TH ", lsi.threadid, "  ", "0x", hexstring(data, 64), " @ ", (void*)(Waddr)addr, " -> rob ", lsi.rob;
+      os << ": shift ", lsi.sizeshift, ", signext ", lsi.signext, ", mask ", bitstring(mask, 8, true);
+      return os;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const LoadFillReq& req) {
+    return req.print(os);
+  }
+
+  template <int size>
+  struct LoadFillReqQueue {
+    CacheHierarchy& hierarchy;
+    bitvec<size> freemap;                    // Slot is free
+    bitvec<size> waiting;                    // Waiting for the line to arrive in the L1
+    bitvec<size> ready;                      // Wait to extract/signext and write into register
+    LoadFillReq reqs[size];
+
+    static const int SIZE = size;
+
+    LoadFillReqQueue(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
+    LoadFillReqQueue(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
+
+    // Clear entries belonging to one thread
+    void reset(int threadid);
+
+    // Reset all threads
+    void reset() {
+      freemap.setall();
+      ready = 0;
+      waiting = 0;
+    }
+
+    void changestate(int idx, bitvec<size>& oldstate, bitvec<size>& newstate) {
+      oldstate[idx] = 0;
+      newstate[idx] = 1;
+    }
+
+    void free(int lfrqslot) {
+      changestate(lfrqslot, waiting, freemap);
+    }
+
+    bool full() const {
+      return (!freemap);
+    }
+
+    void annul(int lfrqslot);
+
+    void restart();
+
+    int add(const LoadFillReq& req);
+
+    void wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask);
+
+    void clock();
+
+    LoadFillReq& operator [](int idx) { return reqs[idx]; }
+    const LoadFillReq& operator [](int idx) const { return reqs[idx]; }
+
+    ostream& print(ostream& os) const;
+  };
+
+  template <int size>
+  static inline ostream& operator <<(ostream& os, const LoadFillReqQueue<size>& lfrq) {
+    return lfrq.print(os);
+  }
+
+  enum { STATE_IDLE, STATE_DELIVER_TO_L3, STATE_DELIVER_TO_L2, STATE_DELIVER_TO_L1 };
+  static const char* missbuf_state_names[] = {"idle", "mem->L3", "L3->L2", "L2->L1"};
+
+  template <int SIZE>
+  struct MissBuffer {
+    struct Entry {
+      W64 addr;     // physical line address we are waiting for
+      W64 virtaddr; // virtual line address we are waiting for, for virtually indexed caches
+      W16 state;
+      W16 dcache:1, icache:1;    // L1I vs L1D
+      W32 cycles;
+      W16 rob; // to identify which thread.
+      W8 threadid;
+
+      bitvec<LFRQ_SIZE> lfrqmap;  // which LFRQ entries should this load wake up?
+      void reset() {
+        lfrqmap = 0;
+        addr = 0xffffffffffffffffULL;
+        virtaddr = 0xffffffffffffffffULL;
+        state = STATE_IDLE;
+        cycles = 0;
+        icache = 0;
+        dcache = 0;
+        rob = 0xffff;
+        threadid = 0xff;
+      }
+    };
+
+    MissBuffer(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
+    MissBuffer(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
+
+    CacheHierarchy& hierarchy;
+    Entry missbufs[SIZE];
+    bitvec<SIZE> freemap;
+    
+    void reset();
+    void reset(int threadid);
+    void restart();
+    bool full() const { return (!freemap); }
+    int find(W64 addr);
+    int initiate_miss(W64 addr, W64 virtaddr, bool hit_in_L2, bool icache = 0, int rob = 0xffff, int threadid = 0xfe);
+    int initiate_miss(const LoadFillReq& req, bool hit_in_L2, int rob = 0xffff);
+    void annul_lfrq(int slot);
+    void annul_lfrq(int slot, int threadid);
+    void clock();
+
+    ostream& print(ostream& os) const;
+  };
+
+  template <int size>
+  static inline ostream& operator <<(ostream& os, const MissBuffer<size>& missbuf) {
+    return missbuf.print(os);
+  }
+
+  struct PerCoreCacheCallbacks {
+    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+  };
+
+  struct CacheHierarchy {
+    LoadFillReqQueue<LFRQ_SIZE> lfrq;
+    MissBuffer<MISSBUF_COUNT> missbuf;
+    L1Cache L1;
+    L1ICache L1I;
+    L2Cache L2;
+#ifdef ENABLE_L3_CACHE
+    L3Cache L3;
+#endif
+    DTLB dtlb;
+    ITLB itlb;
+#ifdef USE_L2_TLB
+    L2_DTLB l2dtlb;
+    L2_ITLB l2itlb;
+#endif
+
+    byte coreid;
+    static CacheHierarchy* hierarchies[];
+
+    PerCoreCacheCallbacks* callback;
+
+
+    CacheHierarchy(int coreid_ = 0): lfrq(*this), missbuf(*this), coreid(coreid_)
+    { callback = null; CacheHierarchy::hierarchies[coreid_] = this;}
+
+    bool probe_cache_and_sfr(W64 physaddr, W64 virtaddr, const SFR* sfra, int sizeshift);
+    bool probe_cache_and_sfr(W64 physaddr, const SFR* sfra, int sizeshift);
+    bool covered_by_sfr(W64 addr, SFR* sfr, int sizeshift);
+    void annul_lfrq_slot(int lfrqslot);
+    int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi);
+    int issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi);
+    bool lfrq_or_missbuf_full() const { return lfrq.full() | missbuf.full(); }
+    bool probe_other_caches(W64 addr);
+    void invalidate_other_caches(W64 addr, W64 virtaddr);
+
+    W64 commitstore(const SFR& sfr, W64 virtaddr, bool internal = false, int threadid = 0xff, bool perform_actual_write = true);
+    W64 speculative_store(const SFR& sfr, W64 virtaddr, int threadid = 0xff);
+
+    void initiate_prefetch(W64 physaddr, W64 virtaddr, int cachelevel, bool invalidating = false);
+
+    bool probe_icache(Waddr virtaddr, Waddr physaddr);
+    int initiate_icache_miss(W64 addr, int rob = 0xffff, int threadid = 0xff);
+
+    void reset();
+    void clock();
+    void complete();
+    void complete(int threadid);
+    ostream& print(ostream& os);
+  };
+#endif // STATS_ONLY
+};
+
+struct PerContextDataCacheStats { // rootnode:
+  struct load {
+    struct hit { // node: summable
+      W64 L1;
+      W64 L2;
+      W64 L3;
+      W64 mem;
+    } hit;
+        
+    struct dtlb { // node: summable
+      W64 l1hits;
+      W64 l2hits;
+      W64 misses;
+    } dtlb;
+
+    struct tlbwalk { // node: summable
+      W64 L1_dcache_hit;
+      W64 L1_dcache_miss;
+      W64 no_lfrq_mb;
+    } tlbwalk;
+  } load;
+ 
+  struct fetch {
+    struct hit { // node: summable
+      W64 L1;
+      W64 L2;
+      W64 L3;
+      W64 mem;
+    } hit;
+    
+    struct itlb { // node: summable
+      W64 hits;
+      W64 misses;
+    } itlb;
+
+    struct tlbwalk { // node: summable
+      W64 L1_dcache_hit;
+      W64 L1_dcache_miss;
+      W64 no_lfrq_mb;      
+    } tlbwalk;
+  } fetch;
+  
+  struct store {
+    W64 prefetches;
+  } store;
+};
+
+struct DataCacheStats { // rootnode:
+  struct load {
+    struct transfer { // node: summable
+      W64 L2_to_L1_full;
+      W64 L2_to_L1_partial;
+      W64 L2_L1I_full;
+    } transfer;
+  } load;
+
+  struct missbuf {
+    W64 inserts;
+    struct deliver { // node: summable
+      W64 mem_to_L3;
+      W64 L3_to_L2;
+      W64 L2_to_L1D;
+      W64 L2_to_L1I;
+    } deliver;
+  } missbuf;
+
+  struct prefetch { // node: summable
+    W64 in_L1;
+    W64 in_L2;
+    W64 required;
+  } prefetch;
+
+  struct lfrq {
+    W64 inserts;
+    W64 wakeups;
+    W64 annuls;
+    W64 resets;
+    W64 total_latency;
+    double average_latency;
+    W64 width[CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1]; // histo: 0, CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1, 1
+  } lfrq;
+
+  PerContextDataCacheStats total;
+  PerContextDataCacheStats vcpu0;
+  PerContextDataCacheStats vcpu1;
+  PerContextDataCacheStats vcpu2;
+  PerContextDataCacheStats vcpu3;
+  PerContextDataCacheStats vcpu4;
+  PerContextDataCacheStats vcpu5;
+  PerContextDataCacheStats vcpu6;
+  PerContextDataCacheStats vcpu7;
+
+};
diff -r 10448c053ad6 dcache-amd-k8.h
--- a/dcache-amd-k8.h	Thu May 31 15:36:20 2007 +0200
+++ b/dcache-amd-k8.h	Wed Nov 05 14:15:51 2008 +0100
@@ -5,9 +5,6 @@
 //
 // Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
 //
-
-#ifndef _DCACHE_H_
-#define _DCACHE_H_
 
 #include <ptlsim.h>
 //#include <datastore.h>
@@ -727,5 +724,3 @@
   PerContextDataCacheStats vcpu2;
   PerContextDataCacheStats vcpu3;
 };
-
-#endif // _DCACHE_H_
diff -r 10448c053ad6 dcache-generic.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/dcache-generic.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,764 @@
+// -*- c++ -*-
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Data Cache
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+/**
+ * Enables cross core cache invalidation and cross core cache forwarding.
+ * Only has effect with a proper SMP model, ie. you have to _disable_
+ * ENABLE_SMT.
+ */
+#define POOR_MANS_MESI
+#include <ptlsim.h>
+//#include <datastore.h>
+#define MAX_HIERARCHIES 8
+struct LoadStoreInfo {
+  W16 rob;
+  W8  threadid;
+  W8  sizeshift:2, aligntype:2, sfrused:1, internal:1, signext:1, pad1:1;
+  W32 pad32;
+  RawDataAccessors(LoadStoreInfo, W64);
+};
+
+#define per_context_dcache_stats_ref(vcpuid) (*(((PerContextDataCacheStats*)&stats.dcache.vcpu0) + (vcpuid)))
+#define per_context_dcache_stats_update(vcpuid, expr) stats.dcache.total.expr, per_context_dcache_stats_ref(vcpuid).expr
+
+namespace CacheSubsystem {
+  // How many load wakeups can be driven into the core each cycle:
+  const int MAX_WAKEUPS_PER_CYCLE = 2;
+
+#ifndef STATS_ONLY
+
+// non-debugging only:
+//#define __RELEASE__
+#ifdef __RELEASE__
+#undef assert
+#define assert(x) (x)
+#endif
+
+  //#define CACHE_ALWAYS_HITS
+  //#define L2_ALWAYS_HITS
+  
+  // 16 KB L1 at 2 cycles
+  const int L1_LINE_SIZE = 64;
+  const int L1_SET_COUNT = 64;
+  const int L1_WAY_COUNT = 4;
+  // #define ENFORCE_L1_DCACHE_BANK_CONFLICTS
+  const int L1_DCACHE_BANKS = 8; // 8 banks x 8 bytes/bank = 64 bytes/line
+
+  // 32 KB L1I
+  const int L1I_LINE_SIZE = 64;
+  const int L1I_SET_COUNT = 128;
+  const int L1I_WAY_COUNT = 4;
+
+  // 256 KB L2 at 6 cycles
+  const int L2_LINE_SIZE = 64;
+  const int L2_SET_COUNT = 256; // 256 KB
+  const int L2_WAY_COUNT = 16;
+  const int L2_LATENCY   = 6; // don't include the extra wakeup cycle (waiting->ready state transition) in the LFRQ
+
+#define ENABLE_L3_CACHE
+#ifdef ENABLE_L3_CACHE
+  // 2 MB L3 cache (4096 sets, 16 ways) with 64-byte lines, latency 16 cycles
+  const int L3_SET_COUNT = 1024;
+  const int L3_WAY_COUNT = 16;
+  const int L3_LINE_SIZE = 128;
+  const int L3_LATENCY   = 16;
+#endif
+  // Load Fill Request Queue (maximum number of missed loads)
+  const int LFRQ_SIZE = 63;
+
+  // Allow up to 16 outstanding lines in the L2 awaiting service:
+  const int MISSBUF_COUNT = 16;
+  const int MAIN_MEM_LATENCY = 100;
+
+  const int CROSS_CACHE_LATENCY = 50;
+  // TLBs
+#define USE_TLB
+  const int ITLB_SIZE = 32;
+  const int DTLB_SIZE = 32;
+
+//#define ISSUE_LOAD_STORE_DEBUG
+//#define CHECK_LOADS_AND_STORES
+
+// Line Usage Statistics
+
+//#define TRACK_LINE_USAGE
+
+#ifdef TRACK_LINE_USAGE
+#define DCACHE_L1_LINE_LIFETIME_INTERVAL   1
+#define DCACHE_L1_LINE_DEADTIME_INTERVAL   1
+#define DCACHE_L1_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L1_LINE_LIFETIME_SLOTS      8192
+#define DCACHE_L1_LINE_DEADTIME_SLOTS      8192
+#define DCACHE_L1_LINE_HITCOUNT_SLOTS      64
+
+#define DCACHE_L1I_LINE_LIFETIME_INTERVAL  16
+#define DCACHE_L1I_LINE_DEADTIME_INTERVAL  16
+#define DCACHE_L1I_LINE_HITCOUNT_INTERVAL  1
+#define DCACHE_L1I_LINE_LIFETIME_SLOTS     8192
+#define DCACHE_L1I_LINE_DEADTIME_SLOTS     8192
+#define DCACHE_L1I_LINE_HITCOUNT_SLOTS     1024
+
+#define DCACHE_L2_LINE_LIFETIME_INTERVAL   4
+#define DCACHE_L2_LINE_DEADTIME_INTERVAL   4
+#define DCACHE_L2_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L2_LINE_LIFETIME_SLOTS      65536
+#define DCACHE_L2_LINE_DEADTIME_SLOTS      65536
+#define DCACHE_L2_LINE_HITCOUNT_SLOTS      256
+
+#define DCACHE_L3_LINE_LIFETIME_INTERVAL   64
+#define DCACHE_L3_LINE_DEADTIME_INTERVAL   64
+#define DCACHE_L3_LINE_HITCOUNT_INTERVAL   1
+#define DCACHE_L3_LINE_LIFETIME_SLOTS      16384
+#define DCACHE_L3_LINE_DEADTIME_SLOTS      16384
+#define DCACHE_L3_LINE_HITCOUNT_SLOTS      256
+#endif
+
+  //
+  // Cache Line Types
+  //
+  template <int linesize>
+  struct CacheLine {
+#ifdef TRACK_LINE_USAGE
+    W32 filltime;
+    W32 lasttime;
+    W32 hitcount;
+#else
+    byte dummy;
+#endif
+    void reset() { clearstats(); }
+    void invalidate() { reset(); }
+    void fill(W64 tag, const bitvec<linesize>& valid) { }
+
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      filltime = sim_cycle;
+      lasttime = sim_cycle;
+      hitcount = 0;
+#endif
+    }
+
+    ostream& print(ostream& os, W64 tag) const;
+  };
+
+  template <int linesize>
+  static inline ostream& operator <<(ostream& os, const CacheLine<linesize>& line) {
+    return line.print(os, 0);
+  }
+
+  template <int linesize>
+  struct CacheLineWithValidMask {
+    bitvec<linesize> valid;
+#ifdef TRACK_LINE_USAGE
+    W32 filltime;
+    W32 lasttime;
+    W32 hitcount;
+#endif
+
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      filltime = sim_cycle;
+      lasttime = sim_cycle;
+      hitcount = 0;
+#endif
+    }
+
+    void reset() { valid = 0; clearstats(); }
+    void invalidate() { reset(); }
+    void fill(W64 tag, const bitvec<linesize>& valid) { this->valid |= valid; }
+    ostream& print(ostream& os, W64 tag) const;
+  };
+
+  template <int linesize>
+  static inline ostream& operator <<(ostream& os, const CacheLineWithValidMask<linesize>& line) {
+    return line.print(os, 0);
+  }
+
+  typedef CacheLineWithValidMask<L1_LINE_SIZE> L1CacheLine;
+  typedef CacheLine<L1I_LINE_SIZE> L1ICacheLine;
+  typedef CacheLineWithValidMask<L2_LINE_SIZE> L2CacheLine;
+#ifdef ENABLE_L3_CACHE
+  typedef CacheLine<L3_LINE_SIZE> L3CacheLine;
+#endif
+
+  //
+  // L1 data cache
+  //
+#ifdef TRACK_LINE_USAGE
+  static const char* cache_names[4] = {"L1", "I1", "L2", "L3"};
+
+  template <int uniq, typename V, int LIFETIME_INTERVAL, int LIFETIME_SLOTS, int DEADTIME_INTERVAL, int DEADTIME_SLOTS, int HITCOUNT_INTERVAL, int HITCOUNT_SLOTS>
+  struct HistogramAssociativeArrayStatisticsCollector {
+    static W64 line_lifetime_histogram[LIFETIME_SLOTS];
+    static W64 line_deadtime_histogram[DEADTIME_SLOTS];
+    static W64 line_hitcount_histogram[HITCOUNT_SLOTS];
+
+    static const bool FORCE_DEBUG = 0;
+
+    HistogramAssociativeArrayStatisticsCollector() {
+      reset();
+    }
+
+    static void reset() {
+      setzero(line_lifetime_histogram);
+      setzero(line_deadtime_histogram);
+      setzero(line_hitcount_histogram);
+    }
+
+    static void evicted(const V& line, W64 tag) {
+      // Line has been evicted: update statistics
+      W64s lifetime = line.lasttime - line.filltime;
+      assert(lifetime >= 0);
+      int lifetimeslot = clipto(lifetime / LIFETIME_INTERVAL, 0, LIFETIME_SLOTS-1);
+      line_lifetime_histogram[lifetimeslot]++;
+
+      W64s deadtime = sim_cycle - line.lasttime;
+      int deadtimeslot = clipto(deadtime / DEADTIME_INTERVAL, 0, DEADTIME_SLOTS-1);
+      line_deadtime_histogram[deadtimeslot]++;
+
+      W64 hitcount = line.hitcount;
+      int hitcountslot = clipto(hitcount / HITCOUNT_INTERVAL, 0, HITCOUNT_SLOTS-1);
+      line_hitcount_histogram[hitcountslot]++;
+
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": evicted(", (void*)tag, "): lifetime ", lifetime, ", deadtime ", deadtime, ", hitcount ", hitcount, " (line addr ", &line, ")", endl;
+    }
+
+    static void filled(V& line, W64 tag) {
+      line.filltime = sim_cycle;
+      line.lasttime = sim_cycle;
+      line.hitcount = 1;
+
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": filled(", (void*)tag, ")", " (line addr ", &line, ")", endl;
+    }
+
+    static void inserted(V& line, W64 newtag, int way) {
+      filled(line, newtag);
+    }
+
+    static void replaced(V& line, W64 oldtag, W64 newtag, int way) {
+      evicted(line, oldtag);
+      filled(line, newtag);
+    }
+
+    static void probed(V& line, W64 tag, int way, bool hit) { 
+      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": probe(", (void*)tag, "): ", (hit ? "HIT" : "miss"), " way ", way, ": hitcount ", line.hitcount, ", filltime ", line.filltime, ", lasttime ", line.lasttime, " (line addr ", &line, ")", endl;
+      if (hit) {
+        line.hitcount++;
+        line.lasttime = sim_cycle;
+      }
+    }
+
+    static void overflow(W64 tag) { }
+
+    static void locked(V& slot, W64 tag, int way) { }
+    static void unlocked(V& slot, W64 tag, int way) { }
+
+    static void invalidated(V& line, W64 oldtag, int way) { evicted(line, oldtag); }
+
+    static void savestats(DataStoreNode& ds) {
+      ds.add("lifetime", (W64s*)line_lifetime_histogram, LIFETIME_SLOTS, 0, ((LIFETIME_SLOTS-1) * LIFETIME_INTERVAL), LIFETIME_INTERVAL);
+      ds.add("deadtime", (W64s*)line_deadtime_histogram, DEADTIME_SLOTS, 0, ((DEADTIME_SLOTS-1) * DEADTIME_INTERVAL), DEADTIME_INTERVAL);
+      ds.add("hitcount", (W64s*)line_hitcount_histogram, HITCOUNT_SLOTS, 0, ((HITCOUNT_SLOTS-1) * HITCOUNT_INTERVAL), HITCOUNT_INTERVAL);
+    }
+  };
+
+  typedef HistogramAssociativeArrayStatisticsCollector<0, L1CacheLine,
+    DCACHE_L1_LINE_LIFETIME_INTERVAL, DCACHE_L1_LINE_LIFETIME_SLOTS, 
+    DCACHE_L1_LINE_DEADTIME_INTERVAL, DCACHE_L1_LINE_DEADTIME_SLOTS, 
+    DCACHE_L1_LINE_HITCOUNT_INTERVAL, DCACHE_L1_LINE_HITCOUNT_SLOTS> L1StatsCollectorBase;
+
+  typedef HistogramAssociativeArrayStatisticsCollector<1, L1ICacheLine,
+    DCACHE_L1I_LINE_LIFETIME_INTERVAL, DCACHE_L1I_LINE_LIFETIME_SLOTS, 
+    DCACHE_L1I_LINE_DEADTIME_INTERVAL, DCACHE_L1I_LINE_DEADTIME_SLOTS, 
+    DCACHE_L1I_LINE_HITCOUNT_INTERVAL, DCACHE_L1I_LINE_HITCOUNT_SLOTS> L1IStatsCollectorBase;
+
+  typedef HistogramAssociativeArrayStatisticsCollector<2, L2CacheLine,
+    DCACHE_L2_LINE_LIFETIME_INTERVAL, DCACHE_L2_LINE_LIFETIME_SLOTS, 
+    DCACHE_L2_LINE_DEADTIME_INTERVAL, DCACHE_L2_LINE_DEADTIME_SLOTS, 
+    DCACHE_L2_LINE_HITCOUNT_INTERVAL, DCACHE_L2_LINE_HITCOUNT_SLOTS> L2StatsCollectorBase;
+
+#ifdef ENABLE_L3_CACHE
+  typedef HistogramAssociativeArrayStatisticsCollector<3, L3CacheLine,
+    DCACHE_L3_LINE_LIFETIME_INTERVAL, DCACHE_L3_LINE_LIFETIME_SLOTS, 
+    DCACHE_L3_LINE_DEADTIME_INTERVAL, DCACHE_L3_LINE_DEADTIME_SLOTS, 
+    DCACHE_L3_LINE_HITCOUNT_INTERVAL, DCACHE_L3_LINE_HITCOUNT_SLOTS> L3StatsCollectorBase;
+#endif
+
+  struct L1StatsCollector: public L1StatsCollectorBase { };
+  struct L1IStatsCollector: public L1IStatsCollectorBase { };
+  struct L2StatsCollector: public L2StatsCollectorBase { };
+#ifdef ENABLE_L3_CACHE
+  struct L3StatsCollector: public L3StatsCollectorBase { };
+#endif
+
+#else
+  typedef NullAssociativeArrayStatisticsCollector<W64, L1CacheLine> L1StatsCollector;
+  typedef NullAssociativeArrayStatisticsCollector<W64, L1ICacheLine> L1IStatsCollector;
+  typedef NullAssociativeArrayStatisticsCollector<W64, L2CacheLine> L2StatsCollector;
+#ifdef ENABLE_L3_CACHE
+  typedef NullAssociativeArrayStatisticsCollector<W64, L3CacheLine> L3StatsCollector;
+#endif
+#endif
+
+  template <typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<W64, V> > 
+  struct DataCache: public AssociativeArray<W64, V, setcount, waycount, linesize, stats> {
+    typedef AssociativeArray<W64, V, setcount, waycount, linesize, stats> base_t;
+    void clearstats() {
+#ifdef TRACK_LINE_USAGE
+      foreach (set, L1_SET_COUNT) {
+        foreach (way, waycount) {
+          base_t::sets[set][way].clearstats();
+        }
+      }
+#endif
+    }
+  };
+
+  struct L1Cache: public DataCache<L1CacheLine, L1_SET_COUNT, L1_WAY_COUNT, L1_LINE_SIZE, L1StatsCollector> {
+    L1CacheLine* validate(W64 addr, const bitvec<L1_LINE_SIZE>& valid) {
+      addr = tagof(addr);
+      L1CacheLine* line = select(addr);
+      line->fill(addr, valid);
+      return line;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const L1Cache& cache) {
+    return os;
+  }
+
+  //
+  // L1 instruction cache
+  //
+
+  struct L1ICache: public DataCache<L1ICacheLine, L1I_SET_COUNT, L1I_WAY_COUNT, L1I_LINE_SIZE, L1IStatsCollector> {
+    L1ICacheLine* validate(W64 addr, const bitvec<L1I_LINE_SIZE>& valid) {
+      addr = tagof(addr);
+      L1ICacheLine* line = select(addr);
+      line->fill(addr, valid);
+      return line;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const L1ICache& cache) {
+    return os;
+  }
+
+  //
+  // L2 cache
+  //
+
+  typedef DataCache<L2CacheLine, L2_SET_COUNT, L2_WAY_COUNT, L2_LINE_SIZE, L2StatsCollector> L2CacheBase;
+
+  struct L2Cache: public L2CacheBase {
+    void validate(W64 addr) {
+      L2CacheLine* line = select(addr);
+      if (!line) return;
+      line->valid.setall();
+    }
+
+    void deliver(W64 address);
+  };
+
+  //
+  // L3 cache
+  //
+#ifdef ENABLE_L3_CACHE
+  static inline ostream& operator <<(ostream& os, const L3CacheLine& line) {
+    return line.print(os, 0);
+  }
+
+  struct L3Cache: public DataCache<L3CacheLine, L3_SET_COUNT, L3_WAY_COUNT, L3_LINE_SIZE, L3StatsCollector> {
+    L3CacheLine* validate(W64 addr) {
+      W64 oldaddr;
+      L3CacheLine* line = select(addr, oldaddr);
+      return line;
+    }
+  };
+#endif
+
+  static inline void prep_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L1_LINE_SIZE>& sframask, bitvec<L1_LINE_SIZE>& reqmask) {
+    sframask = (sfr) ? (bitvec<L1_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L1_LINE_SIZE)-3)) : 0;
+    reqmask = bitvec<L1_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L1_LINE_SIZE));
+  }
+
+  static inline void prep_L2_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L2_LINE_SIZE>& sframask, bitvec<L2_LINE_SIZE>& reqmask) {
+    sframask = (sfr) ? (bitvec<L2_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L2_LINE_SIZE)-3)) : 0;
+    reqmask = bitvec<L2_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L2_LINE_SIZE));
+  }
+
+  //
+  // TLB class with one-hot semantics. 36 bit tags are required since
+  // virtual addresses are 48 bits, so 48 - 12 (2^12 bytes per page)
+  // is 36 bits.
+  //
+  template <int tlbid, int size>
+  struct TranslationLookasideBuffer: public FullyAssociativeTagsNbitOneHot<size, 40> {
+    typedef FullyAssociativeTagsNbitOneHot<size, 40> base_t;
+    TranslationLookasideBuffer(): base_t() { }
+
+    void reset() {
+      base_t::reset();
+    }
+
+    // Get the 40-bit TLB tag (36 bit virtual page ID plus 4 bit threadid)
+    static W64 tagof(W64 addr, W64 threadid) {
+      return bits(addr, 12, 36) | (threadid << 36);
+    }
+
+    bool probe(W64 addr, int threadid = 0) {
+      W64 tag = tagof(addr, threadid);
+      return (base_t::probe(tag) >= 0);
+    }
+
+    bool insert(W64 addr, int threadid = 0) {
+      addr = floor(addr, PAGE_SIZE);
+      W64 tag = tagof(addr, threadid);
+      W64 oldtag;
+      int way = base_t::select(tag, oldtag);
+      W64 oldaddr = lowbits(oldtag, 36) << 12;
+      if (logable(6)) {
+        logfile << "TLB insertion of virt page ", (void*)(Waddr)addr, " (virt addr ", 
+          (void*)(Waddr)(addr), ") into way ", way, ": ",
+          ((oldtag != tag) ? "evicted old entry" : "already present"), endl;
+      }
+      return (oldtag != tag);
+    }
+
+    int flush_all() {
+      reset();
+      return size;
+    }
+
+    int flush_thread(W64 threadid) {
+      W64 tag = threadid << 36;
+      W64 tagmask = 0xfULL << 36;
+      bitvec<size> slotmask = base_t::masked_match(tag, tagmask);
+      int n = slotmask.popcount();
+      base_t::masked_invalidate(slotmask);
+      return n;
+    }
+
+    int flush_virt(Waddr virtaddr, W64 threadid) {
+      return invalidate(tagof(virtaddr, threadid));
+    }
+  };
+
+  template <int tlbid, int size>
+  static inline ostream& operator <<(ostream& os, const TranslationLookasideBuffer<tlbid, size>& tlb) {
+    return tlb.print(os);
+  }
+
+  typedef TranslationLookasideBuffer<0, DTLB_SIZE> DTLB;
+  typedef TranslationLookasideBuffer<1, ITLB_SIZE> ITLB;
+
+  struct CacheHierarchy;
+
+  //
+  // Load fill request queue (LFRQ) contains any requests for outstanding
+  // loads from both the L2 or L1. 
+  //
+  struct LoadFillReq {
+    W64 addr;       // physical address
+    W64 data;       // data already known so far (e.g. from SFR)
+    LoadStoreInfo lsi;
+    W32  initcycle;
+    byte mask;
+    byte fillL1:1, fillL2:1;
+
+    inline LoadFillReq() { }
+  
+    LoadFillReq(W64 addr, W64 data, byte mask, LoadStoreInfo lsi) {
+      this->addr = addr;
+      this->data = data;
+      this->mask = mask;
+      this->lsi = lsi;
+      this->lsi.threadid = lsi.threadid; 
+      this->fillL1 = 1;
+      this->fillL2 = 1;
+      this->initcycle = sim_cycle;
+    }
+
+    ostream& print(ostream& os) const {
+      os << " TH ", lsi.threadid, "  ", "0x", hexstring(data, 64), " @ ", (void*)(Waddr)addr, " -> rob ", lsi.rob;
+      os << ": shift ", lsi.sizeshift, ", signext ", lsi.signext, ", mask ", bitstring(mask, 8, true);
+      return os;
+    }
+  };
+
+  static inline ostream& operator <<(ostream& os, const LoadFillReq& req) {
+    return req.print(os);
+  }
+
+  template <int size>
+  struct LoadFillReqQueue {
+    CacheHierarchy& hierarchy;
+    bitvec<size> freemap;                    // Slot is free
+    bitvec<size> waiting;                    // Waiting for the line to arrive in the L1
+    bitvec<size> ready;                      // Wait to extract/signext and write into register
+    LoadFillReq reqs[size];
+
+    static const int SIZE = size;
+
+    LoadFillReqQueue(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
+    LoadFillReqQueue(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
+
+    // Clear entries belonging to one thread
+    void reset(int threadid);
+
+    // Reset all threads
+    void reset() {
+      freemap.setall();
+      ready = 0;
+      waiting = 0;
+    }
+
+    void changestate(int idx, bitvec<size>& oldstate, bitvec<size>& newstate) {
+      oldstate[idx] = 0;
+      newstate[idx] = 1;
+    }
+
+    void free(int lfrqslot) {
+      changestate(lfrqslot, waiting, freemap);
+    }
+
+    bool full() const {
+      return (!freemap);
+    }
+
+    void annul(int lfrqslot);
+
+    void restart();
+
+    int add(const LoadFillReq& req);
+
+    void wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask);
+
+    void clock();
+
+    LoadFillReq& operator [](int idx) { return reqs[idx]; }
+    const LoadFillReq& operator [](int idx) const { return reqs[idx]; }
+
+    ostream& print(ostream& os) const;
+  };
+
+  template <int size>
+  static inline ostream& operator <<(ostream& os, const LoadFillReqQueue<size>& lfrq) {
+    return lfrq.print(os);
+  }
+
+  enum { STATE_IDLE, STATE_DELIVER_TO_L3, STATE_DELIVER_TO_L2, STATE_DELIVER_TO_L1 };
+  static const char* missbuf_state_names[] = {"idle", "mem->L3", "L3->L2", "L2->L1"};
+
+  template <int SIZE>
+  struct MissBuffer {
+    struct Entry {
+      W64 addr;     // physical line address we are waiting for
+      W16 state;
+      W16 dcache:1, icache:1;    // L1I vs L1D
+      W32 cycles;
+      W16 rob; // to identify which thread.
+      W8 threadid;
+
+      bitvec<LFRQ_SIZE> lfrqmap;  // which LFRQ entries should this load wake up?
+      void reset() {
+        lfrqmap = 0;
+        addr = 0xffffffffffffffffULL;
+        state = STATE_IDLE;
+        cycles = 0;
+        icache = 0;
+        dcache = 0;
+        rob = 0xffff;
+        threadid = 0xff;
+      }
+    };
+
+    MissBuffer(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
+    MissBuffer(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
+
+    CacheHierarchy& hierarchy;
+    Entry missbufs[SIZE];
+    bitvec<SIZE> freemap;
+    
+    void reset();
+    void reset(int threadid);
+    void restart();
+    bool full() const { return (!freemap); }
+    int find(W64 addr);
+    int initiate_miss(W64 addr, bool hit_in_L2, bool icache = 0, int rob = 0xffff, int threadid = 0xfe);
+    int initiate_miss(const LoadFillReq& req, bool hit_in_L2, int rob = 0xffff);
+    void annul_lfrq(int slot);
+    void annul_lfrq(int slot, int threadid);
+    void clock();
+
+    ostream& print(ostream& os) const;
+  };
+
+  template <int size>
+  static inline ostream& operator <<(ostream& os, const MissBuffer<size>& missbuf) {
+    return missbuf.print(os);
+  }
+
+  struct PerCoreCacheCallbacks {
+    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+  };
+
+  struct CacheHierarchy {
+    LoadFillReqQueue<LFRQ_SIZE> lfrq;
+    MissBuffer<MISSBUF_COUNT> missbuf;
+    L1Cache L1;
+    L1ICache L1I;
+    L2Cache L2;
+#ifdef ENABLE_L3_CACHE
+    L3Cache L3;
+#endif
+    DTLB dtlb;
+    ITLB itlb;
+
+    byte coreid;
+    static CacheHierarchy* hierarchies[];
+
+    PerCoreCacheCallbacks* callback;
+
+    CacheHierarchy(int coreid_ = 0): lfrq(*this), missbuf(*this), coreid(coreid_) { callback = null; CacheHierarchy::hierarchies[coreid_] = this;}
+
+    bool probe_cache_and_sfr(W64 addr, const SFR* sfra, int sizeshift);
+    bool covered_by_sfr(W64 addr, SFR* sfr, int sizeshift);
+    void annul_lfrq_slot(int lfrqslot);
+    int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi);
+    int issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi);
+    bool lfrq_or_missbuf_full() const { return lfrq.full() | missbuf.full(); }
+    bool probe_other_caches(W64 addr);
+    void invalidate_other_caches(W64 addr, W64 virtaddr);
+
+    W64 commitstore(const SFR& sfr, W64 virtaddr, bool internal = false, int threadid = 0xff, bool perform_actual_write = true);
+    W64 speculative_store(const SFR& sfr, int threadid = 0xff);
+
+    void initiate_prefetch(W64 physaddr, W64 virtaddr, int cachelevel, bool invalidating = false);
+
+    bool probe_icache(Waddr virtaddr, Waddr physaddr);
+    int initiate_icache_miss(W64 addr, int rob = 0xffff, int threadid = 0xff);
+
+    void reset();
+    void clock();
+    void complete();
+    void complete(int threadid);
+    ostream& print(ostream& os);
+  };
+#endif // STATS_ONLY
+};
+
+struct PerContextDataCacheStats { // rootnode:
+  struct load {
+    struct hit { // node: summable
+      W64 L1;
+      W64 L2;
+      W64 L3;
+      W64 mem;
+    } hit;
+        
+    struct dtlb { // node: summable
+      W64 hits;
+      W64 misses;
+    } dtlb;
+
+    struct tlbwalk { // node: summable
+      W64 L1_dcache_hit;
+      W64 L1_dcache_miss;
+      W64 no_lfrq_mb;
+    } tlbwalk;
+  } load;
+ 
+  struct fetch {
+    struct hit { // node: summable
+      W64 L1;
+      W64 L2;
+      W64 L3;
+      W64 mem;
+    } hit;
+    
+    struct itlb { // node: summable
+      W64 hits;
+      W64 misses;
+    } itlb;
+
+    struct tlbwalk { // node: summable
+      W64 L1_dcache_hit;
+      W64 L1_dcache_miss;
+      W64 no_lfrq_mb;      
+    } tlbwalk;
+  } fetch;
+  
+  struct store {
+    W64 prefetches;
+  } store;
+};
+
+struct DataCacheStats { // rootnode:
+  struct load {
+    struct transfer { // node: summable
+      W64 L2_to_L1_full;
+      W64 L2_to_L1_partial;
+      W64 L2_L1I_full;
+    } transfer;
+  } load;
+
+  struct missbuf {
+    W64 inserts;
+    struct deliver { // node: summable
+      W64 mem_to_L3;
+      W64 L3_to_L2;
+      W64 L2_to_L1D;
+      W64 L2_to_L1I;
+    } deliver;
+  } missbuf;
+
+  struct prefetch { // node: summable
+    W64 in_L1;
+    W64 in_L2;
+    W64 required;
+  } prefetch;
+
+  struct lfrq {
+    W64 inserts;
+    W64 wakeups;
+    W64 annuls;
+    W64 resets;
+    W64 total_latency;
+    double average_latency;
+    W64 width[CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1]; // histo: 0, CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1, 1
+  } lfrq;
+
+  PerContextDataCacheStats total;
+  PerContextDataCacheStats vcpu0;
+  PerContextDataCacheStats vcpu1;
+  PerContextDataCacheStats vcpu2;
+  PerContextDataCacheStats vcpu3;
+  PerContextDataCacheStats vcpu4;
+  PerContextDataCacheStats vcpu5;
+  PerContextDataCacheStats vcpu6;
+  PerContextDataCacheStats vcpu7;
+
+};
diff -r 10448c053ad6 dcache.cpp
--- a/dcache.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/dcache.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // L1 and L2 Data Caches
 //
 // Copyright 2000-2005 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <dcache.h>
@@ -85,7 +87,7 @@
 int LoadFillReqQueue<size>::add(const LoadFillReq& req) {
   if unlikely (full()) return -1;
   int idx = freemap.lsb();
-  changestate(idx, freemap, waiting);         
+  changestate(idx, freemap, waiting);
   reqs[idx] = req;
   stats.dcache.lfrq.inserts++;
   return idx;
@@ -97,7 +99,7 @@
 // line at the L1 level. Once a line is delivered,
 // it is copied into the L1 cache and the corresponding
 // miss buffer can be freed.
-// 
+//
 template <int size>
 void LoadFillReqQueue<size>::wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask) {
   if (logable(6)) logfile << "LFRQ.wakeup(", (void*)(Waddr)address, ", ", lfrqmask, ")", endl;
@@ -131,7 +133,7 @@
 
     int idx = ready.lsb();
     LoadFillReq& req = reqs[idx];
-    
+
     if (logable(6)) logfile << "[vcpu ", req.lsi.threadid, "] at cycle ", sim_cycle, ": wakeup LFRQ slot ", idx, ": ", req, endl;
 
     W64 delta = LO32(sim_cycle) - LO32(req.initcycle);
@@ -141,7 +143,7 @@
     } else {
       stats.dcache.lfrq.total_latency += delta;
     }
-        
+
     stats.dcache.lfrq.wakeups++;
     wakeupcount++;
     if likely (hierarchy.callback) hierarchy.callback->dcache_wakeup(req.lsi, req.addr);
@@ -170,7 +172,7 @@
 // Miss Buffer
 //
 
-template <int SIZE>    
+template <int SIZE>
 void MissBuffer<SIZE>::reset() {
   foreach (i, SIZE) {
     missbufs[i].reset();
@@ -179,7 +181,7 @@
 }
 
 
-template <int SIZE>    
+template <int SIZE>
 void MissBuffer<SIZE>::reset(int threadid) {
   foreach (i, SIZE) {
     if likely (missbufs[i].threadid == threadid) {
@@ -190,7 +192,7 @@
   }
 }
 
-template <int SIZE>    
+template <int SIZE>
 void MissBuffer<SIZE>::restart() {
   if likely (!(freemap.allset())) {
     foreach (i, SIZE) {
@@ -199,7 +201,7 @@
   }
 }
 
-template <int SIZE>    
+template <int SIZE>
 int MissBuffer<SIZE>::find(W64 addr) {
   W64 match = 0;
   foreach (i, SIZE) {
@@ -213,7 +215,7 @@
 // caches and needs service from below.
 //
 template <int SIZE>
-int MissBuffer<SIZE>::initiate_miss(W64 addr, bool hit_in_L2, bool icache, int rob, int threadid) {
+int MissBuffer<SIZE>::initiate_miss(W64 addr, W64 virtaddr, bool hit_in_L2, bool icache, int rob, int threadid) {
   bool DEBUG = logable(6);
 
   addr = floor(addr, L1_LINE_SIZE);
@@ -221,7 +223,7 @@
   int idx = find(addr);
 
   if unlikely (idx >= 0 && threadid == missbufs[idx].threadid) {
-    // Handle case where dcache miss is already in progress but some 
+    // Handle case where dcache miss is already in progress but some
     // code needed in icache is also stored in that line:
     Entry& mb = missbufs[idx];
     mb.icache |= icache;
@@ -243,13 +245,14 @@
   stats.dcache.missbuf.inserts++;
   Entry& mb = missbufs[idx];
   mb.addr = addr;
+  mb.virtaddr = virtaddr;
   mb.lfrqmap = 0;
   mb.icache = icache;
   mb.dcache = (!icache);
   mb.rob = rob;
   mb.threadid = threadid;
 
- 
+
   if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", idx, ": allocated for address ", (void*)(Waddr)addr, " (iter ", iterations, ")", endl;
   //  assert(threadid != 0xff);
 
@@ -273,11 +276,18 @@
 
   if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", idx, ": enter state deliver to L3 on ", (void*)(Waddr)addr, " (iter ", iterations, ")", endl;
   mb.state = STATE_DELIVER_TO_L3;
-  mb.cycles = MAIN_MEM_LATENCY;
 #else
   // L3 cache disabled
   if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", idx, ": enter state deliver to L2 on ", (void*)(Waddr)addr, " (iter ", iterations, ")", endl;
   mb.state = STATE_DELIVER_TO_L2;
+#endif
+
+#ifdef POOR_MANS_MESI
+  if (hierarchy.probe_other_caches(addr))
+    mb.cycles = CROSS_CACHE_LATENCY;
+  else
+    mb.cycles = MAIN_MEM_LATENCY;
+#else
   mb.cycles = MAIN_MEM_LATENCY;
 #endif
 
@@ -294,8 +304,8 @@
 
   if unlikely (lfrqslot < 0)
                 return -1;
-  
-  int mbidx = initiate_miss(req.addr, hit_in_L2, 0, rob, req.lsi.threadid);
+
+  int mbidx = initiate_miss(req.addr, req.virtaddr, hit_in_L2, 0, rob, req.lsi.threadid);
   if unlikely (mbidx < 0) {
     hierarchy.lfrq.free(lfrqslot);
     return -1;
@@ -355,7 +365,7 @@
           if (DEBUG) logfile << "[vcpu ", mb.threadid, "] mb", i, ": delivered ", (void*)(Waddr)mb.addr, " to L1 dcache (map ", mb.lfrqmap, ")", endl;
           // If the L2 line size is bigger than the L1 line size, this will validate multiple lines in the L1 when an L2 line arrives:
           // foreach (i, L2_LINE_SIZE / L1_LINE_SIZE) L1.validate(mb.addr + i*L1_LINE_SIZE, bitvec<L1_LINE_SIZE>().setall());
-          hierarchy.L1.validate(mb.addr, bitvec<L1_LINE_SIZE>().setall());
+          hierarchy.L1.validate(mb.addr, mb.virtaddr, bitvec<L1_LINE_SIZE>().setall());
           stats.dcache.missbuf.deliver.L2_to_L1D++;
           hierarchy.lfrq.wakeup(mb.addr, mb.lfrqmap);
         }
@@ -391,12 +401,12 @@
 
 template <int SIZE>
 ostream& MissBuffer<SIZE>::print(ostream& os) const {
- 
+
   os << "MissBuffer<", SIZE, ">:", endl;
   foreach (i, SIZE) {
     if likely (freemap[i]) continue;
     const Entry& mb = missbufs[i];
-    os << "slot ", intstring(i, 2), ": vcpu ", mb.threadid, ", addr ", (void*)(Waddr)mb.addr, " state ", 
+    os << "slot ", intstring(i, 2), ": vcpu ", mb.threadid, ", addr ", (void*)(Waddr)mb.addr, " state ",
       padstring(missbuf_state_names[mb.state], -8), " ", (mb.dcache ? "dcache" : "      "),
       " ", (mb.icache ? "icache" : "      "), " on ", mb.cycles, " cycles -> lfrq ", mb.lfrqmap, endl;
   }
@@ -427,7 +437,17 @@
   return os;
 }
 
+/**
+ * For virtually indexed caches, set the bits in the index differing
+ * between physical frame and virtual page number to zero, as we can't map
+ * some physical address back to any virtual one necessarily.
+ * HACKALERT: In a real CPU, we quite likely would get the data directly from
+ * L2, but this fairly difficult to model in PTLsim.
+ */
 int CacheHierarchy::issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi) {
+  return issueload_slowpath(physaddr, physaddr & (~PAGE_MASK), sfra, lsi);
+}
+int CacheHierarchy::issueload_slowpath(Waddr physaddr, Waddr virtaddr, SFR& sfra, LoadStoreInfo lsi) {
   static const bool DEBUG = 0;
 
   starttimer(load_slowpath_timer);
@@ -437,7 +457,7 @@
   //
   // Loads and stores that also miss the L2 Stores that
   // miss both the L1 and L2 do not require this since
-  // there could not possibly be a previous load or 
+  // there could not possibly be a previous load or
   // store within the current trace that accessed that
   // line (otherwise it would already have been allocated
   // and locked in the L2). In this case, allocate a
@@ -458,7 +478,7 @@
   }
 
   int L2hit = 0;
-    
+
   L2CacheLine* L2line = L2.probe(physaddr);
 
   if likely (L2line) {
@@ -497,12 +517,15 @@
   // Slap a lock on the L2 line it so it can't get evicted.
   // Once it's locked up, we can move it into the L1 later.
   //
+  // SD: I doubt that this is true. L1 and L2 lines are allocated when the data
+  // arrives. See mb.clock() and the call to validate!
+  //
   // If we did have a hit, but either the L1 or L2 lines
   // were still missing bytes, initiate prefetches to fill
   // them in.
   //
 
-  LoadFillReq req(physaddr, lsi.sfrused ? sfra.data : 0, lsi.sfrused ? sfra.bytemask : 0, lsi);
+  LoadFillReq req(physaddr, virtaddr, lsi.sfrused ? sfra.data : 0, lsi.sfrused ? sfra.bytemask : 0, lsi);
 
   int lfrqslot = missbuf.initiate_miss(req, L2hit, lsi.rob);
 
@@ -523,17 +546,30 @@
   return ((sframask & reqmask) == reqmask);
 }
 
-bool CacheHierarchy::probe_cache_and_sfr(W64 addr, const SFR* sfr, int sizeshift) {
+/**
+ * For virtually indexed caches, set the bits in the index differing
+ * between physical frame and virtual page number to zero, as we can't map
+ * some physical address back to any virtual one necessarily.
+ * HACKALERT: In a real CPU, we quite likely would get the data directly from
+ * L2, but this fairly difficult to model in PTLsim.
+ */
+bool CacheHierarchy::probe_cache_and_sfr(W64 physaddr, const SFR* sfr, int sizeshift) {
+  return probe_cache_and_sfr(physaddr, physaddr & (~PAGE_MASK), sfr, sizeshift);
+}
+bool CacheHierarchy::probe_cache_and_sfr(W64 physaddr, W64 virtaddr, const SFR* sfr, int sizeshift) {
   bitvec<L1_LINE_SIZE> sframask, reqmask;
-  prep_sframask_and_reqmask(sfr, addr, sizeshift, sframask, reqmask);
+  prep_sframask_and_reqmask(sfr, physaddr, sizeshift, sframask, reqmask);
 
   //
   // Short circuit if the SFR covers the entire load: no need for cache probe
   //
   if unlikely ((sframask & reqmask) == reqmask) return true;
 
-  L1CacheLine* L1line = L1.probe(addr);
-
+#ifndef L1_VIRTUALLY_INDEXED
+  L1CacheLine* L1line = L1.probe(physaddr);
+#else
+  L1CacheLine* L1line = L1.split_probe(physaddr, virtaddr);
+#endif
   if unlikely (!L1line) return false;
 
   //
@@ -552,34 +588,51 @@
 void CacheHierarchy::annul_lfrq_slot(int lfrqslot) {
   lfrq.annul(lfrqslot);
 }
-  
+
 //
 // NOTE: lsi should specify destination of REG_null for prefetches!
 //
 static const int PREFETCH_STOPS_AT_L2 = 0;
-  
-void CacheHierarchy::initiate_prefetch(W64 addr, int cachelevel) {
+
+void CacheHierarchy::initiate_prefetch(W64 physaddr, W64 virtaddr, int cachelevel, bool invalidating) {
   static const bool DEBUG = 0;
 
-  addr = floor(addr, L1_LINE_SIZE);
-    
-  L1CacheLine* L1line = L1.probe(addr);
-    
+  physaddr = floor(physaddr, L1_LINE_SIZE);
+  virtaddr = floor(virtaddr, L1_LINE_SIZE);
+#ifndef L1_VIRTUALLY_INDEXED
+  L1CacheLine* L1line = L1.probe(physaddr);
+#else
+  L1CacheLine* L1line = L1.split_probe(physaddr, virtaddr);
+#endif
+
   if unlikely (L1line) {
     stats.dcache.prefetch.in_L1++;
+#ifdef POOR_MANS_MESI
+    if (invalidating) invalidate_other_caches(physaddr, virtaddr);
+#endif
     return;
   }
-    
-  L2CacheLine* L2line = L2.probe(addr);
-    
+
+  L2CacheLine* L2line = L2.probe(physaddr);
+
   if unlikely (L2line) {
     stats.dcache.prefetch.in_L2++;
-    if (PREFETCH_STOPS_AT_L2) return; // only move up to L2 level, and it's already there
+    if (PREFETCH_STOPS_AT_L2) {
+#ifdef POOR_MANS_MESI
+      if (invalidating) invalidate_other_caches(physaddr, virtaddr);
+#endif
+      return; // only move up to L2 level, and it's already there
+    }
   }
-    
-  if (DEBUG) logfile << "Prefetch requested for ", (void*)(Waddr)addr, " to cache level ", cachelevel, endl;
-    
-  missbuf.initiate_miss(addr, L2line);
+
+  if (DEBUG) logfile << "Prefetch requested for ", (void*)(Waddr)physaddr, " to cache level ", cachelevel, endl;
+
+  // NB: This might actually get the line from another cache, ie with less cycles than full memory latency.
+  missbuf.initiate_miss(physaddr, virtaddr, L2line);
+  // NB(cont'd): hence we will just invalidate after initiating the miss!
+#ifdef POOR_MANS_MESI
+  if (invalidating) invalidate_other_caches(physaddr, (W64)virtaddr);
+#endif
   stats.dcache.prefetch.required++;
 }
 
@@ -590,18 +643,20 @@
 bool CacheHierarchy::probe_icache(Waddr virtaddr, Waddr physaddr) {
   L1ICacheLine* L1line = L1I.probe(physaddr);
   bool hit = (L1line != null);
-    
+
   return hit;
 }
 
 int CacheHierarchy::initiate_icache_miss(W64 addr, int rob, int threadid) {
   addr = floor(addr, L1I_LINE_SIZE);
   bool line_in_L2 = (L2.probe(addr) != null);
-  int mb = missbuf.initiate_miss(addr, L2.probe(addr), true, rob, threadid);
-    
+  /* SD: ignore virtual address, as L1I is not virtually indexed
+     if it was, getting the virtual address in here would be a TODO!*/
+  int mb = missbuf.initiate_miss(addr, 0, L2.probe(addr), true, rob, threadid);
+
   if (logable(6))
     logfile << "[vcpu ", threadid, "] Initiate icache miss on ", (void*)(Waddr)addr, " to missbuf ", mb, " (", (line_in_L2 ? "in L2" : "not in L2"), ")", endl;
-    
+
   return mb;
 }
 
@@ -610,7 +665,7 @@
 // any cache lines. The store must have already been checked
 // to have no exceptions.
 //
-W64 CacheHierarchy::commitstore(const SFR& sfr, int threadid, bool perform_actual_write) {
+W64 CacheHierarchy::commitstore(const SFR& sfr, W64 virtaddr, bool internal, int threadid, bool perform_actual_write) {
   if unlikely (sfr.invalid | (sfr.bytemask == 0)) return 0;
 
   static const bool DEBUG = 0;
@@ -619,18 +674,31 @@
 
   W64 addr = sfr.physaddr << 3;
 
+  // internal stores do not hit the caches
+  if unlikely (internal && perform_actual_write) {
+    storemask(addr, sfr.data, sfr.bytemask);
+    return 0;
+  }
+
   L2CacheLine* L2line = L2.select(addr);
 
   if likely (perform_actual_write) storemask(addr, sfr.data, sfr.bytemask);
+#ifdef POOR_MANS_MESI
+  invalidate_other_caches(addr, virtaddr);
+#endif
 
+#ifndef L1_VIRTUALLY_INDEXED
   L1CacheLine* L1line = L1.select(addr);
+#else
+  L1CacheLine* L1line = L1.split_select(addr, virtaddr);
+#endif
 
   L1line->valid |= ((W64)sfr.bytemask << lowbits(addr, 6));
   L2line->valid |= ((W64)sfr.bytemask << lowbits(addr, 6));
 
   if unlikely (!L1line->valid.allset()) {
     per_context_dcache_stats_update(threadid, store.prefetches++);
-    missbuf.initiate_miss(addr, L2line->valid.allset(), false, 0xffff, threadid);
+    missbuf.initiate_miss(addr, virtaddr, L2line->valid.allset(), false, 0xffff, threadid);
   }
 
   stoptimer(store_flush_timer);
@@ -643,8 +711,8 @@
 // so they can be immediately forwarded to loads, but do not actually
 // write to the cache itself.
 //
-W64 CacheHierarchy::speculative_store(const SFR& sfr, int threadid) {
-  return commitstore(sfr, threadid, false);
+W64 CacheHierarchy::speculative_store(const SFR& sfr, W64 virtaddr, int threadid) {
+  return commitstore(sfr, virtaddr, false, threadid, false);
 }
 
 void CacheHierarchy::clock() {
@@ -661,6 +729,54 @@
 
   lfrq.clock();
   missbuf.clock();
+}
+
+//
+// Probes all other cache hierarchies in the system and checks whether any cache in
+// them contains the line with the specified address.
+//
+bool CacheHierarchy::probe_other_caches(W64 addr) {
+  CacheHierarchy *other_hier;
+  byte           other_id;
+  byte coreid    = coreid;
+  bool crosshit  = false;
+
+  foreach (other_id, MAX_HIERARCHIES) {
+    //TODO: Add statistics!
+    if (other_id == coreid) continue;
+    other_hier = hierarchies[other_id];
+    if (!other_hier) continue;
+    crosshit |= (other_hier->L2.probe(addr) != null);
+#ifdef ENABLE_L3_CACHE
+    crosshit |= (other_hier->L3.probe(addr) != null);
+#endif
+    if (crosshit) break;
+  }
+  return crosshit;
+}
+
+//
+// Invalidates a line in all caches of the other cache hierarchies.
+//
+void CacheHierarchy::invalidate_other_caches(W64 addr, W64 virtaddr) {
+  CacheHierarchy *other_hier;
+
+  foreach (other_id, MAX_HIERARCHIES) {
+    //TODO: Add statistics!
+    if (other_id == coreid) continue;
+    other_hier = hierarchies[other_id];
+    if (!other_hier) continue;
+#ifndef L1_VIRTUALLY_INDEXED
+    other_hier->L1.invalidate(addr);
+#else
+    other_hier->L1.split_invalidate(addr, virtaddr);
+#endif
+    other_hier->L1I.invalidate(addr);
+    other_hier->L2.invalidate(addr);
+#ifdef ENABLE_L3_CACHE
+    other_hier->L3.invalidate(addr);
+#endif
+  }
 }
 
 void CacheHierarchy::complete() {
@@ -684,14 +800,18 @@
   L1I.reset();
   itlb.reset();
   dtlb.reset();
+#ifdef USE_L2_TLB
+  l2itlb.reset();
+  l2dtlb.reset();
+#endif
 }
 
 ostream& CacheHierarchy::print(ostream& os) {
   os << "Data Cache Subsystem:", endl;
   os << lfrq;
   os << missbuf;
-  // logfile << L1; 
-  // logfile << L2; 
+  // logfile << L1;
+  // logfile << L2;
   return os;
 }
 
@@ -722,3 +842,4 @@
 }
 */
 
+CacheHierarchy* CacheHierarchy::hierarchies[MAX_HIERARCHIES] = {null};
diff -r 10448c053ad6 dcache.h
--- a/dcache.h	Thu May 31 15:36:20 2007 +0200
+++ b/dcache.h	Wed Nov 05 14:15:51 2008 +0100
@@ -3,732 +3,43 @@
 // PTLsim: Cycle Accurate x86-64 Simulator
 // Data Cache
 //
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
 // Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _DCACHE_H_
 #define _DCACHE_H_
 
 #include <ptlsim.h>
-//#include <datastore.h>
 
-struct LoadStoreInfo {
-  W16 rob;
-  W8  threadid;
-  W8  sizeshift:2, aligntype:2, sfrused:1, internal:1, signext:1, pad1:1;
-  W32 pad32;
-  RawDataAccessors(LoadStoreInfo, W64);
-};
-
-#define per_context_dcache_stats_ref(vcpuid) (*(((PerContextDataCacheStats*)&stats.dcache.vcpu0) + (vcpuid)))
-#define per_context_dcache_stats_update(vcpuid, expr) stats.dcache.total.expr, per_context_dcache_stats_ref(vcpuid).expr
-
-namespace CacheSubsystem {
-  // How many load wakeups can be driven into the core each cycle:
-  const int MAX_WAKEUPS_PER_CYCLE = 2;
-
-#ifndef STATS_ONLY
-
-// non-debugging only:
-//#define __RELEASE__
-#ifdef __RELEASE__
-#undef assert
-#define assert(x) (x)
+#ifdef CORE_GENERIC
+#include <dache-generic.h>
+#else
+#ifdef CORE_AMD_K8
+#include <dcache-amd-k8.h>
+#else
+#ifdef CORE_AMD_BARCELONA_ASF
+#include <dcache-amd-barcelona-asf.h>
+#else
+#error Please specify a core flavour by defining CORE_XXX in ptlsim.h!
+#endif
+#endif
 #endif
 
-  //#define CACHE_ALWAYS_HITS
-  //#define L2_ALWAYS_HITS
-  
-  // 16 KB L1 at 2 cycles
-  const int L1_LINE_SIZE = 64;
-  const int L1_SET_COUNT = 64;
-  const int L1_WAY_COUNT = 4;
-  // #define ENFORCE_L1_DCACHE_BANK_CONFLICTS
-  const int L1_DCACHE_BANKS = 8; // 8 banks x 8 bytes/bank = 64 bytes/line
-
-  // 32 KB L1I
-  const int L1I_LINE_SIZE = 64;
-  const int L1I_SET_COUNT = 128;
-  const int L1I_WAY_COUNT = 4;
-
-  // 256 KB L2 at 6 cycles
-  const int L2_LINE_SIZE = 64;
-  const int L2_SET_COUNT = 256; // 256 KB
-  const int L2_WAY_COUNT = 16;
-  const int L2_LATENCY   = 6; // don't include the extra wakeup cycle (waiting->ready state transition) in the LFRQ
-
-#define ENABLE_L3_CACHE
-#ifdef ENABLE_L3_CACHE
-  // 2 MB L3 cache (4096 sets, 16 ways) with 64-byte lines, latency 16 cycles
-  const int L3_SET_COUNT = 1024;
-  const int L3_WAY_COUNT = 16;
-  const int L3_LINE_SIZE = 128;
-  const int L3_LATENCY   = 16;
-#endif
-  // Load Fill Request Queue (maximum number of missed loads)
-  const int LFRQ_SIZE = 63;
-
-  // Allow up to 16 outstanding lines in the L2 awaiting service:
-  const int MISSBUF_COUNT = 16;
-  const int MAIN_MEM_LATENCY = 100;
-
-  // TLBs
-#define USE_TLB
-  const int ITLB_SIZE = 32;
-  const int DTLB_SIZE = 32;
-
-//#define ISSUE_LOAD_STORE_DEBUG
-//#define CHECK_LOADS_AND_STORES
-
-// Line Usage Statistics
-
-//#define TRACK_LINE_USAGE
-
-#ifdef TRACK_LINE_USAGE
-#define DCACHE_L1_LINE_LIFETIME_INTERVAL   1
-#define DCACHE_L1_LINE_DEADTIME_INTERVAL   1
-#define DCACHE_L1_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L1_LINE_LIFETIME_SLOTS      8192
-#define DCACHE_L1_LINE_DEADTIME_SLOTS      8192
-#define DCACHE_L1_LINE_HITCOUNT_SLOTS      64
-
-#define DCACHE_L1I_LINE_LIFETIME_INTERVAL  16
-#define DCACHE_L1I_LINE_DEADTIME_INTERVAL  16
-#define DCACHE_L1I_LINE_HITCOUNT_INTERVAL  1
-#define DCACHE_L1I_LINE_LIFETIME_SLOTS     8192
-#define DCACHE_L1I_LINE_DEADTIME_SLOTS     8192
-#define DCACHE_L1I_LINE_HITCOUNT_SLOTS     1024
-
-#define DCACHE_L2_LINE_LIFETIME_INTERVAL   4
-#define DCACHE_L2_LINE_DEADTIME_INTERVAL   4
-#define DCACHE_L2_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L2_LINE_LIFETIME_SLOTS      65536
-#define DCACHE_L2_LINE_DEADTIME_SLOTS      65536
-#define DCACHE_L2_LINE_HITCOUNT_SLOTS      256
-
-#define DCACHE_L3_LINE_LIFETIME_INTERVAL   64
-#define DCACHE_L3_LINE_DEADTIME_INTERVAL   64
-#define DCACHE_L3_LINE_HITCOUNT_INTERVAL   1
-#define DCACHE_L3_LINE_LIFETIME_SLOTS      16384
-#define DCACHE_L3_LINE_DEADTIME_SLOTS      16384
-#define DCACHE_L3_LINE_HITCOUNT_SLOTS      256
-#endif
-
-  //
-  // Cache Line Types
-  //
-  template <int linesize>
-  struct CacheLine {
-#ifdef TRACK_LINE_USAGE
-    W32 filltime;
-    W32 lasttime;
-    W32 hitcount;
-#else
-    byte dummy;
-#endif
-    void reset() { clearstats(); }
-    void invalidate() { reset(); }
-    void fill(W64 tag, const bitvec<linesize>& valid) { }
-
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      filltime = sim_cycle;
-      lasttime = sim_cycle;
-      hitcount = 0;
-#endif
-    }
-
-    ostream& print(ostream& os, W64 tag) const;
-  };
-
-  template <int linesize>
-  static inline ostream& operator <<(ostream& os, const CacheLine<linesize>& line) {
-    return line.print(os, 0);
-  }
-
-  template <int linesize>
-  struct CacheLineWithValidMask {
-    bitvec<linesize> valid;
-#ifdef TRACK_LINE_USAGE
-    W32 filltime;
-    W32 lasttime;
-    W32 hitcount;
-#endif
-
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      filltime = sim_cycle;
-      lasttime = sim_cycle;
-      hitcount = 0;
-#endif
-    }
-
-    void reset() { valid = 0; clearstats(); }
-    void invalidate() { reset(); }
-    void fill(W64 tag, const bitvec<linesize>& valid) { this->valid |= valid; }
-    ostream& print(ostream& os, W64 tag) const;
-  };
-
-  template <int linesize>
-  static inline ostream& operator <<(ostream& os, const CacheLineWithValidMask<linesize>& line) {
-    return line.print(os, 0);
-  }
-
-  typedef CacheLineWithValidMask<L1_LINE_SIZE> L1CacheLine;
-  typedef CacheLine<L1I_LINE_SIZE> L1ICacheLine;
-  typedef CacheLineWithValidMask<L2_LINE_SIZE> L2CacheLine;
-#ifdef ENABLE_L3_CACHE
-  typedef CacheLine<L3_LINE_SIZE> L3CacheLine;
-#endif
-
-  //
-  // L1 data cache
-  //
-#ifdef TRACK_LINE_USAGE
-  static const char* cache_names[4] = {"L1", "I1", "L2", "L3"};
-
-  template <int uniq, typename V, int LIFETIME_INTERVAL, int LIFETIME_SLOTS, int DEADTIME_INTERVAL, int DEADTIME_SLOTS, int HITCOUNT_INTERVAL, int HITCOUNT_SLOTS>
-  struct HistogramAssociativeArrayStatisticsCollector {
-    static W64 line_lifetime_histogram[LIFETIME_SLOTS];
-    static W64 line_deadtime_histogram[DEADTIME_SLOTS];
-    static W64 line_hitcount_histogram[HITCOUNT_SLOTS];
-
-    static const bool FORCE_DEBUG = 0;
-
-    HistogramAssociativeArrayStatisticsCollector() {
-      reset();
-    }
-
-    static void reset() {
-      setzero(line_lifetime_histogram);
-      setzero(line_deadtime_histogram);
-      setzero(line_hitcount_histogram);
-    }
-
-    static void evicted(const V& line, W64 tag) {
-      // Line has been evicted: update statistics
-      W64s lifetime = line.lasttime - line.filltime;
-      assert(lifetime >= 0);
-      int lifetimeslot = clipto(lifetime / LIFETIME_INTERVAL, 0, LIFETIME_SLOTS-1);
-      line_lifetime_histogram[lifetimeslot]++;
-
-      W64s deadtime = sim_cycle - line.lasttime;
-      int deadtimeslot = clipto(deadtime / DEADTIME_INTERVAL, 0, DEADTIME_SLOTS-1);
-      line_deadtime_histogram[deadtimeslot]++;
-
-      W64 hitcount = line.hitcount;
-      int hitcountslot = clipto(hitcount / HITCOUNT_INTERVAL, 0, HITCOUNT_SLOTS-1);
-      line_hitcount_histogram[hitcountslot]++;
-
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": evicted(", (void*)tag, "): lifetime ", lifetime, ", deadtime ", deadtime, ", hitcount ", hitcount, " (line addr ", &line, ")", endl;
-    }
-
-    static void filled(V& line, W64 tag) {
-      line.filltime = sim_cycle;
-      line.lasttime = sim_cycle;
-      line.hitcount = 1;
-
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": filled(", (void*)tag, ")", " (line addr ", &line, ")", endl;
-    }
-
-    static void inserted(V& line, W64 newtag, int way) {
-      filled(line, newtag);
-    }
-
-    static void replaced(V& line, W64 oldtag, W64 newtag, int way) {
-      evicted(line, oldtag);
-      filled(line, newtag);
-    }
-
-    static void probed(V& line, W64 tag, int way, bool hit) { 
-      if (logable(6) | FORCE_DEBUG) logfile << "[", cache_names[uniq], "] ", sim_cycle, ": probe(", (void*)tag, "): ", (hit ? "HIT" : "miss"), " way ", way, ": hitcount ", line.hitcount, ", filltime ", line.filltime, ", lasttime ", line.lasttime, " (line addr ", &line, ")", endl;
-      if (hit) {
-        line.hitcount++;
-        line.lasttime = sim_cycle;
-      }
-    }
-
-    static void overflow(W64 tag) { }
-
-    static void locked(V& slot, W64 tag, int way) { }
-    static void unlocked(V& slot, W64 tag, int way) { }
-
-    static void invalidated(V& line, W64 oldtag, int way) { evicted(line, oldtag); }
-
-    static void savestats(DataStoreNode& ds) {
-      ds.add("lifetime", (W64s*)line_lifetime_histogram, LIFETIME_SLOTS, 0, ((LIFETIME_SLOTS-1) * LIFETIME_INTERVAL), LIFETIME_INTERVAL);
-      ds.add("deadtime", (W64s*)line_deadtime_histogram, DEADTIME_SLOTS, 0, ((DEADTIME_SLOTS-1) * DEADTIME_INTERVAL), DEADTIME_INTERVAL);
-      ds.add("hitcount", (W64s*)line_hitcount_histogram, HITCOUNT_SLOTS, 0, ((HITCOUNT_SLOTS-1) * HITCOUNT_INTERVAL), HITCOUNT_INTERVAL);
-    }
-  };
-
-  typedef HistogramAssociativeArrayStatisticsCollector<0, L1CacheLine,
-    DCACHE_L1_LINE_LIFETIME_INTERVAL, DCACHE_L1_LINE_LIFETIME_SLOTS, 
-    DCACHE_L1_LINE_DEADTIME_INTERVAL, DCACHE_L1_LINE_DEADTIME_SLOTS, 
-    DCACHE_L1_LINE_HITCOUNT_INTERVAL, DCACHE_L1_LINE_HITCOUNT_SLOTS> L1StatsCollectorBase;
-
-  typedef HistogramAssociativeArrayStatisticsCollector<1, L1ICacheLine,
-    DCACHE_L1I_LINE_LIFETIME_INTERVAL, DCACHE_L1I_LINE_LIFETIME_SLOTS, 
-    DCACHE_L1I_LINE_DEADTIME_INTERVAL, DCACHE_L1I_LINE_DEADTIME_SLOTS, 
-    DCACHE_L1I_LINE_HITCOUNT_INTERVAL, DCACHE_L1I_LINE_HITCOUNT_SLOTS> L1IStatsCollectorBase;
-
-  typedef HistogramAssociativeArrayStatisticsCollector<2, L2CacheLine,
-    DCACHE_L2_LINE_LIFETIME_INTERVAL, DCACHE_L2_LINE_LIFETIME_SLOTS, 
-    DCACHE_L2_LINE_DEADTIME_INTERVAL, DCACHE_L2_LINE_DEADTIME_SLOTS, 
-    DCACHE_L2_LINE_HITCOUNT_INTERVAL, DCACHE_L2_LINE_HITCOUNT_SLOTS> L2StatsCollectorBase;
-
-#ifdef ENABLE_L3_CACHE
-  typedef HistogramAssociativeArrayStatisticsCollector<3, L3CacheLine,
-    DCACHE_L3_LINE_LIFETIME_INTERVAL, DCACHE_L3_LINE_LIFETIME_SLOTS, 
-    DCACHE_L3_LINE_DEADTIME_INTERVAL, DCACHE_L3_LINE_DEADTIME_SLOTS, 
-    DCACHE_L3_LINE_HITCOUNT_INTERVAL, DCACHE_L3_LINE_HITCOUNT_SLOTS> L3StatsCollectorBase;
-#endif
-
-  struct L1StatsCollector: public L1StatsCollectorBase { };
-  struct L1IStatsCollector: public L1IStatsCollectorBase { };
-  struct L2StatsCollector: public L2StatsCollectorBase { };
-#ifdef ENABLE_L3_CACHE
-  struct L3StatsCollector: public L3StatsCollectorBase { };
-#endif
-
-#else
-  typedef NullAssociativeArrayStatisticsCollector<W64, L1CacheLine> L1StatsCollector;
-  typedef NullAssociativeArrayStatisticsCollector<W64, L1ICacheLine> L1IStatsCollector;
-  typedef NullAssociativeArrayStatisticsCollector<W64, L2CacheLine> L2StatsCollector;
-#ifdef ENABLE_L3_CACHE
-  typedef NullAssociativeArrayStatisticsCollector<W64, L3CacheLine> L3StatsCollector;
-#endif
-#endif
-
-  template <typename V, int setcount, int waycount, int linesize, typename stats = NullAssociativeArrayStatisticsCollector<W64, V> > 
-  struct DataCache: public AssociativeArray<W64, V, setcount, waycount, linesize, stats> {
-    typedef AssociativeArray<W64, V, setcount, waycount, linesize, stats> base_t;
-    void clearstats() {
-#ifdef TRACK_LINE_USAGE
-      foreach (set, L1_SET_COUNT) {
-        foreach (way, waycount) {
-          base_t::sets[set][way].clearstats();
-        }
-      }
-#endif
-    }
-  };
-
-  struct L1Cache: public DataCache<L1CacheLine, L1_SET_COUNT, L1_WAY_COUNT, L1_LINE_SIZE, L1StatsCollector> {
-    L1CacheLine* validate(W64 addr, const bitvec<L1_LINE_SIZE>& valid) {
-      addr = tagof(addr);
-      L1CacheLine* line = select(addr);
-      line->fill(addr, valid);
-      return line;
-    }
-  };
-
-  static inline ostream& operator <<(ostream& os, const L1Cache& cache) {
-    return os;
-  }
-
-  //
-  // L1 instruction cache
-  //
-
-  struct L1ICache: public DataCache<L1ICacheLine, L1I_SET_COUNT, L1I_WAY_COUNT, L1I_LINE_SIZE, L1IStatsCollector> {
-    L1ICacheLine* validate(W64 addr, const bitvec<L1I_LINE_SIZE>& valid) {
-      addr = tagof(addr);
-      L1ICacheLine* line = select(addr);
-      line->fill(addr, valid);
-      return line;
-    }
-  };
-
-  static inline ostream& operator <<(ostream& os, const L1ICache& cache) {
-    return os;
-  }
-
-  //
-  // L2 cache
-  //
-
-  typedef DataCache<L2CacheLine, L2_SET_COUNT, L2_WAY_COUNT, L2_LINE_SIZE, L2StatsCollector> L2CacheBase;
-
-  struct L2Cache: public L2CacheBase {
-    void validate(W64 addr) {
-      L2CacheLine* line = select(addr);
-      if (!line) return;
-      line->valid.setall();
-    }
-
-    void deliver(W64 address);
-  };
-
-  //
-  // L3 cache
-  //
-#ifdef ENABLE_L3_CACHE
-  static inline ostream& operator <<(ostream& os, const L3CacheLine& line) {
-    return line.print(os, 0);
-  }
-
-  struct L3Cache: public DataCache<L3CacheLine, L3_SET_COUNT, L3_WAY_COUNT, L3_LINE_SIZE, L3StatsCollector> {
-    L3CacheLine* validate(W64 addr) {
-      W64 oldaddr;
-      L3CacheLine* line = select(addr, oldaddr);
-      return line;
-    }
-  };
-#endif
-
-  static inline void prep_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L1_LINE_SIZE>& sframask, bitvec<L1_LINE_SIZE>& reqmask) {
-    sframask = (sfr) ? (bitvec<L1_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L1_LINE_SIZE)-3)) : 0;
-    reqmask = bitvec<L1_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L1_LINE_SIZE));
-  }
-
-  static inline void prep_L2_sframask_and_reqmask(const SFR* sfr, W64 addr, int sizeshift, bitvec<L2_LINE_SIZE>& sframask, bitvec<L2_LINE_SIZE>& reqmask) {
-    sframask = (sfr) ? (bitvec<L2_LINE_SIZE>(sfr->bytemask) << 8*lowbits(sfr->physaddr, log2(L2_LINE_SIZE)-3)) : 0;
-    reqmask = bitvec<L2_LINE_SIZE>(bitmask(1 << sizeshift)) << lowbits(addr, log2(L2_LINE_SIZE));
-  }
-
-  //
-  // TLB class with one-hot semantics. 36 bit tags are required since
-  // virtual addresses are 48 bits, so 48 - 12 (2^12 bytes per page)
-  // is 36 bits.
-  //
-  template <int tlbid, int size>
-  struct TranslationLookasideBuffer: public FullyAssociativeTagsNbitOneHot<size, 40> {
-    typedef FullyAssociativeTagsNbitOneHot<size, 40> base_t;
-    TranslationLookasideBuffer(): base_t() { }
-
-    void reset() {
-      base_t::reset();
-    }
-
-    // Get the 40-bit TLB tag (36 bit virtual page ID plus 4 bit threadid)
-    static W64 tagof(W64 addr, W64 threadid) {
-      return bits(addr, 12, 36) | (threadid << 36);
-    }
-
-    bool probe(W64 addr, int threadid = 0) {
-      W64 tag = tagof(addr, threadid);
-      return (base_t::probe(tag) >= 0);
-    }
-
-    bool insert(W64 addr, int threadid = 0) {
-      addr = floor(addr, PAGE_SIZE);
-      W64 tag = tagof(addr, threadid);
-      W64 oldtag;
-      int way = base_t::select(tag, oldtag);
-      W64 oldaddr = lowbits(oldtag, 36) << 12;
-      if (logable(6)) {
-        logfile << "TLB insertion of virt page ", (void*)(Waddr)addr, " (virt addr ", 
-          (void*)(Waddr)(addr), ") into way ", way, ": ",
-          ((oldtag != tag) ? "evicted old entry" : "already present"), endl;
-      }
-      return (oldtag != tag);
-    }
-
-    int flush_all() {
-      reset();
-      return size;
-    }
-
-    int flush_thread(W64 threadid) {
-      W64 tag = threadid << 36;
-      W64 tagmask = 0xfULL << 36;
-      bitvec<size> slotmask = base_t::masked_match(tag, tagmask);
-      int n = slotmask.popcount();
-      base_t::masked_invalidate(slotmask);
-      return n;
-    }
-
-    int flush_virt(Waddr virtaddr, W64 threadid) {
-      return invalidate(tagof(virtaddr, threadid));
-    }
-  };
-
-  template <int tlbid, int size>
-  static inline ostream& operator <<(ostream& os, const TranslationLookasideBuffer<tlbid, size>& tlb) {
-    return tlb.print(os);
-  }
-
-  typedef TranslationLookasideBuffer<0, DTLB_SIZE> DTLB;
-  typedef TranslationLookasideBuffer<1, ITLB_SIZE> ITLB;
-
-  struct CacheHierarchy;
-
-  //
-  // Load fill request queue (LFRQ) contains any requests for outstanding
-  // loads from both the L2 or L1. 
-  //
-  struct LoadFillReq {
-    W64 addr;       // physical address
-    W64 data;       // data already known so far (e.g. from SFR)
-    LoadStoreInfo lsi;
-    W32  initcycle;
-    byte mask;
-    byte fillL1:1, fillL2:1;
-
-    inline LoadFillReq() { }
-  
-    LoadFillReq(W64 addr, W64 data, byte mask, LoadStoreInfo lsi) {
-      this->addr = addr;
-      this->data = data;
-      this->mask = mask;
-      this->lsi = lsi;
-      this->lsi.threadid = lsi.threadid; 
-      this->fillL1 = 1;
-      this->fillL2 = 1;
-      this->initcycle = sim_cycle;
-    }
-
-    ostream& print(ostream& os) const {
-      os << " TH ", lsi.threadid, "  ", "0x", hexstring(data, 64), " @ ", (void*)(Waddr)addr, " -> rob ", lsi.rob;
-      os << ": shift ", lsi.sizeshift, ", signext ", lsi.signext, ", mask ", bitstring(mask, 8, true);
-      return os;
-    }
-  };
-
-  static inline ostream& operator <<(ostream& os, const LoadFillReq& req) {
-    return req.print(os);
-  }
-
-  template <int size>
-  struct LoadFillReqQueue {
-    CacheHierarchy& hierarchy;
-    bitvec<size> freemap;                    // Slot is free
-    bitvec<size> waiting;                    // Waiting for the line to arrive in the L1
-    bitvec<size> ready;                      // Wait to extract/signext and write into register
-    LoadFillReq reqs[size];
-
-    static const int SIZE = size;
-
-    LoadFillReqQueue(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
-    LoadFillReqQueue(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
-
-    // Clear entries belonging to one thread
-    void reset(int threadid);
-
-    // Reset all threads
-    void reset() {
-      freemap.setall();
-      ready = 0;
-      waiting = 0;
-    }
-
-    void changestate(int idx, bitvec<size>& oldstate, bitvec<size>& newstate) {
-      oldstate[idx] = 0;
-      newstate[idx] = 1;
-    }
-
-    void free(int lfrqslot) {
-      changestate(lfrqslot, waiting, freemap);
-    }
-
-    bool full() const {
-      return (!freemap);
-    }
-
-    void annul(int lfrqslot);
-
-    void restart();
-
-    int add(const LoadFillReq& req);
-
-    void wakeup(W64 address, const bitvec<LFRQ_SIZE>& lfrqmask);
-
-    void clock();
-
-    LoadFillReq& operator [](int idx) { return reqs[idx]; }
-    const LoadFillReq& operator [](int idx) const { return reqs[idx]; }
-
-    ostream& print(ostream& os) const;
-  };
-
-  template <int size>
-  static inline ostream& operator <<(ostream& os, const LoadFillReqQueue<size>& lfrq) {
-    return lfrq.print(os);
-  }
-
-  enum { STATE_IDLE, STATE_DELIVER_TO_L3, STATE_DELIVER_TO_L2, STATE_DELIVER_TO_L1 };
-  static const char* missbuf_state_names[] = {"idle", "mem->L3", "L3->L2", "L2->L1"};
-
-  template <int SIZE>
-  struct MissBuffer {
-    struct Entry {
-      W64 addr;     // physical line address we are waiting for
-      W16 state;
-      W16 dcache:1, icache:1;    // L1I vs L1D
-      W32 cycles;
-      W16 rob; // to identify which thread.
-      W8 threadid;
-
-      bitvec<LFRQ_SIZE> lfrqmap;  // which LFRQ entries should this load wake up?
-      void reset() {
-        lfrqmap = 0;
-        addr = 0xffffffffffffffffULL;
-        state = STATE_IDLE;
-        cycles = 0;
-        icache = 0;
-        dcache = 0;
-        rob = 0xffff;
-        threadid = 0xff;
-      }
-    };
-
-    MissBuffer(): hierarchy(*((CacheHierarchy*)null)) { reset(); }
-    MissBuffer(CacheHierarchy& hierarchy_): hierarchy(hierarchy_) { reset(); }
-
-    CacheHierarchy& hierarchy;
-    Entry missbufs[SIZE];
-    bitvec<SIZE> freemap;
-    
-    void reset();
-    void reset(int threadid);
-    void restart();
-    bool full() const { return (!freemap); }
-    int find(W64 addr);
-    int initiate_miss(W64 addr, bool hit_in_L2, bool icache = 0, int rob = 0xffff, int threadid = 0xfe);
-    int initiate_miss(const LoadFillReq& req, bool hit_in_L2, int rob = 0xffff);
-    void annul_lfrq(int slot);
-    void annul_lfrq(int slot, int threadid);
-    void clock();
-
-    ostream& print(ostream& os) const;
-  };
-
-  template <int size>
-  static inline ostream& operator <<(ostream& os, const MissBuffer<size>& missbuf) {
-    return missbuf.print(os);
-  }
-
-  struct PerCoreCacheCallbacks {
-    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-  };
-
-  struct CacheHierarchy {
-    LoadFillReqQueue<LFRQ_SIZE> lfrq;
-    MissBuffer<MISSBUF_COUNT> missbuf;
-    L1Cache L1;
-    L1ICache L1I;
-    L2Cache L2;
-#ifdef ENABLE_L3_CACHE
-    L3Cache L3;
-#endif
-    DTLB dtlb;
-    ITLB itlb;
-
-    PerCoreCacheCallbacks* callback;
-
-    CacheHierarchy(): lfrq(*this), missbuf(*this) { callback = null; }
-
-    bool probe_cache_and_sfr(W64 addr, const SFR* sfra, int sizeshift);
-    bool covered_by_sfr(W64 addr, SFR* sfr, int sizeshift);
-    void annul_lfrq_slot(int lfrqslot);
-    int issueload_slowpath(Waddr physaddr, SFR& sfra, LoadStoreInfo lsi);
-    bool lfrq_or_missbuf_full() const { return lfrq.full() | missbuf.full(); }
-
-    W64 commitstore(const SFR& sfr, int threadid = 0xff, bool perform_actual_write = true);
-    W64 speculative_store(const SFR& sfr, int threadid = 0xff);
-
-    void initiate_prefetch(W64 addr, int cachelevel);
-
-    bool probe_icache(Waddr virtaddr, Waddr physaddr);
-    int initiate_icache_miss(W64 addr, int rob = 0xffff, int threadid = 0xff);
-
-    void reset();
-    void clock();
-    void complete();
-    void complete(int threadid);
-    ostream& print(ostream& os);
-  };
-#endif // STATS_ONLY
-};
-
-struct PerContextDataCacheStats { // rootnode:
-  struct load {
-    struct hit { // node: summable
-      W64 L1;
-      W64 L2;
-      W64 L3;
-      W64 mem;
-    } hit;
-        
-    struct dtlb { // node: summable
-      W64 hits;
-      W64 misses;
-    } dtlb;
-
-    struct tlbwalk { // node: summable
-      W64 L1_dcache_hit;
-      W64 L1_dcache_miss;
-      W64 no_lfrq_mb;
-    } tlbwalk;
-  } load;
- 
-  struct fetch {
-    struct hit { // node: summable
-      W64 L1;
-      W64 L2;
-      W64 L3;
-      W64 mem;
-    } hit;
-    
-    struct itlb { // node: summable
-      W64 hits;
-      W64 misses;
-    } itlb;
-
-    struct tlbwalk { // node: summable
-      W64 L1_dcache_hit;
-      W64 L1_dcache_miss;
-      W64 no_lfrq_mb;      
-    } tlbwalk;
-  } fetch;
-  
-  struct store {
-    W64 prefetches;
-  } store;
-};
-
-struct DataCacheStats { // rootnode:
-  struct load {
-    struct transfer { // node: summable
-      W64 L2_to_L1_full;
-      W64 L2_to_L1_partial;
-      W64 L2_L1I_full;
-    } transfer;
-  } load;
-
-  struct missbuf {
-    W64 inserts;
-    struct deliver { // node: summable
-      W64 mem_to_L3;
-      W64 L3_to_L2;
-      W64 L2_to_L1D;
-      W64 L2_to_L1I;
-    } deliver;
-  } missbuf;
-
-  struct prefetch { // node: summable
-    W64 in_L1;
-    W64 in_L2;
-    W64 required;
-  } prefetch;
-
-  struct lfrq {
-    W64 inserts;
-    W64 wakeups;
-    W64 annuls;
-    W64 resets;
-    W64 total_latency;
-    double average_latency;
-    W64 width[CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1]; // histo: 0, CacheSubsystem::MAX_WAKEUPS_PER_CYCLE+1, 1
-  } lfrq;
-
-  PerContextDataCacheStats total;
-  PerContextDataCacheStats vcpu0;
-  PerContextDataCacheStats vcpu1;
-  PerContextDataCacheStats vcpu2;
-  PerContextDataCacheStats vcpu3;
-};
-
 #endif // _DCACHE_H_
diff -r 10448c053ad6 decode-asf.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/decode-asf.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,265 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Decoder for ASF-Extensions to AMD64 instruction set.
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+#include <decode.h>
+#include <superstl.h>
+#include <ptlhwdef.h>
+
+// ASF - Testing!
+W64 asf_testcounter;
+
+/**
+ * Scans backwards through the TransOp-buffer to find the last ld-µop and flags it as having
+ * ASF side effects. This allows us to use PTLsim's standard infrastructure for generation of
+ * loads even for ASF's LOCKed loads and prefetches.
+ * @param opcode The opcode which is to be scanned for and should be flagged.
+ */
+void TraceDecoder::scan_transb_and_flag_asf(byte opcode) {
+      bool found_ld = false;
+      int i;
+
+      for (i =  transbufcount-1; i >= 0; i--) {
+        found_ld = (transbuf[i].opcode == opcode);
+        if (found_ld) break;
+      }
+      assert(found_ld);
+      /* Flag the found load as being ASF, let the core handle the associated special functionality */
+      transbuf[i].is_asf = 1;
+}
+
+static const byte sse_float_datatype_to_ptl_datatype[4] = {DATATYPE_FLOAT, DATATYPE_VEC_FLOAT, DATATYPE_DOUBLE, DATATYPE_VEC_DOUBLE};
+/**
+ * Decodes a single ASF instruction
+ * @return true, when further decoding is possible, false, if not
+ */
+bool TraceDecoder::decode_asf() {
+  DecodedOperand ra;
+  DecodedOperand rd;
+  /* This is an ugly hack to check whether we run in an ASF enabled model. */
+  bool isasfcore = (PTLsimMachine::getcurrent() == PTLsimMachine::getmachine("asfooo"))
+                   || (PTLsimMachine::getcurrent() == PTLsimMachine::getmachine("asfsmt"));
+
+  switch (op) {
+    case 0x1C7: {
+      /* ACQUIRE & COMMIT */
+      switch (modrm.reg) {
+        /* ACQUIRE + reg,imm(8/16/32?) */
+        case 0x2: {
+          if (!isasfcore) MakeInvalid();
+          DECODE(eform, rd, v_mode);
+          /* imm8*/DECODE(iform64, ra, b_mode);
+          /* for imm16: DECODE(iform64, ra, w_mode);*/
+          /* for imm32: DECODE(iform64, ra, d_mode);*/
+          /* NOTE: we might run into Problems with REX64 <-> OpSize-Prefix, when doing dynamic 16/32 bit switching*/
+          EndOfDecode();
+          /* Decode into a new µop */
+          assert(rd.type == OPTYPE_REG);
+          assert(ra.type == OPTYPE_IMM);
+
+          int destreg   = arch_pseudo_reg_to_arch_reg[rd.reg.reg];
+          int sizeshift = reginfo[rd.reg.reg].sizeshift;
+          // SD: Add rsp as a dependency, so that acquire can save that value!
+          TransOp acq(OP_acq, destreg, REG_rsp, REG_imm, REG_zero, 3, ra.imm.imm, 0);
+          acq.is_asf = true; this << acq;
+          break;
+        }
+        /* COMMIT */
+        case 0x3: {
+          if (!isasfcore) MakeInvalid();
+          EndOfDecode();
+          /* Decode into a new µop */
+          TransOp rel(OP_com, REG_temp0, REG_zero, REG_zero, REG_zero, 3);
+          rel.is_asf = true; this << rel;
+          /* this is a special ASF-LFENCE, it will only sort loads which
+             try to access data still protected by an ongoing ASF-CS. */
+          TransOp mf(OP_mf, REG_temp0, REG_zero, REG_zero, REG_zero, 0);
+          mf.extshift = MF_TYPE_LFENCE; mf.is_asf = true; this << mf;
+          break;
+        }
+        /* Extension: ACQUIRE, register form */
+        case 0x4: {
+          if (!isasfcore) MakeInvalid();
+          DECODE(eform, rd, v_mode);
+          ra = rd;
+          EndOfDecode();
+          /* Decode into a new µop */
+          assert(rd.type == OPTYPE_REG);
+          assert(ra.type == OPTYPE_REG);
+
+          int destreg   = arch_pseudo_reg_to_arch_reg[rd.reg.reg];
+          int sizeshift = reginfo[rd.reg.reg].sizeshift;
+          // SD: Add rsp as a dependency, so that acquire can save that value!
+          TransOp acq(OP_acq, destreg, REG_rsp, destreg, REG_zero, 3);
+          acq.is_asf = true; this << acq;
+
+          break;
+        }
+
+        default: {
+          cerr << __FILE__,"@",__LINE__,": Unknown ModR/M (", modrm.reg, ") for opcode 0x1c7\n";
+          cerr.flush();
+          MakeInvalid();
+          break;
+        }
+        return true;
+      }
+      break;
+    }
+
+    case 0x8a:
+    case 0x8b: {
+      /* LOCKed MOV-loads */
+      if (!isasfcore) MakeInvalid();
+      /* Remove the LOCK prefix, as it was just used to flag the ASFness of this load */
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+
+      int bytemode = bit(op, 0) ? v_mode : b_mode;
+      switch (bit(op, 1)) {
+        case 0: DECODE(eform, rd, bytemode); DECODE(gform, ra, bytemode); break;
+        case 1: DECODE(gform, rd, bytemode); DECODE(eform, ra, bytemode); break;
+      } 
+      /* ASF permits only locked loads! */
+      if (ra.type != OPTYPE_MEM) MakeInvalid();
+      EndOfDecode();
+
+      move_reg_or_mem(rd, ra);
+      /* search for and flag the ld-µop as ASF */
+      scan_transb_and_flag_asf(OP_ld);
+      break;
+    }
+    case 0xa0:
+    case 0xa1: {
+      // LOCK mov rAX,Ov
+      if (!isasfcore) MakeInvalid();
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+      rd.gform_ext(*this, (op & 1) ? v_mode : b_mode, REG_rax);
+      DECODE(iform64, ra, (use64 ? q_mode : addrsize_prefix ? w_mode : d_mode));
+      EndOfDecode();
+
+      ra.mem.offset = ra.imm.imm;
+      ra.mem.offset = (use64) ? ra.mem.offset : lowbits(ra.mem.offset, (addrsize_prefix) ? 16 : 32);
+      ra.mem.basereg = APR_zero;
+      ra.mem.indexreg = APR_zero;
+      ra.mem.scale = APR_zero;
+      ra.mem.size = reginfo[rd.reg.reg].sizeshift;
+      ra.type = OPTYPE_MEM;
+      operand_load(REG_rax, ra);
+
+      /* search for and flag the ld-µop as ASF */
+      scan_transb_and_flag_asf(OP_ld);
+      break;
+    }
+
+    // LOCKed vector loads (NOTE: Copy'n'pasted mostly from decode-sse.cpp)
+    case 0x56e: { // LOCK movd xmm,rm32/rm64
+      if (!isasfcore) MakeInvalid();
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+      DECODE(gform, rd, x_mode);
+      DECODE(eform, ra, v_mode);
+      EndOfDecode();
+
+      int rdreg = arch_pseudo_reg_to_arch_reg[rd.reg.reg];
+      int datatype = sse_float_datatype_to_ptl_datatype[(op >> 8) - 2];
+      if (ra.type != OPTYPE_MEM) MakeInvalid();
+
+      // Load
+      operand_load(rdreg+0, ra, OP_ld, datatype);
+      /* search for and flag the ld-µop as ASF */
+      scan_transb_and_flag_asf(OP_ld);
+      this << TransOp(OP_mov, rdreg+1, REG_zero, REG_zero, REG_zero, 3); // zero high 64 bits
+      break;
+    }
+
+    case 0x56f: // movdqa load
+    case 0x26f: { // movdqu load
+      if (!isasfcore) MakeInvalid();
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+
+      DECODE(gform, rd, x_mode);
+      DECODE(eform, ra, x_mode);
+      EndOfDecode();
+
+      int rdreg = arch_pseudo_reg_to_arch_reg[rd.reg.reg];
+      int datatype = sse_float_datatype_to_ptl_datatype[(op >> 8) - 2];
+
+      if (ra.type != OPTYPE_MEM) MakeInvalid();
+
+      // Load
+      // This is still idempotent since if the second one was unaligned, the first one must be too
+      operand_load(rdreg+0, ra, OP_ld, datatype);
+      /* just flag the first load as being ASF TODO: What happens, when data lies on two cachelines? */
+      scan_transb_and_flag_asf(OP_ld);
+      ra.mem.offset += 8;
+      operand_load(rdreg+1, ra, OP_ld, datatype);
+      break;
+    }
+
+    case 0x27e: { // LOCK movq xmm,xmmlo|mem64 with zero extension
+      if (!isasfcore) MakeInvalid();
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+
+      DECODE(gform, rd, x_mode);
+      DECODE(eform, ra, x_mode);
+      EndOfDecode();
+
+      int rdreg = arch_pseudo_reg_to_arch_reg[rd.reg.reg];
+      int datatype = sse_float_datatype_to_ptl_datatype[(op >> 8) - 2];
+
+      if (ra.type != OPTYPE_MEM) MakeInvalid();
+
+      // Load
+      operand_load(rdreg+0, ra, OP_ld, datatype);
+      /* search for and flag the ld-µop as ASF */
+      scan_transb_and_flag_asf(OP_ld);
+      this << TransOp(OP_mov, rdreg+1, REG_zero, REG_zero, REG_zero, 3); // zero high 64 bits
+      break;
+    }
+
+    case 0x10d: {
+      // prefetchw [eform] (NOTE: this is an AMD-only insn from K6 onwards)
+      if (!isasfcore) MakeInvalid();
+      DECODE(eform, ra, b_mode);
+      EndOfDecode();
+
+      int level = 2;
+      assert(prefixes & PFX_LOCK);
+      prefixes &= ~PFX_LOCK;
+      /* to make the ACQUIRE dependent on the cache-miss, this has to be a load! */
+      operand_load(REG_temp0, ra, OP_ld, DATATYPE_INT, level, true);
+      /* search for and flag the ld-µop as ASF */
+      scan_transb_and_flag_asf(OP_ld);
+      break;
+    }
+
+    default:
+      if (logable(3)) logfile << __FILE__,"@",__LINE__,": Unknown opcode ", hexstring(op, 32), " in ASF decoder\n";
+      MakeInvalid();
+      break;
+  }
+  return true;
+}
diff -r 10448c053ad6 decode-complex.cpp
--- a/decode-complex.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/decode-complex.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -9,6 +9,8 @@
 
 template <typename T> void assist_div(Context& ctx) {
   Waddr rax = ctx.commitarf[REG_rax]; Waddr rdx = ctx.commitarf[REG_rdx];
+  /*S.D.*/
+  //cerr << __FILE__,__LINE__,": Division of ",rdx,":",rax," by ",((T)ctx.commitarf[REG_ar1]),endl, flush; 
   asm("div %[divisor];" : "+a" (rax), "+d" (rdx) : [divisor] "q" ((T)ctx.commitarf[REG_ar1]));
   ctx.commitarf[REG_rax] = rax; ctx.commitarf[REG_rdx] = rdx;
   ctx.commitarf[REG_rip] = ctx.commitarf[REG_nextrip];
@@ -1672,12 +1674,15 @@
 
   case 0x10d: {
     // prefetchw [eform] (NOTE: this is an AMD-only insn from K6 onwards)
+    /* Let decode_asf handle the locked versions of prefetchw */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(eform, ra, b_mode);
     EndOfDecode();
 
     int level = 2;
-    prefixes &= ~PFX_LOCK;
-    operand_load(REG_temp0, ra, OP_ld_pre, level);
+    assert(!(prefixes & PFX_LOCK));
+    operand_load(REG_temp0, ra, OP_ld_pre, DATATYPE_INT, level, true);
     break;
   }
 
@@ -2217,8 +2222,71 @@
     break;
   }
 
+  case 0x1c7: { // cmpxchg8b/cmpxchg16b
+    DECODE(eform, rd, (rex.mode64) ? q_mode : d_mode);
+    ra = rd;
+    if (modrm.reg != 1) 
+        /* Let decode_core invoke decode_asf, which will also mark as inv. */
+        return false;  
+
+    if (rd.type != OPTYPE_MEM) MakeInvalid();
+
+    int sizeincr = (rex.mode64) ? 8 : 4;
+    int sizeshift = (rex.mode64) ? 3 : 2;
+    EndOfDecode();
+
+    // cmpxchg16b
+    prefixes |= PFX_LOCK;
+    if (memory_fence_if_locked(0)) break;
+   
+    /*
+
+    Microcode:
+
+    ld   t0 = [mem]
+    ld   t1 = [mem+8]
+    sub  t2 = t0,rax
+    sub  t3 = t1,rdx
+    andcc t7,flags = t2,t3
+    sel.eq t2 = t0,rbx,(t7)
+    sel.eq t3 = t1,rcx,(t7)
+    sel.eq rax = t0,rax,(t7)
+    sel.eq rdx = t1,rdx,(t7)
+    st   [mem],t2
+    st   [mem+8],t3
+  
+    */
+
+    operand_load(REG_temp0, ra, OP_ld);
+    ra.mem.offset += sizeincr;
+    operand_load(REG_temp1, ra, OP_ld);
+
+    TransOp sublo(OP_sub, REG_temp2, REG_temp0, REG_rax, REG_zero, sizeshift,
+      0, 0, FLAGS_DEFAULT_ALU); sublo.nouserflags = 1; this << sublo;
+    TransOp subhi(OP_sub, REG_temp3, REG_temp1, REG_rdx, REG_zero, sizeshift,
+      0, 0, FLAGS_DEFAULT_ALU); subhi.nouserflags = 1; this << subhi;
+    this << TransOp(OP_andcc, REG_temp7, REG_temp2, REG_temp3, REG_zero,
+      sizeshift, 0, 0, FLAGS_DEFAULT_ALU);
+    { TransOp sel(OP_sel, REG_temp2, REG_temp0, REG_rbx, REG_temp7, sizeshift);
+      sel.cond = COND_e; this << sel; }
+    { TransOp sel(OP_sel, REG_temp3, REG_temp1, REG_rcx, REG_temp7, sizeshift);
+      sel.cond = COND_e; this << sel; }
+    { TransOp sel(OP_sel, REG_rax, REG_temp0, REG_rax, REG_temp7, sizeshift);
+      sel.cond = COND_e; this << sel; }
+    { TransOp sel(OP_sel, REG_rdx, REG_temp1, REG_rdx, REG_temp7, sizeshift);
+      sel.cond = COND_e; this << sel; }
+    result_store(REG_temp2, REG_temp4, rd);
+    rd.mem.offset += sizeincr;
+    result_store(REG_temp3, REG_temp5, rd);
+
+    if (memory_fence_if_locked(1)) break;
+
+    break;
+  }
+
   default: {
-    MakeInvalid();
+    //S.D. Give the ASF-Decoder a chance to run first! MakeInvalid();
+    return false;
     break;
   }
   }
diff -r 10448c053ad6 decode-core.cpp
--- a/decode-core.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/decode-core.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Decoder for x86 and x86-64 to PTL transops
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -887,7 +889,7 @@
   return basereg;
 }
 
-void TraceDecoder::address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool force_seg_bias) {
+void TraceDecoder::address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool force_seg_bias, bool invalidating) {
   //
   // In the address generation form used by internally generated
   // uops, we need the full virtual address, including the segment base
@@ -943,13 +945,16 @@
     int tempreg = (memop) ? REG_temp8 : destreg;
 
     abs_code_addr_immediate(REG_temp8, 3, Waddr(rip) + offset);
-    this << TransOp(OP_add, tempreg, REG_temp8, basereg, REG_zero, 3);
+    // SD: Skip unneccesarry add for rip-relative addressing!
+    if ((tempreg != REG_temp8) || (basereg != REG_zero))
+      this << TransOp(OP_add, tempreg, REG_temp8, basereg, REG_zero, 3);
 
     if (memop) {
       TransOp ldst(opcode, destreg, tempreg, REG_imm, srcreg, memref.mem.size, 0);
       ldst.datatype = datatype;
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
+      ldst.invalidating = invalidating;
       this << ldst;
     }
   } else if (indexreg == REG_zero) {
@@ -965,6 +970,7 @@
     ldst.datatype = datatype;
     ldst.cachelevel = cachelevel;
     ldst.locked = locked;
+    ldst.invalidating = invalidating;
     this << ldst;
   } else if (offset == 0) {
     // [ra + rb*scale] or [rb*scale]
@@ -986,6 +992,7 @@
       ldst.datatype = datatype;
       ldst.cachelevel = cachelevel;
       ldst.locked = locked;
+      ldst.invalidating = invalidating;
       this << ldst;
     }
   } else {
@@ -1005,12 +1012,13 @@
     ldst.datatype = datatype;
     ldst.cachelevel = cachelevel;
     ldst.locked = locked;
+    ldst.invalidating = invalidating;
     this << ldst;
   }
 }
 
-void TraceDecoder::operand_load(int destreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel) {
-  address_generate_and_load_or_store(destreg, REG_zero, memref, opcode, datatype, cachelevel);
+void TraceDecoder::operand_load(int destreg, const DecodedOperand& memref, int opcode, int datatype, int cachelevel, bool invalidating) {
+  address_generate_and_load_or_store(destreg, REG_zero, memref, opcode, datatype, cachelevel, false, invalidating);
 }
 
 void TraceDecoder::result_store(int srcreg, int tempreg, const DecodedOperand& memref, int datatype) {
@@ -1886,6 +1894,10 @@
   }
   } // switch
 
+  /* check for possible ASF-Instructions */
+  bool isasf = ((rc == 0) & (!invalid));
+  if (isasf) rc = decode_asf();
+
   if (!rc) return rc;
 
   user_insn_count++;
diff -r 10448c053ad6 decode-fast.cpp
--- a/decode-fast.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/decode-fast.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Decoder for simple x86 instructions
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <decode.h>
@@ -216,6 +218,9 @@
 
   case 0x88 ... 0x8b: {
     // moves
+    /* Handle LOCKed loads with ASF! */
+    if (prefixes & PFX_LOCK) return false;
+
     int bytemode = bit(op, 0) ? v_mode : b_mode;
     switch (bit(op, 1)) {
     case 0: DECODE(eform, rd, bytemode); DECODE(gform, ra, bytemode); break;
@@ -830,7 +835,7 @@
     static const byte x86_prefetch_to_pt2x_cachelevel[8] = {2, 1, 2, 3};
     int level = x86_prefetch_to_pt2x_cachelevel[modrm.reg];
     prefixes &= ~PFX_LOCK;
-    operand_load(REG_temp0, ra, OP_ld_pre, level);
+    operand_load(REG_temp0, ra, OP_ld_pre, DATATYPE_INT, level);
     break;
   }
 
diff -r 10448c053ad6 decode-sse.cpp
--- a/decode-sse.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/decode-sse.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Decoder for SSE/SSE2/SSE3/MMX and misc instructions
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <decode.h>
@@ -14,7 +16,6 @@
   DecodedOperand ra;
 
   is_sse = 1;
-  prefixes &= ~PFX_LOCK;
 
   switch (op) {
     //
@@ -481,12 +482,15 @@
     break;
   }
 
+  case 0x56f: // movdqa load
+  case 0x26f: { // movdqu load
+    /* Let decode_asf handle LOCKed movdqa and movdqu */
+    if (prefixes & PFX_LOCK) return false;
   case 0x328: // movaps load 
   case 0x528: // movapd load
   case 0x310: // movups load
   case 0x510: // movupd load
-  case 0x56f: // movdqa load
-  case 0x26f: { // movdqu load
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, x_mode);
     EndOfDecode();
@@ -953,6 +957,9 @@
     */
 
   case 0x56e: { // movd xmm,rm32/rm64
+    /* Let decode_asf handle the locked verisons of movd xmm,rm32/rm64 */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, v_mode);
     EndOfDecode();
@@ -988,6 +995,9 @@
   }
 
   case 0x27e: { // movq xmm,xmmlo|mem64 with zero extension
+    /* Let decode_asf handle the locked verisons of movq xmm,xmmlo|mem64 */
+    if (prefixes & PFX_LOCK) return false;
+
     DECODE(gform, rd, x_mode);
     DECODE(eform, ra, x_mode);
     EndOfDecode();
diff -r 10448c053ad6 decode.h
--- a/decode.h	Thu May 31 15:36:20 2007 +0200
+++ b/decode.h	Wed Nov 05 14:15:51 2008 +0100
@@ -4,6 +4,8 @@
 // Decoder for x86 and x86-64 to PTL uops
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _DECODE_H_
@@ -200,8 +202,8 @@
   void immediate(int rdreg, int sizeshift, W64s imm, bool issigned = true);
   void abs_code_addr_immediate(int rdreg, int sizeshift, W64 imm);
   int bias_by_segreg(int basereg);
-  void address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype = DATATYPE_INT, int cachelevel = 0, bool force_seg_bias = false);
-  void operand_load(int destreg, const DecodedOperand& memref, int loadop = OP_ld, int datatype = 0, int cachelevel = 0);
+  void address_generate_and_load_or_store(int destreg, int srcreg, const DecodedOperand& memref, int opcode, int datatype = DATATYPE_INT, int cachelevel = 0, bool force_seg_bias = false, bool invalidating = false);
+  void operand_load(int destreg, const DecodedOperand& memref, int loadop = OP_ld, int datatype = 0, int cachelevel = 0, bool invalidating = false);
   void result_store(int srcreg, int tempreg, const DecodedOperand& memref, int datatype = 0);
   void alu_reg_or_mem(int opcode, const DecodedOperand& rd, const DecodedOperand& ra, W32 setflags, int rcreg, 
                       bool flagsonly = false, bool isnegop = false, bool ra_rb_imm_form = false, W64s ra_rb_imm_form_rbimm = 0);
@@ -226,6 +228,9 @@
   bool decode_complex();
   bool decode_sse();
   bool decode_x87();
+  bool decode_asf();
+
+  void scan_transb_and_flag_asf(byte opcode);
 
   typedef int rep_and_size_to_assist_t[3][4];
 
diff -r 10448c053ad6 globals.h
--- a/globals.h	Thu May 31 15:36:20 2007 +0200
+++ b/globals.h	Wed Nov 05 14:15:51 2008 +0100
@@ -495,6 +495,16 @@
 #include <mathlib.h>
 #include <klibc.h>
 
+#ifdef PAGE_SHIFT
+#undef PAGE_SHIFT
+#endif
+#ifdef PAGE_SIZE
+#undef PAGE_SIZE
+#endif
+// We're on x86 or x86-64, so pages are always 4096 bytes:
+#define PAGE_SHIFT (12)
+#define PAGE_SIZE (1 << (PAGE_SHIFT))
+
 // e.g., head (a, b, c) => a
 // e.g., if list = (a, b, c), head list => a
 //#define head(h, ...) (h)
diff -r 10448c053ad6 kernel.cpp
--- a/kernel.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/kernel.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Linux Kernel Interface
 //
 // Copyright 2000-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -797,10 +799,13 @@
   }
 } __attribute__((packed));
 
+#ifdef __x86_64__
+extern "C" void inside_sim_escape_code_template_64bit();
+extern "C" void inside_sim_escape_code_template_64bit_end();
+#else
 extern "C" void inside_sim_escape_code_template_32bit();
 extern "C" void inside_sim_escape_code_template_32bit_end();
-extern "C" void inside_sim_escape_code_template_64bit();
-extern "C" void inside_sim_escape_code_template_64bit_end();
+#endif
 
 struct InsideSimEscapeCode { 
   byte bytes[64];
@@ -808,13 +813,16 @@
   void prep() {
     byte* src;
     int length;
-    if (ctx.use64) {
-      src = (byte*)&inside_sim_escape_code_template_64bit;
-      length = ((byte*)&inside_sim_escape_code_template_64bit_end) - src;
-    } else {
-      src = (byte*)&inside_sim_escape_code_template_32bit;
-      length = ((byte*)&inside_sim_escape_code_template_32bit_end) - src;
-    }
+    // Make sure PTLsim build type matches target process type:
+#ifdef __x86_64__
+    assert(ctx.use64);
+    src = (byte*)&inside_sim_escape_code_template_64bit;
+    length = ((byte*)&inside_sim_escape_code_template_64bit_end) - src;
+#else
+    assert(!ctx.use64);
+    src = (byte*)&inside_sim_escape_code_template_32bit;
+    length = ((byte*)&inside_sim_escape_code_template_32bit_end) - src;
+#endif
     assert(length <= lengthof(bytes));
     memcpy(&bytes, src, length);
   }
@@ -1098,6 +1106,22 @@
 #endif
     break;
   }
+
+// This should be done when using a hypervisor...
+#if (0)
+  case __NR_futex: {
+    // S.D.: Mask the sys_futex / futex_wait to immediatelly return zero. This turns any futex based
+    // lock into a spinlock, which should not make any difference in terms of correctness. It seems
+    // like the split core model has some problems regarding futex wakeup. THis is (hopefully) a quick
+    // hack.
+    if ((arg2 & 127) == 0 /* FUTEX_WAIT*/) {
+      logfile << "handle_syscall: sys_futex/FUTEX_WAIT on futex ", (void*)arg1, " masked!",endl;
+      ctx.commitarf[REG_rax] = 0;
+    } else 
+      ctx.commitarf[REG_rax] = do_syscall_64bit(syscallid, arg1, arg2, arg3, arg4, arg5, arg6);
+    break;
+  }
+#endif
   default:
     ctx.commitarf[REG_rax] = do_syscall_64bit(syscallid, arg1, arg2, arg3, arg4, arg5, arg6);
     break;
@@ -1480,8 +1504,11 @@
   return LO32(data);
 }
 
+#ifdef __x86_64__
 extern "C" void ptlsim_loader_thunk_64bit(LoaderInfo* info);
+#else
 extern "C" void ptlsim_loader_thunk_32bit(LoaderInfo* info);
+#endif
 
 int is_elf_64bit(const char* filename) {
   idstream is;
@@ -1564,7 +1591,19 @@
 
   regs.rsp -= sizeof(LoaderInfo);
 
-  void* thunk_source = (void*)(x86_64_mode ? &ptlsim_loader_thunk_64bit : &ptlsim_loader_thunk_32bit);
+#ifdef __x86_64__
+  if (!x86_64_mode) {
+    cerr << "ptlsim: Error: This is a 64-bit build of PTLsim. It cannot run 32-bit processes.", endl;
+    assert(false);
+  }
+  void* thunk_source = (void*)&ptlsim_loader_thunk_64bit;
+#else
+  if (x86_64_mode) {
+    cerr << "ptlsim: Error: This is a 32-bit build of PTLsim. It cannot run 64-bit processes.", endl;
+    assert(false);
+  }
+  void* thunk_source = (void*)&ptlsim_loader_thunk_32bit;
+#endif
   int thunk_size = LOADER_THUNK_SIZE;
 
   if (DEBUG) cerr << "Saving old code (", thunk_size, " bytes) at thunk rip ", (void*)regs.rip, " in pid ", pid, endl;
@@ -1845,6 +1884,7 @@
     break;
   }
   default:
+    /*S.D.*/ cerr << "Received signal ", si->si_signo," ignoring it!", endl, flush;
     if (logfile) logfile << "Warning: unknown signal ", si->si_signo, "; ignoring", endl, flush; break;
   }
 }
@@ -1860,6 +1900,13 @@
   sa.sa_sigaction = external_signal_callback;
   sa.sa_flags = SA_SIGINFO;
   assert(sys_rt_sigaction(SIGXCPU, &sa, NULL, sizeof(W64)) == 0);
+
+  /*S.D.: try to fetch all signals within PTLsim!*/
+  memset(&sa, 0, sizeof sa);
+  sa.sa_sigaction = external_signal_callback;
+  sa.sa_flags = SA_SIGINFO;
+  assert(sys_rt_sigaction(SIGUSR1, &sa, NULL, sizeof(W64)) == 0);
+
 }
 
 bool check_for_async_sim_break() {
diff -r 10448c053ad6 klibc.cpp
--- a/klibc.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/klibc.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -2053,3 +2053,4 @@
     ctor_list[i]();
   }
 }
+
diff -r 10448c053ad6 klibc.h
--- a/klibc.h	Thu May 31 15:36:20 2007 +0200
+++ b/klibc.h	Wed Nov 05 14:15:51 2008 +0100
@@ -54,5 +54,6 @@
 //
 
 void call_global_constuctors();
-
+#undef RAND_MAX
+#define RAND_MAX (32767)
 #endif // _BASELIBC_H
diff -r 10448c053ad6 ooocore.h
--- a/ooocore.h	Thu May 31 15:36:20 2007 +0200
+++ b/ooocore.h	Wed Nov 05 14:15:51 2008 +0100
@@ -4,6 +4,8 @@
 // Out-of-Order Core Simulator Configuration
 //
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 // With these disabled, simulation is faster
@@ -624,6 +626,7 @@
     bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
     int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
     int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
+    void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel);
     void release();
     W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
     W64 annul_after() { return annul(true); }
diff -r 10448c053ad6 oooexec.cpp
--- a/oooexec.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/oooexec.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -4,6 +4,8 @@
 // Execution Pipeline Stages: Scheduling, Execution, Broadcast
 //
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -256,6 +258,7 @@
   bool ld = isload(uop.opcode);
   bool st = isstore(uop.opcode);
   bool br = isbranch(uop.opcode);
+  bool pf = isprefetch(uop.opcode);
 
   assert(operands[RA]->ready());
   if likely (uop.rb != REG_imm) assert(rb.ready());
@@ -309,6 +312,8 @@
         stats.ooocore.issue.result.replay++;
         return 0;
       }
+    } else if unlikely (pf) {
+      issueprefetch(state, radata, rbdata, rcdata, uop.cachelevel);
     } else {
       if unlikely (br) {
         state.brreg.riptaken = uop.riptaken;
@@ -1015,6 +1020,24 @@
   state.data = data;
   state.invalid = 0;
   state.bytemask = 0xff;
+
+  // Internal loads don't hit the cache hierarchy, but rather complete in two cycles.
+  if unlikely (uop.internal) {
+    cycles_left = LOADLAT;
+
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIT, this, sfra, addr);
+
+    load_store_second_phase = 1;
+    state.datavalid = 1;
+    physreg->flags &= ~FLAG_WAIT;
+    physreg->complete();
+    changestate(core.rob_issued_list[cluster]);
+    lfrqslot = -1;
+    forward_cycle = 0;
+
+    return ISSUE_COMPLETED;
+  }
+
   bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(addr, sfra, sizeshift);
 
   if likely (L1hit) {    
@@ -1050,7 +1073,7 @@
   // NOTE: this state is not really used anywhere since load misses
   // will fill directly into the physical register instead.
   //
-  lfrqslot = core.caches.issueload_slowpath(addr, *sfra, lsi);
+  lfrqslot = core.caches.issueload_slowpath(addr, (W64)origvirt, *sfra, lsi);
 
   if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_MISS, this, sfra, addr);
 
@@ -1068,6 +1091,37 @@
   core.load_to_store_parallel_forwarding_buffer[core.loads_in_this_cycle++] = floor(addr, 8);
 
   return ISSUE_COMPLETED;
+}
+
+void ReorderBufferEntry::issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel) {
+
+  OutOfOrderCore& core = getcore();
+
+  state.reg.rddata = 0;
+  state.reg.rdflags = 0;
+
+  int exception = 0;
+  Waddr addr;
+  Waddr origaddr;
+  PTEUpdate pteupdate;
+  PageFaultErrorCode pfec;
+  bool annul;
+
+  LoadStoreQueueEntry dummy;
+  setzero(dummy);
+  void* mapped = addrgen(dummy, origaddr, ra, rb, rc, pteupdate,
+                         addr, exception, pfec, annul);
+
+  // Ignore bogus prefetches:
+  if unlikely (exception) return;
+
+  // Ignore unaligned prefetches (should never happen)
+  if unlikely (annul) return;
+
+  // (Stats are already updated by initiate_prefetch())
+
+  Waddr physaddr = (annul) ? 0 : Waddr(mapped_virt_to_phys(mapped));
+  core.caches.initiate_prefetch(physaddr, origvirt, cachelevel);
 }
 
 //
diff -r 10448c053ad6 ooopipe.cpp
--- a/ooopipe.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/ooopipe.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -1573,7 +1573,7 @@
   if unlikely (st) {
     Waddr mfn = (lsq->physaddr << 3) >> 12;
     smc_setdirty(mfn);
-    if (lsq->bytemask) assert(core.caches.commitstore(*lsq) == 0);
+    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, (W64)origvirt, uop.internal) == 0);
   }
 
   if unlikely (pteupdate) {
diff -r 10448c053ad6 perfctrs.cpp
--- a/perfctrs.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/perfctrs.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Performance counters
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -152,11 +154,21 @@
   AMD_UOPS_RETIRED                            = 0xc1,
   AMD_L1D_ACCESSES                            = 0x40,
   AMD_L1D_MISSES                              = 0x41, // only counts first miss on an outstanding line
+  AMD_L1I_ACCESSES                            = 0x80,
+  AMD_L1I_MISSES                              = 0x80,
   AMD_BR_INST_RETIRED                         = 0xc2,
   AMD_BR_INST_MISPRED                         = 0xc3,
   AMD_L1_DTLB_MISS_AND_L2_DTLB_HIT            = 0x45, // relative to L1D_ACCESSES
   AMD_L1_DTLB_AND_L2_DTLB_MISS                = 0x46, // relative to L1D_ACCESSES
-  AMD_L2_CACHE_MISS                           = 0x7e  // all misses through L2 (including page walk traffic)
+  AMD_L2_CACHE_MISS                           = 0x7e,  // all misses through L2 (including page walk traffic)
+  AMD_FETCH_STALL_TOTAL                       = 0x87, // may be overlapped of the stalls below
+  AMD_RET_STACk_HITS                          = 0x88, // speculative, _DO NOT_ compare with retired branch ops!
+  AMD_DECODER_EMPTY                           = 0xD0,
+  AMD_DISPATCH_STALL_TOTAL                    = 0xD1,
+  AMD_DISPATCH_STALL_BR_MISPRED               = 0xD2,
+  AMD_DISPATCH_STALL_ROB_FULL                 = 0xD5,
+  AMD_DISPATCH_STALL_ISSUEQ_FULL              = 0xD5,
+  AMD_DISPATCH_STALL_LS_FULL                  = 0xD8
 };
 
 CounterPairConfig amd_k8_counter_config[GENERIC_PERFCTR_COUNT] = {
@@ -169,8 +181,8 @@
     {AMD_L1D_ACCESSES, 0} // total accesses (including speculative)
   }},
   {{ // GENERIC_PERFCTR_L1I_MISS_RATE
-    {0, 0}, //++MTY TODO
-    {0, 0},
+    {AMD_L1I_MISSES, 0}, // total L1D misses (including speculative)
+    {AMD_L1I_ACCESSES, 0} // total accesses (including speculative)
   }},
   {{ // GENERIC_PERFCTR_DTLB_MISS_RATE
     {AMD_L1_DTLB_AND_L2_DTLB_MISS, 0}, // or AMD_L1_DTLB_MISS_AND_L2_DTLB_HIT
diff -r 10448c053ad6 ptlhwdef.cpp
--- a/ptlhwdef.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/ptlhwdef.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Hardware Definitions
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <ptlsim.h>
@@ -150,6 +152,8 @@
   {"cvtf.d2s.p",     OPCLASS_FP_CONVERTFP,  opAB }, // pair of doubles in <ra> (high), <rb> (low) to pair of singles in <rd> (for cvtpd2ps)
   {"cvtf.s2d.lo",    OPCLASS_FP_CONVERTFP,  opB  }, // low single in <rb> to double in <rd> (for cvtps2pd, part 1, cvtss2sd)
   {"cvtf.s2d.hi",    OPCLASS_FP_CONVERTFP,  opB  }, // high single in <rb> to double in <rd> (for cvtps2pd, part 2)
+  {"asf.acq",        OPCLASS_LOGIC,         opB  },
+  {"asf.com",        OPCLASS_LOGIC,         opNOSIZE },
 };
 
 const char* exception_names[EXCEPTION_COUNT] = {
@@ -169,6 +173,7 @@
   "LFRQFull",
   "Float",
   "FloatNotAvail",
+  "ASFTesting",
 };
 
 const char* x86_exception_names[256] = {
@@ -388,6 +393,7 @@
 
   if ((ld|st) && (op.cachelevel > 0)) sbname << ".L", (char)('1' + op.cachelevel);
   if ((ld|st) && (op.locked)) sbname << ((ld) ? ".acq" : ".rel");
+  if (op.invalidating) sbname << ".inv";
   if (op.internal) sbname << ".p";
   if (op.eom) sbname << ".", (op.any_flags_in_insn ? "+" : "-");
 
@@ -539,7 +545,7 @@
   return (char*)temp;
 }
 
-stringbuf& nameof(stringbuf& sbname, const TransOp& uop) {
+stringbuf& nameof(stringbuf& sbname, const TransOpBase& uop) {
   static const char* size_names[4] = {"b", "w", "d", ""};
   static const char* fptype_names[4] = {"ss", "ps", "sd", "pd"};
   static const char* mask_exttype[4] = {"", "zxt", "sxt", "???"};
diff -r 10448c053ad6 ptlhwdef.h
--- a/ptlhwdef.h	Thu May 31 15:36:20 2007 +0200
+++ b/ptlhwdef.h	Wed Nov 05 14:15:51 2008 +0100
@@ -4,6 +4,8 @@
 // Hardware Definitions
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _PTLHWDEF_H
@@ -182,6 +184,7 @@
   EXCEPTION_LFRQFull,
   EXCEPTION_FloatingPoint,
   EXCEPTION_FloatingPointNotAvailable,
+  EXCEPTION_ASF_Testing, /*S.D.*/
   EXCEPTION_COUNT
 };
 
@@ -190,8 +193,10 @@
 static const int MAX_BB_UOPS = 63;
 static const int MAX_BB_PER_PAGE = 4096;
 
-static const int MAX_TRANSOPS_PER_USER_INSN = 16;
-
+/* BLAME S.D.!
+ *static const int MAX_TRANSOPS_PER_USER_INSN = 16;
+ */
+static const int MAX_TRANSOPS_PER_USER_INSN = 32;
 extern const char* exception_names[EXCEPTION_COUNT];
 
 static inline const char* exception_name(W64 exception) {
@@ -1108,6 +1113,9 @@
   OP_cvtf_d2s_p,
   OP_cvtf_s2d_lo,
   OP_cvtf_s2d_hi,
+  // ASF
+  OP_acq,
+  OP_com,
   OP_MAX_OPCODE,
 };
 
@@ -1147,6 +1155,7 @@
 inline int opclassof(int opcode) { return lsbindex(opinfo[opcode].opclass); }
 
 inline bool isload(int opcode) { return isclass(opcode, OPCLASS_LOAD); }
+inline bool isprefetch(int opcode) { return isclass(opcode, OPCLASS_PREFETCH); }
 inline bool isstore(int opcode) { return isclass(opcode, OPCLASS_STORE); }
 inline bool iscondbranch(int opcode) { return isclass(opcode, OPCLASS_COND_BRANCH|OPCLASS_INDIR_BRANCH); }
 inline bool isbranch(int opcode) { return isclass(opcode, OPCLASS_BRANCH); }
@@ -1266,7 +1275,7 @@
   // Index in basic block
   byte bbindex;
   // Misc info (terminal writer of targets in this insn, etc)
-  byte final_insn_in_bb:1, final_arch_in_insn:1, final_flags_in_insn:1, any_flags_in_insn:1, pad:4;
+  byte final_insn_in_bb:1, final_arch_in_insn:1, final_flags_in_insn:1, any_flags_in_insn:1, is_asf:1, invalidating: 1, pad:2;
   // Immediates
   W64s rbimm;
   W64s rcimm;
@@ -1450,7 +1459,7 @@
 //
 // Printing and information
 //
-stringbuf& nameof(stringbuf& sbname, const TransOp& uop);
+stringbuf& nameof(stringbuf& sbname, const TransOpBase& uop);
 
 char* regname(int r);
 
diff -r 10448c053ad6 ptlmon.cpp
--- a/ptlmon.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/ptlmon.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -1836,7 +1836,7 @@
   assert(sizeof(Context) == PAGE_SIZE);
 
   // 32 MB default:
-  W64 ptlsim_reserved_mb = 32;
+  W64 ptlsim_reserved_mb = 128;
   const char* domain_name = null;
 
   foreach (i, argc) {
diff -r 10448c053ad6 ptlsim.h
--- a/ptlsim.h	Thu May 31 15:36:20 2007 +0200
+++ b/ptlsim.h	Wed Nov 05 14:15:51 2008 +0100
@@ -19,6 +19,10 @@
 #include <ptlhwdef.h>
 #include <config.h>
 #include <datastore.h>
+
+//#define CORE_GENERIC
+//#define CORE_AMD_K8
+#define CORE_AMD_BARCELONA_ASF
 
 extern W64 sim_cycle;
 extern W64 total_uops_committed;
diff -r 10448c053ad6 ptlxen.cpp
--- a/ptlxen.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/ptlxen.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -3,6 +3,8 @@
 // Toplevel control and kernel interface to Xen inside the user domain
 //
 // Copyright 1999-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -1489,7 +1491,7 @@
   }
 
   default:
-    if (debug) logfile << "Cannot handle hypercall ", hypercallid, "!", endl, flush;
+    /*if (debug)*/ logfile << "Cannot handle hypercall ", hypercallid, "!", endl, flush;
     assert(false);
   }
 
@@ -1876,6 +1878,13 @@
   // This only works when the guest OS is Linux and the program is 64 bit.
   //
   if (ctx.use64) {
+    W64 arg1 = ctx.commitarf[REG_rdi];
+    W64 arg2 = ctx.commitarf[REG_rsi];
+    W64 arg3 = ctx.commitarf[REG_rdx];
+    W64 arg4 = ctx.commitarf[REG_r10];
+    W64 arg5 = ctx.commitarf[REG_r8];
+    W64 arg6 = ctx.commitarf[REG_r9];
+
     switch (ctx.commitarf[REG_rax]) {
     case __NR_execve: {
       char filename[256];
@@ -1883,6 +1892,48 @@
       assert(inrange(n, 0, int(sizeof(filename)-1)));
       filename[n] = 0;
       logfile << "syscall: execve('", filename, "', ...)", endl;
+      break;
+    }
+    case __NR_futex: {
+      if likely (!logable(1)) break;
+      W64 stack[64];
+      int n = ctx.copy_from_user(stack, ctx.commitarf[REG_rsp], sizeof(stack));
+      char *futex_names[]={
+        "FUTEX_WAIT",
+        "FUTEX_WAKE",
+        "FUTEX_FD",
+        "FUTEX_REQUEUE",
+        "FUTEX_CMP_REQUEUE",
+        "FUTEX_WAKE_OP",
+        "FUTEX_LOCK_PI",
+        "FUTEX_UNLOCK_PI",
+        "FUTEX_TRYLOCK_PI" 
+      };
+
+      W32 futex, val, op;
+      W64 time[2];
+
+      ctx.copy_from_user(&futex, arg1, sizeof(futex));
+      op  = (W32)arg2;
+      val = (W32)arg3;
+      ctx.copy_from_user(&time, arg4, sizeof(time));
+
+      W64 physaddr = mapped_virt_to_phys(pte_to_mapped_virt(arg1, ctx.virt_to_pte(arg1)));
+      logfile << "syscall: ", futex_names[op & 127], (op & 128) ? " (private)":"",
+                 " futex @ ",(void*)arg1,"(",(void*)physaddr ,")= ", (void*)futex, " val: ", (void*)val,
+                 " time: ", time[0], "s", time[1], "ns"
+                 " stack:", endl;
+      if (logable(5))
+        for (int i = 0; i < n / sizeof(stack[0]); i++)
+          logfile << "  ",hexstring (stack[i],64),endl;
+
+      break;
+    }
+    case __NR_write: {
+      if likely (!logable(5)) break;
+      char message[512];
+      int n = ctx.copy_from_user(message, arg2, sizeof(message)-1);
+      logfile << "sys_write to fd ", arg1, " message: ", message, endl;
       break;
     }
     }
diff -r 10448c053ad6 random_inject.cpp
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_inject.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,51 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Random testing support
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+/**
+ * Simple statistic tools for random event injection.
+ * @author stephan.diestelhorst@{amd.com, inf.tu-dresden.de, gmail.com}
+ */
+#include <random_inject.h>
+#include <globals.h>
+long random_seed;
+
+/**
+ * Very primitive random function from K&R C book.
+ * Guess, it is PD... S.D.
+ * @return Random number in [0, RAND_MAX).
+ */
+extern "C" int rand() 
+{
+  random_seed = random_seed * 1103515245LL +12345; 
+  return (unsigned int)(random_seed / 65536) % 32768; 
+}
+
+/**
+ * Initialises the random testing suite by setting the random number
+ * generator to a certain start value.
+ * @param fixed If true, seed will be initialised with a fixed value, time of day otherwise.
+ */
+void init_random(bool fixed) {
+  if (fixed) random_seed = 0;
+  else random_seed = (long)rdtsc();
+}
diff -r 10448c053ad6 random_inject.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/random_inject.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,52 @@
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// Random testing support
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+/**
+ * Simple statistic tools for random event injection.
+ * @author stephan.diestelhorst@{amd.com, inf.tu-dresden.de, gmail.com}
+ */
+#include <globals.h>
+#ifndef _RANDOM_INJECT_H
+#define _RANDOM_INJECT_H
+extern long random_seed;
+extern "C" int rand();
+void init_random(bool fixed);
+
+/**
+ * Defines a predicate which will randomly be true in
+ * every share / 1000 calls.
+ * @param name Name of the predicate.
+ * @param share How often should the predicate be true? In parts-per-thousand.
+ * @return Truth-value of the random predicate.
+ */
+#define RANDOM_TRUE(name, share)\
+  static bool name(){\
+    return (share && (rand() < RAND_MAX*(long)share/1000L));\
+  }
+
+/* Injection of exceptions for testing */
+RANDOM_TRUE(asf_consistency_error,  0);
+RANDOM_TRUE(asf_load_exception,     0);
+RANDOM_TRUE(asf_prefetch_exception, 0);
+RANDOM_TRUE(asf_interrupt_critsec,  0);
+RANDOM_TRUE(asf_exception_critsec,  0);
+#endif
diff -r 10448c053ad6 smtcore-amd-barcelona-asf.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smtcore-amd-barcelona-asf.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,2121 @@
+// -*- c++ -*-
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// SMT Core Simulator Configuration for
+// AMD Fam. 0x10 (Barcelona) with Experimental AMD64 ASF Extension
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+#include <random_inject.h>
+
+//
+// Enable SMT operation:
+//
+// Note that this limits some configurations of resources and
+// issue queues that would normally be possible in single
+// threaded mode.
+//
+
+//#define ENABLE_SMT
+
+static const int MAX_THREADS_BIT = 4; // up to 16 threads
+static const int MAX_ROB_IDX_BIT = 12; // up to 4096 ROB entries
+
+#ifdef ENABLE_SMT
+static const int MAX_THREADS_PER_CORE = 4;
+#else
+static const int MAX_THREADS_PER_CORE = 1;
+#endif
+
+//#define ENABLE_SIM_TIMING
+#ifdef ENABLE_SIM_TIMING
+#define time_this_scope(ct) CycleTimerScope ctscope(ct)
+#define start_timer(ct) ct.start()
+#define stop_timer(ct) ct.stop()
+#else
+#define time_this_scope(ct) (0)
+#define start_timer(ct) (0)
+#define stop_timer(ct) (0)
+#endif
+
+#define per_context_smtcore_stats_ref(vcpuid) (*(((PerContextSMTStats*)&stats.smtcore.vcpu0) + (vcpuid)))
+#define per_context_smtcore_stats_update(vcpuid, expr) stats.smtcore.total.expr, per_context_smtcore_stats_ref(vcpuid).expr
+
+namespace SMTModel {
+  //
+  // Operand formats
+  //
+  static const int MAX_OPERANDS = 4;
+  static const int RA = 0;
+  static const int RB = 1;
+  static const int RC = 2;
+  static const int RS = 3; // (for stores only)
+
+  // SD: In order to reduce the effect of clever dispatching decisions, I have
+  // implemented a separate LSU, which might more acurately model the actually
+  // separate LSU in K8, which is only driven by _three_ AGUs.
+//#define SEPARATE_LSU
+
+  //
+  // Uop to functional unit mappings
+  //
+  static const int FU_COUNT = 7 + 3;
+  static const int LOADLAT  = 3;
+
+  enum {
+    FU_ALU1       = (1 << 0),
+    FU_ALUC       = (1 << 1),
+    FU_ALU2       = (1 << 2),
+    FU_ALU3       = (1 << 3),
+    FU_FADD       = (1 << 4),
+    FU_FMUL       = (1 << 5),
+    FU_FCVT       = (1 << 6),
+    FU_LSU01      = (1 << 7), // SD: not used, see below
+    FU_LSU02      = (1 << 8),
+    FU_LSU03      = (1 << 9),
+  };
+
+  static const int LOAD_FU_COUNT = 2;
+
+  const char* fu_names[FU_COUNT] = {
+    "alu1",
+    "aluc",
+    "alu2",
+    "alu3",
+    "fadd",
+    "fmul",
+    "fcvt",
+    "lsu1",
+    "lsu2",
+    "lsu3",
+  };
+
+  //
+  // Opcodes and properties
+  //
+#define ALU1 FU_ALU1
+#define ALU2 FU_ALU2
+#define ALU3 FU_ALU3
+#define ALUC FU_ALUC
+#define LSU  FU_LSU01|FU_LSU02|FU_LSU03
+#define FADD FU_FADD
+#define FMUL FU_FMUL
+#define FCVT FU_FCVT
+#define A 1 // ALU latency, assuming fast bypass
+#define L LOADLAT
+
+  struct FunctionalUnitInfo {
+    byte opcode;   // Must match definition in ptlhwdef.h and ptlhwdef.cpp! 
+    byte latency;  // Latency in cycles, assuming ideal bypass
+    W32  fu;       // Map of functional units on which this uop can issue
+  };
+
+  //
+  // WARNING: This table MUST be kept in sync with the table
+  // in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
+  //
+  const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
+    // name, latency, fumask
+    {OP_nop,            1, ALU1|ALU2|ALU3|ALUC|LSU|FADD|FMUL|FCVT},
+    {OP_mov,            1, ALU1|ALU2|ALU3|FADD|FMUL},
+    // Logical
+    {OP_and,            1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_andnot,         1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_xor,            1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_or,             1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_nand,           1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_ornot,          1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_eqv,            1, ALU1|ALU2|ALU3|FADD|FMUL},
+    {OP_nor,            1, ALU1|ALU2|ALU3|FADD|FMUL},
+    // Mask, insert or extract bytes
+    {OP_maskb,          A, ALU1|ALU2|ALU3},
+    // Add and subtract
+    {OP_add,            1, ALU1|ALU2|ALU3},
+    {OP_sub,            1, ALU1|ALU2|ALU3},
+    {OP_adda,           1, ALU1|ALU2|ALU3},
+    {OP_suba,           1, ALU1|ALU2|ALU3},
+    {OP_addm,           1, ALU1|ALU2|ALU3},
+    {OP_subm,           1, ALU1|ALU2|ALU3},
+    // Condition code logical ops
+    {OP_andcc,          1, ALUC},
+    {OP_orcc,           1, ALUC},
+    {OP_xorcc,          1, ALUC},
+    {OP_ornotcc,        1, ALUC},
+    // Condition code movement and merging
+    {OP_movccr,         1, ALUC},
+    {OP_movrcc,         1, ALUC},
+    {OP_collcc,         1, ALUC},
+    // Simple shifting (restricted to small immediate 1..8)
+    {OP_shls,           1, ALU1|ALU2|ALU3},
+    {OP_shrs,           1, ALU1|ALU2|ALU3},
+    {OP_bswap,          1, ALU1|ALU2|ALU3},
+    {OP_sars,           1, ALU1|ALU2|ALU3},
+    // Bit testing
+    {OP_bt,             1, ALU1|ALU2|ALU3},
+    {OP_bts,            1, ALU1|ALU2|ALU3},
+    {OP_btr,            1, ALU1|ALU2|ALU3},
+    {OP_btc,            1, ALU1|ALU2|ALU3},
+    // Set and select
+    {OP_set,            1, ALU1|ALU2|ALU3},
+    {OP_set_sub,        1, ALU1|ALU2|ALU3},
+    {OP_set_and,        1, ALU1|ALU2|ALU3},
+    {OP_sel,            1, ALU1|ALU2|ALU3},
+    // Branches
+    {OP_br,             1, ALU1|ALU2|ALU3},
+    {OP_br_sub,         1, ALU1|ALU2|ALU3},
+    {OP_br_and,         1, ALU1|ALU2|ALU3},
+    {OP_jmp,            1, ALU1|ALU2|ALU3},
+    {OP_bru,            1, ALU1|ALU2|ALU3},
+    {OP_jmpp,           1, ALUC},
+    {OP_brp,            1, ALUC},
+    // Checks
+    {OP_chk,            1, ALU1|ALU2|ALU3},
+    {OP_chk_sub,        1, ALU1|ALU2|ALU3},
+    {OP_chk_and,        1, ALU1|ALU2|ALU3},
+    // Loads and stores
+    {OP_ld,             L, LSU},
+    {OP_ldx,            L, LSU},
+    {OP_ld_pre,         1, LSU     },
+    {OP_st,             1, LSU},
+    {OP_mf,             1, LSU     },
+    // Shifts, rotates and complex masking
+    {OP_shl,            1, ALU1|ALU2|ALU3},
+    {OP_shr,            1, ALU1|ALU2|ALU3},
+    {OP_mask,           1, ALU1|ALU2|ALU3},
+    {OP_sar,            1, ALU1|ALU2|ALU3},
+    {OP_rotl,           1, ALU1|ALU2|ALU3},  
+    {OP_rotr,           1, ALU1|ALU2|ALU3},   
+    {OP_rotcl,          1, ALU1|ALU2|ALU3},
+    {OP_rotcr,          1, ALU1|ALU2|ALU3},  
+    // Multiplication
+    {OP_mull,           4, ALUC},
+    {OP_mulh,           4, ALUC},
+    {OP_mulhu,          4, ALUC},
+    // Bit scans
+    {OP_ctz,            4, ALUC},
+    {OP_clz,            4, ALUC},
+    {OP_ctpop,          4, ALUC},  
+    {OP_permb,          2, ALUC|FCVT},
+    // Floating point
+    // uop.size bits have following meaning:
+    // 00 = single precision, scalar (preserve high 32 bits of ra)
+    // 01 = single precision, packed (two 32-bit floats)
+    // 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
+    {OP_addf,           4, FADD},
+    {OP_subf,           4, FADD},
+    {OP_mulf,           4, FMUL},
+    {OP_maddf,          5, FMUL},
+    {OP_msubf,          5, FMUL},
+    {OP_divf,          16, FMUL},
+    {OP_sqrtf,         19, FMUL},
+    {OP_rcpf,           4, FMUL},
+    {OP_rsqrtf,         4, FMUL},
+    {OP_minf,           3, FADD},
+    {OP_maxf,           3, FADD},
+    {OP_cmpf,           3, FADD},
+    // For fcmpcc, uop.size bits have following meaning:
+    // 00 = single precision ordered compare
+    // 01 = single precision unordered compare
+    // 10 = double precision ordered compare
+    // 11 = double precision unordered compare
+    {OP_cmpccf,         4, FADD},
+    // and/andn/or/xor are done using integer uops
+    {OP_permf,          2, FADD|FCVT}, // shuffles
+    // For these conversions, uop.size bits select truncation mode:
+    // x0 = normal IEEE-style rounding
+    // x1 = truncate to zero
+    {OP_cvtf_i2s_ins,   9, FCVT},
+    {OP_cvtf_i2s_p,     9, FCVT},
+    {OP_cvtf_i2d_lo,    9, FCVT},
+    {OP_cvtf_i2d_hi,    9, FCVT},
+    {OP_cvtf_q2s_ins,   9, FCVT},
+    {OP_cvtf_q2d,       9, FCVT},
+    {OP_cvtf_s2i,       6, FCVT},
+    {OP_cvtf_s2q,       6, FCVT},
+    {OP_cvtf_s2i_p,     6, FCVT},
+    {OP_cvtf_d2i,       6, FCVT},
+    {OP_cvtf_d2q,       6, FCVT},
+    {OP_cvtf_d2i_p,     6, FCVT},
+    {OP_cvtf_d2s_ins,   4, FCVT},
+    {OP_cvtf_d2s_p,     4, FCVT},
+    {OP_cvtf_s2d_lo,    4, FCVT},
+    {OP_cvtf_s2d_hi,    4, FCVT},
+    {OP_acq,            A, ALU1|ALU2|ALU3},
+    {OP_com,            A, ALU1|ALU2|ALU3},
+  };
+
+#undef A
+#undef L
+#undef F
+
+#undef ALU0
+#undef ALU1
+#undef STU0
+#undef STU1
+#undef LDU0
+#undef LDU1
+#undef FPU0
+#undef FPU1
+#undef L
+
+#undef ANYALU
+#undef ANYLDU
+#undef ANYSTU
+#undef ANYFPU
+#undef ANYINT
+  
+  //
+  // Global limits
+  //
+  
+  const int MAX_ISSUE_WIDTH = 6;
+
+  // Largest size of any physical register file or the store queue:
+  const int MAX_PHYS_REG_FILE_SIZE = 128;
+  const int PHYS_REG_FILE_SIZE = 128;
+  const int PHYS_REG_NULL = 0;
+
+  //
+  // IMPORTANT! If you change this to be greater than 256, you MUST
+  // #define BIG_ROB below to use the correct associative search logic
+  // (16-bit tags vs 8-bit tags).
+  //
+  // SMT always has BIG_ROB enabled: high 4 bits are used for thread id
+  //
+#define BIG_ROB
+
+  const int ROB_SIZE = 72;
+  
+  // Maximum number of branches in the pipeline at any given time
+  const int MAX_BRANCHES_IN_FLIGHT = 24;
+
+  // Set this to combine the integer and FP phys reg files:
+  // #define UNIFIED_INT_FP_PHYS_REG_FILE
+  
+#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
+  // unified, br, st
+  const int PHYS_REG_FILE_COUNT = 3;
+#else
+  // int, fp, br, st
+  const int PHYS_REG_FILE_COUNT = 4;
+#endif
+  
+  //
+  // Load and Store Queues
+  //
+  const int LDQ_SIZE = 44;
+  const int STQ_SIZE = 44;
+
+  //
+  // Fetch
+  //
+  const int FETCH_QUEUE_SIZE = 36;
+  const int FETCH_WIDTH = 3; //This is actually 3 AMD64 ops...
+
+  //
+  // Frontend (Rename and Decode)
+  //
+  const int FRONTEND_WIDTH = 3;
+  const int FRONTEND_STAGES = 7;
+
+  //
+  // Dispatch
+  //
+  const int DISPATCH_WIDTH = 3;
+  // Use SD's experimental penaly based dispatcher. Looks better, but not
+  // thoroughly tested! Blame him (as usual) if sth fails!
+#define PENALTY_DISPATCHER
+  //
+  // Writeback
+  //
+  const int WRITEBACK_WIDTH = 3;
+
+  //
+  // Commit
+  //
+  const int COMMIT_WIDTH = 3;
+
+  //
+  // Clustering, Issue Queues and Bypass Network
+  //
+  const int MAX_FORWARDING_LATENCY = 2;
+
+#define MULTI_IQ
+
+#ifdef ENABLE_SMT
+#error AMD Familiy 0x10 microarchitecture does not support SMT
+#endif
+
+#ifndef SEPARATE_LSU
+  const int MAX_CLUSTERS = 4;
+#else
+  const int MAX_CLUSTERS = 5;
+#endif
+
+  enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
+  static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
+  static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
+
+#ifdef INSIDE_SMTCORE
+
+  struct SMTCore;
+  SMTCore& coreof(int coreid);
+
+  struct ReorderBufferEntry;
+
+  //
+  // Issue queue based scheduler with broadcast
+  //
+#ifdef BIG_ROB
+  typedef W16 issueq_tag_t;
+#else
+  typedef byte issueq_tag_t;
+#endif
+
+  template <int size, int operandcount = MAX_OPERANDS>
+  struct IssueQueue {
+#ifdef BIG_ROB
+    typedef FullyAssociativeTags16bit<size, size> assoc_t;
+    typedef vec8w vec_t;
+#else
+    typedef FullyAssociativeTags8bit<size, size> assoc_t;
+    typedef vec16b vec_t;
+#endif
+
+    typedef issueq_tag_t tag_t;
+
+    static const int SIZE = size;
+
+    assoc_t uopids;
+    assoc_t tags[operandcount];
+
+    // States:
+    //             V I
+    // free        0 0
+    // dispatched  1 0
+    // issued      1 1
+    // complete    0 1
+
+    bitvec<size> valid;
+    bitvec<size> issued;
+    bitvec<size> allready;
+    int count;
+    byte coreid;
+    int shared_entries;
+    int reserved_entries;
+
+    void set_reserved_entries(int num) { reserved_entries = num; }
+    bool reset_shared_entries() { 
+      shared_entries = size - reserved_entries; 
+      return true;
+    }
+    bool alloc_reserved_entry() {
+      assert(shared_entries > 0);
+      shared_entries--;
+      return true;
+    }
+    bool free_shared_entry() {
+      assert(shared_entries < size - reserved_entries);
+      shared_entries++;
+      return true;
+    }    
+    bool shared_empty() {
+      return (shared_entries == 0);
+    }
+
+    bool remaining() const { return (size - count); }
+    bool empty() const { return (!count); }
+    bool full() const { return (!remaining()); }
+
+    int uopof(int slot) const {
+      return uopids[slot];
+    }
+
+    int slotof(int uopid) const {
+      return uopids.search(uopid);
+    }
+
+    void reset(int coreid);
+    void reset(int coreid, int threadid);
+    void clock();
+    bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
+    bool broadcast(tag_t uopid);
+    int issue();
+    bool replay(int slot, const tag_t* operands, const tag_t* preready);
+    bool switch_to_end(int slot, const tag_t* operands, const tag_t* preready);
+    bool remove(int slot);
+
+    ostream& print(ostream& os) const;
+    void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
+
+    //
+    // Replay a uop that has already issued once.
+    // The caller may add or reset dependencies here as needed.
+    //
+    bool replay(int slot) {
+      issued[slot] = 0;
+      return true;
+    }
+
+    //
+    // Remove an entry from the issue queue after it has completed,
+    // or in the process of annulment.
+    //
+    bool release(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annul(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annuluop(int uopid) {
+      int slot = slotof(uopid);
+      if (slot < 0) return false;
+      remove(slot);
+      return true;
+    }
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  template <int size, int operandcount>
+  static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
+    return issueq.print(os);
+  }
+
+  //
+  // Iterate through a linked list of objects where each object directly inherits
+  // only from the selfqueuelink class or otherwise has a selfqueuelink object
+  // as the first member.
+  //
+  // This iterator supports mutable lists, meaning the current entry (obj) may
+  // be safely removed from the list and/or moved to some other list without
+  // affecting the next object processed.
+  //
+  // This does NOT mean you can remove any object from the list other than the
+  // current object obj - to do this, copy the list of pointers to an array and
+  // then process that instead.
+  //
+#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
+  linktype* entry; \
+  linktype* nextentry; \
+  for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
+    entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
+
+#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
+
+  struct StateList;
+
+  struct ListOfStateLists: public array<StateList*, 64> {
+    int count;
+
+    ListOfStateLists() { count = 0; }
+
+    int add(StateList* list);
+    void reset();
+  };
+
+  struct StateList: public selfqueuelink {
+    char* name;
+    int count;
+    int listid;
+    W64 dispatch_source_counter;
+    W64 issue_source_counter;
+    W32 flags;
+
+    StateList() { count = 0; listid = 0; }
+
+    void init(const char* name, ListOfStateLists& lol, W32 flags = 0);
+
+    StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {  
+      init(name, lol, flags);
+    }
+
+    // simulated asymmetric c++ array constructor:
+    StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
+      init(name, lol, flags);
+      return *this;
+    }
+
+    void reset();
+
+    selfqueuelink* dequeue() {
+      if (empty())
+        return null;
+      count--;
+      assert(count >=0);
+      selfqueuelink* obj = removehead(); 
+      return obj;
+    }
+
+    selfqueuelink* enqueue(selfqueuelink* entry) {
+      entry->addtail(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
+      if (preventry) entry->addhead(preventry); else entry->addhead(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* remove(selfqueuelink* entry) {
+      assert(entry->linked());
+      entry->unlink();
+      count--;
+      assert(count >=0);
+      return entry;
+    }
+
+    selfqueuelink* peek() {
+      return (empty()) ? null : head();
+    }
+
+    void checkvalid();
+  };
+
+  template <typename T> 
+  static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
+
+  //
+  // Fetch Buffers
+  //
+  struct BranchPredictorUpdateInfo: public PredictorUpdate {
+    int stack_recover_idx;
+    int bptype;
+    W64 ripafter;
+  };
+
+  struct FetchBufferEntry: public TransOp {
+    RIPVirtPhys rip;
+    W64 uuid;
+    uopimpl_func_t synthop;
+    BranchPredictorUpdateInfo predinfo;
+    W16 index;
+    W8 threadid;
+    byte ld_st_truly_unaligned;
+
+    int init(int index) { this->index = index; return 0; }
+    void validate() { }
+
+    FetchBufferEntry() { }
+    
+    FetchBufferEntry(const TransOp& transop) {
+      *((TransOp*)this) = transop;
+    }
+  };
+
+  //
+  // ReorderBufferEntry
+  struct ThreadContext;
+  struct SMTCore;
+  struct PhysicalRegister;
+  struct LoadStoreQueueEntry;
+  struct SMTCoreEvent;
+  struct LLBLine;
+  //
+  // Reorder Buffer (ROB) structure, used for tracking all uops in flight.
+  // This same structure is used to represent both dispatched but not yet issued 
+  // uops as well as issued uops.
+  //
+  struct ReorderBufferEntry: public selfqueuelink {
+    FetchBufferEntry uop;
+    struct StateList* current_state_list;
+    PhysicalRegister* physreg;
+    PhysicalRegister* operands[MAX_OPERANDS];
+    LoadStoreQueueEntry* lsq;
+    W16s idx;
+    W16s cycles_left; // execution latency counter, decremented every cycle when executing
+    W16s forward_cycle; // forwarding cycle after completion
+    W16s lfrqslot;
+    W16s iqslot;
+    W16  executable_on_cluster_mask;
+    W8s  cluster;
+    W8   coreid;
+
+    W8   threadid;
+    byte fu;
+    byte consumer_count;
+    PTEUpdate pteupdate;
+    Waddr origvirt; // original virtual address, with low bits
+    Waddr virtpage; // virtual page number actually accessed by the load or store
+    byte entry_valid:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1, issued:1;
+    byte tlb_walk_level;
+
+    int index() const { return idx; }
+    void validate() { entry_valid = true; }
+
+    void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
+      if (current_state_list)
+        current_state_list->remove(this);
+      current_state_list = &newqueue;
+      if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
+    }
+
+    void init(int idx);
+    void reset();
+    bool ready_to_issue() const;
+    bool ready_to_commit() const;
+    StateList& get_ready_to_issue_list() const;
+    bool find_sources();
+    int forward();
+    int select_cluster();
+    int select_cluster_penalty();
+    int issue();
+    void* addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& virtpage, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
+    bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
+    int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
+    int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
+    int probecache(Waddr addr, LoadStoreQueueEntry* sfra);
+    void tlbwalk();
+    int issuefence(LoadStoreQueueEntry& state);
+    void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate);
+    void release();
+    W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
+    W64 annul_after() { return annul(true); }
+    W64 annul_after_and_including() { return annul(false); }
+    int commit();
+    void replay();
+    void replay_locked();
+    int pseudocommit();
+    void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
+    void redispatch_dependents(bool inclusive = true);
+    void loadwakeup();
+    void fencewakeup();
+    LoadStoreQueueEntry* find_nearest_memory_fence();
+    bool release_mem_lock(bool forced = false);
+    ostream& print(ostream& os) const;
+    stringbuf& get_operand_info(stringbuf& sb, int operand) const;
+    ostream& print_operand_info(ostream& os, int operand) const;
+
+    SMTCore& getcore() const { return coreof(coreid); }
+
+    ThreadContext& getthread() const;
+    issueq_tag_t get_tag();
+
+    // ASF-related things
+    LLBLine* llbline;
+    int commit_asf_instruction();
+    int issueasf(IssueState& state, W64 rbdata);
+    void abort_asf();
+  };
+
+  void decode_tag(issueq_tag_t tag, int& threadid, int& idx) {
+    threadid = tag >> MAX_ROB_IDX_BIT;
+    int mask = ((1 << (MAX_ROB_IDX_BIT + MAX_THREADS_BIT)) - 1) >> MAX_THREADS_BIT;
+    idx = tag & mask;
+  }
+
+  static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
+    return rob.print(os);
+  }
+
+  //
+  // Load/Store Queue
+  //
+#define LSQ_SIZE 44 // Fam. 0x10 uses a unified LSQ
+
+  // Define this to allow speculative issue of loads before unresolved stores
+//#define SMT_ENABLE_LOAD_HOISTING // (Fam. 0x10 does not support this)
+
+  struct LoadStoreQueueEntry: public SFR {
+    ReorderBufferEntry* rob;
+    W16 idx;
+    byte coreid;
+    W8s mbtag;
+    W8 store:1, lfence:1, sfence:1, entry_valid:1;
+    W32 padding;
+
+    LoadStoreQueueEntry() { }
+
+    int index() const { return idx; }
+
+    void reset() {
+      int oldidx = idx;
+      setzero(*this);
+      idx = oldidx;
+      mbtag = -1;
+    }
+
+    void init(int idx) {
+      this->idx = idx;
+      reset();
+    }
+
+    void validate() { entry_valid = 1; }
+  
+    ostream& print(ostream& os) const;
+
+    LoadStoreQueueEntry& operator =(const SFR& sfr) {
+      *((SFR*)this) = sfr;
+      return *this;
+    }
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
+    return lsq.print(os);
+  }
+
+  struct PhysicalRegisterOperandInfo {
+    W32 uuid;
+    W16 physreg;
+    W16 rob;
+    byte state;
+    byte rfid;
+    byte archreg;
+    byte pad1;
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
+
+  //
+  // Physical Register File
+  //
+ 
+  struct PhysicalRegister: public selfqueuelink {
+    ReorderBufferEntry* rob;
+    W64 data;
+    W16 flags;
+    W16 idx;
+    W8  coreid;
+    W8  rfid;
+    W8  state;
+    W8  archreg;
+    W8  all_consumers_sourced_from_bypass:1;
+    W16s refcount;
+    W8 threadid;
+
+    StateList& get_state_list(int state) const;
+    StateList& get_state_list() const { return get_state_list(this->state); }
+
+    void changestate(int newstate) {
+      if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
+      state = newstate;
+      get_state_list(state).enqueue(this);
+    }
+
+    void init(int coreid, int rfid, int idx) {
+      this->coreid = coreid;
+      this->rfid = rfid;
+      this->idx = idx;
+      reset();
+    }
+
+  private:
+    void addref() { refcount++; }
+    void unref() {
+      refcount--;
+      assert((idx == 0) || (refcount >= 0));
+    }
+
+  public:
+
+    void addref(const ReorderBufferEntry& rob, W8 threadid) { addref(); }
+    void unref(const ReorderBufferEntry& rob, W8 threadid) { unref(); }
+    void addspecref(int archreg, W8 threadid) { addref(); }
+    void unspecref(int archreg, W8 threadid) { unref(); }
+    void addcommitref(int archreg, W8 threadid) { addref(); }
+    void uncommitref(int archreg, W8 threadid) { unref();  }
+
+    bool referenced() const { return (refcount > 0); }
+    bool nonnull() const { return (index() != PHYS_REG_NULL); }
+    bool allocated() const { return (state != PHYSREG_FREE); }
+    void commit() { changestate(PHYSREG_ARCH); }
+    void complete() { changestate(PHYSREG_BYPASS); }
+    void writeback() { changestate(PHYSREG_WRITTEN); }
+
+    void free() {      
+      changestate(PHYSREG_FREE);
+      rob = 0;
+      refcount = 0;
+      threadid = 0xff;
+      all_consumers_sourced_from_bypass = 1;
+    }
+
+  private:
+    void reset() {
+      selfqueuelink::reset();
+      state = PHYSREG_NONE;
+      free();
+    }
+
+  public:
+    void reset(W8 threadid, bool check_id = true) {
+      if (check_id && this->threadid != threadid) return;
+
+      if (!check_id) {
+        selfqueuelink::reset();
+        state = PHYSREG_NONE;
+      }
+      free();
+    }
+
+    int index() const { return idx; }
+    bool valid() const { return ((flags & FLAG_INV) == 0); }
+    bool ready() const { return ((flags & FLAG_WAIT) == 0); }
+
+    void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
+
+  struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
+    byte coreid;
+    byte rfid;
+    W16 size;
+    const char* name;
+    StateList states[MAX_PHYSREG_STATE];
+    W64 allocations;
+    W64 frees;
+
+    PhysicalRegisterFile() { }
+
+    PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset();
+    }
+
+    PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset(); return *this;
+    }
+
+    void init(const char* name, int coreid, int rfid, int size);
+    bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
+   
+    PhysicalRegister* alloc(W8 threadid, int r = -1);
+    void reset(W8 threadid);
+    ostream& print(ostream& os) const;
+
+    SMTCore& getcore() const { return coreof(coreid); }
+
+  private:
+    void reset();
+  };
+
+  static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
+    return physregs.print(os);
+  }
+
+  //
+  // Register Rename Table
+  //
+  struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+    bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
+#endif
+    ostream& print(ostream& os) const;
+  };
+
+  static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
+    return rrt.print(os);
+  }
+
+  enum {
+    ISSUE_COMPLETED = 1,      // issued correctly
+    ISSUE_NEEDS_REPLAY = 0,   // fast scheduling replay
+    ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
+    ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
+  };
+
+  enum {
+    COMMIT_RESULT_NONE = 0,   // no instructions committed: some uops not ready
+    COMMIT_RESULT_OK = 1,     // committed
+    COMMIT_RESULT_EXCEPTION = 2, // exception
+    COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
+    COMMIT_RESULT_SMC = 4,    // self modifying code detected
+    COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
+    COMMIT_RESULT_STOP = 6    // stop processor model (shutdown)
+  };
+
+  // Branch predictor outcomes:
+  enum { MISPRED = 0, CORRECT = 1 };
+
+  //
+  // Lookup tables (LUTs):
+  //
+  struct Cluster {
+    char* name;
+    W16 issue_width;
+    W32 fu_mask;
+  };
+
+  extern const Cluster clusters[MAX_CLUSTERS];
+  extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
+  extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
+  extern const byte archdest_can_commit[TRANSREG_COUNT];
+  extern const byte archdest_is_visible[TRANSREG_COUNT];
+
+  struct SMTMachine;
+
+  struct SMTCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
+    SMTCore& core;
+    SMTCoreCacheCallbacks(SMTCore& core_): core(core_) { }
+    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+  };
+
+  struct MemoryInterlockEntry {
+    W64 uuid;
+    W16 rob;
+    byte vcpuid;
+    W8 threadid;
+
+    void reset() { uuid = 0; rob = 0; vcpuid = 0; threadid = 0;}
+ 
+    ostream& print(ostream& os, W64 physaddr) const {
+      os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", threadid ", threadid, ", uuid ", uuid, ", rob ", rob;
+      return os;
+    }
+  };
+
+  struct MemoryInterlockBuffer: public LockableAssociativeArray<W64, MemoryInterlockEntry, 16, 4, 8> { };
+
+  extern MemoryInterlockBuffer interlocks;
+
+  //
+  // Event Tracing
+  //
+  enum {
+    EVENT_INVALID = 0,
+    EVENT_FETCH_STALLED,
+    EVENT_FETCH_ICACHE_WAIT,
+    EVENT_FETCH_FETCHQ_FULL,
+    EVENT_FETCH_IQ_QUOTA_FULL,
+    EVENT_FETCH_BOGUS_RIP,
+    EVENT_FETCH_ICACHE_MISS,
+    EVENT_FETCH_SPLIT,
+    EVENT_FETCH_ASSIST,
+    EVENT_FETCH_TRANSLATE,
+    EVENT_FETCH_OK,
+    EVENT_RENAME_FETCHQ_EMPTY,
+    EVENT_RENAME_ROB_FULL,
+    EVENT_RENAME_PHYSREGS_FULL,
+    EVENT_RENAME_LDQ_FULL,
+    EVENT_RENAME_STQ_FULL,
+    EVENT_RENAME_MEMQ_FULL,
+    EVENT_RENAME_OK,
+    EVENT_FRONTEND,
+    EVENT_CLUSTER_NO_CLUSTER,
+    EVENT_CLUSTER_OK,
+    EVENT_DISPATCH_NO_CLUSTER,
+    EVENT_DISPATCH_DEADLOCK,
+    EVENT_DISPATCH_OK,
+    EVENT_ISSUE_NO_FU,
+    EVENT_ISSUE_OK,
+    EVENT_REPLAY,
+    EVENT_STORE_EXCEPTION,
+    EVENT_STORE_WAIT,
+    EVENT_STORE_PARALLEL_FORWARDING_MATCH,
+    EVENT_STORE_ALIASED_LOAD,
+    EVENT_STORE_ISSUED,
+    EVENT_STORE_LOCK_RELEASED,
+    EVENT_STORE_LOCK_ANNULLED,
+    EVENT_STORE_LOCK_REPLAY,
+    EVENT_LOAD_EXCEPTION,
+    EVENT_LOAD_WAIT,
+    EVENT_LOAD_HIGH_ANNULLED,
+    EVENT_LOAD_HIT,
+    EVENT_LOAD_MISS,
+    EVENT_LOAD_BANK_CONFLICT,
+    EVENT_LOAD_TLB_MISS,
+    EVENT_LOAD_LOCK_REPLAY,
+    EVENT_LOAD_LOCK_OVERFLOW,
+    EVENT_LOAD_LOCK_ACQUIRED,
+    EVENT_LOAD_LFRQ_FULL,
+    EVENT_LOAD_WAKEUP,
+    EVENT_TLBWALK_HIT,
+    EVENT_TLBWALK_MISS,
+    EVENT_TLBWALK_WAKEUP,
+    EVENT_TLBWALK_NO_LFRQ_MB,
+    EVENT_TLBWALK_COMPLETE,
+    EVENT_FENCE_ISSUED,
+    EVENT_ALIGNMENT_FIXUP,
+    EVENT_ANNUL_NO_FUTURE_UOPS,
+    EVENT_ANNUL_MISSPECULATION,
+    EVENT_ANNUL_EACH_ROB,
+    EVENT_ANNUL_PSEUDOCOMMIT,
+    EVENT_ANNUL_FETCHQ_RAS,
+    EVENT_ANNUL_FETCHQ,
+    EVENT_ANNUL_FLUSH,
+    EVENT_REDISPATCH_DEPENDENTS,
+    EVENT_REDISPATCH_DEPENDENTS_DONE,
+    EVENT_REDISPATCH_EACH_ROB,
+    EVENT_COMPLETE,
+    EVENT_BROADCAST,
+    EVENT_FORWARD,
+    EVENT_WRITEBACK,
+    EVENT_COMMIT_FENCE_COMPLETED,
+    EVENT_COMMIT_EXCEPTION_DETECTED,
+    EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED,
+    EVENT_COMMIT_SKIPBLOCK,
+    EVENT_COMMIT_SMC_DETECTED,
+    EVENT_COMMIT_MEM_LOCKED,
+    EVENT_COMMIT_ASSIST,
+    EVENT_COMMIT_OK,
+    EVENT_RECLAIM_PHYSREG,
+    EVENT_RELEASE_MEM_LOCK,
+  };
+
+  //
+  // Event that gets written to the trace buffer
+  //
+  // In the interest of minimizing space, the cycle counters
+  // and uuids are only 32-bits; in practice wraparound is
+  // not likely to be a problem.
+  //
+  struct SMTCoreEvent {
+    W32 cycle;
+    W32 uuid;
+    RIPVirtPhysBase rip;
+    TransOpBase uop;
+    W16 rob;
+    W16 physreg;
+    W16 lsq;
+    W16 type;
+    W16s lfrqslot;
+    byte rfid;
+    byte cluster;
+    byte fu;
+    W8 threadid;
+    W32 issueq_count;
+
+    SMTCoreEvent* fill(int type) {
+      this->type = type;
+      cycle = sim_cycle;
+      uuid = 0;
+      threadid = 0xff;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const FetchBufferEntry& uop) {
+      fill(type);
+      uuid = uop.uuid;
+      rip = uop.rip;
+      threadid = uop.threadid;
+      this->uop = uop;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const RIPVirtPhys& rvp) {
+      fill(type);
+      rip = rvp;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob->uop);
+      this->rob = rob->index();
+      physreg = rob->physreg->index();
+      lsq = (rob->lsq) ? rob->lsq->index() : 0;
+      rfid = rob->physreg->rfid;
+      cluster = rob->cluster;
+      fu = rob->fu;
+      lfrqslot = rob->lfrqslot;
+      return this;
+    }
+
+    SMTCoreEvent* fill_commit(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob);
+      if unlikely (isstore(rob->uop.opcode)) {
+        commit.state.st = *rob->lsq;
+      } else {
+        commit.state.reg.rddata = rob->physreg->data;
+        commit.state.reg.rdflags = rob->physreg->flags;
+      }
+      // taken, predtaken only for branches
+      commit.ld_st_truly_unaligned = rob->uop.ld_st_truly_unaligned;
+      commit.pteupdate = rob->pteupdate;
+      // oldphysreg filled in later
+      // oldphysreg_refcount filled in later
+      commit.origvirt = rob->origvirt;
+      commit.total_user_insns_committed = total_user_insns_committed;
+      // target_rip filled in later
+      foreach (i, MAX_OPERANDS) commit.operand_physregs[i] = rob->operands[i]->index();
+      return this;
+    }
+
+    SMTCoreEvent* fill_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr, Waddr virtaddr) {
+      fill(type, rob);
+      if likely (rob->lsq) loadstore.sfr = *rob->lsq;
+      loadstore.virtaddr = virtaddr;
+      loadstore.load_store_second_phase = rob->load_store_second_phase;
+      loadstore.inherit_sfr_used = (inherit_sfr != null);
+      if unlikely (inherit_sfr) {
+        loadstore.inherit_sfr = *inherit_sfr;
+        loadstore.inherit_sfr_lsq = inherit_sfr->rob->lsq->index();
+        loadstore.inherit_sfr_uuid = inherit_sfr->rob->uop.uuid;
+        loadstore.inherit_sfr_rob = inherit_sfr->rob->index();
+        loadstore.inherit_sfr_physreg = inherit_sfr->rob->physreg->index();
+        loadstore.inherit_sfr_rip = inherit_sfr->rob->uop.rip;
+      }
+      loadstore.tlb_walk_level = rob->tlb_walk_level;
+      return this;
+    }
+
+    union {
+      struct {
+        W16s missbuf;
+        W64 predrip;
+        W16 bb_uop_count;
+      } fetch;
+      struct {
+        W16  oldphys;
+        W16  oldzf;
+        W16  oldcf;
+        W16  oldof;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } rename;
+      struct {
+        W16 cycles_left;
+      } frontend;
+      struct {
+        W16 allowed_clusters;
+        W16 iq_avail[MAX_CLUSTERS];
+      } select_cluster;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } dispatch;
+      struct {
+        byte mispredicted:1;
+        IssueState state;
+        W16 cycles_left;
+        W64 operand_data[MAX_OPERANDS];
+        W16 operand_flags[MAX_OPERANDS];
+        W64 predrip;
+        W32 fu_avail;
+      } issue;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+        byte ready;
+      } replay;
+      struct {
+        W64 virtaddr; 
+        W64 data_to_store;
+        SFR sfr;
+        SFR inherit_sfr;
+        W64 inherit_sfr_uuid;        
+        W64 inherit_sfr_rip;
+        W16 inherit_sfr_lsq;
+        W16 inherit_sfr_rob;
+        W16 inherit_sfr_physreg;
+        W16 cycles_left;
+        W64 locking_uuid;
+        byte inherit_sfr_used:1, rcready:1, load_store_second_phase:1, predicted_alias:1;
+        byte locking_vcpuid;
+        W16 locking_rob;
+        W8 threadid;
+        W8 tlb_walk_level;
+      } loadstore;
+      struct {
+        W16 somidx;
+        W16 eomidx;
+        W16 startidx;
+        W16 endidx;
+        byte annulras;
+      } annul;
+      struct {
+        StateList* current_state_list;
+        W16 iqslot;
+        W16 count;
+        byte dependent_operands;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } redispatch;
+      struct {
+        W8  forward_cycle;
+        W8  operand;
+        W8  target_operands_ready;
+        W8  target_all_operands_ready;
+        W16 target_rob;
+        W16 target_physreg;
+        W8  target_rfid;
+        W8  target_cluster;
+        W64 target_uuid;
+        W16 target_lsq;
+        W8  target_st;
+      } forwarding;
+      struct {
+        W16 consumer_count;
+        W16 flags;
+        W64 data;
+        byte transient:1, all_consumers_sourced_from_bypass:1, no_branches_between_renamings:1, dest_renamed_before_writeback:1;
+      } writeback;
+      struct {
+        IssueState state;
+        byte taken:1, predtaken:1, ld_st_truly_unaligned:1,krn:1;
+        PTEUpdateBase pteupdate;
+        W16s oldphysreg;
+        W16 oldphysreg_refcount;
+        W64 origvirt;
+        W64 total_user_insns_committed;
+        W64 target_rip;
+        W16 operand_physregs[MAX_OPERANDS];
+      } commit;
+    };
+
+    ostream& print(ostream& os) const;
+  };
+
+  struct EventLog {
+    SMTCoreEvent* start;
+    SMTCoreEvent* end;
+    SMTCoreEvent* tail;
+    int           coreid;
+    ostream* logfile;
+
+    EventLog(int coreid_) :coreid(coreid_) { start = null; end = null; tail = null; logfile = null; }
+
+    bool init(size_t bufsize);
+    void reset();
+
+    SMTCoreEvent* add() {
+      if unlikely (tail >= end) {
+        tail = start;
+        flush();
+      }
+      SMTCoreEvent* event = tail;
+      tail++;
+      return event;
+    }
+
+    void flush(bool only_to_tail = false);
+
+    SMTCoreEvent* add(int type) {
+      return add()->fill(type);
+    }
+
+    SMTCoreEvent* add(int type, const RIPVirtPhys& rvp) {
+      return add()->fill(type, rvp);
+    }
+
+    SMTCoreEvent* add(int type, const FetchBufferEntry& uop) {
+      return add()->fill(type, uop);
+    }
+
+    SMTCoreEvent* add(int type, const ReorderBufferEntry* rob) {
+      return add()->fill(type, rob);
+    }
+
+    SMTCoreEvent* add_commit(int type, const ReorderBufferEntry* rob) {
+      return add()->fill_commit(type, rob);
+    }
+
+    SMTCoreEvent* add_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr = null, Waddr addr = 0) {
+      return add()->fill_load_store(type, rob, inherit_sfr, addr);
+    }
+
+    ostream& print(ostream& os, bool only_to_tail = false);
+  };
+
+  #define ASF_MAX_LINES (8)
+  #define ASF_MAX_SPEC_LINES (12)
+  #define LLB_LINE_SIZE CacheSubsystem::L1_LINE_SIZE
+  struct LLBLine {
+    bool  written;
+    byte  orig_data[LLB_LINE_SIZE];
+    int   refcount;
+    void  reset() {written = false; refcount = 0;}
+    LLBLine():written(false),refcount(0) {}
+  };
+  enum {
+    PROBE_ACK,
+    PROBE_NACK,
+    PROBE_WAIT
+  };
+
+  struct LockedLineBuffer: public FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_SPEC_LINES> {
+    typedef FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_SPEC_LINES> base_t;
+    ThreadContext& thread;
+    int num_locations;
+
+    LockedLineBuffer(ThreadContext& _thread): base_t(), thread(_thread) {}
+    LLBLine* add_location(Waddr addr);
+    void remove_ref(LLBLine* line);
+    void clear();
+    void snapshot();
+    void undo();
+
+    void commit() {clear(); lasterr = 0;};
+    void abort() { undo(); /*clear();*/ lasterr = 0; };
+
+    bool contains(Waddr addr) {return probe(floor(addr, LLB_LINE_SIZE));}
+    bool empty() {return (num_locations == 0);}
+    void* external_probe(Waddr addr, bool invalidating);
+    void* probe_other_LLBs(Waddr addr, bool invalidating);
+    void mark_clean(Waddr addr);
+    void mark_clean_others(Waddr addr);
+    void mark_written(Waddr addr);
+
+    W64  consistency_error() {
+/* S.D. Error injection framework!*/if (!lasterr)  lasterr = (asf_consistency_error()) ? 0xDEADBEEF : 0;
+      return lasterr;
+    }
+    private: W64 lasterr;
+  };
+
+  struct LoadStoreAliasPredictor: public FullyAssociativeTags<W64, 8> { };
+
+  enum {
+    ROB_STATE_READY = (1 << 0),
+    ROB_STATE_IN_ISSUE_QUEUE = (1 << 1),
+    ROB_STATE_PRE_READY_TO_DISPATCH = (1 << 2)
+  };
+
+#ifndef SEPARATE_LSU
+#define InitClusteredROBList(name, description, flags) \
+  name[0](description "-int0", rob_states, flags); \
+  name[1](description "-int1", rob_states, flags); \
+  name[2](description "-int2", rob_states, flags); \
+  name[3](description "-fp", rob_states, flags);
+#else
+#define InitClusteredROBList(name, description, flags) \
+  name[0](description "-int0", rob_states, flags); \
+  name[1](description "-int1", rob_states, flags); \
+  name[2](description "-int2", rob_states, flags); \
+  name[3](description "-fp", rob_states, flags);   \
+  name[4](description "-ld", rob_states, flags);
+#endif
+
+  static const int ISSUE_QUEUE_SIZE = 16;
+
+  // How many bytes of x86 code to fetch into decode buffer at once
+  static const int ICACHE_FETCH_GRANULARITY = 32;
+  // Deadlock timeout: if nothing dispatches for this many cycles, flush the pipeline
+  static const int DISPATCH_DEADLOCK_COUNTDOWN_CYCLES = 256;
+  // Size of unaligned predictor Bloom filter
+  static const int UNALIGNED_PREDICTOR_SIZE = 4096;
+
+  struct ThreadContext {
+    SMTCore& core;
+    SMTCore& getcore() const { return core; }
+
+    int threadid;
+    Context& ctx;
+    BranchPredictorInterface branchpred;
+
+    Queue<FetchBufferEntry, FETCH_QUEUE_SIZE> fetchq;
+
+    ListOfStateLists rob_states;
+    ListOfStateLists lsq_states;
+    //
+    // Each ROB's state can be linked into at most one of the
+    // following rob_xxx_list lists at any given time; the ROB's
+    // current_state_list points back to the list it belongs to.
+    //
+    StateList rob_free_list;                             // Free ROB entyry
+    StateList rob_frontend_list;                         // Frontend in progress (artificial delay)
+    StateList rob_ready_to_dispatch_list;                // Ready to dispatch
+    StateList rob_dispatched_list[MAX_CLUSTERS];         // Dispatched but waiting for operands
+    StateList rob_ready_to_issue_list[MAX_CLUSTERS];     // Ready to issue (all operands ready)
+    StateList rob_ready_to_store_list[MAX_CLUSTERS];     // Ready to store (all operands except possibly rc are ready)
+    StateList rob_ready_to_load_list[MAX_CLUSTERS];      // Ready to load (all operands ready)
+    StateList rob_issued_list[MAX_CLUSTERS];             // Issued and in progress (or for loads, returned here after address is generated)
+    StateList rob_completed_list[MAX_CLUSTERS];          // Completed and result in transit for local and global forwarding
+    StateList rob_ready_to_writeback_list[MAX_CLUSTERS]; // Completed; result ready to writeback in parallel across all cluster register files
+    StateList rob_cache_miss_list;                       // Loads only: wait for cache miss to be serviced
+    StateList rob_tlb_miss_list;                         // TLB miss waiting to be serviced on one or more levels
+    StateList rob_memory_fence_list;                     // mf uops only: wait for memory fence to reach head of LSQ before completing
+    StateList rob_ready_to_commit_queue;                 // Ready to commit
+
+    Queue<ReorderBufferEntry, ROB_SIZE> ROB;
+
+    Queue<LoadStoreQueueEntry, LSQ_SIZE> LSQ;
+    RegisterRenameTable specrrt;
+    RegisterRenameTable commitrrt;
+
+    // Fetch-related structures
+    RIPVirtPhys fetchrip;
+    BasicBlock* current_basic_block;
+    int current_basic_block_transop_index;
+    bool stall_frontend;
+    bool stall_on_eom;
+    bool waiting_for_icache_fill;
+
+    // Last block in icache we fetched into our buffer
+    W64 current_icache_block;
+    W64 fetch_uuid;
+    int loads_in_flight;
+    int stores_in_flight;
+    bool prev_interrupts_pending;
+    bool handle_interrupt_at_next_eom;
+    bool stop_at_next_eom;
+
+    W64 last_commit_at_cycle;
+    bool smc_invalidate_pending;
+    RIPVirtPhys smc_invalidate_rvp;
+    W64 chk_recovery_rip;
+
+    TransOpBuffer unaligned_ldst_buf;
+    LoadStoreAliasPredictor lsap;
+    int loads_in_this_cycle;
+    W64 load_to_store_parallel_forwarding_buffer[LOAD_FU_COUNT];
+
+    W64 consecutive_commits_inside_spinlock;
+
+    // statistics:
+    W64 total_uops_committed;
+    W64 total_insns_committed;
+    int dispatch_deadlock_countdown;    
+    int issueq_count;
+
+    //
+    // List of memory locks that will be removed from
+    // the lock controller when the macro-op commits.
+    //
+    // At most 4 chunks are allowed, to ensure
+    // cmpxchg16b works even with unaligned data.
+    //
+    byte queued_mem_lock_release_count;
+    W64 queued_mem_lock_release_list[4];
+
+    ThreadContext(SMTCore& core_, int threadid_, Context& ctx_): core(core_), threadid(threadid_), ctx(ctx_), locked_line_buffer(*this) {
+      reset();
+    }
+
+    int commit();
+    int writeback(int cluster);
+    int transfer(int cluster);
+    int complete(int cluster);
+    int dispatch();
+    void frontend();
+    void rename();
+    bool fetch();
+    void tlbwalk();
+
+    bool handle_barrier();
+    bool handle_exception();
+    bool handle_interrupt();
+    void reset_fetch_unit(W64 realrip);
+    void flush_pipeline();
+    void invalidate_smc();
+    void external_to_core_state();
+    void core_to_external_state() { }
+    void annul_fetchq();
+    BasicBlock* fetch_or_translate_basic_block(const RIPVirtPhys& rvp);
+    void redispatch_deadlock_recovery();
+    void flush_mem_lock_release_list(byte start = 0);
+    int get_priority() const;
+
+    void dump_smt_state(ostream& os);
+    void print_smt_state(ostream& os);
+    void print_rob(ostream& os);
+    void print_lsq(ostream& os);
+    void print_rename_tables(ostream& os);
+
+    void reset();
+    void init();
+
+    // ASF
+    bool        asf_in_crit_sec;
+    bool        asf_reissue_will_fail;
+    W64         asf_stored_error;
+    RIPVirtPhys asf_failing_acquire;
+    W64         asf_saved_rsp;
+    LockedLineBuffer locked_line_buffer;
+    void check_asf_conflicts();
+    int asf_runcycle(int commitrc);
+    void asf_rollback_last_acq(W64 errorcode, int reg_nextrip);
+  };
+
+  //
+  // checkpointed core
+  //
+  struct SMTCore {
+    SMTMachine& machine;
+    int coreid;
+    SMTCore& getcore() const { return coreof(coreid); }
+
+    int threadcount;
+    ThreadContext* threads[MAX_THREADS_PER_CORE];
+
+    ListOfStateLists rob_states;
+    ListOfStateLists lsq_states;
+
+    EventLog eventlog;
+    ListOfStateLists physreg_states;
+    // Bandwidth counters:
+    int commitcount;
+    int writecount;
+    int dispatchcount;
+
+    byte round_robin_tid;
+
+    //
+    // Issue Queues (one per cluster)
+    //
+    int reserved_iq_entries;
+#define declare_issueq_templates \
+    template struct IssueQueue<8>; \
+    template struct IssueQueue<36>;\
+    template struct IssueQueue<12>;
+
+    IssueQueue<8> issueq_int1;
+    IssueQueue<8> issueq_int2;
+    IssueQueue<8> issueq_int3;
+    IssueQueue<36> issueq_fp;
+#ifdef SEPARATE_LSU
+    IssueQueue<12> issueq_ld;
+#endif
+
+#ifndef SEPARATE_LSU
+#define foreach_issueq(expr) { SMTCore& core = getcore(); core.issueq_int1.expr; core.issueq_int2.expr; core.issueq_int3.expr; core.issueq_fp.expr;}
+#else
+#define foreach_issueq(expr) { SMTCore& core = getcore(); core.issueq_int1.expr; core.issueq_int2.expr; core.issueq_int3.expr; core.issueq_fp.expr; core.issueq_ld.expr;}
+#endif
+
+    void sched_get_all_issueq_free_slots(int* a) {
+      a[0] = issueq_int1.remaining();
+      a[1] = issueq_int2.remaining();
+      a[2] = issueq_int3.remaining();
+      a[3] = issueq_fp.remaining();
+#ifdef SEPARATE_LSU
+      a[4] = issueq_ld.remaining();
+#endif
+    }
+
+#ifndef SEPARATE_LSU
+#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
+  switch (cluster) { \
+  case 0: rc = core.issueq_int1.expr; break; \
+  case 1: rc = core.issueq_int2.expr; break; \
+  case 2: rc = core.issueq_int3.expr; break; \
+  case 3: rc = core.issueq_fp.expr; break; \
+  }
+
+#define per_cluster_stats_update(prefix, cluster, expr) \
+  switch (cluster) { \
+  case 0: prefix.int1 expr; break; \
+  case 1: prefix.int2 expr; break; \
+  case 2: prefix.int3 expr; break; \
+  case 3: prefix.fp expr; break; \
+  }
+#else //#ifndef SEPARATE_LSU
+#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
+    switch (cluster) { \
+    case 0: rc = core.issueq_int1.expr; break; \
+    case 1: rc = core.issueq_int2.expr; break; \
+    case 2: rc = core.issueq_int3.expr; break; \
+    case 3: rc = core.issueq_fp.expr; break; \
+    case 4: rc = core.issueq_ld.expr; break; \
+  }
+
+#define per_cluster_stats_update(prefix, cluster, expr) \
+    switch (cluster) { \
+    case 0: prefix.int1 expr; break; \
+    case 1: prefix.int2 expr; break; \
+    case 2: prefix.int3 expr; break; \
+    case 3: prefix.fp expr; break; \
+    case 4: prefix.ld expr; break; \
+  }
+#endif //#ifndef SEPARATE_LSU
+
+#define per_physregfile_stats_update(prefix, rfid, expr) \
+  switch (rfid) { \
+  case 0: prefix.integer expr; break; \
+  case 1: prefix.fp expr; break; \
+  case 2: prefix.st expr; break; \
+  case 3: prefix.br expr; break; \
+  }
+
+#define issueq_operation_on_cluster(core, cluster, expr) { int dummyrc; issueq_operation_on_cluster_with_result(core, cluster, dummyrc, expr); }
+
+#define for_each_cluster(iter) foreach (iter, MAX_CLUSTERS)
+#define for_each_operand(iter) foreach (iter, MAX_OPERANDS)
+
+    SMTCore(int coreid_, SMTMachine& machine_): coreid(coreid_), machine(machine_), cache_callbacks(*this), caches(coreid_), eventlog(coreid_) {
+      threadcount = 0;
+      setzero(threads);
+    }
+    
+    ~SMTCore(){};
+
+    // 
+    // Initialize structures independent of the core parameters
+    //
+    void init_generic();
+    void reset();
+
+    //
+    // Initialize all structures for the first time
+    //
+    void init() {
+      init_generic();
+      //
+      // Physical register files
+      //
+      physregfiles[0]("int", coreid, 0, PHYS_REG_FILE_SIZE);
+      physregfiles[1]("fp", coreid, 1, PHYS_REG_FILE_SIZE);
+      physregfiles[2]("st", coreid, 2, STQ_SIZE * MAX_THREADS_PER_CORE);
+      physregfiles[3]("br", coreid, 3, MAX_BRANCHES_IN_FLIGHT * MAX_THREADS_PER_CORE);
+    }
+
+    //
+    // Physical Registers
+    //
+
+    enum { PHYS_REG_FILE_INT, PHYS_REG_FILE_FP, PHYS_REG_FILE_ST, PHYS_REG_FILE_BR };
+
+    enum {  
+      PHYS_REG_FILE_MASK_INT = (1 << 0),
+      PHYS_REG_FILE_MASK_FP  = (1 << 1),
+      PHYS_REG_FILE_MASK_ST  = (1 << 2),
+      PHYS_REG_FILE_MASK_BR  = (1 << 3)
+    };
+
+    // Major core structures
+    PhysicalRegisterFile physregfiles[PHYS_REG_FILE_COUNT];
+    int round_robin_reg_file_offset;
+    W32 fu_avail;
+    ReorderBufferEntry* robs_on_fu[FU_COUNT];
+    CacheSubsystem::CacheHierarchy caches;
+    SMTCoreCacheCallbacks cache_callbacks;
+
+    // Unaligned load/store predictor
+    bitvec<UNALIGNED_PREDICTOR_SIZE> unaligned_predictor;
+    static int hash_unaligned_predictor_slot(const RIPVirtPhysBase& rvp);
+    bool get_unaligned_hint(const RIPVirtPhysBase& rvp) const;
+    void set_unaligned_hint(const RIPVirtPhysBase& rvp, bool value);
+
+    // Pipeline Stages
+    bool runcycle();
+    void flush_pipeline_all();
+    bool fetch();
+    void rename();
+    void frontend();
+    int dispatch();
+    int issue(int cluster);
+    int complete(int cluster);
+    int transfer(int cluster);
+    int writeback(int cluster);
+    int commit();
+
+    // Callbacks
+    void flush_tlb(Context& ctx, int threadid, bool selective = false, Waddr virtaddr = 0);
+
+    // Debugging
+    void dump_smt_state(ostream& os);
+    void print_smt_state(ostream& os);
+    void check_refcounts();
+    void check_rob();
+
+  };
+
+  #define MAX_SMT_CORES 32
+
+  struct SMTMachine: public PTLsimMachine {
+    SMTCore* cores[MAX_SMT_CORES];
+    int corecount;
+    bitvec<MAX_CONTEXTS> stopped;
+    SMTMachine(const char* name);
+    virtual bool init(PTLsimConfig& config);
+    virtual int run(PTLsimConfig& config);
+    virtual void dump_state(ostream& os);
+    virtual void update_stats(PTLsimStats& stats);
+    virtual void flush_tlb(Context& ctx);
+    virtual void flush_tlb_virt(Context& ctx, Waddr virtaddr);
+    void flush_all_pipelines();
+  };
+
+  extern CycleTimer cttotal;
+  extern CycleTimer ctfetch;
+  extern CycleTimer ctdecode;
+  extern CycleTimer ctrename;
+  extern CycleTimer ctfrontend;
+  extern CycleTimer ctdispatch;
+  extern CycleTimer ctissue;
+  extern CycleTimer ctissueload;
+  extern CycleTimer ctissuestore;
+  extern CycleTimer ctcomplete;
+  extern CycleTimer cttransfer;
+  extern CycleTimer ctwriteback;
+  extern CycleTimer ctcommit;
+
+#ifdef DECLARE_STRUCTURES
+  //
+  // The following configuration has two integer/store clusters with a single cycle
+  // latency between them, but both clusters can access the load pseudo-cluster with
+  // no extra cycle. The floating point cluster is two cycles from everything else.
+  //
+
+#ifndef SEPARATE_LSU
+  extern const Cluster clusters[MAX_CLUSTERS] = {
+    {"int1",  2, (FU_ALU1|FU_ALUC/*|FU_LSU01*/)},  //TODO: Adding LSU01 requires additional checking of max loads / cycle!
+    {"int2",  2, (FU_ALU2|FU_LSU02)},              //      as K8/GH only allows two simultaneous accesses to L1D.
+    {"int3",  2, (FU_ALU3|FU_LSU03)},              //      also see the SEPARATE_LSU version below!
+    {"fp",    3, (FU_FADD|FU_FMUL|FU_FCVT)},
+  };
+  extern const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 I2 FP  <-to
+    {  0, 0, 0, 2}, // from I0
+    {  0, 0, 0, 2}, // from I1
+    {  0, 0, 0, 2}, // from I2
+    {  2, 2, 2, 0}, // from FP
+  };
+
+  extern const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I1 I2 I3 FP <-to
+    {  2, 2, 2, 1}, // from I1
+    {  2, 2, 2, 1}, // from I2
+    {  2, 2, 2, 2}, // from I3
+    {  1, 1, 1, 2}, // from FP
+  };
+#else //#ifndef SEPARATE_LSU
+  extern const Cluster clusters[MAX_CLUSTERS] = {
+    {"int1",  2, (FU_ALU1|FU_ALUC)},
+    {"int2",  1, (FU_ALU2)},
+    {"int3",  1, (FU_ALU3)},
+    {"fp",    3, (FU_FADD|FU_FMUL|FU_FCVT)},
+    {"ld",    2, (FU_LSU01|FU_LSU02)},
+  };
+  extern const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 I2 FP LD <-to
+    {  0, 0, 0, 2, 0}, // from I0
+    {  0, 0, 0, 2, 0}, // from I1
+    {  0, 0, 0, 2, 0}, // from I2
+    {  2, 2, 2, 0, 2}, // from FP
+    {  0, 0, 0, 2, 0}, // from LD
+  };
+
+  extern const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I1 I2 I3 FP LD<-to
+    {  2, 2, 2, 1, 2}, // from I1
+    {  2, 2, 2, 1, 2}, // from I2
+    {  2, 2, 2, 2, 2}, // from I3
+    {  1, 1, 1, 2, 1}, // from FP
+    {  2, 2, 2, 1, 2}, // from LD
+  };
+#endif //#ifndef SEPARATE_LSU
+#else //#ifdef DECLARE_STRUCTURES
+  extern const Cluster clusters[MAX_CLUSTERS];
+  extern const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS];
+  extern const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS];
+#endif // #ifdef DECLARE_STRUCTURES
+
+#endif // INSIDE_SMTCORE
+
+  //
+  // This part is used when parsing stats.h to build the
+  // data store template; these must be in sync with the
+  // corresponding definitions elsewhere.
+  //
+  static const char* cluster_names[MAX_CLUSTERS] = {"int1", "int2", "int3", "fp"};
+
+  static const char* phys_reg_file_names[PHYS_REG_FILE_COUNT] = {"int", "fp", "st", "br"};
+};
+
+struct PerContextSMTStats { // rootnode:
+  struct fetch {
+    struct stop { // node: summable
+      W64 stalled;
+      W64 icache_miss;
+      W64 fetchq_full;
+      W64 issueq_quota_full;
+      W64 bogus_rip;
+      W64 microcode_assist;
+      W64 branch_taken;
+      W64 full_width;
+    } stop;
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+    W64 width[SMTModel::FETCH_WIDTH+1]; // histo: 0, SMTModel::FETCH_WIDTH, 1
+    W64 blocks;
+    W64 uops;
+    W64 user_insns;
+  } fetch;
+
+  struct frontend {
+    struct status { // node: summable
+      W64 complete;
+      W64 fetchq_empty;
+      W64 rob_full;
+      W64 physregs_full;
+      W64 ldq_full;
+      W64 stq_full;
+    } status;
+    W64 width[SMTModel::FRONTEND_WIDTH+1]; // histo: 0, SMTModel::FRONTEND_WIDTH, 1
+    struct renamed {
+      W64 none;
+      W64 reg;
+      W64 flags;
+      W64 reg_and_flags;
+    } renamed;
+    struct alloc {
+      W64 reg;
+      W64 ldreg;
+      W64 sfr;
+      W64 br;
+    } alloc;
+    // NOTE: This is capped at 255 consumers to keep the size reasonable:
+    W64 consumer_count[256]; // histo: 0, 255, 1
+  } frontend;
+
+  struct dispatch {
+    W64 cluster[SMTModel::MAX_CLUSTERS]; // label: SMTModel::cluster_names
+    struct redispatch {
+      W64 trigger_uops;
+      W64 deadlock_flushes;
+      W64 deadlock_uops_flushed;
+      W64 dependent_uops[SMTModel::ROB_SIZE+1]; // histo: 0, SMTModel::ROB_SIZE, 1
+    } redispatch;
+  } dispatch;
+
+  struct issue {
+    W64 uops;
+    double uipc;
+    struct result { // node: summable
+      W64 no_fu;
+      W64 replay;
+      W64 misspeculated;
+      W64 refetch;
+      W64 branch_mispredict;
+      W64 exception;
+      W64 complete;
+    } result;
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } issue;
+
+  struct writeback {
+    W64 writebacks[SMTModel::PHYS_REG_FILE_COUNT]; // label: SMTModel::phys_reg_file_names
+  } writeback;
+
+  struct commit {
+    W64 uops;
+    W64 insns;
+    double uipc;
+    double ipc;
+
+    struct result { // node: summable
+      W64 none;
+      W64 ok;
+      W64 exception;
+      W64 skipblock;
+      W64 barrier;
+      W64 smc;
+      W64 memlocked;
+      W64 stop;
+    } result;
+
+    struct setflags { // node: summable
+      W64 yes;
+      W64 no;
+    } setflags;
+
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } commit;
+
+  struct branchpred {
+    W64 predictions;
+    W64 updates;
+
+    // These counters are [0] = mispred, [1] = correct
+    W64 cond[2]; // label: branchpred_outcome_names
+    W64 indir[2]; // label: branchpred_outcome_names
+    W64 ret[2]; // label: branchpred_outcome_names
+    W64 summary[2]; // label: branchpred_outcome_names
+    struct ras { // node: summable
+      W64 pushes;
+      W64 overflows;
+      W64 pops;
+      W64 underflows;
+      W64 annuls;
+    } ras;
+  } branchpred;
+
+  struct dcache {
+    struct load {
+      struct issue { // node: summable
+        W64 complete;
+        W64 miss;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 missbuf_full;
+          W64 interlocked;
+          W64 interlock_overflow;
+          W64 fence;
+          W64 bank_conflict;
+        } replay;
+      } issue;
+
+      struct forward { // node: summable
+        W64 cache;
+        W64 sfr;
+        W64 sfr_and_cache;
+      } forward;
+        
+      struct dependency { // node: summable
+        W64 independent;
+        W64 predicted_alias_unresolved;
+        W64 stq_address_match;
+        W64 stq_address_not_ready;
+        W64 fence;
+      } dependency;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+    } load;
+
+    struct store {
+      struct issue { // node: summable
+        W64 complete;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 sfr_addr_and_data_and_data_to_store_not_ready;
+          W64 sfr_addr_and_data_to_store_not_ready;
+          W64 sfr_data_and_data_to_store_not_ready;
+          W64 interlocked;
+          W64 fence;
+          W64 parallel_aliasing;
+          W64 bank_conflict;
+        } replay;
+      } issue;
+
+      struct forward { // node: summable
+        W64 zero;
+        W64 sfr;
+      } forward;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+    } store;
+
+    struct fence { // node: summable
+      W64 lfence;
+      W64 sfence;
+      W64 mfence;
+    } fence;
+  } dcache;
+};
+
+//
+// SMT Core
+//
+struct SMTCoreStats { // rootnode:
+  W64 cycles;
+
+  struct dispatch {
+    struct source { // node: summable
+      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+    } source;
+    W64 width[SMTModel::DISPATCH_WIDTH+1]; // histo: 0, SMTModel::DISPATCH_WIDTH, 1
+  } dispatch;
+
+  struct issue {
+    struct source { // node: summable
+      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+    } source;
+    struct width {
+      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int2[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int3[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#ifdef SEPARATE_LSU
+      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#endif
+    } width;
+  } issue;
+
+  struct writeback {
+    struct width {
+      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int2[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int3[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#ifdef SEPARATE_LSU
+      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#endif
+    } width;
+  } writeback;
+
+  struct commit {
+    struct freereg { // node: summable
+      W64 pending;
+      W64 free;
+    } freereg;
+
+    W64 free_regs_recycled;
+
+    W64 width[SMTModel::COMMIT_WIDTH+1]; // histo: 0, SMTModel::COMMIT_WIDTH, 1
+  } commit;
+
+  PerContextSMTStats total;
+  PerContextSMTStats vcpu0;
+  PerContextSMTStats vcpu1;
+  PerContextSMTStats vcpu2;
+  PerContextSMTStats vcpu3;
+  PerContextSMTStats vcpu4;
+  PerContextSMTStats vcpu5;
+  PerContextSMTStats vcpu6;
+  PerContextSMTStats vcpu7;
+
+  struct simulator {
+    double total_time;
+    struct cputime { // node: summable
+      double fetch;
+      double decode;
+      double rename;
+      double frontend;
+      double dispatch;
+      double issue;
+      double issueload;
+      double issuestore;
+      double complete;
+      double transfer;
+      double writeback;
+      double commit;
+    } cputime;
+  } simulator;
+};
diff -r 10448c053ad6 smtcore-amd-k8.h
--- a/smtcore-amd-k8.h	Thu May 31 15:36:20 2007 +0200
+++ b/smtcore-amd-k8.h	Wed Nov 05 14:15:51 2008 +0100
@@ -7,13 +7,6 @@
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
 //
-
-#ifndef _SMTCORE_H_
-#define _SMTCORE_H_
-
-// With these disabled, simulation is faster
-// #define ENABLE_CHECKS
-// #define ENABLE_LOGGING
 
 //
 // Enable SMT operation:
@@ -1590,7 +1583,7 @@
     void check_rob();
   };
 
-#define MAX_SMT_CORES 1
+#define MAX_SMT_CORES 32
 
   struct SMTMachine: public PTLsimMachine {
     SMTCore* cores[MAX_SMT_CORES];
@@ -1944,5 +1937,3 @@
     } cputime;
   } simulator;
 };
-
-#endif // _SMTCORE_H_
diff -r 10448c053ad6 smtcore-generic.h
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/smtcore-generic.h	Wed Nov 05 14:15:51 2008 +0100
@@ -0,0 +1,2086 @@
+// -*- c++ -*-
+//
+// PTLsim: Cycle Accurate x86-64 Simulator
+// SMT Core Simulator Configuration
+//
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
+// Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
+// Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
+//
+
+//
+// Enable SMT operation:
+//
+// Note that this limits some configurations of resources and
+// issue queues that would normally be possible in single
+// threaded mode.
+//
+
+//#define ENABLE_SMT
+
+static const int MAX_THREADS_BIT = 4; // up to 16 threads
+static const int MAX_ROB_IDX_BIT = 12; // up to 4096 ROB entries
+
+#ifdef ENABLE_SMT
+static const int MAX_THREADS_PER_CORE = 4;
+#else
+static const int MAX_THREADS_PER_CORE = 1;
+#endif
+
+//#define ENABLE_SIM_TIMING
+#ifdef ENABLE_SIM_TIMING
+#define time_this_scope(ct) CycleTimerScope ctscope(ct)
+#define start_timer(ct) ct.start()
+#define stop_timer(ct) ct.stop()
+#else
+#define time_this_scope(ct) (0)
+#define start_timer(ct) (0)
+#define stop_timer(ct) (0)
+#endif
+
+#define per_context_smtcore_stats_ref(vcpuid) (*(((PerContextSMTStats*)&stats.smtcore.vcpu0) + (vcpuid)))
+#define per_context_smtcore_stats_update(vcpuid, expr) stats.smtcore.total.expr, per_context_smtcore_stats_ref(vcpuid).expr
+
+namespace SMTModel {
+  //
+  // Operand formats
+  //
+  static const int MAX_OPERANDS = 4;
+  static const int RA = 0;
+  static const int RB = 1;
+  static const int RC = 2;
+  static const int RS = 3; // (for stores only)
+
+  //
+  // Uop to functional unit mappings
+  //
+  static const int FU_COUNT = 8;
+  static const int LOADLAT = 2;
+
+  enum {
+    FU_LDU0       = (1 << 0),
+    FU_STU0       = (1 << 1),
+    FU_LDU1       = (1 << 2),
+    FU_STU1       = (1 << 3),
+    FU_ALU0       = (1 << 4),
+    FU_FPU0       = (1 << 5),
+    FU_ALU1       = (1 << 6),
+    FU_FPU1       = (1 << 7),
+  };
+
+  static const int LOAD_FU_COUNT = 2;
+
+  const char* fu_names[FU_COUNT] = {
+    "ldu0",
+    "stu0",
+    "ldu1",
+    "stu1",
+    "alu0",
+    "fpu0",
+    "alu1",
+    "fpu1",
+  };
+
+  //
+  // Opcodes and properties
+  //
+#define ALU0 FU_ALU0
+#define ALU1 FU_ALU1
+#define STU0 FU_STU0
+#define STU1 FU_STU1
+#define LDU0 FU_LDU0
+#define LDU1 FU_LDU1
+#define FPU0 FU_FPU0
+#define FPU1 FU_FPU1
+#define A 1 // ALU latency, assuming fast bypass
+#define L LOADLAT
+
+#define ANYALU ALU0|ALU1
+#define ANYLDU LDU0|LDU1
+#define ANYSTU STU0|STU1
+#define ANYFPU FPU0|FPU1
+#define ANYINT ANYALU|ANYSTU|ANYLDU
+
+  struct FunctionalUnitInfo {
+    byte opcode;   // Must match definition in ptlhwdef.h and ptlhwdef.cpp! 
+    byte latency;  // Latency in cycles, assuming ideal bypass
+    W16  fu;       // Map of functional units on which this uop can issue
+  };
+
+  //
+  // WARNING: This table MUST be kept in sync with the table
+  // in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
+  //
+  const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
+    // name, latency, fumask
+    {OP_nop,            A, ANYINT|ANYFPU},
+    {OP_mov,            A, ANYINT|ANYFPU},
+    // Logical
+    {OP_and,            A, ANYINT|ANYFPU},
+    {OP_andnot,         A, ANYINT|ANYFPU},
+    {OP_xor,            A, ANYINT|ANYFPU},
+    {OP_or,             A, ANYINT|ANYFPU},
+    {OP_nand,           A, ANYINT|ANYFPU},
+    {OP_ornot,          A, ANYINT|ANYFPU},
+    {OP_eqv,            A, ANYINT|ANYFPU},
+    {OP_nor,            A, ANYINT|ANYFPU},
+    // Mask, insert or extract bytes
+    {OP_maskb,          A, ANYINT},
+    // Add and subtract
+    {OP_add,            A, ANYINT},
+    {OP_sub,            A, ANYINT},
+    {OP_adda,           A, ANYINT},
+    {OP_suba,           A, ANYINT},
+    {OP_addm,           A, ANYINT},
+    {OP_subm,           A, ANYINT},
+    // Condition code logical ops
+    {OP_andcc,          A, ANYINT},
+    {OP_orcc,           A, ANYINT},
+    {OP_xorcc,          A, ANYINT},
+    {OP_ornotcc,        A, ANYINT},
+    // Condition code movement and merging
+    {OP_movccr,         A, ANYINT},
+    {OP_movrcc,         A, ANYINT},
+    {OP_collcc,         A, ANYINT},
+    // Simple shifting (restricted to small immediate 1..8)
+    {OP_shls,           A, ANYINT},
+    {OP_shrs,           A, ANYINT},
+    {OP_bswap,          A, ANYINT},
+    {OP_sars,           A, ANYINT},
+    // Bit testing
+    {OP_bt,             A, ANYALU},
+    {OP_bts,            A, ANYALU},
+    {OP_btr,            A, ANYALU},
+    {OP_btc,            A, ANYALU},
+    // Set and select
+    {OP_set,            A, ANYINT},
+    {OP_set_sub,        A, ANYINT},
+    {OP_set_and,        A, ANYINT},
+    {OP_sel,            A, ANYINT},
+    // Branches
+    {OP_br,             A, ANYINT},
+    {OP_br_sub,         A, ANYINT},
+    {OP_br_and,         A, ANYINT},
+    {OP_jmp,            A, ANYINT},
+    {OP_bru,            A, ANYINT},
+    {OP_jmpp,           A, ANYALU|ANYLDU},
+    {OP_brp,            A, ANYALU|ANYLDU},
+    // Checks
+    {OP_chk,            A, ANYINT},
+    {OP_chk_sub,        A, ANYINT},
+    {OP_chk_and,        A, ANYINT},
+    // Loads and stores
+    {OP_ld,             L, ANYLDU},
+    {OP_ldx,            L, ANYLDU},
+    {OP_ld_pre,         1, ANYLDU},
+    {OP_st,             1, ANYSTU},
+    {OP_mf,             1, STU0  },
+    // Shifts, rotates and complex masking
+    {OP_shl,            A, ANYALU},
+    {OP_shr,            A, ANYALU},
+    {OP_mask,           A, ANYALU},
+    {OP_sar,            A, ANYALU},
+    {OP_rotl,           A, ANYALU},  
+    {OP_rotr,           A, ANYALU},   
+    {OP_rotcl,          A, ANYALU},
+    {OP_rotcr,          A, ANYALU},  
+    // Multiplication
+    {OP_mull,           4, ANYFPU},
+    {OP_mulh,           4, ANYFPU},
+    {OP_mulhu,          4, ANYFPU},
+    // Bit scans
+    {OP_ctz,            3, ANYFPU},
+    {OP_clz,            3, ANYFPU},
+    {OP_ctpop,          3, ANYFPU},  
+    {OP_permb,          4, ANYFPU},
+    // Floating point
+    // uop.size bits have following meaning:
+    // 00 = single precision, scalar (preserve high 32 bits of ra)
+    // 01 = single precision, packed (two 32-bit floats)
+    // 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
+    {OP_addf,           6, ANYFPU},
+    {OP_subf,           6, ANYFPU},
+    {OP_mulf,           6, ANYFPU},
+    {OP_maddf,          6, ANYFPU},
+    {OP_msubf,          6, ANYFPU},
+    {OP_divf,           6, ANYFPU},
+    {OP_sqrtf,          6, ANYFPU},
+    {OP_rcpf,           6, ANYFPU},
+    {OP_rsqrtf,         6, ANYFPU},
+    {OP_minf,           4, ANYFPU},
+    {OP_maxf,           4, ANYFPU},
+    {OP_cmpf,           4, ANYFPU},
+    // For fcmpcc, uop.size bits have following meaning:
+    // 00 = single precision ordered compare
+    // 01 = single precision unordered compare
+    // 10 = double precision ordered compare
+    // 11 = double precision unordered compare
+    {OP_cmpccf,         4, ANYFPU},
+    // and/andn/or/xor are done using integer uops
+    {OP_permf,          3, ANYFPU}, // shuffles
+    // For these conversions, uop.size bits select truncation mode:
+    // x0 = normal IEEE-style rounding
+    // x1 = truncate to zero
+    {OP_cvtf_i2s_ins,   6, ANYFPU},
+    {OP_cvtf_i2s_p,     6, ANYFPU},
+    {OP_cvtf_i2d_lo,    6, ANYFPU},
+    {OP_cvtf_i2d_hi,    6, ANYFPU},
+    {OP_cvtf_q2s_ins,   6, ANYFPU},
+    {OP_cvtf_q2d,       6, ANYFPU},
+    {OP_cvtf_s2i,       6, ANYFPU},
+    {OP_cvtf_s2q,       6, ANYFPU},
+    {OP_cvtf_s2i_p,     6, ANYFPU},
+    {OP_cvtf_d2i,       6, ANYFPU},
+    {OP_cvtf_d2q,       6, ANYFPU},
+    {OP_cvtf_d2i_p,     6, ANYFPU},
+    {OP_cvtf_d2s_ins,   6, ANYFPU},
+    {OP_cvtf_d2s_p,     6, ANYFPU},
+    {OP_cvtf_s2d_lo,    6, ANYFPU},
+    {OP_cvtf_s2d_hi,    6, ANYFPU},
+    {OP_acq,            A, ANYINT},
+    {OP_com,            A, ANYINT|ANYFPU},
+  };
+
+#undef A
+#undef L
+#undef F
+
+#undef ALU0
+#undef ALU1
+#undef STU0
+#undef STU1
+#undef LDU0
+#undef LDU1
+#undef FPU0
+#undef FPU1
+#undef L
+
+#undef ANYALU
+#undef ANYLDU
+#undef ANYSTU
+#undef ANYFPU
+#undef ANYINT
+  
+  //
+  // Global limits
+  //
+  
+  const int MAX_ISSUE_WIDTH = 4;
+  
+  const int PHYS_REG_FILE_SIZE = 128*MAX_THREADS_PER_CORE;
+  const int PHYS_REG_NULL = 0;
+  // Largest size of any physical register file or the store queue:
+  /* S.D.: getting this maximum of constants during compile-time doesn't work :(
+    static const int _tmp = max(STQ_SIZE * MAX_THREADS_PER_CORE, MAX_BRANCHES_IN_FLIGHT * MAX_THREADS_PER_CORE);
+    const int MAX_PHYS_REG_FILE_SIZE = max(PHYS_REG_FILE_SIZE, _tmp);
+   */
+  const int MAX_PHYS_REG_FILE_SIZE = 128*MAX_THREADS_PER_CORE;
+  //
+  // IMPORTANT! If you change this to be greater than 256, you MUST
+  // #define BIG_ROB below to use the correct associative search logic
+  // (16-bit tags vs 8-bit tags).
+  //
+  // SMT always has BIG_ROB enabled: high 4 bits are used for thread id
+  //
+#define BIG_ROB
+
+  const int ROB_SIZE = 128;
+  
+  // Maximum number of branches in the pipeline at any given time
+  const int MAX_BRANCHES_IN_FLIGHT = 16;
+
+  // Set this to combine the integer and FP phys reg files:
+  // #define UNIFIED_INT_FP_PHYS_REG_FILE
+  
+#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
+  // unified, br, st
+  const int PHYS_REG_FILE_COUNT = 3;
+#else
+  // int, fp, br, st
+  const int PHYS_REG_FILE_COUNT = 4;
+#endif
+  
+  //
+  // Load and Store Queues
+  //
+  const int LDQ_SIZE = 48;
+  const int STQ_SIZE = 32;
+
+  //
+  // Fetch
+  //
+  const int FETCH_QUEUE_SIZE = 32;
+  const int FETCH_WIDTH = 4;
+
+  //
+  // Frontend (Rename and Decode)
+  //
+  const int FRONTEND_WIDTH = 4;
+  const int FRONTEND_STAGES = 5;
+
+  //
+  // Dispatch
+  //
+  const int DISPATCH_WIDTH = 4;
+
+  //
+  // Writeback
+  //
+  const int WRITEBACK_WIDTH = 4;
+
+  //
+  // Commit
+  //
+  const int COMMIT_WIDTH = 4;
+
+  //
+  // Clustering, Issue Queues and Bypass Network
+  //
+  const int MAX_FORWARDING_LATENCY = 2;
+
+#define MULTI_IQ
+
+#ifdef ENABLE_SMT
+  //
+  // Multiple issue queues are currently only supported in
+  // the non-SMT configuration, due to ambiguities in the
+  // ICOUNT SMT heuristic when multiple queues are active.
+  //
+#undef MULTI_IQ
+#endif
+
+#ifdef MULTI_IQ
+  const int MAX_CLUSTERS = 4;
+#else
+  const int MAX_CLUSTERS = 1;
+#endif
+
+  enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
+  static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
+  static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
+
+#ifdef INSIDE_SMTCORE
+
+  struct SMTCore;
+  SMTCore& coreof(int coreid);
+
+  struct ReorderBufferEntry;
+
+  //
+  // Issue queue based scheduler with broadcast
+  //
+#ifdef BIG_ROB
+  typedef W16 issueq_tag_t;
+#else
+  typedef byte issueq_tag_t;
+#endif
+
+  template <int size, int operandcount = MAX_OPERANDS>
+  struct IssueQueue {
+#ifdef BIG_ROB
+    typedef FullyAssociativeTags16bit<size, size> assoc_t;
+    typedef vec8w vec_t;
+#else
+    typedef FullyAssociativeTags8bit<size, size> assoc_t;
+    typedef vec16b vec_t;
+#endif
+
+    typedef issueq_tag_t tag_t;
+
+    static const int SIZE = size;
+
+    assoc_t uopids;
+    assoc_t tags[operandcount];
+
+    // States:
+    //             V I
+    // free        0 0
+    // dispatched  1 0
+    // issued      1 1
+    // complete    0 1
+
+    bitvec<size> valid;
+    bitvec<size> issued;
+    bitvec<size> allready;
+    int count;
+    byte coreid;
+    int shared_entries;
+    int reserved_entries;
+
+    void set_reserved_entries(int num) { reserved_entries = num; }
+    bool reset_shared_entries() { 
+      shared_entries = size - reserved_entries; 
+      return true;
+    }
+    bool alloc_reserved_entry() {
+      assert(shared_entries > 0);
+      shared_entries--;
+      return true;
+    }
+    bool free_shared_entry() {
+      assert(shared_entries < size - reserved_entries);
+      shared_entries++;
+      return true;
+    }    
+    bool shared_empty() {
+      return (shared_entries == 0);
+    }
+
+    bool remaining() const { return (size - count); }
+    bool empty() const { return (!count); }
+    bool full() const { return (!remaining()); }
+
+    int uopof(int slot) const {
+      return uopids[slot];
+    }
+
+    int slotof(int uopid) const {
+      return uopids.search(uopid);
+    }
+
+    void reset(int coreid);
+    void reset(int coreid, int threadid);
+    void clock();
+    bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
+    bool broadcast(tag_t uopid);
+    int issue();
+    bool replay(int slot, const tag_t* operands, const tag_t* preready);
+    bool switch_to_end(int slot, const tag_t* operands, const tag_t* preready);
+    bool remove(int slot);
+
+    ostream& print(ostream& os) const;
+    void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
+
+    //
+    // Replay a uop that has already issued once.
+    // The caller may add or reset dependencies here as needed.
+    //
+    bool replay(int slot) {
+      issued[slot] = 0;
+      return true;
+    }
+
+    //
+    // Remove an entry from the issue queue after it has completed,
+    // or in the process of annulment.
+    //
+    bool release(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annul(int slot) {
+      remove(slot);
+      return true;
+    }
+
+    bool annuluop(int uopid) {
+      int slot = slotof(uopid);
+      if (slot < 0) return false;
+      remove(slot);
+      return true;
+    }
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  template <int size, int operandcount>
+  static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
+    return issueq.print(os);
+  }
+
+  //
+  // Iterate through a linked list of objects where each object directly inherits
+  // only from the selfqueuelink class or otherwise has a selfqueuelink object
+  // as the first member.
+  //
+  // This iterator supports mutable lists, meaning the current entry (obj) may
+  // be safely removed from the list and/or moved to some other list without
+  // affecting the next object processed.
+  //
+  // This does NOT mean you can remove any object from the list other than the
+  // current object obj - to do this, copy the list of pointers to an array and
+  // then process that instead.
+  //
+#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
+  linktype* entry; \
+  linktype* nextentry; \
+  for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
+    entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
+
+#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
+
+  struct StateList;
+
+  struct ListOfStateLists: public array<StateList*, 64> {
+    int count;
+
+    ListOfStateLists() { count = 0; }
+
+    int add(StateList* list);
+    void reset();
+  };
+
+  struct StateList: public selfqueuelink {
+    char* name;
+    int count;
+    int listid;
+    W64 dispatch_source_counter;
+    W64 issue_source_counter;
+    W32 flags;
+
+    StateList() { count = 0; listid = 0; }
+
+    void init(const char* name, ListOfStateLists& lol, W32 flags = 0);
+
+    StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {  
+      init(name, lol, flags);
+    }
+
+    // simulated asymmetric c++ array constructor:
+    StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
+      init(name, lol, flags);
+      return *this;
+    }
+
+    void reset();
+
+    selfqueuelink* dequeue() {
+      if (empty())
+        return null;
+      count--;
+      assert(count >=0);
+      selfqueuelink* obj = removehead(); 
+      return obj;
+    }
+
+    selfqueuelink* enqueue(selfqueuelink* entry) {
+      entry->addtail(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
+      if (preventry) entry->addhead(preventry); else entry->addhead(this);
+      count++;
+      return entry;
+    }
+
+    selfqueuelink* remove(selfqueuelink* entry) {
+      assert(entry->linked());
+      entry->unlink();
+      count--;
+      assert(count >=0);
+      return entry;
+    }
+
+    selfqueuelink* peek() {
+      return (empty()) ? null : head();
+    }
+
+    void checkvalid();
+  };
+
+  template <typename T> 
+  static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
+
+  //
+  // Fetch Buffers
+  //
+  struct BranchPredictorUpdateInfo: public PredictorUpdate {
+    int stack_recover_idx;
+    int bptype;
+    W64 ripafter;
+  };
+
+  struct FetchBufferEntry: public TransOp {
+    RIPVirtPhys rip;
+    W64 uuid;
+    uopimpl_func_t synthop;
+    BranchPredictorUpdateInfo predinfo;
+    W16 index;
+    W8 threadid;
+    byte ld_st_truly_unaligned;
+
+    int init(int index) { this->index = index; return 0; }
+    void validate() { }
+
+    FetchBufferEntry() { }
+    
+    FetchBufferEntry(const TransOp& transop) {
+      *((TransOp*)this) = transop;
+    }
+  };
+
+  //
+  // ReorderBufferEntry
+  struct ThreadContext;
+  struct SMTCore;
+  struct PhysicalRegister;
+  struct LoadStoreQueueEntry;
+  struct SMTCoreEvent;
+  struct LLBLine;
+  //
+  // Reorder Buffer (ROB) structure, used for tracking all uops in flight.
+  // This same structure is used to represent both dispatched but not yet issued 
+  // uops as well as issued uops.
+  //
+  struct ReorderBufferEntry: public selfqueuelink {
+    FetchBufferEntry uop;
+    struct StateList* current_state_list;
+    PhysicalRegister* physreg;
+    PhysicalRegister* operands[MAX_OPERANDS];
+    LoadStoreQueueEntry* lsq;
+    W16s idx;
+    W16s cycles_left; // execution latency counter, decremented every cycle when executing
+    W16s forward_cycle; // forwarding cycle after completion
+    W16s lfrqslot;
+    W16s iqslot;
+    W16  executable_on_cluster_mask;
+    W8s  cluster;
+    W8   coreid;
+
+    W8   threadid;
+    byte fu;
+    byte consumer_count;
+    PTEUpdate pteupdate;
+    Waddr origvirt; // original virtual address, with low bits
+    Waddr virtpage; // virtual page number actually accessed by the load or store
+    byte entry_valid:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1, issued:1;
+    byte tlb_walk_level;
+
+    int index() const { return idx; }
+    void validate() { entry_valid = true; }
+
+    void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
+      if (current_state_list)
+        current_state_list->remove(this);
+      current_state_list = &newqueue;
+      if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
+    }
+
+    void init(int idx);
+    void reset();
+    bool ready_to_issue() const;
+    bool ready_to_commit() const;
+    StateList& get_ready_to_issue_list() const;
+    bool find_sources();
+    int forward();
+    int select_cluster();
+    int issue();
+    void* addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& virtpage, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
+    bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
+    int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
+    int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
+    int probecache(Waddr addr, LoadStoreQueueEntry* sfra);
+    void tlbwalk();
+    int issuefence(LoadStoreQueueEntry& state);
+    void issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate);
+    void release();
+    W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
+    W64 annul_after() { return annul(true); }
+    W64 annul_after_and_including() { return annul(false); }
+    int commit();
+    void replay();
+    void replay_locked();
+    int pseudocommit();
+    void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
+    void redispatch_dependents(bool inclusive = true);
+    void loadwakeup();
+    void fencewakeup();
+    LoadStoreQueueEntry* find_nearest_memory_fence();
+    bool release_mem_lock(bool forced = false);
+    ostream& print(ostream& os) const;
+    stringbuf& get_operand_info(stringbuf& sb, int operand) const;
+    ostream& print_operand_info(ostream& os, int operand) const;
+
+    SMTCore& getcore() const { return coreof(coreid); }
+
+    ThreadContext& getthread() const;
+    issueq_tag_t get_tag();
+
+    // ASF-related things
+    LLBLine* llbline;
+    int commit_asf_instruction();
+    int issueasf(IssueState& state, W64 rbdata);
+    void abort_asf();
+  };
+
+  void decode_tag(issueq_tag_t tag, int& threadid, int& idx) {
+    threadid = tag >> MAX_ROB_IDX_BIT;
+    int mask = ((1 << (MAX_ROB_IDX_BIT + MAX_THREADS_BIT)) - 1) >> MAX_THREADS_BIT;
+    idx = tag & mask;
+  }
+
+  static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
+    return rob.print(os);
+  }
+
+  //
+  // Load/Store Queue
+  //
+#define LSQ_SIZE (LDQ_SIZE + STQ_SIZE)
+
+  // Define this to allow speculative issue of loads before unresolved stores
+#define SMT_ENABLE_LOAD_HOISTING
+
+  struct LoadStoreQueueEntry: public SFR {
+    ReorderBufferEntry* rob;
+    W16 idx;
+    byte coreid;
+    W8s mbtag;
+    W8 store:1, lfence:1, sfence:1, entry_valid:1;
+    W32 padding;
+
+    LoadStoreQueueEntry() { }
+
+    int index() const { return idx; }
+
+    void reset() {
+      int oldidx = idx;
+      setzero(*this);
+      idx = oldidx;
+      mbtag = -1;
+    }
+
+    void init(int idx) {
+      this->idx = idx;
+      reset();
+    }
+
+    void validate() { entry_valid = 1; }
+  
+    ostream& print(ostream& os) const;
+
+    LoadStoreQueueEntry& operator =(const SFR& sfr) {
+      *((SFR*)this) = sfr;
+      return *this;
+    }
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
+    return lsq.print(os);
+  }
+
+  struct PhysicalRegisterOperandInfo {
+    W32 uuid;
+    W16 physreg;
+    W16 rob;
+    byte state;
+    byte rfid;
+    byte archreg;
+    byte pad1;
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
+
+  //
+  // Physical Register File
+  //
+ 
+  struct PhysicalRegister: public selfqueuelink {
+    ReorderBufferEntry* rob;
+    W64 data;
+    W16 flags;
+    W16 idx;
+    W8  coreid;
+    W8  rfid;
+    W8  state;
+    W8  archreg;
+    W8  all_consumers_sourced_from_bypass:1;
+    W16s refcount;
+    W8 threadid;
+
+    StateList& get_state_list(int state) const;
+    StateList& get_state_list() const { return get_state_list(this->state); }
+
+    void changestate(int newstate) {
+      if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
+      state = newstate;
+      get_state_list(state).enqueue(this);
+    }
+
+    void init(int coreid, int rfid, int idx) {
+      this->coreid = coreid;
+      this->rfid = rfid;
+      this->idx = idx;
+      reset();
+    }
+
+  private:
+    void addref() { refcount++; }
+    void unref() {
+      refcount--;
+      assert((idx == 0) || (refcount >= 0));
+    }
+
+  public:
+
+    void addref(const ReorderBufferEntry& rob, W8 threadid) { addref(); }
+    void unref(const ReorderBufferEntry& rob, W8 threadid) { unref(); }
+    void addspecref(int archreg, W8 threadid) { addref(); }
+    void unspecref(int archreg, W8 threadid) { unref(); }
+    void addcommitref(int archreg, W8 threadid) { addref(); }
+    void uncommitref(int archreg, W8 threadid) { unref();  }
+
+    bool referenced() const { return (refcount > 0); }
+    bool nonnull() const { return (index() != PHYS_REG_NULL); }
+    bool allocated() const { return (state != PHYSREG_FREE); }
+    void commit() { changestate(PHYSREG_ARCH); }
+    void complete() { changestate(PHYSREG_BYPASS); }
+    void writeback() { changestate(PHYSREG_WRITTEN); }
+
+    void free() {      
+      changestate(PHYSREG_FREE);
+      rob = 0;
+      refcount = 0;
+      threadid = 0xff;
+      all_consumers_sourced_from_bypass = 1;
+    }
+
+  private:
+    void reset() {
+      selfqueuelink::reset();
+      state = PHYSREG_NONE;
+      free();
+    }
+
+  public:
+    void reset(W8 threadid, bool check_id = true) {
+      if (check_id && this->threadid != threadid) return;
+
+      if (!check_id) {
+        selfqueuelink::reset();
+        state = PHYSREG_NONE;
+      }
+      free();
+    }
+
+    int index() const { return idx; }
+    bool valid() const { return ((flags & FLAG_INV) == 0); }
+    bool ready() const { return ((flags & FLAG_WAIT) == 0); }
+
+    void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
+
+    SMTCore& getcore() const { return coreof(coreid); }
+  };
+
+  ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
+
+  struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
+    byte coreid;
+    byte rfid;
+    W16 size;
+    const char* name;
+    StateList states[MAX_PHYSREG_STATE];
+    W64 allocations;
+    W64 frees;
+
+    PhysicalRegisterFile() { }
+
+    PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset();
+    }
+
+    PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
+      init(name, coreid, rfid, size); reset(); return *this;
+    }
+
+    void init(const char* name, int coreid, int rfid, int size);
+    bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
+   
+    PhysicalRegister* alloc(W8 threadid, int r = -1);
+    void reset(W8 threadid);
+    ostream& print(ostream& os) const;
+
+    SMTCore& getcore() const { return coreof(coreid); }
+
+  private:
+    void reset();
+  };
+
+  static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
+    return physregs.print(os);
+  }
+
+  //
+  // Register Rename Table
+  //
+  struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
+#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
+    bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
+#endif
+    ostream& print(ostream& os) const;
+  };
+
+  static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
+    return rrt.print(os);
+  }
+
+  enum {
+    ISSUE_COMPLETED = 1,      // issued correctly
+    ISSUE_NEEDS_REPLAY = 0,   // fast scheduling replay
+    ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
+    ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
+  };
+
+  enum {
+    COMMIT_RESULT_NONE = 0,   // no instructions committed: some uops not ready
+    COMMIT_RESULT_OK = 1,     // committed
+    COMMIT_RESULT_EXCEPTION = 2, // exception
+    COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
+    COMMIT_RESULT_SMC = 4,    // self modifying code detected
+    COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
+    COMMIT_RESULT_STOP = 6    // stop processor model (shutdown)
+  };
+
+  // Branch predictor outcomes:
+  enum { MISPRED = 0, CORRECT = 1 };
+
+  //
+  // Lookup tables (LUTs):
+  //
+  struct Cluster {
+    char* name;
+    W16 issue_width;
+    W32 fu_mask;
+  };
+
+  extern const Cluster clusters[MAX_CLUSTERS];
+  extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
+  extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
+  extern const byte archdest_can_commit[TRANSREG_COUNT];
+  extern const byte archdest_is_visible[TRANSREG_COUNT];
+
+  struct SMTMachine;
+
+  struct SMTCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
+    SMTCore& core;
+    SMTCoreCacheCallbacks(SMTCore& core_): core(core_) { }
+    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
+  };
+
+  struct MemoryInterlockEntry {
+    W64 uuid;
+    W16 rob;
+    byte vcpuid;
+    W8 threadid;
+
+    void reset() { uuid = 0; rob = 0; vcpuid = 0; threadid = 0;}
+ 
+    ostream& print(ostream& os, W64 physaddr) const {
+      os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", threadid ", threadid, ", uuid ", uuid, ", rob ", rob;
+      return os;
+    }
+  };
+
+  struct MemoryInterlockBuffer: public LockableAssociativeArray<W64, MemoryInterlockEntry, 16, 4, 8> { };
+
+  extern MemoryInterlockBuffer interlocks;
+
+  //
+  // Event Tracing
+  //
+  enum {
+    EVENT_INVALID = 0,
+    EVENT_FETCH_STALLED,
+    EVENT_FETCH_ICACHE_WAIT,
+    EVENT_FETCH_FETCHQ_FULL,
+    EVENT_FETCH_IQ_QUOTA_FULL,
+    EVENT_FETCH_BOGUS_RIP,
+    EVENT_FETCH_ICACHE_MISS,
+    EVENT_FETCH_SPLIT,
+    EVENT_FETCH_ASSIST,
+    EVENT_FETCH_TRANSLATE,
+    EVENT_FETCH_OK,
+    EVENT_RENAME_FETCHQ_EMPTY,
+    EVENT_RENAME_ROB_FULL,
+    EVENT_RENAME_PHYSREGS_FULL,
+    EVENT_RENAME_LDQ_FULL,
+    EVENT_RENAME_STQ_FULL,
+    EVENT_RENAME_MEMQ_FULL,
+    EVENT_RENAME_OK,
+    EVENT_FRONTEND,
+    EVENT_CLUSTER_NO_CLUSTER,
+    EVENT_CLUSTER_OK,
+    EVENT_DISPATCH_NO_CLUSTER,
+    EVENT_DISPATCH_DEADLOCK,
+    EVENT_DISPATCH_OK,
+    EVENT_ISSUE_NO_FU,
+    EVENT_ISSUE_OK,
+    EVENT_REPLAY,
+    EVENT_STORE_EXCEPTION,
+    EVENT_STORE_WAIT,
+    EVENT_STORE_PARALLEL_FORWARDING_MATCH,
+    EVENT_STORE_ALIASED_LOAD,
+    EVENT_STORE_ISSUED,
+    EVENT_STORE_LOCK_RELEASED,
+    EVENT_STORE_LOCK_ANNULLED,
+    EVENT_STORE_LOCK_REPLAY,
+    EVENT_LOAD_EXCEPTION,
+    EVENT_LOAD_WAIT,
+    EVENT_LOAD_HIGH_ANNULLED,
+    EVENT_LOAD_HIT,
+    EVENT_LOAD_MISS,
+    EVENT_LOAD_BANK_CONFLICT,
+    EVENT_LOAD_TLB_MISS,
+    EVENT_LOAD_LOCK_REPLAY,
+    EVENT_LOAD_LOCK_OVERFLOW,
+    EVENT_LOAD_LOCK_ACQUIRED,
+    EVENT_LOAD_LFRQ_FULL,
+    EVENT_LOAD_WAKEUP,
+    EVENT_TLBWALK_HIT,
+    EVENT_TLBWALK_MISS,
+    EVENT_TLBWALK_WAKEUP,
+    EVENT_TLBWALK_NO_LFRQ_MB,
+    EVENT_TLBWALK_COMPLETE,
+    EVENT_FENCE_ISSUED,
+    EVENT_ALIGNMENT_FIXUP,
+    EVENT_ANNUL_NO_FUTURE_UOPS,
+    EVENT_ANNUL_MISSPECULATION,
+    EVENT_ANNUL_EACH_ROB,
+    EVENT_ANNUL_PSEUDOCOMMIT,
+    EVENT_ANNUL_FETCHQ_RAS,
+    EVENT_ANNUL_FETCHQ,
+    EVENT_ANNUL_FLUSH,
+    EVENT_REDISPATCH_DEPENDENTS,
+    EVENT_REDISPATCH_DEPENDENTS_DONE,
+    EVENT_REDISPATCH_EACH_ROB,
+    EVENT_COMPLETE,
+    EVENT_BROADCAST,
+    EVENT_FORWARD,
+    EVENT_WRITEBACK,
+    EVENT_COMMIT_FENCE_COMPLETED,
+    EVENT_COMMIT_EXCEPTION_DETECTED,
+    EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED,
+    EVENT_COMMIT_SKIPBLOCK,
+    EVENT_COMMIT_SMC_DETECTED,
+    EVENT_COMMIT_MEM_LOCKED,
+    EVENT_COMMIT_ASSIST,
+    EVENT_COMMIT_OK,
+    EVENT_RECLAIM_PHYSREG,
+    EVENT_RELEASE_MEM_LOCK,
+  };
+
+  //
+  // Event that gets written to the trace buffer
+  //
+  // In the interest of minimizing space, the cycle counters
+  // and uuids are only 32-bits; in practice wraparound is
+  // not likely to be a problem.
+  //
+  struct SMTCoreEvent {
+    W32 cycle;
+    W32 uuid;
+    RIPVirtPhysBase rip;
+    TransOpBase uop;
+    W16 rob;
+    W16 physreg;
+    W16 lsq;
+    W16 type;
+    W16s lfrqslot;
+    byte rfid;
+    byte cluster;
+    byte fu;
+    W8 threadid;
+    W32 issueq_count;
+
+    SMTCoreEvent* fill(int type) {
+      this->type = type;
+      cycle = sim_cycle;
+      uuid = 0;
+      threadid = 0xff;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const FetchBufferEntry& uop) {
+      fill(type);
+      uuid = uop.uuid;
+      rip = uop.rip;
+      threadid = uop.threadid;
+      this->uop = uop;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const RIPVirtPhys& rvp) {
+      fill(type);
+      rip = rvp;
+      return this;
+    }
+
+    SMTCoreEvent* fill(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob->uop);
+      this->rob = rob->index();
+      physreg = rob->physreg->index();
+      lsq = (rob->lsq) ? rob->lsq->index() : 0;
+      rfid = rob->physreg->rfid;
+      cluster = rob->cluster;
+      fu = rob->fu;
+      lfrqslot = rob->lfrqslot;
+      return this;
+    }
+
+    SMTCoreEvent* fill_commit(int type, const ReorderBufferEntry* rob) {
+      fill(type, rob);
+      if unlikely (isstore(rob->uop.opcode)) {
+        commit.state.st = *rob->lsq;
+      } else {
+        commit.state.reg.rddata = rob->physreg->data;
+        commit.state.reg.rdflags = rob->physreg->flags;
+      }
+      // taken, predtaken only for branches
+      commit.ld_st_truly_unaligned = rob->uop.ld_st_truly_unaligned;
+      commit.pteupdate = rob->pteupdate;
+      // oldphysreg filled in later
+      // oldphysreg_refcount filled in later
+      commit.origvirt = rob->origvirt;
+      commit.total_user_insns_committed = total_user_insns_committed;
+      // target_rip filled in later
+      foreach (i, MAX_OPERANDS) commit.operand_physregs[i] = rob->operands[i]->index();
+      return this;
+    }
+
+    SMTCoreEvent* fill_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr, Waddr virtaddr) {
+      fill(type, rob);
+      if likely (rob->lsq) loadstore.sfr = *rob->lsq;
+      loadstore.virtaddr = virtaddr;
+      loadstore.load_store_second_phase = rob->load_store_second_phase;
+      loadstore.inherit_sfr_used = (inherit_sfr != null);
+      if unlikely (inherit_sfr) {
+        loadstore.inherit_sfr = *inherit_sfr;
+        loadstore.inherit_sfr_lsq = inherit_sfr->rob->lsq->index();
+        loadstore.inherit_sfr_uuid = inherit_sfr->rob->uop.uuid;
+        loadstore.inherit_sfr_rob = inherit_sfr->rob->index();
+        loadstore.inherit_sfr_physreg = inherit_sfr->rob->physreg->index();
+        loadstore.inherit_sfr_rip = inherit_sfr->rob->uop.rip;
+      }
+      loadstore.tlb_walk_level = rob->tlb_walk_level;
+      return this;
+    }
+
+    union {
+      struct {
+        W16s missbuf;
+        W64 predrip;
+        W16 bb_uop_count;
+      } fetch;
+      struct {
+        W16  oldphys;
+        W16  oldzf;
+        W16  oldcf;
+        W16  oldof;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } rename;
+      struct {
+        W16 cycles_left;
+      } frontend;
+      struct {
+        W16 allowed_clusters;
+        W16 iq_avail[MAX_CLUSTERS];
+      } select_cluster;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } dispatch;
+      struct {
+        byte mispredicted:1;
+        IssueState state;
+        W16 cycles_left;
+        W64 operand_data[MAX_OPERANDS];
+        W16 operand_flags[MAX_OPERANDS];
+        W64 predrip;
+        W32 fu_avail;
+      } issue;
+      struct {
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+        byte ready;
+      } replay;
+      struct {
+        W64 virtaddr; 
+        W64 data_to_store;
+        SFR sfr;
+        SFR inherit_sfr;
+        W64 inherit_sfr_uuid;        
+        W64 inherit_sfr_rip;
+        W16 inherit_sfr_lsq;
+        W16 inherit_sfr_rob;
+        W16 inherit_sfr_physreg;
+        W16 cycles_left;
+        W64 locking_uuid;
+        byte inherit_sfr_used:1, rcready:1, load_store_second_phase:1, predicted_alias:1;
+        byte locking_vcpuid;
+        W16 locking_rob;
+        W8 threadid;
+        W8 tlb_walk_level;
+      } loadstore;
+      struct {
+        W16 somidx;
+        W16 eomidx;
+        W16 startidx;
+        W16 endidx;
+        byte annulras;
+      } annul;
+      struct {
+        StateList* current_state_list;
+        W16 iqslot;
+        W16 count;
+        byte dependent_operands;
+        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
+      } redispatch;
+      struct {
+        W8  forward_cycle;
+        W8  operand;
+        W8  target_operands_ready;
+        W8  target_all_operands_ready;
+        W16 target_rob;
+        W16 target_physreg;
+        W8  target_rfid;
+        W8  target_cluster;
+        W64 target_uuid;
+        W16 target_lsq;
+        W8  target_st;
+      } forwarding;
+      struct {
+        W16 consumer_count;
+        W16 flags;
+        W64 data;
+        byte transient:1, all_consumers_sourced_from_bypass:1, no_branches_between_renamings:1, dest_renamed_before_writeback:1;
+      } writeback;
+      struct {
+        IssueState state;
+        byte taken:1, predtaken:1, ld_st_truly_unaligned:1,krn:1;
+        PTEUpdateBase pteupdate;
+        W16s oldphysreg;
+        W16 oldphysreg_refcount;
+        W64 origvirt;
+        W64 total_user_insns_committed;
+        W64 target_rip;
+        W16 operand_physregs[MAX_OPERANDS];
+      } commit;
+    };
+
+    ostream& print(ostream& os) const;
+  };
+
+  struct EventLog {
+    SMTCoreEvent* start;
+    SMTCoreEvent* end;
+    SMTCoreEvent* tail;
+    int           coreid;
+    ostream* logfile;
+
+    EventLog(int coreid_) :coreid(coreid_) { start = null; end = null; tail = null; logfile = null; }
+
+    bool init(size_t bufsize);
+    void reset();
+
+    SMTCoreEvent* add() {
+      if unlikely (tail >= end) {
+        tail = start;
+        flush();
+      }
+      SMTCoreEvent* event = tail;
+      tail++;
+      return event;
+    }
+
+    void flush(bool only_to_tail = false);
+
+    SMTCoreEvent* add(int type) {
+      return add()->fill(type);
+    }
+
+    SMTCoreEvent* add(int type, const RIPVirtPhys& rvp) {
+      return add()->fill(type, rvp);
+    }
+
+    SMTCoreEvent* add(int type, const FetchBufferEntry& uop) {
+      return add()->fill(type, uop);
+    }
+
+    SMTCoreEvent* add(int type, const ReorderBufferEntry* rob) {
+      return add()->fill(type, rob);
+    }
+
+    SMTCoreEvent* add_commit(int type, const ReorderBufferEntry* rob) {
+      return add()->fill_commit(type, rob);
+    }
+
+    SMTCoreEvent* add_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr = null, Waddr addr = 0) {
+      return add()->fill_load_store(type, rob, inherit_sfr, addr);
+    }
+
+    ostream& print(ostream& os, bool only_to_tail = false);
+  };
+
+  #define ASF_MAX_LINES (8)
+  #define LLB_LINE_SIZE CacheSubsystem::L1_LINE_SIZE
+  struct LLBLine {
+    bool  written;
+    byte  orig_data[LLB_LINE_SIZE];
+    int   refcount;
+    void  reset() {written = false; refcount = 0;}
+    LLBLine():written(false),refcount(0) {}
+  };
+  enum {
+    PROBE_ACK,
+    PROBE_NACK,
+    PROBE_WAIT
+  };
+
+  struct LockedLineBuffer: public FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_LINES> {
+    typedef FullyAssociativeArray<Waddr, LLBLine, ASF_MAX_LINES> base_t;
+    ThreadContext& thread;
+    int num_locations;
+
+    LockedLineBuffer(ThreadContext& _thread): base_t(), thread(_thread) {}
+    LLBLine* add_location(Waddr addr);
+    void remove_ref(LLBLine* line);
+    void clear();
+    void snapshot();
+    void undo();
+
+    void commit() {clear(); lasterr = 0;};
+    void abort() { undo(); /*clear();*/ lasterr = 0; };
+
+    bool contains(Waddr addr) {return probe(floor(addr, LLB_LINE_SIZE));}
+    bool empty() {return (num_locations == 0);}
+    void* external_probe(Waddr addr, bool invalidating);
+    void* probe_other_LLBs(Waddr addr, bool invalidating);
+    void mark_clean(Waddr addr);
+    void mark_clean_others(Waddr addr);
+    void mark_written(Waddr addr);
+
+    W64  consistency_error() {
+/* S.D. Error injection framework!*/if (!lasterr)  lasterr = (asf_consistency_error()) ? 0xDEADBEEF : 0;
+      return lasterr;
+    }
+    private: W64 lasterr;
+  };
+
+  struct LoadStoreAliasPredictor: public FullyAssociativeTags<W64, 8> { };
+
+  enum {
+    ROB_STATE_READY = (1 << 0),
+    ROB_STATE_IN_ISSUE_QUEUE = (1 << 1),
+    ROB_STATE_PRE_READY_TO_DISPATCH = (1 << 2)
+  };
+
+#ifdef MULTI_IQ
+#define InitClusteredROBList(name, description, flags) \
+  name[0](description "-int0", rob_states, flags); \
+  name[1](description "-int1", rob_states, flags); \
+  name[2](description "-ld", rob_states, flags); \
+  name[3](description "-fp", rob_states, flags)
+#else
+#define InitClusteredROBList(name, description, flags) \
+  name[0](description "-all", rob_states, flags);
+#endif
+
+  static const int ISSUE_QUEUE_SIZE = 16;
+
+  // How many bytes of x86 code to fetch into decode buffer at once
+  static const int ICACHE_FETCH_GRANULARITY = 16;
+  // Deadlock timeout: if nothing dispatches for this many cycles, flush the pipeline
+  static const int DISPATCH_DEADLOCK_COUNTDOWN_CYCLES = 256;
+  // Size of unaligned predictor Bloom filter
+  static const int UNALIGNED_PREDICTOR_SIZE = 4096;
+
+  struct ThreadContext {
+    SMTCore& core;
+    SMTCore& getcore() const { return core; }
+
+    int threadid;
+    Context& ctx;
+    BranchPredictorInterface branchpred;
+
+    Queue<FetchBufferEntry, FETCH_QUEUE_SIZE> fetchq;
+
+    ListOfStateLists rob_states;
+    ListOfStateLists lsq_states;
+    //
+    // Each ROB's state can be linked into at most one of the
+    // following rob_xxx_list lists at any given time; the ROB's
+    // current_state_list points back to the list it belongs to.
+    //
+    StateList rob_free_list;                             // Free ROB entyry
+    StateList rob_frontend_list;                         // Frontend in progress (artificial delay)
+    StateList rob_ready_to_dispatch_list;                // Ready to dispatch
+    StateList rob_dispatched_list[MAX_CLUSTERS];         // Dispatched but waiting for operands
+    StateList rob_ready_to_issue_list[MAX_CLUSTERS];     // Ready to issue (all operands ready)
+    StateList rob_ready_to_store_list[MAX_CLUSTERS];     // Ready to store (all operands except possibly rc are ready)
+    StateList rob_ready_to_load_list[MAX_CLUSTERS];      // Ready to load (all operands ready)
+    StateList rob_issued_list[MAX_CLUSTERS];             // Issued and in progress (or for loads, returned here after address is generated)
+    StateList rob_completed_list[MAX_CLUSTERS];          // Completed and result in transit for local and global forwarding
+    StateList rob_ready_to_writeback_list[MAX_CLUSTERS]; // Completed; result ready to writeback in parallel across all cluster register files
+    StateList rob_cache_miss_list;                       // Loads only: wait for cache miss to be serviced
+    StateList rob_tlb_miss_list;                         // TLB miss waiting to be serviced on one or more levels
+    StateList rob_memory_fence_list;                     // mf uops only: wait for memory fence to reach head of LSQ before completing
+    StateList rob_ready_to_commit_queue;                 // Ready to commit
+
+    Queue<ReorderBufferEntry, ROB_SIZE> ROB;
+
+    Queue<LoadStoreQueueEntry, LSQ_SIZE> LSQ;
+    RegisterRenameTable specrrt;
+    RegisterRenameTable commitrrt;
+
+    // Fetch-related structures
+    RIPVirtPhys fetchrip;
+    BasicBlock* current_basic_block;
+    int current_basic_block_transop_index;
+    bool stall_frontend;
+    bool stall_on_eom;
+    bool waiting_for_icache_fill;
+
+    // Last block in icache we fetched into our buffer
+    W64 current_icache_block;
+    W64 fetch_uuid;
+    int loads_in_flight;
+    int stores_in_flight;
+    bool prev_interrupts_pending;
+    bool handle_interrupt_at_next_eom;
+    bool stop_at_next_eom;
+
+    W64 last_commit_at_cycle;
+    bool smc_invalidate_pending;
+    RIPVirtPhys smc_invalidate_rvp;
+    W64 chk_recovery_rip;
+
+    TransOpBuffer unaligned_ldst_buf;
+    LoadStoreAliasPredictor lsap;
+    int loads_in_this_cycle;
+    W64 load_to_store_parallel_forwarding_buffer[LOAD_FU_COUNT];
+
+    W64 consecutive_commits_inside_spinlock;
+
+    // statistics:
+    W64 total_uops_committed;
+    W64 total_insns_committed;
+    int dispatch_deadlock_countdown;    
+    int issueq_count;
+
+    //
+    // List of memory locks that will be removed from
+    // the lock controller when the macro-op commits.
+    //
+    // At most 4 chunks are allowed, to ensure
+    // cmpxchg16b works even with unaligned data.
+    //
+    byte queued_mem_lock_release_count;
+    W64 queued_mem_lock_release_list[4];
+
+    ThreadContext(SMTCore& core_, int threadid_, Context& ctx_): core(core_), threadid(threadid_), ctx(ctx_), locked_line_buffer(*this) {
+      reset();
+    }
+
+    int commit();
+    int writeback(int cluster);
+    int transfer(int cluster);
+    int complete(int cluster);
+    int dispatch();
+    void frontend();
+    void rename();
+    bool fetch();
+    void tlbwalk();
+
+    bool handle_barrier();
+    bool handle_exception();
+    bool handle_interrupt();
+    void reset_fetch_unit(W64 realrip);
+    void flush_pipeline();
+    void invalidate_smc();
+    void external_to_core_state();
+    void core_to_external_state() { }
+    void annul_fetchq();
+    BasicBlock* fetch_or_translate_basic_block(const RIPVirtPhys& rvp);
+    void redispatch_deadlock_recovery();
+    void flush_mem_lock_release_list(byte start = 0);
+    int get_priority() const;
+
+    void dump_smt_state(ostream& os);
+    void print_smt_state(ostream& os);
+    void print_rob(ostream& os);
+    void print_lsq(ostream& os);
+    void print_rename_tables(ostream& os);
+
+    void reset();
+    void init();
+
+    // ASF
+    bool        asf_in_crit_sec;
+    bool        asf_reissue_will_fail;
+    W64         asf_stored_error;
+    RIPVirtPhys asf_failing_acquire;
+    W64         asf_saved_rsp;
+    LockedLineBuffer locked_line_buffer;
+    void check_asf_conflicts();
+    int asf_runcycle(int commitrc);
+    void asf_rollback_last_acq(W64 errorcode, int reg_nextrip);
+  };
+
+  //
+  // checkpointed core
+  //
+  struct SMTCore {
+    SMTMachine& machine;
+    int coreid;
+    SMTCore& getcore() const { return coreof(coreid); }
+
+    int threadcount;
+    ThreadContext* threads[MAX_THREADS_PER_CORE];
+
+    ListOfStateLists rob_states;
+    ListOfStateLists lsq_states;
+
+    EventLog eventlog;
+    ListOfStateLists physreg_states;
+    // Bandwidth counters:
+    int commitcount;
+    int writecount;
+    int dispatchcount;
+
+    byte round_robin_tid;
+    byte round_robin_cid;
+
+    //
+    // Issue Queues (one per cluster)
+    //
+    int reserved_iq_entries;
+#define declare_issueq_templates template struct IssueQueue<ISSUE_QUEUE_SIZE>
+#ifdef MULTI_IQ
+    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int0;
+    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int1;
+    IssueQueue<ISSUE_QUEUE_SIZE> issueq_ld;
+    IssueQueue<ISSUE_QUEUE_SIZE> issueq_fp;
+
+    // Instantiate any issueq sizes used above:
+
+
+#define foreach_issueq(expr) { SMTCore& core = getcore(); core.issueq_int0.expr; core.issueq_int1.expr; core.issueq_ld.expr; core.issueq_fp.expr; }
+  
+    void sched_get_all_issueq_free_slots(int* a) {
+      a[0] = issueq_int0.remaining();
+      a[1] = issueq_int1.remaining();
+      a[2] = issueq_ld.remaining();
+      a[3] = issueq_fp.remaining();
+    }
+
+#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
+  switch (cluster) { \
+  case 0: rc = core.issueq_int0.expr; break; \
+  case 1: rc = core.issueq_int1.expr; break; \
+  case 2: rc = core.issueq_ld.expr; break; \
+  case 3: rc = core.issueq_fp.expr; break; \
+  }
+
+#define per_cluster_stats_update(prefix, cluster, expr) \
+  switch (cluster) { \
+  case 0: prefix.int0 expr; break; \
+  case 1: prefix.int1 expr; break; \
+  case 2: prefix.ld expr; break; \
+  case 3: prefix.fp expr; break; \
+  }
+
+#else
+    IssueQueue<ISSUE_QUEUE_SIZE> issueq_all;
+#define foreach_issueq(expr) { getcore().issueq_all.expr; }
+    void sched_get_all_issueq_free_slots(int* a) {
+      a[0] = issueq_all.remaining();
+    }
+#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) rc = core.issueq_all.expr;
+#define per_cluster_stats_update(prefix, cluster, expr) prefix.all expr;
+
+#endif
+
+#define per_physregfile_stats_update(prefix, rfid, expr) \
+  switch (rfid) { \
+  case 0: prefix.integer expr; break; \
+  case 1: prefix.fp expr; break; \
+  case 2: prefix.st expr; break; \
+  case 3: prefix.br expr; break; \
+  }
+
+#define issueq_operation_on_cluster(core, cluster, expr) { int dummyrc; issueq_operation_on_cluster_with_result(core, cluster, dummyrc, expr); }
+
+#define for_each_cluster(iter) foreach (iter, MAX_CLUSTERS)
+#define for_each_operand(iter) foreach (iter, MAX_OPERANDS)
+
+    SMTCore(int coreid_, SMTMachine& machine_): coreid(coreid_), machine(machine_), cache_callbacks(*this), caches(coreid_), eventlog(coreid_) {
+      threadcount = 0;
+      setzero(threads);
+    }
+    
+    ~SMTCore(){};
+
+    // 
+    // Initialize structures independent of the core parameters
+    //
+    void init_generic();
+    void reset();
+
+    //
+    // Initialize all structures for the first time
+    //
+    void init() {
+      init_generic();
+      //
+      // Physical register files
+      //
+      physregfiles[0]("int", coreid, 0, PHYS_REG_FILE_SIZE);
+      physregfiles[1]("fp", coreid, 1, PHYS_REG_FILE_SIZE);
+      physregfiles[2]("st", coreid, 2, STQ_SIZE * MAX_THREADS_PER_CORE);
+      physregfiles[3]("br", coreid, 3, MAX_BRANCHES_IN_FLIGHT * MAX_THREADS_PER_CORE);
+    }
+
+    //
+    // Physical Registers
+    //
+
+    enum { PHYS_REG_FILE_INT, PHYS_REG_FILE_FP, PHYS_REG_FILE_ST, PHYS_REG_FILE_BR };
+
+    enum {  
+      PHYS_REG_FILE_MASK_INT = (1 << 0),
+      PHYS_REG_FILE_MASK_FP  = (1 << 1),
+      PHYS_REG_FILE_MASK_ST  = (1 << 2),
+      PHYS_REG_FILE_MASK_BR  = (1 << 3)
+    };
+
+    // Major core structures
+    PhysicalRegisterFile physregfiles[PHYS_REG_FILE_COUNT];
+    int round_robin_reg_file_offset;
+    W32 fu_avail;
+    ReorderBufferEntry* robs_on_fu[FU_COUNT];
+    CacheSubsystem::CacheHierarchy caches;
+    SMTCoreCacheCallbacks cache_callbacks;
+
+    // Unaligned load/store predictor
+    bitvec<UNALIGNED_PREDICTOR_SIZE> unaligned_predictor;
+    static int hash_unaligned_predictor_slot(const RIPVirtPhysBase& rvp);
+    bool get_unaligned_hint(const RIPVirtPhysBase& rvp) const;
+    void set_unaligned_hint(const RIPVirtPhysBase& rvp, bool value);
+
+    // Pipeline Stages
+    bool runcycle();
+    void flush_pipeline_all();
+    bool fetch();
+    void rename();
+    void frontend();
+    int dispatch();
+    int issue(int cluster);
+    int complete(int cluster);
+    int transfer(int cluster);
+    int writeback(int cluster);
+    int commit();
+
+    // Callbacks
+    void flush_tlb(Context& ctx, int threadid, bool selective = false, Waddr virtaddr = 0);
+
+    // Debugging
+    void dump_smt_state(ostream& os);
+    void print_smt_state(ostream& os);
+    void check_refcounts();
+    void check_rob();
+
+  };
+
+#ifdef ENABLE_SMT
+  #define MAX_SMT_CORES 1
+#else
+  #define MAX_SMT_CORES 32
+#endif
+
+  struct SMTMachine: public PTLsimMachine {
+    SMTCore* cores[MAX_SMT_CORES];
+    int corecount;
+    bitvec<MAX_CONTEXTS> stopped;
+    SMTMachine(const char* name);
+    virtual bool init(PTLsimConfig& config);
+    virtual int run(PTLsimConfig& config);
+    virtual void dump_state(ostream& os);
+    virtual void update_stats(PTLsimStats& stats);
+    virtual void flush_tlb(Context& ctx);
+    virtual void flush_tlb_virt(Context& ctx, Waddr virtaddr);
+    void flush_all_pipelines();
+  };
+
+  extern CycleTimer cttotal;
+  extern CycleTimer ctfetch;
+  extern CycleTimer ctdecode;
+  extern CycleTimer ctrename;
+  extern CycleTimer ctfrontend;
+  extern CycleTimer ctdispatch;
+  extern CycleTimer ctissue;
+  extern CycleTimer ctissueload;
+  extern CycleTimer ctissuestore;
+  extern CycleTimer ctcomplete;
+  extern CycleTimer cttransfer;
+  extern CycleTimer ctwriteback;
+  extern CycleTimer ctcommit;
+
+#ifdef DECLARE_STRUCTURES
+  //
+  // The following configuration has two integer/store clusters with a single cycle
+  // latency between them, but both clusters can access the load pseudo-cluster with
+  // no extra cycle. The floating point cluster is two cycles from everything else.
+  //
+#ifdef MULTI_IQ
+  const Cluster clusters[MAX_CLUSTERS] = {
+    {"int0",  2, (FU_ALU0|FU_STU0)},
+    {"int1",  2, (FU_ALU1|FU_STU1)},
+    {"ld",    2, (FU_LDU0|FU_LDU1)},
+    {"fp",    2, (FU_FPU0|FU_FPU1)},
+  };
+
+  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 LD FP <-to
+    {0, 1, 0, 2}, // from I0
+    {1, 0, 0, 2}, // from I1
+    {0, 0, 0, 2}, // from LD
+    {2, 2, 2, 0}, // from FP
+  };
+
+  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
+    // I0 I1 LD FP <-to
+    {2, 2, 1, 1}, // from I0
+    {2, 2, 1, 1}, // from I1
+    {1, 1, 2, 2}, // from LD
+    {1, 1, 1, 2}, // from FP
+  };
+
+#else // single issueq
+  const Cluster clusters[MAX_CLUSTERS] = {
+    {"all",  4, (FU_ALU0|FU_ALU1|FU_STU0|FU_STU1|FU_LDU0|FU_LDU1|FU_FPU0|FU_FPU1)},
+   };
+  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{0}};
+  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{64}};
+#endif // multi_issueq
+
+#endif // DECLARE_STRUCTURES
+
+#endif // INSIDE_SMTCORE
+
+  //
+  // This part is used when parsing stats.h to build the
+  // data store template; these must be in sync with the
+  // corresponding definitions elsewhere.
+  //
+#ifdef MULTI_IQ
+  static const char* cluster_names[MAX_CLUSTERS] = {"int0", "int1", "ld", "fp"};
+#else
+  static const char* cluster_names[MAX_CLUSTERS] = {"all"};
+#endif
+
+  static const char* phys_reg_file_names[PHYS_REG_FILE_COUNT] = {"int", "fp", "st", "br"};
+};
+
+struct PerContextSMTStats { // rootnode:
+  struct fetch {
+    struct stop { // node: summable
+      W64 stalled;
+      W64 icache_miss;
+      W64 fetchq_full;
+      W64 issueq_quota_full;
+      W64 bogus_rip;
+      W64 microcode_assist;
+      W64 branch_taken;
+      W64 full_width;
+    } stop;
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+    W64 width[SMTModel::FETCH_WIDTH+1]; // histo: 0, SMTModel::FETCH_WIDTH, 1
+    W64 blocks;
+    W64 uops;
+    W64 user_insns;
+  } fetch;
+
+  struct frontend {
+    struct status { // node: summable
+      W64 complete;
+      W64 fetchq_empty;
+      W64 rob_full;
+      W64 physregs_full;
+      W64 ldq_full;
+      W64 stq_full;
+    } status;
+    W64 width[SMTModel::FRONTEND_WIDTH+1]; // histo: 0, SMTModel::FRONTEND_WIDTH, 1
+    struct renamed {
+      W64 none;
+      W64 reg;
+      W64 flags;
+      W64 reg_and_flags;
+    } renamed;
+    struct alloc {
+      W64 reg;
+      W64 ldreg;
+      W64 sfr;
+      W64 br;
+    } alloc;
+    // NOTE: This is capped at 255 consumers to keep the size reasonable:
+    W64 consumer_count[256]; // histo: 0, 255, 1
+  } frontend;
+
+  struct dispatch {
+    W64 cluster[SMTModel::MAX_CLUSTERS]; // label: SMTModel::cluster_names
+    struct redispatch {
+      W64 trigger_uops;
+      W64 deadlock_flushes;
+      W64 deadlock_uops_flushed;
+      W64 dependent_uops[SMTModel::ROB_SIZE+1]; // histo: 0, SMTModel::ROB_SIZE, 1
+    } redispatch;
+  } dispatch;
+
+  struct issue {
+    W64 uops;
+    double uipc;
+    struct result { // node: summable
+      W64 no_fu;
+      W64 replay;
+      W64 misspeculated;
+      W64 refetch;
+      W64 branch_mispredict;
+      W64 exception;
+      W64 complete;
+    } result;
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } issue;
+
+  struct writeback {
+    W64 writebacks[SMTModel::PHYS_REG_FILE_COUNT]; // label: SMTModel::phys_reg_file_names
+  } writeback;
+
+  struct commit {
+    W64 uops;
+    W64 insns;
+    double uipc;
+    double ipc;
+
+    struct result { // node: summable
+      W64 none;
+      W64 ok;
+      W64 exception;
+      W64 skipblock;
+      W64 barrier;
+      W64 smc;
+      W64 memlocked;
+      W64 stop;
+    } result;
+
+    struct setflags { // node: summable
+      W64 yes;
+      W64 no;
+    } setflags;
+
+    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
+  } commit;
+
+  struct branchpred {
+    W64 predictions;
+    W64 updates;
+
+    // These counters are [0] = mispred, [1] = correct
+    W64 cond[2]; // label: branchpred_outcome_names
+    W64 indir[2]; // label: branchpred_outcome_names
+    W64 ret[2]; // label: branchpred_outcome_names
+    W64 summary[2]; // label: branchpred_outcome_names
+    struct ras { // node: summable
+      W64 pushes;
+      W64 overflows;
+      W64 pops;
+      W64 underflows;
+      W64 annuls;
+    } ras;
+  } branchpred;
+
+  struct dcache {
+    struct load {
+      struct issue { // node: summable
+        W64 complete;
+        W64 miss;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 missbuf_full;
+          W64 interlocked;
+          W64 interlock_overflow;
+          W64 fence;
+          W64 bank_conflict;
+        } replay;
+      } issue;
+
+      struct forward { // node: summable
+        W64 cache;
+        W64 sfr;
+        W64 sfr_and_cache;
+      } forward;
+        
+      struct dependency { // node: summable
+        W64 independent;
+        W64 predicted_alias_unresolved;
+        W64 stq_address_match;
+        W64 stq_address_not_ready;
+        W64 fence;
+      } dependency;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+    } load;
+
+    struct store {
+      struct issue { // node: summable
+        W64 complete;
+        W64 exception;
+        W64 ordering;
+        W64 unaligned;
+        struct replay { // node: summable
+          W64 sfr_addr_and_data_not_ready;
+          W64 sfr_addr_not_ready;
+          W64 sfr_data_not_ready;
+          W64 sfr_addr_and_data_and_data_to_store_not_ready;
+          W64 sfr_addr_and_data_to_store_not_ready;
+          W64 sfr_data_and_data_to_store_not_ready;
+          W64 interlocked;
+          W64 fence;
+          W64 parallel_aliasing;
+          W64 bank_conflict;
+        } replay;
+      } issue;
+
+      struct forward { // node: summable
+        W64 zero;
+        W64 sfr;
+      } forward;
+        
+      struct type { // node: summable
+        W64 aligned;
+        W64 unaligned;
+        W64 internal;
+      } type;
+        
+      W64 size[4]; // label: sizeshift_names
+
+      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
+    } store;
+
+    struct fence { // node: summable
+      W64 lfence;
+      W64 sfence;
+      W64 mfence;
+    } fence;
+  } dcache;
+};
+
+//
+// SMT Core
+//
+struct SMTCoreStats { // rootnode:
+  W64 cycles;
+
+  struct dispatch {
+    struct source { // node: summable
+      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+    } source;
+    W64 width[SMTModel::DISPATCH_WIDTH+1]; // histo: 0, SMTModel::DISPATCH_WIDTH, 1
+  } dispatch;
+
+  struct issue {
+    struct source { // node: summable
+      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
+    } source;
+    struct width {
+#ifdef MULTI_IQ
+      W64 int0[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#else
+      W64 all[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#endif
+    } width;
+  } issue;
+
+  struct writeback {
+    struct width {
+#ifdef MULTI_IQ
+      W64 int0[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#else
+      W64 all[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
+#endif
+    } width;
+  } writeback;
+
+  struct commit {
+    struct freereg { // node: summable
+      W64 pending;
+      W64 free;
+    } freereg;
+
+    W64 free_regs_recycled;
+
+    W64 width[SMTModel::COMMIT_WIDTH+1]; // histo: 0, SMTModel::COMMIT_WIDTH, 1
+  } commit;
+
+  PerContextSMTStats total;
+  PerContextSMTStats vcpu0;
+  PerContextSMTStats vcpu1;
+  PerContextSMTStats vcpu2;
+  PerContextSMTStats vcpu3;
+  PerContextSMTStats vcpu4;
+  PerContextSMTStats vcpu5;
+  PerContextSMTStats vcpu6;
+  PerContextSMTStats vcpu7;
+
+  struct simulator {
+    double total_time;
+    struct cputime { // node: summable
+      double fetch;
+      double decode;
+      double rename;
+      double frontend;
+      double dispatch;
+      double issue;
+      double issueload;
+      double issuestore;
+      double complete;
+      double transfer;
+      double writeback;
+      double commit;
+    } cputime;
+  } simulator;
+};
diff -r 10448c053ad6 smtcore.cpp
--- a/smtcore.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/smtcore.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2005 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -96,7 +98,8 @@
   setzero(fetchrip);
   current_basic_block = null;
   current_basic_block_transop_index = -1;
-  stall_frontend = false;
+  stall_frontend  = false;
+  stall_on_eom    = false;
   waiting_for_icache_fill = false;
   fetch_uuid = 0;
   current_icache_block = 0;
@@ -152,7 +155,7 @@
   caches.callback = &cache_callbacks;
   setzero(robs_on_fu);
   foreach_issueq(reset(coreid));
-  
+
   reserved_iq_entries = (int)sqrt(ISSUE_QUEUE_SIZE / MAX_THREADS_PER_CORE);
   assert(reserved_iq_entries && reserved_iq_entries < ISSUE_QUEUE_SIZE);
 
@@ -337,6 +340,7 @@
   //
   // Compute reserved issue queue entries to avoid starvation:
   //
+#ifndef MULTI_IQ
 #ifdef ENABLE_CHECKS
   int total_issueq_count = 0;
   int total_issueq_reserved_free = 0;
@@ -356,7 +360,8 @@
 
   assert (total_issueq_count == issueq_all.count);
   assert((ISSUE_QUEUE_SIZE - issueq_all.count) == (issueq_all.shared_entries + total_issueq_reserved_free));
-#endif
+#endif /* ENABLE_CHECKS */
+#endif /* MULTI_IQ */
 
   foreach (i, threadcount) threads[i]->loads_in_this_cycle = 0;
 
@@ -378,6 +383,7 @@
     commitrc[tid] = thread->commit();
     for_each_cluster(j) thread->writeback(j);
     for_each_cluster(j) thread->transfer(j);
+    commitrc[tid] = thread->asf_runcycle(commitrc[tid]);
   }
 
   //
@@ -390,8 +396,14 @@
   }
 
   //
+  // Always clock the issue queues: they're independent of all threads
+  //
+  // SD: Moved between forward and issue in the same cycle, so that a 0-cycle
+  // forwarding delay would actually be equivalent to a direct bypass!
+  foreach_issueq(clock());
+
+  //
   // Issue whatever is ready
-  //
   for_each_cluster(i) { issue(i); }
 
   //
@@ -462,7 +474,9 @@
   //
   // Always clock the issue queues: they're independent of all threads
   //
-  foreach_issueq(clock());
+  // SD: Moved between forward and issue in the same cycle, so that a 0-cycle
+  // forwarding delay would actually be equivalent to a direct bypass!
+  //foreach_issueq(clock());
 
   //
   // Advance the round robin priority index
@@ -555,6 +569,7 @@
         (sim_cycle - thread->last_commit_at_cycle), " cycles; the pipeline could be deadlocked", endl;
       logfile << sb, flush;
       cerr << sb, flush;
+      logfile << thread->ROB, endl, flush;
       exiting = 1;
     }
   }
@@ -596,6 +611,7 @@
   no_branches_between_renamings = 0;
 #endif
   issued = 0;
+  llbline = (LLBLine*)null;
 }
 
 bool ReorderBufferEntry::ready_to_issue() const {
@@ -697,6 +713,9 @@
   os << padstring(rainfo, -30);
   os << padstring(rbinfo, -30);
   os << padstring(rcinfo, -30);
+
+  if (llbline)
+    os << " llb: ", llbline;
 
   return os;
 }
@@ -986,6 +1005,14 @@
     return true;
   }
 
+  /* S.D.: Ignore ASF Testing exceptions! -> Reexecute Instruction*/
+  if (ctx.exception == EXCEPTION_ASF_Testing) {
+    if (logable(5)) logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle,
+      ": Ignoring ASF testing exception, re-executing instruction @ ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+    flush_pipeline();
+    return true;
+  }
+
 #ifdef PTLSIM_HYPERVISOR
   //
   // Map PTL internal hardware exceptions to their x86 equivalents,
@@ -1146,7 +1173,7 @@
 
     if unlikely (p->cycle != cycle) {
       cycle = p->cycle;
-      os << "Cycle ", cycle, ":", endl;
+      os << "[core ", coreid, "] Cycle ", cycle, ":", endl;
     }
 
     p->print(os);
@@ -1162,6 +1189,9 @@
   bool br = isbranch(uop.opcode);
   W32 exception = LO32(commit.state.reg.rddata);
   W32 error_code = HI32(commit.state.reg.rddata);
+
+  stringbuf uopname;
+  nameof(uopname, uop);
 
   os << intstring(uuid, 20), " t", threadid, " ";
   switch (type) {
@@ -1214,7 +1244,7 @@
   case EVENT_RENAME_MEMQ_FULL:
     os <<  "rename memory queue full"; break;
   case EVENT_RENAME_OK: {
-    os <<  "rename rob ", intstring(rob, -3), " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
+    os <<  "rename rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
     if (ld|st) os << " lsq", lsq;
     os << " = ";
     foreach (i, MAX_OPERANDS) os << rename.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
@@ -1228,11 +1258,11 @@
     break;
   }
   case EVENT_FRONTEND:
-    os <<  "front  rob ", intstring(rob, -3), " frontend stage ", (FRONTEND_STAGES - frontend.cycles_left), " of ", FRONTEND_STAGES;
+    os <<  "front  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " frontend stage ", (FRONTEND_STAGES - frontend.cycles_left), " of ", FRONTEND_STAGES;
     break;
   case EVENT_CLUSTER_NO_CLUSTER:
   case EVENT_CLUSTER_OK: {
-    os << ((type == EVENT_CLUSTER_OK) ? "clustr" : "noclus"), " rob ", intstring(rob, -3), " allowed FUs = ", 
+    os << ((type == EVENT_CLUSTER_OK) ? "clustr" : "noclus"), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " allowed FUs = ", 
       bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), " -> clusters ",
       bitstring(select_cluster.allowed_clusters, MAX_CLUSTERS, true), " avail";
     foreach (i, MAX_CLUSTERS) os << " ", select_cluster.iq_avail[i];
@@ -1242,13 +1272,13 @@
   }
   case EVENT_DISPATCH_NO_CLUSTER:
   case EVENT_DISPATCH_OK: {
-    os << ((type == EVENT_DISPATCH_OK) ? "disptc" : "nodisp"),  " rob ", intstring(rob, -3), " operands ";
+    os << ((type == EVENT_DISPATCH_OK) ? "disptc" : "nodisp"),  " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " operands ";
     foreach (i, MAX_OPERANDS) os << dispatch.opinfo[i], ((i < MAX_OPERANDS-1) ? " " : "");
     if (type == EVENT_DISPATCH_OK) os << " -> cluster ", clusters[cluster].name; else os << " -> none";
     break;
   }
   case EVENT_ISSUE_NO_FU: {
-    os << "issue  rob ", intstring(rob, -3);
+    os << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
     os << "no FUs available in cluster ", clusters[cluster].name, ": ",
       "fu_avail = ", bitstring(issue.fu_avail, FU_COUNT, true), ", ",
       "op_fu = ", bitstring(fuinfo[uop.opcode].fu, FU_COUNT, true), ", "
@@ -1257,7 +1287,7 @@
   }
   case EVENT_ISSUE_OK: {
     stringbuf sb;
-    sb << "issue  rob ", intstring(rob, -3);
+    sb << "issue  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
     sb << " on ", padstring(fu_names[fu], -4), " in ", padstring(cluster_names[cluster], -4), ": r", intstring(physreg, -3), "@", phys_reg_file_names[rfid];
     sb << " "; print_value_and_flags(sb, issue.state.reg.rddata, issue.state.reg.rdflags); sb << " =";
     sb << " "; print_value_and_flags(sb, issue.operand_data[RA], issue.operand_flags[RA]); sb << ", ";
@@ -1269,7 +1299,7 @@
     break;
   }
   case EVENT_REPLAY: {
-    os << "replay rob ", intstring(rob, -3), " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid],
+    os << "replay rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " r", intstring(physreg, -3), "@", phys_reg_file_names[rfid],
       " on cluster ", clusters[cluster].name, ": waiting on";
     foreach (i, MAX_OPERANDS) {
       if (!bit(replay.ready, i)) os << " ", replay.opinfo[i];
@@ -1277,7 +1307,7 @@
     break;
   }
   case EVENT_STORE_WAIT: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     os << "wait on ";
@@ -1290,7 +1320,7 @@
     break;
   }
   case EVENT_STORE_PARALLEL_FORWARDING_MATCH: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     os << "ignored parallel forwarding match with ldq ", loadstore.inherit_sfr_lsq,
@@ -1299,7 +1329,7 @@
     break;
   }
   case EVENT_STORE_ALIASED_LOAD: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     os << "aliased with ldbuf ", loadstore.inherit_sfr_lsq, " (uuid ", loadstore.inherit_sfr_uuid,
@@ -1308,7 +1338,7 @@
     break;
   }
   case EVENT_STORE_ISSUED: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     if (loadstore.inherit_sfr_used) {
@@ -1320,21 +1350,21 @@
     break;
   }
   case EVENT_STORE_LOCK_RELEASED: {
-    os << "lk-rel", " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "lk-rel", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "lock released (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
     break;
   }
   case EVENT_STORE_LOCK_ANNULLED: {
-    os << "lk-anl", " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "lk-anl", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "lock annulled (original ld.acq uuid ", loadstore.locking_uuid, " rob ", loadstore.locking_rob, " on vcpu ", loadstore.locking_vcpuid, ")";
     break;
   }
   case EVENT_STORE_LOCK_REPLAY: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
@@ -1342,7 +1372,7 @@
   }
 
   case EVENT_LOAD_WAIT: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     os << "wait on sfr ", loadstore.inherit_sfr,
@@ -1357,7 +1387,7 @@
       os << (loadstore.load_store_second_phase ? "load2 " : "load  ");
     else os << (loadstore.load_store_second_phase ? "ldmis2" : "ldmiss");
 
-    os << " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     if (loadstore.inherit_sfr_used) {
@@ -1371,7 +1401,7 @@
     break;
   }
   case EVENT_LOAD_BANK_CONFLICT: {
-    os << "ldbank", " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << "ldbank", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "L1 bank conflict over bank ", lowbits(loadstore.sfr.physaddr, log2(CacheSubsystem::L1_DCACHE_BANKS));
@@ -1379,7 +1409,7 @@
   }
   case EVENT_LOAD_TLB_MISS: {
     os << (loadstore.load_store_second_phase ? "ldtlb2" : "ldtlb ");  
-    os << " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     if (loadstore.inherit_sfr_used) {
@@ -1391,70 +1421,70 @@
     break;
   }
   case EVENT_LOAD_LOCK_REPLAY: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "replay because vcpuid ", loadstore.locking_vcpuid, " uop uuid ", loadstore.locking_uuid, " has lock";
     break;
   }
   case EVENT_LOAD_LOCK_OVERFLOW: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "replay because locking required but no free interlock buffers", endl;
     break;
   }
   case EVENT_LOAD_LOCK_ACQUIRED: {
-    os << "lk-acq", " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << "lk-acq", " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ",
       "lock acquired";
     break;
   }
   case EVENT_LOAD_LFRQ_FULL:
-    os << "load   rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), ": LFRQ or miss buffer full; replaying"; break;
+    os << "load   rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), ": LFRQ or miss buffer full; replaying"; break;
   case EVENT_LOAD_HIGH_ANNULLED: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), " ldq ", lsq,
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, " (phys ", (void*)(Waddr)(loadstore.sfr.physaddr << 3), "): ";
     os << "load was annulled (high unaligned load)";
     break;
   }
   case EVENT_LOAD_WAKEUP:
-    os << "ldwake rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " wakeup load via lfrq slot ", lfrqslot; break;
+    os << "ldwake rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " wakeup load via lfrq slot ", lfrqslot; break;
   case EVENT_TLBWALK_HIT: {
-    os << "wlkhit rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+    os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
       loadstore.tlb_walk_level, "): hit for PTE at phys ", (void*)loadstore.virtaddr; break;
     break;
   }
   case EVENT_TLBWALK_MISS: {
-    os << "wlkmis rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+    os << "wlkmis rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
       loadstore.tlb_walk_level, "): miss for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
     break;
   }
   case EVENT_TLBWALK_WAKEUP: {
-    os << "wlkwak rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+    os << "wlkwak rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
       loadstore.tlb_walk_level, "): wakeup from cache miss for phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
     break;
   }
   case EVENT_TLBWALK_NO_LFRQ_MB: {
-    os << "wlknml rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+    os << "wlknml rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
       loadstore.tlb_walk_level, "): no LFRQ or MB for PTE at phys ", (void*)loadstore.virtaddr, ": lfrq ", lfrqslot; break;
     break;
   }
   case EVENT_TLBWALK_COMPLETE: {
-    os << "wlkhit rob ", intstring(rob, -3), " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
+    os << "wlkhit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " ldq ", lsq, " r", intstring(physreg, -3), " page table walk (level ",
       loadstore.tlb_walk_level, "): complete!"; break;
     break;
   }
   case EVENT_LOAD_EXCEPTION: {
-    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << (loadstore.load_store_second_phase ? "load2 " : "load  "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
     break;
   }
   case EVENT_STORE_EXCEPTION: {
-    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), " stq ", lsq,
+    os << "store", (loadstore.load_store_second_phase ? "2" : " "), " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " stq ", lsq,
       " r", intstring(physreg, -3), " on ", padstring(fu_names[fu], -4), " @ ",
       (void*)(Waddr)loadstore.virtaddr, ": exception ", exception_name(exception), ", pfec ", PageFaultErrorCode(error_code);
     break;
@@ -1462,16 +1492,16 @@
   case EVENT_ALIGNMENT_FIXUP:
     os << "algnfx", " rip ", rip, ": set unaligned bit for uop ", uop.bbindex, " (unaligned predictor slot ", SMTCore::hash_unaligned_predictor_slot(rip), ") and refetch"; break;
   case EVENT_FENCE_ISSUED:
-    os << "mfence rob ", intstring(rob, -3), " lsq ", lsq, " r", intstring(physreg, -3), ": memory fence (", uop, ")"; break;
+    os << "mfence rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " lsq ", lsq, " r", intstring(physreg, -3), ": memory fence (", uop, ")"; break;
   case EVENT_ANNUL_NO_FUTURE_UOPS:
-    os << "misspc rob ", intstring(rob, -3), ": SOM rob ", annul.somidx, ", EOM rob ", annul.eomidx, ": no future uops to annul"; break;
+    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, ", EOM rob ", annul.eomidx, ": no future uops to annul"; break;
   case EVENT_ANNUL_MISSPECULATION: {
-    os << "misspc rob ", intstring(rob, -3), ": SOM rob ", annul.somidx, 
+    os << "misspc rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": SOM rob ", annul.somidx, 
       ", EOM rob ", annul.eomidx, ": annul from rob ", annul.startidx, " to rob ", annul.endidx;
     break;
   }
   case EVENT_ANNUL_EACH_ROB: {
-    os << "annul  rob ", intstring(rob, -3), ": annul rip ", rip;
+    os << "annul  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": annul rip ", rip;
     os << (uop.som ? " SOM" : "    "); os << (uop.eom ? " EOM" : "    ");
     os << ": free";
     os << " r", physreg;
@@ -1481,7 +1511,7 @@
     break;
   }
   case EVENT_ANNUL_PSEUDOCOMMIT: {
-    os << "pseucm rob ", intstring(rob, -3), ": r", physreg, " rebuild rrt:";
+    os << "pseucm rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", ": r", physreg, " rebuild rrt:";
     os << " arch ", arch_reg_names[uop.rd];
     if likely (!uop.nouserflags) {
       if (uop.setflags & SETFLAG_ZF) os << " zf";
@@ -1494,13 +1524,13 @@
   case EVENT_ANNUL_FETCHQ_RAS:
     os << "anlras rip ", rip, ": annul RAS update still in fetchq"; break;
   case EVENT_ANNUL_FLUSH:
-    os << "flush  rob ", intstring(rob, -3), " rip ", rip; break;
+    os << "flush  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " rip ", rip; break;
   case EVENT_REDISPATCH_DEPENDENTS:
-    os << "redisp rob ", intstring(rob, -3), " find all dependents"; break;
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " find all dependents"; break;
   case EVENT_REDISPATCH_DEPENDENTS_DONE:
-    os << "redisp rob ", intstring(rob, -3), " redispatched ", (redispatch.count - 1), " dependent uops"; break;
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " redispatched ", (redispatch.count - 1), " dependent uops"; break;
   case EVENT_REDISPATCH_EACH_ROB: {
-    os << "redisp rob ", intstring(rob, -3), " from state ", redispatch.current_state_list->name, ": dep on ";
+    os << "redisp rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " from state ", redispatch.current_state_list->name, ": dep on ";
     if (!redispatch.dependent_operands) {
       os << " [self]";
     } else {
@@ -1520,9 +1550,9 @@
     break;
   }
   case EVENT_COMPLETE:
-    os << "complt rob ", intstring(rob, -3), " on ", padstring(fu_names[fu], -4), ": r", intstring(physreg, -3); break;
+    os << "complt rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " on ", padstring(fu_names[fu], -4), ": r", intstring(physreg, -3); break;
   case EVENT_FORWARD: {
-    os << "forwd", forwarding.forward_cycle, " rob ", intstring(rob, -3), 
+    os << "forwd", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
       " (", clusters[cluster].name, ") r", intstring(physreg, -3), 
       " => ", "uuid ", forwarding.target_uuid, " rob ", forwarding.target_rob,
       " (", clusters[forwarding.target_cluster].name, ") r", forwarding.target_physreg,
@@ -1535,13 +1565,13 @@
     break;
   }
   case EVENT_BROADCAST: {
-    os << "brcst", forwarding.forward_cycle, " rob ", intstring(rob, -3), 
+    os << "brcst", forwarding.forward_cycle, " rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", 
       " from cluster ", clusters[cluster].name, " to cluster ", clusters[forwarding.target_cluster].name,
       " on forwarding cycle ", forwarding.forward_cycle;
     break;
   }
   case EVENT_WRITEBACK: {
-    os << "write  rob ", intstring(rob, -3), " (cluster ", clusters[cluster].name, ") r", intstring(physreg, -3), "@", phys_reg_file_names[rfid], " = 0x", hexstring(writeback.data, 64), " ", flagstring(writeback.flags);
+    os << "write  rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " (cluster ", clusters[cluster].name, ") r", intstring(physreg, -3), "@", phys_reg_file_names[rfid], " = 0x", hexstring(writeback.data, 64), " ", flagstring(writeback.flags);
     if (writeback.transient) os << " (transient)";
     os << " (", writeback.consumer_count, " consumers";
     if (writeback.all_consumers_sourced_from_bypass) os << ", all from bypass";
@@ -1551,19 +1581,19 @@
     break;
   }
   case EVENT_COMMIT_FENCE_COMPLETED:
-    os << "mfcmit rob ", intstring(rob, -3), " fence committed: wake up waiting memory uops"; break;
+    os << "mfcmit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " fence committed: wake up waiting memory uops"; break;
   case EVENT_COMMIT_EXCEPTION_DETECTED:
-    os << "detect rob ", intstring(rob, -3), " exception ", exception_name(exception), " (", exception, "), error code ", hexstring(error_code, 16), ", origvirt ", (void*)(Waddr)commit.origvirt; break;
+    os << "detect rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " (", exception, "), error code ", hexstring(error_code, 16), ", origvirt ", (void*)(Waddr)commit.origvirt; break;
   case EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED:
-    os << "except rob ", intstring(rob, -3), " exception ", exception_name(exception), " [EOM #", commit.total_user_insns_committed, "]"; break;
+    os << "except rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " exception ", exception_name(exception), " [EOM #", commit.total_user_insns_committed, "]"; break;
   case EVENT_COMMIT_SKIPBLOCK:
-    os << "skipbk rob ", intstring(rob, -3), " skip block: advance rip by ", uop.bytes, " to ", (void*)(Waddr)(rip.rip + uop.bytes), " [EOM #", commit.total_user_insns_committed, "]"; break;
+    os << "skipbk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " skip block: advance rip by ", uop.bytes, " to ", (void*)(Waddr)(rip.rip + uop.bytes), " [EOM #", commit.total_user_insns_committed, "]"; break;
   case EVENT_COMMIT_SMC_DETECTED:
-    os << "smcdet rob ", intstring(rob, -3), " self-modifying code at rip ", rip, " detected (mfn was dirty); invalidate and retry [EOM #", commit.total_user_insns_committed, "]"; break;
+    os << "smcdet rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " self-modifying code at rip ", rip, " detected (mfn was dirty); invalidate and retry [EOM #", commit.total_user_insns_committed, "]"; break;
   case EVENT_COMMIT_MEM_LOCKED:
-    os << "waitlk rob ", intstring(rob, -3), " wait for lock on physaddr ", (void*)(commit.state.st.physaddr << 3), " to be released"; break;
+    os << "waitlk rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " wait for lock on physaddr ", (void*)(commit.state.st.physaddr << 3), " to be released"; break;
   case EVENT_COMMIT_OK: {
-    os << "commit rob ", intstring(rob, -3);
+    os << "commit rob ", intstring(rob, -3), "(",padstring(uopname,-5),")";
     if likely (archdest_can_commit[uop.rd])
                 os << " [rrt ", arch_reg_names[uop.rd], " = r", physreg, " 0x", hexstring(commit.state.reg.rddata, 64), "]";
 
@@ -1573,7 +1603,7 @@
         " -> ", flagstring(commit.state.reg.rdflags), "]";
     }
 
-    if (uop.eom) os << " [rip = ", (void*)(Waddr)commit.target_rip, "]";
+    if (uop.eom) os << " [rip = ", (void*)(Waddr)commit.target_rip, commit.krn ? " krn" : "", "]";
 
     if unlikely (st && (commit.state.st.bytemask != 0))
                   os << " [mem ", (void*)(Waddr)(commit.state.st.physaddr << 3), " = ", bytemaskstring((const byte*)&commit.state.st.data, commit.state.st.bytemask, 8), " mask ", bitstring(commit.state.st.bytemask, 8, true), "]";
@@ -1615,8 +1645,9 @@
     break;
   }
   case EVENT_COMMIT_ASSIST: {
-    os << "assist rob ", intstring(rob, -3), " calling assist ", (void*)rip.rip, " (#",
-      assist_index((assist_func_t)rip.rip), ": ", assist_name((assist_func_t)rip.rip), ")";
+    assist_func_t assist_func = assistid_to_func[rip.rip];
+    os << "assist rob ", intstring(rob, -3), "(",padstring(uopname,-5),")", " calling assist ", (void*)assist_func, " (#",
+      assist_index(assist_func), ": ", assist_name(assist_func), ")";
     break;
   }
   case EVENT_RECLAIM_PHYSREG:
@@ -1646,16 +1677,28 @@
 //
 
 bool SMTMachine::init(PTLsimConfig& config) {
+#ifdef ENABLE_SMT
   // Note: we only create a single core for all contexts for now.
   cores[0] = new SMTCore(0, *this);
+  corecount = 1;
+#else
+  corecount = 0;
+#endif
 
   foreach (i, contextcount) {
+#ifdef ENABLE_SMT
     SMTCore& core = *cores[0];
+#else
+    cores[corecount] = new SMTCore(corecount, *this);
+    SMTCore& core    = *cores[corecount];
+    corecount++;
+#endif
+
+    ThreadContext* thread = new ThreadContext(core, core.threadcount, contextof(i));
+    core.threads[core.threadcount] = thread;
     core.threadcount++;
-    ThreadContext* thread = new ThreadContext(core, i, contextof(i));
-    core.threads[i] = thread;
     thread->init();
-
+    logfile << "New ThreadContext: Core ", core.coreid, " (", core.threadcount," threads) Thread ", thread->threadid, " *thread=",thread, " *core=",&core, corecount, " total cores", corecount, endl, flush;
     //
     // Note: in a multi-processor model, config may
     // specify various ways of slicing contextcount up
@@ -1665,8 +1708,10 @@
     //
   }
 
-  cores[0]->init();
+  foreach (i, corecount) cores[i]->init();
   init_luts();
+
+  init_random(true);
   return true;
 }
 
@@ -1682,14 +1727,16 @@
   // All VCPUs are running:
   stopped = 0;
 
-  cores[0]->reset();
-  cores[0]->flush_pipeline_all();
+  foreach (i, corecount) {
+    cores[i]->reset();
+    cores[i]->flush_pipeline_all();
 
-  logfile << "IssueQueue states:", endl;
+    logfile << "IssueQueue states:", endl;
 
-  if unlikely (config.event_log_enabled && (!cores[0]->eventlog.start)) {
-    cores[0]->eventlog.init(config.event_log_ring_buffer_size);
-    cores[0]->eventlog.logfile = &logfile;
+    if unlikely (config.event_log_enabled && (!cores[i]->eventlog.start)) {
+      cores[i]->eventlog.init(config.event_log_ring_buffer_size);
+      cores[i]->eventlog.logfile = &logfile;
+    }
   }
 
   bool exiting = false;
@@ -1704,30 +1751,34 @@
     update_progress();
     inject_events();
 
-    SMTCore& core =* cores[0]; // only one core for now
-    foreach (i, core.threadcount) {
-      ThreadContext* thread = core.threads[i];
+    foreach (j, corecount) {
+      SMTCore& core =* cores[j];
+      foreach (i, core.threadcount) {
+        ThreadContext* thread = core.threads[i];
 #ifdef PTLSIM_HYPERVISOR
-      if unlikely (!thread->ctx.running) {
-        if unlikely (stopping) {
-          // Thread is already waiting for an event: stop it now
-          logfile << "[vcpu ", thread->ctx.vcpuid, "] Already stopped at cycle ", sim_cycle, endl;
-          stopped[thread->ctx.vcpuid] = 1;
-        } else {
-          if (thread->ctx.check_events()) thread->handle_interrupt();
+        if unlikely (!thread->ctx.running) {
+          if unlikely (stopping) {
+            // Thread is already waiting for an event: stop it now
+            logfile << "[vcpu ", thread->ctx.vcpuid, "] Already stopped at cycle ", sim_cycle, endl;
+            stopped[thread->ctx.vcpuid] = 1;
+          } else {
+            if (thread->ctx.check_events()) thread->handle_interrupt();
+          }
+          continue; /* NB, SD: Back to foreach (i, core.threadcount), that doesn't make much sense in the original impl, either! */
         }
-        continue;
+#endif
       }
-#endif
+
+      exiting |= core.runcycle();
     }
-
-    exiting |= core.runcycle();
 
     if unlikely (check_for_async_sim_break() && (!stopping)) {
       logfile << "Waiting for all VCPUs to reach stopping point, starting at cycle ", sim_cycle, endl;
       // force_logging_enabled();
-      SMTCore& core =* cores[0];
-      foreach (i, core.threadcount) core.threads[i]->stop_at_next_eom = 1;
+      foreach (j, corecount) {
+        SMTCore& core = *cores[j];
+        foreach (i, core.threadcount) core.threads[i]->stop_at_next_eom = 1;
+      }
       if (config.abort_at_end) {
         config.abort_at_end = 0;
         logfile << "Abort immediately: do not wait for next x86 boundary nor flush pipelines", endl;
@@ -1752,19 +1803,20 @@
 
   logfile << "Exiting SMT mode at ", total_user_insns_committed, " commits, ", total_uops_committed, " uops and ", iterations, " iterations (cycles)", endl;
 
-  SMTCore& core =* cores[0]; /// only one core for now.
+  foreach (j, corecount) {
+    SMTCore& core =* cores[j]; 
 
-  foreach (i, core.threadcount) {
-    ThreadContext* thread = core.threads[i];
+    foreach (i, core.threadcount) {
+      ThreadContext* thread = core.threads[i];
 
-    thread->core_to_external_state();
+      thread->core_to_external_state();
 
-    if (logable(6) | ((sim_cycle - thread->last_commit_at_cycle) > 1024) | config.dump_state_now) {
-      logfile << "Core State at end for thread ", thread->threadid, ": ", endl;
-      logfile << thread->ctx;
+      if (logable(6) | ((sim_cycle - thread->last_commit_at_cycle) > 1024) | config.dump_state_now) {
+        logfile << "Core State at end for thread ", thread->threadid, ": ", endl;
+        logfile << thread->ctx;
+      }
     }
   }
-
   config.dump_state_now = 0;
 
   dump_state(logfile);
@@ -1785,13 +1837,20 @@
     //logfile << "ITLB before: ", endl, caches.itlb, endl;
   }
   int dn; int in;
-
   if unlikely (selective) {
     dn = caches.dtlb.flush_virt(virtaddr, threadid);
     in = caches.itlb.flush_virt(virtaddr, threadid);
+#ifdef USE_L2_TLB
+    dn += caches.l2dtlb.flush_virt(virtaddr, threadid);
+    in += caches.l2itlb.flush_virt(virtaddr, threadid);
+#endif
   } else {
     dn = caches.dtlb.flush_thread(threadid);
     in = caches.itlb.flush_thread(threadid);
+#ifdef USE_L2_TLB
+    dn += caches.l2dtlb.flush_thread(threadid);
+    in += caches.l2itlb.flush_thread(threadid);
+#endif
   }
   if (logable(5)) {
     logfile << "Flushed ", dn, " DTLB slots and ", in, " ITLB slots", endl;
@@ -1801,16 +1860,26 @@
 }
 
 void SMTMachine::flush_tlb(Context& ctx) {
-  // This assumes all VCPUs are mapped as threads in a single SMT core
+  // This assumes all VCPUs are mapped statically to either cores or threads in a single SMT core
+#ifdef ENABLE_SMT
   int coreid = 0;
   int threadid = ctx.vcpuid;
+#else
+  int coreid = ctx.vcpuid;;
+  int threadid = 0;
+#endif
   cores[coreid]->flush_tlb(ctx, threadid);
 }
 
 void SMTMachine::flush_tlb_virt(Context& ctx, Waddr virtaddr) {
-  // This assumes all VCPUs are mapped as threads in a single SMT core
+  // This assumes all VCPUs are mapped statically to either cores or threads in a single SMT core
+#ifdef ENABLE_SMT
   int coreid = 0;
   int threadid = ctx.vcpuid;
+#else
+  int coreid = ctx.vcpuid;;
+  int threadid = 0;
+#endif
   cores[coreid]->flush_tlb(ctx, threadid, true, virtaddr);
 }
 
@@ -1916,7 +1985,355 @@
   }  
 }
 
-SMTMachine smtmodel("smt");
+/**
+ * Adds an address to the locked-line buffer (LLB). During the ACQUIRE
+ * instruction, the cachelines for the addresses inside the LLB are fetched,
+ * so that the original data can be restored if the transaction aborts.
+ *
+ * @param addr The physical address of the data to be stored in the LLB.
+ *             NULL, if there is no more space in the LLB.
+ */
+LLBLine* LockedLineBuffer::add_location(Waddr addr) {
+  Waddr cache_line_phys_addr = floor(addr, LLB_LINE_SIZE);
+
+  if unlikely (num_locations >= ASF_MAX_SPEC_LINES) return NULL;
+
+  /* Touch the line now, fill it later */
+  LLBLine* line = select(cache_line_phys_addr);
+  line->refcount++;
+  num_locations++;
+  if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Adding location ",hexstring(addr,64),
+                             " locations: ",num_locations," line: ",line," refcount: ",line->refcount ,endl;
+  return line;
+}
+
+/**
+ * Removes all addresses and their associated undo data from the LLB.
+ */
+void LockedLineBuffer::clear() {
+  if likely(empty()) return;
+  if (logable(5))  logfile <<"[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Clearing the LLB! Locations: ",num_locations, endl;
+  num_locations = 0;
+  lasterr = 0;
+  reset();
+}
+
+/**
+ * Creates a snapshot of the cachelines associated with all the addresses in the LLB.
+ */
+void LockedLineBuffer::snapshot() {
+  int c = 0;
+  /* This can actually happen, as the acquire makes the snapshot at issue time,
+      where possibly some lock-loads are still mis-speculted and not resolved! */
+  //assert(num_locations <= ASF_MAX_LINES);
+  for (int i = 0; i < ASF_MAX_SPEC_LINES; i++) {
+    if likely (tags[i] != tags.INVALID) {
+      /* Fetch the cacheline from the given address */
+      if (logable(5)) {
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                   ": Fetching LLB line ", i, " from address ", hexstring(tags[i],64),
+                   " Data :", endl, bytestring((W8*)phys_to_mapped_virt(tags[i]), LLB_LINE_SIZE), endl;
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Copying ", sizeof(data[i].orig_data),
+                   " bytes from ", phys_to_mapped_virt(tags[i]), " to ", data[i].orig_data, endl;
+      }
+      memcpy(data[i].orig_data, phys_to_mapped_virt(tags[i]), sizeof(data[i].orig_data));
+      c += data[i].refcount;
+    }
+  }
+  assert(c == num_locations);
+}
+
+/**
+ * Write the lines from the LLB back to the caches, undoing any changes made to
+ * them.
+ */
+void LockedLineBuffer::undo() {
+  int c = 0;
+  /* This can actually happen, as the acquire makes the snapshot at issue time,
+     where possibly some lock-loads are still mis-speculted and not resolved! */
+  //assert(num_locations <= ASF_MAX_LINES);
+  for (int i = 0; i < ASF_MAX_SPEC_LINES; i++) {
+    if likely (tags[i] != tags.INVALID) {
+      /* Write the cacheline back to its position. */
+      if (logable(5))
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                   ":Restoring LLB line ", i, " at address ", hexstring(tags[i],64),
+                   " Data :", endl, bytestring(data[i].orig_data, LLB_LINE_SIZE), endl;
+
+      if (data[i].written) {
+        if (logable(5))
+          logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Copying ", sizeof(data[i].orig_data),
+                     " bytes from ",  data[i].orig_data, " to ",phys_to_mapped_virt(tags[i]), endl;
+        memcpy(phys_to_mapped_virt(tags[i]), data[i].orig_data, sizeof(data[i].orig_data));
+        data[i].written = false;
+      } else {
+        if (logable(5)) 
+          logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Ignoring ", sizeof(data[i].orig_data),
+                     " unmodified bytes from ",  data[i].orig_data, " @ ",phys_to_mapped_virt(tags[i]), endl;
+      }
+
+      c += data[i].refcount;
+    }
+  }
+  assert(c == num_locations);
+}
+
+/**
+ * Notifies the LLB, that a reference to one of its lines has been dropped.
+ * This can occur when a ROBEntry gets redispatched / annuled and thus must
+ * get removed from the LLB.
+ *
+ * @param line Pointer to the line inside the LLB which is to be removed.
+ */
+void LockedLineBuffer::remove_ref(LLBLine* line) {
+  Waddr tag = tagof(line);
+
+  if unlikely (tag == tags.INVALID) {
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                               ": Line not in LLB anymore. Ignoring remove request!", endl;
+    return;
+  }
+  assert(line->refcount > 0);
+  if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Removing reference to line ",line,
+                             " tag: ",tagof(line)," refcount: ", line->refcount, endl;
+
+  line->refcount--;
+  num_locations--;
+
+  /* Remove a line which does not belong to any valid instructions any longer! */
+  if (!line->refcount) {
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": No more references to line ",line,
+                               ". Removing it!", endl, flush;
+    invalidate_line(line);
+  }
+}
+
+/**
+ * Incoming probe from another core. Checks our own LLB for any conflicting cachelines.
+ * Either of the two accesses has to be aborted / stalled. Will provide a pointer to the
+ * backed up data for short-circuit forwarding of the original content!
+ * @param addr Address of the cacheline to probe.
+ * @param invalidating True, if the incoming probe is an invalidating one, eg from a write access.
+ * @return Pointer to original data, if speculative updates have occured to that line! Null, if
+ *         line not touched / present in LLB.
+ */
+void* LockedLineBuffer::external_probe(Waddr addr, bool invalidating) {
+  /* For now just implement the policy of aborting ourselves. */
+  Waddr cache_line_phys_addr = floor(addr, LLB_LINE_SIZE);
+  LLBLine* l = probe(cache_line_phys_addr);
+
+  if likely (!l) return null;									// Line not in LLB
+  if likely (!l->written && !invalidating) return null; 		// multiple readers ok!
+
+  if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": ",invalidating?"Inv-":"", "probe hit on ", addr, endl;
+
+  /* Either invalidating probe or modified data read before commit:
+     we (callee) will abort our ASF-CS! */
+  lasterr = cache_line_phys_addr; //TODO: Proper errorcode!
+
+  /* In case we had the line modified, forward the unmodified data! */
+  if unlikely(l->written) return &l->orig_data;
+  return null;
+}
+
+/**
+ * Probe the LLBs of all other cores in the system with a given address
+ * and access mode, finds one unmodified copy in the system.
+ * @param addr Address of the cacheline to probe.
+ * @param invalidating True, if the incoming probe is an invalidating one,
+ *                     eg from a write access.
+ * @return Pointer to one copy of the original data. Null, if line not touched
+ *         / present in no LLB.
+ */
+void* LockedLineBuffer::probe_other_LLBs(Waddr addr, bool invalidating) {
+  SMTMachine& m = thread.core.machine;
+  void* orig_data = null;
+  void* res;
+
+  foreach(cid, m.corecount) {
+    SMTCore& c = *m.cores[cid];
+    foreach(tid, c.threadcount) {
+      ThreadContext& t = *c.threads[tid];
+      if (&t == &thread) continue;
+      res = t.locked_line_buffer.external_probe(addr, invalidating);
+      if (!orig_data) orig_data = res;
+    }
+  }
+  return orig_data;
+}
+
+/**
+ * Marks the given cache-line as clean in this LLB.
+ * No write-back will occur on eviction.
+ * @param addr Address of the cacheline to mark.
+ */
+void LockedLineBuffer::mark_clean(Waddr addr) {
+  Waddr cache_line_phys_addr = floor(addr, LLB_LINE_SIZE);
+  LLBLine* l = probe(cache_line_phys_addr);
+  if (l) l->written = false;
+}
+
+/**
+ * Marks the given cache-line as clean in all LLBs of other cores in
+ * the system. No write-back will occur on eviction.
+ * @param addr Address of the cacheline to mark.
+ */
+void LockedLineBuffer::mark_clean_others(Waddr addr) {
+  SMTMachine& m = thread.core.machine;
+
+  foreach(cid, m.corecount) {
+    SMTCore& c = *m.cores[cid];
+    foreach(tid, c.threadcount) {
+      ThreadContext& t = *c.threads[tid];
+      if (&t == &thread) continue;
+      t.locked_line_buffer.mark_clean(addr);
+    }
+  }
+}
+
+/**
+ * Marks the given cache-line as dirty in this LLB.
+ * Write-back will occur on eviction.
+ * @param addr Address of the cacheline to mark.
+ */
+void LockedLineBuffer::mark_written(Waddr addr) {
+  Waddr cache_line_phys_addr = floor(addr, LLB_LINE_SIZE);
+  LLBLine* l = probe(cache_line_phys_addr);
+  if (l) l->written = true;
+}
+
+/**
+ * Rolls back all speculative modifications by the critical section and
+ * moves control flow back to the last acquire instruction.
+ * @param errorcode The error code, which the re-executed acquire instruction
+ *                  should return.
+ * @param reg_nextrip The index of the register, which will contain the RIP of the next instruction.
+ */
+void ThreadContext::asf_rollback_last_acq(W64 errorcode, int reg_nextrip) {
+  assert(asf_failing_acquire);
+  LockedLineBuffer& llb = locked_line_buffer;
+
+  if (logable(5)) logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,__LINE__,": Aborting critical section, jumping back to ", asf_failing_acquire, endl;
+  llb.abort();
+  asf_in_crit_sec       = false;
+  asf_reissue_will_fail = true;
+  asf_stored_error      = errorcode;
+
+  ctx.commitarf[reg_nextrip] = asf_failing_acquire;
+  if (ctx.commitarf[REG_rsp] != asf_saved_rsp)
+    if (logable(5)) logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,__LINE__,": Restoring RSP to ", (void*)asf_saved_rsp, 
+                               " was ",(void*)ctx.commitarf[REG_rsp],endl;
+  ctx.commitarf[REG_rsp]     = asf_saved_rsp;
+}
+
+/**
+ * When the core is running in an ASF transaction check for conflicting accesses from other cores
+ * and abort the currently running transaction (in optimistic mode) if detected!
+ */
+void ThreadContext::check_asf_conflicts() {
+  LockedLineBuffer& llb     = locked_line_buffer;
+  int               asf_err = llb.consistency_error();
+
+  if likely (!asf_err) return;
+  if (logable(5)) logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,__LINE__,": Error ", hexstring(asf_err, 64),
+                             " found! Aborting the transaction!", endl;
+  /* TODO: This is for optimistic mode just a plain -16, but for testing make this somewhat more useful!
+           We could actually also use some data from the LLB here. */
+  asf_rollback_last_acq(ctx.commitarf[REG_rip], REG_rip);
+  if (logable(5)) logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,__LINE__,": Flushing the pipeline!", endl;
+  flush_pipeline();
+}
+
+/**
+ * Allow ASF to see any effects of the stages of the CPU just before the cycle ends and
+ * things such as exceptions are processed.
+ * @param commitrc Return code of the commit operation, used to tweak exception handling
+ *                 when inside ASF's critical sections.
+ * @return New commitrc, possibly tweaked to mask exceptions!
+ */
+int ThreadContext::asf_runcycle(int commitrc) {
+  /* Check for conflicts of the ongoing critical section */
+  if unlikely (asf_in_crit_sec) {
+    //TODO: Add proper interrupt deferal treatment!
+    if unlikely (commitrc == COMMIT_RESULT_INTERRUPT) {
+      logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle,
+                 "Interrupt at rip ", (void*)(Waddr)ctx.commitarf[REG_rip],
+                 " faking at acquire. ", endl, flush;
+      asf_rollback_last_acq(ctx.exception, REG_rip);
+    } 
+
+    else if unlikely (commitrc == COMMIT_RESULT_EXCEPTION) {
+      /* Exceptions trigger, but as if they were caused by the ACQUIRE! */
+      if (ctx.exception != EXCEPTION_SkipBlock) {
+        logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle," Exception ", exception_name(ctx.exception),
+          " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip], " faking it at the last acquire!", endl, flush;
+        asf_rollback_last_acq(ctx.exception, REG_rip);
+      }
+    }
+
+    else if unlikely (commitrc == COMMIT_RESULT_BARRIER) {
+      int assistid = ctx.commitarf[REG_rip];
+      /* Far control flow movements push the rip of the last Acquire on the stack! */
+      if (inrange(assistid, (int)ASSIST_INT, (int)ASSIST_IRET64))
+        logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle,
+          " Assist ", assist_names[assistid]," called from rip ", (void*)(Waddr)ctx.commitarf[REG_selfrip],
+          " faking it to return to old acquire!", endl, flush;
+
+      //NOTE: This is scary stuff!
+      if unlikely (assistid == ASSIST_INT) {
+        /* int just pushes the RIP of the next Instruktion onto the stack -> modify this! */
+        asf_rollback_last_acq(assistid, REG_nextrip);
+      } else if unlikely (assistid == ASSIST_SYSCALL) {
+        /* syscall moves the RIP of the next instruction into RCX, PTLsim has done that in
+           ucode already before this assist -> change RCX! */
+        asf_rollback_last_acq(assistid, REG_RCX);
+      } else {
+        /* Can't do much for all the other crazyness! */
+        logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle,
+          " Unhandled assist ", assist_names[assistid], " encountered within an ASF critical section. This is pretty bad!!!", endl, flush;
+      }
+    }
+
+    if likely ((commitrc == COMMIT_RESULT_OK) || (commitrc == COMMIT_RESULT_NONE)) {
+      /* Just checking for normal interference from other cores.. */
+      check_asf_conflicts();
+      return commitrc;
+    }
+  } else {
+    /* we're not inside a critical section! */
+    if likely (commitrc == COMMIT_RESULT_OK) return commitrc;
+    if likely (locked_line_buffer.empty())   return commitrc;
+
+    /* In case of any exception / far control transfer: clear the LLB!
+       Example: An lock prefetchw causes a page fault -> the subsequent acquire will fail! */
+    if unlikely (commitrc == COMMIT_RESULT_EXCEPTION) {
+      logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle," Exception ", exception_name(ctx.exception),
+        " called from rip ", (void*)(Waddr)ctx.commitarf[REG_rip],  endl, flush;
+
+      if (ctx.exception != EXCEPTION_SkipBlock) locked_line_buffer.clear();
+    }
+
+    else if unlikely (commitrc == COMMIT_RESULT_BARRIER) {
+      int assistid = ctx.commitarf[REG_rip];
+      //NOTE: This relies on assist-id ordering!
+      //TODO: Add other ASF unfriendly assists here!
+      if (inrange(assistid, (int)ASSIST_INT, (int)ASSIST_IRET64)) {
+        logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle," Assist ", assist_names[assistid],
+          " called from rip ", (void*)(Waddr)ctx.commitarf[REG_selfrip],  endl, flush;
+        locked_line_buffer.clear();
+      }
+    }
+
+    else if unlikely (commitrc == COMMIT_RESULT_INTERRUPT) {
+      logfile << "[vcpu ", ctx.vcpuid,"]"__FILE__,":",__LINE__,"@",sim_cycle," Interrupt at rip ", (void*)(Waddr)ctx.commitarf[REG_rip], endl, flush;
+      locked_line_buffer.clear();
+    }
+  }
+  return commitrc;
+}
+
+/* We have asf now! */
+SMTMachine smtmodel("asfsmt");
 
 SMTCore& SMTModel::coreof(int coreid) {
   return *smtmodel.cores[coreid];
diff -r 10448c053ad6 smtcore.h
--- a/smtcore.h	Thu May 31 15:36:20 2007 +0200
+++ b/smtcore.h	Wed Nov 05 14:15:51 2008 +0100
@@ -3,1994 +3,49 @@
 // PTLsim: Cycle Accurate x86-64 Simulator
 // SMT Core Simulator Configuration
 //
+// This program is free software; you can redistribute it and/or
+// modify it under the terms of the GNU General Public License
+// as published by the Free Software Foundation; either version 2
+// of the License, or (at your option) any later version.
+// 
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public License for more details.
+// 
+// You should have received a copy of the GNU General Public License
+// along with this program; if not, write to the Free Software
+// Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+// 02110-1301, USA.
+//
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #ifndef _SMTCORE_H_
 #define _SMTCORE_H_
 
+#include <ptlsim.h>
+#include <random_inject.h>
+
 // With these disabled, simulation is faster
 // #define ENABLE_CHECKS
 // #define ENABLE_LOGGING
 
-//
-// Enable SMT operation:
-//
-// Note that this limits some configurations of resources and
-// issue queues that would normally be possible in single
-// threaded mode.
-//
-
-//#define ENABLE_SMT
-
-static const int MAX_THREADS_BIT = 4; // up to 16 threads
-static const int MAX_ROB_IDX_BIT = 12; // up to 4096 ROB entries
-
-#ifdef ENABLE_SMT
-static const int MAX_THREADS_PER_CORE = 4;
+#ifdef CORE_GENERIC
+#include <smtcore-generic.h>
 #else
-static const int MAX_THREADS_PER_CORE = 1;
+#ifdef CORE_AMD_K8
+#include <smtcore-amd-k8.h>
+#else
+#ifdef CORE_AMD_BARCELONA_ASF
+#include <smtcore-amd-barcelona-asf.h>
+#else
+#error Please specify a core flavour by defining CORE_XXX in ptlsim.h!
+#endif
+#endif
 #endif
 
-//#define ENABLE_SIM_TIMING
-#ifdef ENABLE_SIM_TIMING
-#define time_this_scope(ct) CycleTimerScope ctscope(ct)
-#define start_timer(ct) ct.start()
-#define stop_timer(ct) ct.stop()
-#else
-#define time_this_scope(ct) (0)
-#define start_timer(ct) (0)
-#define stop_timer(ct) (0)
-#endif
-
-#define per_context_smtcore_stats_ref(vcpuid) (*(((PerContextSMTStats*)&stats.smtcore.vcpu0) + (vcpuid)))
-#define per_context_smtcore_stats_update(vcpuid, expr) stats.smtcore.total.expr, per_context_smtcore_stats_ref(vcpuid).expr
-
-namespace SMTModel {
-  //
-  // Operand formats
-  //
-  static const int MAX_OPERANDS = 4;
-  static const int RA = 0;
-  static const int RB = 1;
-  static const int RC = 2;
-  static const int RS = 3; // (for stores only)
-
-  //
-  // Uop to functional unit mappings
-  //
-  static const int FU_COUNT = 8;
-  static const int LOADLAT = 2;
-
-  enum {
-    FU_LDU0       = (1 << 0),
-    FU_STU0       = (1 << 1),
-    FU_LDU1       = (1 << 2),
-    FU_STU1       = (1 << 3),
-    FU_ALU0       = (1 << 4),
-    FU_FPU0       = (1 << 5),
-    FU_ALU1       = (1 << 6),
-    FU_FPU1       = (1 << 7),
-  };
-
-  static const int LOAD_FU_COUNT = 2;
-
-  const char* fu_names[FU_COUNT] = {
-    "ldu0",
-    "stu0",
-    "ldu1",
-    "stu1",
-    "alu0",
-    "fpu0",
-    "alu1",
-    "fpu1",
-  };
-
-  //
-  // Opcodes and properties
-  //
-#define ALU0 FU_ALU0
-#define ALU1 FU_ALU1
-#define STU0 FU_STU0
-#define STU1 FU_STU1
-#define LDU0 FU_LDU0
-#define LDU1 FU_LDU1
-#define FPU0 FU_FPU0
-#define FPU1 FU_FPU1
-#define A 1 // ALU latency, assuming fast bypass
-#define L LOADLAT
-
-#define ANYALU ALU0|ALU1
-#define ANYLDU LDU0|LDU1
-#define ANYSTU STU0|STU1
-#define ANYFPU FPU0|FPU1
-#define ANYINT ANYALU|ANYSTU|ANYLDU
-
-  struct FunctionalUnitInfo {
-    byte opcode;   // Must match definition in ptlhwdef.h and ptlhwdef.cpp! 
-    byte latency;  // Latency in cycles, assuming ideal bypass
-    W16  fu;       // Map of functional units on which this uop can issue
-  };
-
-  //
-  // WARNING: This table MUST be kept in sync with the table
-  // in ptlhwdef.cpp and the uop enum in ptlhwdef.h!
-  //
-  const FunctionalUnitInfo fuinfo[OP_MAX_OPCODE] = {
-    // name, latency, fumask
-    {OP_nop,            A, ANYINT|ANYFPU},
-    {OP_mov,            A, ANYINT|ANYFPU},
-    // Logical
-    {OP_and,            A, ANYINT|ANYFPU},
-    {OP_andnot,         A, ANYINT|ANYFPU},
-    {OP_xor,            A, ANYINT|ANYFPU},
-    {OP_or,             A, ANYINT|ANYFPU},
-    {OP_nand,           A, ANYINT|ANYFPU},
-    {OP_ornot,          A, ANYINT|ANYFPU},
-    {OP_eqv,            A, ANYINT|ANYFPU},
-    {OP_nor,            A, ANYINT|ANYFPU},
-    // Mask, insert or extract bytes
-    {OP_maskb,          A, ANYINT},
-    // Add and subtract
-    {OP_add,            A, ANYINT},
-    {OP_sub,            A, ANYINT},
-    {OP_adda,           A, ANYINT},
-    {OP_suba,           A, ANYINT},
-    {OP_addm,           A, ANYINT},
-    {OP_subm,           A, ANYINT},
-    // Condition code logical ops
-    {OP_andcc,          A, ANYINT},
-    {OP_orcc,           A, ANYINT},
-    {OP_xorcc,          A, ANYINT},
-    {OP_ornotcc,        A, ANYINT},
-    // Condition code movement and merging
-    {OP_movccr,         A, ANYINT},
-    {OP_movrcc,         A, ANYINT},
-    {OP_collcc,         A, ANYINT},
-    // Simple shifting (restricted to small immediate 1..8)
-    {OP_shls,           A, ANYINT},
-    {OP_shrs,           A, ANYINT},
-    {OP_bswap,          A, ANYINT},
-    {OP_sars,           A, ANYINT},
-    // Bit testing
-    {OP_bt,             A, ANYALU},
-    {OP_bts,            A, ANYALU},
-    {OP_btr,            A, ANYALU},
-    {OP_btc,            A, ANYALU},
-    // Set and select
-    {OP_set,            A, ANYINT},
-    {OP_set_sub,        A, ANYINT},
-    {OP_set_and,        A, ANYINT},
-    {OP_sel,            A, ANYINT},
-    // Branches
-    {OP_br,             A, ANYINT},
-    {OP_br_sub,         A, ANYINT},
-    {OP_br_and,         A, ANYINT},
-    {OP_jmp,            A, ANYINT},
-    {OP_bru,            A, ANYINT},
-    {OP_jmpp,           A, ANYALU|ANYLDU},
-    {OP_brp,            A, ANYALU|ANYLDU},
-    // Checks
-    {OP_chk,            A, ANYINT},
-    {OP_chk_sub,        A, ANYINT},
-    {OP_chk_and,        A, ANYINT},
-    // Loads and stores
-    {OP_ld,             L, ANYLDU},
-    {OP_ldx,            L, ANYLDU},
-    {OP_ld_pre,         1, ANYLDU},
-    {OP_st,             1, ANYSTU},
-    {OP_mf,             1, STU0  },
-    // Shifts, rotates and complex masking
-    {OP_shl,            A, ANYALU},
-    {OP_shr,            A, ANYALU},
-    {OP_mask,           A, ANYALU},
-    {OP_sar,            A, ANYALU},
-    {OP_rotl,           A, ANYALU},  
-    {OP_rotr,           A, ANYALU},   
-    {OP_rotcl,          A, ANYALU},
-    {OP_rotcr,          A, ANYALU},  
-    // Multiplication
-    {OP_mull,           4, ANYFPU},
-    {OP_mulh,           4, ANYFPU},
-    {OP_mulhu,          4, ANYFPU},
-    // Bit scans
-    {OP_ctz,            3, ANYFPU},
-    {OP_clz,            3, ANYFPU},
-    {OP_ctpop,          3, ANYFPU},  
-    {OP_permb,          4, ANYFPU},
-    // Floating point
-    // uop.size bits have following meaning:
-    // 00 = single precision, scalar (preserve high 32 bits of ra)
-    // 01 = single precision, packed (two 32-bit floats)
-    // 1x = double precision, scalar or packed (use two uops to process 128-bit xmm)
-    {OP_addf,           6, ANYFPU},
-    {OP_subf,           6, ANYFPU},
-    {OP_mulf,           6, ANYFPU},
-    {OP_maddf,          6, ANYFPU},
-    {OP_msubf,          6, ANYFPU},
-    {OP_divf,           6, ANYFPU},
-    {OP_sqrtf,          6, ANYFPU},
-    {OP_rcpf,           6, ANYFPU},
-    {OP_rsqrtf,         6, ANYFPU},
-    {OP_minf,           4, ANYFPU},
-    {OP_maxf,           4, ANYFPU},
-    {OP_cmpf,           4, ANYFPU},
-    // For fcmpcc, uop.size bits have following meaning:
-    // 00 = single precision ordered compare
-    // 01 = single precision unordered compare
-    // 10 = double precision ordered compare
-    // 11 = double precision unordered compare
-    {OP_cmpccf,         4, ANYFPU},
-    // and/andn/or/xor are done using integer uops
-    {OP_permf,          3, ANYFPU}, // shuffles
-    // For these conversions, uop.size bits select truncation mode:
-    // x0 = normal IEEE-style rounding
-    // x1 = truncate to zero
-    {OP_cvtf_i2s_ins,   6, ANYFPU},
-    {OP_cvtf_i2s_p,     6, ANYFPU},
-    {OP_cvtf_i2d_lo,    6, ANYFPU},
-    {OP_cvtf_i2d_hi,    6, ANYFPU},
-    {OP_cvtf_q2s_ins,   6, ANYFPU},
-    {OP_cvtf_q2d,       6, ANYFPU},
-    {OP_cvtf_s2i,       6, ANYFPU},
-    {OP_cvtf_s2q,       6, ANYFPU},
-    {OP_cvtf_s2i_p,     6, ANYFPU},
-    {OP_cvtf_d2i,       6, ANYFPU},
-    {OP_cvtf_d2q,       6, ANYFPU},
-    {OP_cvtf_d2i_p,     6, ANYFPU},
-    {OP_cvtf_d2s_ins,   6, ANYFPU},
-    {OP_cvtf_d2s_p,     6, ANYFPU},
-    {OP_cvtf_s2d_lo,    6, ANYFPU},
-    {OP_cvtf_s2d_hi,    6, ANYFPU},
-  };
-
-#undef A
-#undef L
-#undef F
-
-#undef ALU0
-#undef ALU1
-#undef STU0
-#undef STU1
-#undef LDU0
-#undef LDU1
-#undef FPU0
-#undef FPU1
-#undef L
-
-#undef ANYALU
-#undef ANYLDU
-#undef ANYSTU
-#undef ANYFPU
-#undef ANYINT
-  
-  //
-  // Global limits
-  //
-  
-  const int MAX_ISSUE_WIDTH = 4;
-  
-  // Largest size of any physical register file or the store queue:
-  const int MAX_PHYS_REG_FILE_SIZE = 256;
-  const int PHYS_REG_FILE_SIZE = 128;
-  const int PHYS_REG_NULL = 0;
-  
-  //
-  // IMPORTANT! If you change this to be greater than 256, you MUST
-  // #define BIG_ROB below to use the correct associative search logic
-  // (16-bit tags vs 8-bit tags).
-  //
-  // SMT always has BIG_ROB enabled: high 4 bits are used for thread id
-  //
-#define BIG_ROB
-
-  const int ROB_SIZE = 128;
-  
-  // Maximum number of branches in the pipeline at any given time
-  const int MAX_BRANCHES_IN_FLIGHT = 16;
-
-  // Set this to combine the integer and FP phys reg files:
-  // #define UNIFIED_INT_FP_PHYS_REG_FILE
-  
-#ifdef UNIFIED_INT_FP_PHYS_REG_FILE
-  // unified, br, st
-  const int PHYS_REG_FILE_COUNT = 3;
-#else
-  // int, fp, br, st
-  const int PHYS_REG_FILE_COUNT = 4;
-#endif
-  
-  //
-  // Load and Store Queues
-  //
-  const int LDQ_SIZE = 48;
-  const int STQ_SIZE = 32;
-
-  //
-  // Fetch
-  //
-  const int FETCH_QUEUE_SIZE = 32;
-  const int FETCH_WIDTH = 4;
-
-  //
-  // Frontend (Rename and Decode)
-  //
-  const int FRONTEND_WIDTH = 4;
-  const int FRONTEND_STAGES = 5;
-
-  //
-  // Dispatch
-  //
-  const int DISPATCH_WIDTH = 4;
-
-  //
-  // Writeback
-  //
-  const int WRITEBACK_WIDTH = 4;
-
-  //
-  // Commit
-  //
-  const int COMMIT_WIDTH = 4;
-
-  //
-  // Clustering, Issue Queues and Bypass Network
-  //
-  const int MAX_FORWARDING_LATENCY = 2;
-
-#define MULTI_IQ
-
-#ifdef ENABLE_SMT
-  //
-  // Multiple issue queues are currently only supported in
-  // the non-SMT configuration, due to ambiguities in the
-  // ICOUNT SMT heuristic when multiple queues are active.
-  //
-#undef MULTI_IQ
-#endif
-
-#ifdef MULTI_IQ
-  const int MAX_CLUSTERS = 4;
-#else
-  const int MAX_CLUSTERS = 1;
-#endif
-
-  enum { PHYSREG_NONE, PHYSREG_FREE, PHYSREG_WAITING, PHYSREG_BYPASS, PHYSREG_WRITTEN, PHYSREG_ARCH, PHYSREG_PENDINGFREE, MAX_PHYSREG_STATE };
-  static const char* physreg_state_names[MAX_PHYSREG_STATE] = {"none", "free", "waiting", "bypass", "written", "arch", "pendingfree"};
-  static const char* short_physreg_state_names[MAX_PHYSREG_STATE] = {"-", "free", "wait", "byps", "wrtn", "arch", "pend"};
-
-#ifdef INSIDE_SMTCORE
-
-  struct SMTCore;
-  SMTCore& coreof(int coreid);
-
-  struct ReorderBufferEntry;
-
-  //
-  // Issue queue based scheduler with broadcast
-  //
-#ifdef BIG_ROB
-  typedef W16 issueq_tag_t;
-#else
-  typedef byte issueq_tag_t;
-#endif
-
-  template <int size, int operandcount = MAX_OPERANDS>
-  struct IssueQueue {
-#ifdef BIG_ROB
-    typedef FullyAssociativeTags16bit<size, size> assoc_t;
-    typedef vec8w vec_t;
-#else
-    typedef FullyAssociativeTags8bit<size, size> assoc_t;
-    typedef vec16b vec_t;
-#endif
-
-    typedef issueq_tag_t tag_t;
-
-    static const int SIZE = size;
-
-    assoc_t uopids;
-    assoc_t tags[operandcount];
-
-    // States:
-    //             V I
-    // free        0 0
-    // dispatched  1 0
-    // issued      1 1
-    // complete    0 1
-
-    bitvec<size> valid;
-    bitvec<size> issued;
-    bitvec<size> allready;
-    int count;
-    byte coreid;
-    int shared_entries;
-    int reserved_entries;
-
-    void set_reserved_entries(int num) { reserved_entries = num; }
-    bool reset_shared_entries() { 
-      shared_entries = size - reserved_entries; 
-      return true;
-    }
-    bool alloc_reserved_entry() {
-      assert(shared_entries > 0);
-      shared_entries--;
-      return true;
-    }
-    bool free_shared_entry() {
-      assert(shared_entries < size - reserved_entries);
-      shared_entries++;
-      return true;
-    }    
-    bool shared_empty() {
-      return (shared_entries == 0);
-    }
-
-    bool remaining() const { return (size - count); }
-    bool empty() const { return (!count); }
-    bool full() const { return (!remaining()); }
-
-    int uopof(int slot) const {
-      return uopids[slot];
-    }
-
-    int slotof(int uopid) const {
-      return uopids.search(uopid);
-    }
-
-    void reset(int coreid);
-    void reset(int coreid, int threadid);
-    void clock();
-    bool insert(tag_t uopid, const tag_t* operands, const tag_t* preready);
-    bool broadcast(tag_t uopid);
-    int issue();
-    bool replay(int slot, const tag_t* operands, const tag_t* preready);
-    bool switch_to_end(int slot, const tag_t* operands, const tag_t* preready);
-    bool remove(int slot);
-
-    ostream& print(ostream& os) const;
-    void tally_broadcast_matches(tag_t sourceid, const bitvec<size>& mask, int operand) const;
-
-    //
-    // Replay a uop that has already issued once.
-    // The caller may add or reset dependencies here as needed.
-    //
-    bool replay(int slot) {
-      issued[slot] = 0;
-      return true;
-    }
-
-    //
-    // Remove an entry from the issue queue after it has completed,
-    // or in the process of annulment.
-    //
-    bool release(int slot) {
-      remove(slot);
-      return true;
-    }
-
-    bool annul(int slot) {
-      remove(slot);
-      return true;
-    }
-
-    bool annuluop(int uopid) {
-      int slot = slotof(uopid);
-      if (slot < 0) return false;
-      remove(slot);
-      return true;
-    }
-
-    SMTCore& getcore() const { return coreof(coreid); }
-  };
-
-  template <int size, int operandcount>
-  static inline ostream& operator <<(ostream& os, const IssueQueue<size, operandcount>& issueq) {
-    return issueq.print(os);
-  }
-
-  //
-  // Iterate through a linked list of objects where each object directly inherits
-  // only from the selfqueuelink class or otherwise has a selfqueuelink object
-  // as the first member.
-  //
-  // This iterator supports mutable lists, meaning the current entry (obj) may
-  // be safely removed from the list and/or moved to some other list without
-  // affecting the next object processed.
-  //
-  // This does NOT mean you can remove any object from the list other than the
-  // current object obj - to do this, copy the list of pointers to an array and
-  // then process that instead.
-  //
-#define foreach_list_mutable_linktype(L, obj, entry, nextentry, linktype) \
-  linktype* entry; \
-  linktype* nextentry; \
-  for (entry = (L).next, nextentry = entry->next, prefetch(entry->next), obj = (typeof(obj))entry; \
-    entry != &(L); entry = nextentry, nextentry = entry->next, prefetch(nextentry), obj = (typeof(obj))entry)
-
-#define foreach_list_mutable(L, obj, entry, nextentry) foreach_list_mutable_linktype(L, obj, entry, nextentry, selfqueuelink)
-
-  struct StateList;
-
-  struct ListOfStateLists: public array<StateList*, 64> {
-    int count;
-
-    ListOfStateLists() { count = 0; }
-
-    int add(StateList* list);
-    void reset();
-  };
-
-  struct StateList: public selfqueuelink {
-    char* name;
-    int count;
-    int listid;
-    W64 dispatch_source_counter;
-    W64 issue_source_counter;
-    W32 flags;
-
-    StateList() { count = 0; listid = 0; }
-
-    void init(const char* name, ListOfStateLists& lol, W32 flags = 0);
-
-    StateList(const char* name, ListOfStateLists& lol, W32 flags = 0) {  
-      init(name, lol, flags);
-    }
-
-    // simulated asymmetric c++ array constructor:
-    StateList& operator ()(const char* name, ListOfStateLists& lol, W32 flags = 0) {
-      init(name, lol, flags);
-      return *this;
-    }
-
-    void reset();
-
-    selfqueuelink* dequeue() {
-      if (empty())
-        return null;
-      count--;
-      assert(count >=0);
-      selfqueuelink* obj = removehead(); 
-      return obj;
-    }
-
-    selfqueuelink* enqueue(selfqueuelink* entry) {
-      entry->addtail(this);
-      count++;
-      return entry;
-    }
-
-    selfqueuelink* enqueue_after(selfqueuelink* entry, selfqueuelink* preventry) {
-      if (preventry) entry->addhead(preventry); else entry->addhead(this);
-      count++;
-      return entry;
-    }
-
-    selfqueuelink* remove(selfqueuelink* entry) {
-      assert(entry->linked());
-      entry->unlink();
-      count--;
-      assert(count >=0);
-      return entry;
-    }
-
-    selfqueuelink* peek() {
-      return (empty()) ? null : head();
-    }
-
-    void checkvalid();
-  };
-
-  template <typename T> 
-  static void print_list_of_state_lists(ostream& os, const ListOfStateLists& lol, const char* title);
-
-  //
-  // Fetch Buffers
-  //
-  struct BranchPredictorUpdateInfo: public PredictorUpdate {
-    int stack_recover_idx;
-    int bptype;
-    W64 ripafter;
-  };
-
-  struct FetchBufferEntry: public TransOp {
-    RIPVirtPhys rip;
-    W64 uuid;
-    uopimpl_func_t synthop;
-    BranchPredictorUpdateInfo predinfo;
-    W16 index;
-    W8 threadid;
-    byte ld_st_truly_unaligned;
-
-    int init(int index) { this->index = index; return 0; }
-    void validate() { }
-
-    FetchBufferEntry() { }
-    
-    FetchBufferEntry(const TransOp& transop) {
-      *((TransOp*)this) = transop;
-    }
-  };
-
-  //
-  // ReorderBufferEntry
-  struct ThreadContext;
-  struct SMTCore;
-  struct PhysicalRegister;
-  struct LoadStoreQueueEntry;
-  struct SMTCoreEvent;
-  //
-  // Reorder Buffer (ROB) structure, used for tracking all uops in flight.
-  // This same structure is used to represent both dispatched but not yet issued 
-  // uops as well as issued uops.
-  //
-  struct ReorderBufferEntry: public selfqueuelink {
-    FetchBufferEntry uop;
-    struct StateList* current_state_list;
-    PhysicalRegister* physreg;
-    PhysicalRegister* operands[MAX_OPERANDS];
-    LoadStoreQueueEntry* lsq;
-    W16s idx;
-    W16s cycles_left; // execution latency counter, decremented every cycle when executing
-    W16s forward_cycle; // forwarding cycle after completion
-    W16s lfrqslot;
-    W16s iqslot;
-    W16  executable_on_cluster_mask;
-    W8s  cluster;
-    W8   coreid;
-
-    W8   threadid;
-    byte fu;
-    byte consumer_count;
-    PTEUpdate pteupdate;
-    Waddr origvirt; // original virtual address, with low bits
-    Waddr virtpage; // virtual page number actually accessed by the load or store
-    byte entry_valid:1, load_store_second_phase:1, all_consumers_off_bypass:1, dest_renamed_before_writeback:1, no_branches_between_renamings:1, transient:1, lock_acquired:1, issued:1;
-    byte tlb_walk_level;
-
-    int index() const { return idx; }
-    void validate() { entry_valid = true; }
-
-    void changestate(StateList& newqueue, bool place_at_head = false, ReorderBufferEntry* prevrob = null) {
-      if (current_state_list)
-        current_state_list->remove(this);
-      current_state_list = &newqueue;
-      if (place_at_head) newqueue.enqueue_after(this, prevrob); else newqueue.enqueue(this);
-    }
-
-    void init(int idx);
-    void reset();
-    bool ready_to_issue() const;
-    bool ready_to_commit() const;
-    StateList& get_ready_to_issue_list() const;
-    bool find_sources();
-    int forward();
-    int select_cluster();
-    int issue();
-    void* addrgen(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& virtpage, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate, Waddr& addr, int& exception, PageFaultErrorCode& pfec, bool& annul);
-    bool handle_common_load_store_exceptions(LoadStoreQueueEntry& state, Waddr& origaddr, Waddr& addr, int& exception, PageFaultErrorCode& pfec);
-    int issuestore(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, bool rcready, PTEUpdate& pteupdate);
-    int issueload(LoadStoreQueueEntry& state, Waddr& origvirt, W64 ra, W64 rb, W64 rc, PTEUpdate& pteupdate);
-    int probecache(Waddr addr, LoadStoreQueueEntry* sfra);
-    void tlbwalk();
-    int issuefence(LoadStoreQueueEntry& state);
-    void release();
-    W64 annul(bool keep_misspec_uop, bool return_first_annulled_rip = false);
-    W64 annul_after() { return annul(true); }
-    W64 annul_after_and_including() { return annul(false); }
-    int commit();
-    void replay();
-    void replay_locked();
-    int pseudocommit();
-    void redispatch(const bitvec<MAX_OPERANDS>& dependent_operands, ReorderBufferEntry* prevrob);
-    void redispatch_dependents(bool inclusive = true);
-    void loadwakeup();
-    void fencewakeup();
-    LoadStoreQueueEntry* find_nearest_memory_fence();
-    bool release_mem_lock(bool forced = false);
-    ostream& print(ostream& os) const;
-    stringbuf& get_operand_info(stringbuf& sb, int operand) const;
-    ostream& print_operand_info(ostream& os, int operand) const;
-
-    SMTCore& getcore() const { return coreof(coreid); }
-
-    ThreadContext& getthread() const;
-    issueq_tag_t get_tag();
-  };
-
-  void decode_tag(issueq_tag_t tag, int& threadid, int& idx) {
-    threadid = tag >> MAX_ROB_IDX_BIT;
-    int mask = ((1 << (MAX_ROB_IDX_BIT + MAX_THREADS_BIT)) - 1) >> MAX_THREADS_BIT;
-    idx = tag & mask;
-  }
-
-  static inline ostream& operator <<(ostream& os, const ReorderBufferEntry& rob) {
-    return rob.print(os);
-  }
-
-  //
-  // Load/Store Queue
-  //
-#define LSQ_SIZE (LDQ_SIZE + STQ_SIZE)
-
-  // Define this to allow speculative issue of loads before unresolved stores
-#define SMT_ENABLE_LOAD_HOISTING
-
-  struct LoadStoreQueueEntry: public SFR {
-    ReorderBufferEntry* rob;
-    W16 idx;
-    byte coreid;
-    W8s mbtag;
-    W8 store:1, lfence:1, sfence:1, entry_valid:1;
-    W32 padding;
-
-    LoadStoreQueueEntry() { }
-
-    int index() const { return idx; }
-
-    void reset() {
-      int oldidx = idx;
-      setzero(*this);
-      idx = oldidx;
-      mbtag = -1;
-    }
-
-    void init(int idx) {
-      this->idx = idx;
-      reset();
-    }
-
-    void validate() { entry_valid = 1; }
-  
-    ostream& print(ostream& os) const;
-
-    LoadStoreQueueEntry& operator =(const SFR& sfr) {
-      *((SFR*)this) = sfr;
-      return *this;
-    }
-
-    SMTCore& getcore() const { return coreof(coreid); }
-  };
-
-  static inline ostream& operator <<(ostream& os, const LoadStoreQueueEntry& lsq) {
-    return lsq.print(os);
-  }
-
-  struct PhysicalRegisterOperandInfo {
-    W32 uuid;
-    W16 physreg;
-    W16 rob;
-    byte state;
-    byte rfid;
-    byte archreg;
-    byte pad1;
-  };
-
-  ostream& operator <<(ostream& os, const PhysicalRegisterOperandInfo& opinfo);
-
-  //
-  // Physical Register File
-  //
- 
-  struct PhysicalRegister: public selfqueuelink {
-    ReorderBufferEntry* rob;
-    W64 data;
-    W16 flags;
-    W16 idx;
-    W8  coreid;
-    W8  rfid;
-    W8  state;
-    W8  archreg;
-    W8  all_consumers_sourced_from_bypass:1;
-    W16s refcount;
-    W8 threadid;
-
-    StateList& get_state_list(int state) const;
-    StateList& get_state_list() const { return get_state_list(this->state); }
-
-    void changestate(int newstate) {
-      if likely (state != PHYSREG_NONE) get_state_list(state).remove(this);
-      state = newstate;
-      get_state_list(state).enqueue(this);
-    }
-
-    void init(int coreid, int rfid, int idx) {
-      this->coreid = coreid;
-      this->rfid = rfid;
-      this->idx = idx;
-      reset();
-    }
-
-  private:
-    void addref() { refcount++; }
-    void unref() {
-      refcount--;
-      assert((idx == 0) || (refcount >= 0));
-    }
-
-  public:
-
-    void addref(const ReorderBufferEntry& rob, W8 threadid) { addref(); }
-    void unref(const ReorderBufferEntry& rob, W8 threadid) { unref(); }
-    void addspecref(int archreg, W8 threadid) { addref(); }
-    void unspecref(int archreg, W8 threadid) { unref(); }
-    void addcommitref(int archreg, W8 threadid) { addref(); }
-    void uncommitref(int archreg, W8 threadid) { unref();  }
-
-    bool referenced() const { return (refcount > 0); }
-    bool nonnull() const { return (index() != PHYS_REG_NULL); }
-    bool allocated() const { return (state != PHYSREG_FREE); }
-    void commit() { changestate(PHYSREG_ARCH); }
-    void complete() { changestate(PHYSREG_BYPASS); }
-    void writeback() { changestate(PHYSREG_WRITTEN); }
-
-    void free() {      
-      changestate(PHYSREG_FREE);
-      rob = 0;
-      refcount = 0;
-      threadid = 0xff;
-      all_consumers_sourced_from_bypass = 1;
-    }
-
-  private:
-    void reset() {
-      selfqueuelink::reset();
-      state = PHYSREG_NONE;
-      free();
-    }
-
-  public:
-    void reset(W8 threadid, bool check_id = true) {
-      if (check_id && this->threadid != threadid) return;
-
-      if (!check_id) {
-        selfqueuelink::reset();
-        state = PHYSREG_NONE;
-      }
-      free();
-    }
-
-    int index() const { return idx; }
-    bool valid() const { return ((flags & FLAG_INV) == 0); }
-    bool ready() const { return ((flags & FLAG_WAIT) == 0); }
-
-    void fill_operand_info(PhysicalRegisterOperandInfo& opinfo);
-
-    SMTCore& getcore() const { return coreof(coreid); }
-  };
-
-  ostream& operator <<(ostream& os, const PhysicalRegister& physreg);
-
-  struct PhysicalRegisterFile: public array<PhysicalRegister, MAX_PHYS_REG_FILE_SIZE> {
-    byte coreid;
-    byte rfid;
-    W16 size;
-    const char* name;
-    StateList states[MAX_PHYSREG_STATE];
-    W64 allocations;
-    W64 frees;
-
-    PhysicalRegisterFile() { }
-
-    PhysicalRegisterFile(const char* name, int coreid, int rfid, int size) {
-      init(name, coreid, rfid, size); reset();
-    }
-
-    PhysicalRegisterFile& operator ()(const char* name, int coreid, int rfid, int size) {
-      init(name, coreid, rfid, size); reset(); return *this;
-    }
-
-    void init(const char* name, int coreid, int rfid, int size);
-    bool remaining() const { return (!states[PHYSREG_FREE].empty()); }
-   
-    PhysicalRegister* alloc(W8 threadid, int r = -1);
-    void reset(W8 threadid);
-    ostream& print(ostream& os) const;
-
-    SMTCore& getcore() const { return coreof(coreid); }
-
-  private:
-    void reset();
-  };
-
-  static inline ostream& operator <<(ostream& os, const PhysicalRegisterFile& physregs) {
-    return physregs.print(os);
-  }
-
-  //
-  // Register Rename Table
-  //
-  struct RegisterRenameTable: public array<PhysicalRegister*, TRANSREG_COUNT> {
-#ifdef ENABLE_TRANSIENT_VALUE_TRACKING
-    bitvec<TRANSREG_COUNT> renamed_in_this_basic_block;
-#endif
-    ostream& print(ostream& os) const;
-  };
-
-  static inline ostream& operator <<(ostream& os, const RegisterRenameTable& rrt) {
-    return rrt.print(os);
-  }
-
-  enum {
-    ISSUE_COMPLETED = 1,      // issued correctly
-    ISSUE_NEEDS_REPLAY = 0,   // fast scheduling replay
-    ISSUE_MISSPECULATED = -1, // mis-speculation: redispatch dependent slice
-    ISSUE_NEEDS_REFETCH = -2, // refetch from RIP of bad insn
-  };
-
-  enum {
-    COMMIT_RESULT_NONE = 0,   // no instructions committed: some uops not ready
-    COMMIT_RESULT_OK = 1,     // committed
-    COMMIT_RESULT_EXCEPTION = 2, // exception
-    COMMIT_RESULT_BARRIER = 3,// barrier; branch to microcode (brp uop)
-    COMMIT_RESULT_SMC = 4,    // self modifying code detected
-    COMMIT_RESULT_INTERRUPT = 5, // interrupt pending
-    COMMIT_RESULT_STOP = 6    // stop processor model (shutdown)
-  };
-
-  // Branch predictor outcomes:
-  enum { MISPRED = 0, CORRECT = 1 };
-
-  //
-  // Lookup tables (LUTs):
-  //
-  struct Cluster {
-    char* name;
-    W16 issue_width;
-    W32 fu_mask;
-  };
-
-  extern const Cluster clusters[MAX_CLUSTERS];
-  extern byte uop_executable_on_cluster[OP_MAX_OPCODE];
-  extern W32 forward_at_cycle_lut[MAX_CLUSTERS][MAX_FORWARDING_LATENCY+1];
-  extern const byte archdest_can_commit[TRANSREG_COUNT];
-  extern const byte archdest_is_visible[TRANSREG_COUNT];
-
-  struct SMTMachine;
-
-  struct SMTCoreCacheCallbacks: public CacheSubsystem::PerCoreCacheCallbacks {
-    SMTCore& core;
-    SMTCoreCacheCallbacks(SMTCore& core_): core(core_) { }
-    virtual void dcache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-    virtual void icache_wakeup(LoadStoreInfo lsi, W64 physaddr);
-  };
-
-  struct MemoryInterlockEntry {
-    W64 uuid;
-    W16 rob;
-    byte vcpuid;
-    W8 threadid;
-
-    void reset() { uuid = 0; rob = 0; vcpuid = 0; threadid = 0;}
- 
-    ostream& print(ostream& os, W64 physaddr) const {
-      os << "phys ", (void*)physaddr, ": vcpu ", vcpuid, ", threadid ", threadid, ", uuid ", uuid, ", rob ", rob;
-      return os;
-    }
-  };
- 
-  struct MemoryInterlockBuffer: public LockableAssociativeArray<W64, MemoryInterlockEntry, 16, 4, 8> { };
- 
-  extern MemoryInterlockBuffer interlocks;
- 
-  //
-  // Event Tracing
-  //
-  enum {
-    EVENT_INVALID = 0,
-    EVENT_FETCH_STALLED,
-    EVENT_FETCH_ICACHE_WAIT,
-    EVENT_FETCH_FETCHQ_FULL,
-    EVENT_FETCH_IQ_QUOTA_FULL,
-    EVENT_FETCH_BOGUS_RIP,
-    EVENT_FETCH_ICACHE_MISS,
-    EVENT_FETCH_SPLIT,
-    EVENT_FETCH_ASSIST,
-    EVENT_FETCH_TRANSLATE,
-    EVENT_FETCH_OK,
-    EVENT_RENAME_FETCHQ_EMPTY,
-    EVENT_RENAME_ROB_FULL,
-    EVENT_RENAME_PHYSREGS_FULL,
-    EVENT_RENAME_LDQ_FULL,
-    EVENT_RENAME_STQ_FULL,
-    EVENT_RENAME_MEMQ_FULL,
-    EVENT_RENAME_OK,
-    EVENT_FRONTEND,
-    EVENT_CLUSTER_NO_CLUSTER,
-    EVENT_CLUSTER_OK,
-    EVENT_DISPATCH_NO_CLUSTER,
-    EVENT_DISPATCH_DEADLOCK,
-    EVENT_DISPATCH_OK,
-    EVENT_ISSUE_NO_FU,
-    EVENT_ISSUE_OK,
-    EVENT_REPLAY,
-    EVENT_STORE_EXCEPTION,
-    EVENT_STORE_WAIT,
-    EVENT_STORE_PARALLEL_FORWARDING_MATCH,
-    EVENT_STORE_ALIASED_LOAD,
-    EVENT_STORE_ISSUED,
-    EVENT_STORE_LOCK_RELEASED,
-    EVENT_STORE_LOCK_ANNULLED,
-    EVENT_STORE_LOCK_REPLAY,
-    EVENT_LOAD_EXCEPTION,
-    EVENT_LOAD_WAIT,
-    EVENT_LOAD_HIGH_ANNULLED,
-    EVENT_LOAD_HIT,
-    EVENT_LOAD_MISS,
-    EVENT_LOAD_BANK_CONFLICT,
-    EVENT_LOAD_TLB_MISS,
-    EVENT_LOAD_LOCK_REPLAY,
-    EVENT_LOAD_LOCK_OVERFLOW,
-    EVENT_LOAD_LOCK_ACQUIRED,
-    EVENT_LOAD_LFRQ_FULL,
-    EVENT_LOAD_WAKEUP,
-    EVENT_TLBWALK_HIT,
-    EVENT_TLBWALK_MISS,
-    EVENT_TLBWALK_WAKEUP,
-    EVENT_TLBWALK_NO_LFRQ_MB,
-    EVENT_TLBWALK_COMPLETE,
-    EVENT_FENCE_ISSUED,
-    EVENT_ALIGNMENT_FIXUP,
-    EVENT_ANNUL_NO_FUTURE_UOPS,
-    EVENT_ANNUL_MISSPECULATION,
-    EVENT_ANNUL_EACH_ROB,
-    EVENT_ANNUL_PSEUDOCOMMIT,
-    EVENT_ANNUL_FETCHQ_RAS,
-    EVENT_ANNUL_FETCHQ,
-    EVENT_ANNUL_FLUSH,
-    EVENT_REDISPATCH_DEPENDENTS,
-    EVENT_REDISPATCH_DEPENDENTS_DONE,
-    EVENT_REDISPATCH_EACH_ROB,
-    EVENT_COMPLETE,
-    EVENT_BROADCAST,
-    EVENT_FORWARD,
-    EVENT_WRITEBACK,
-    EVENT_COMMIT_FENCE_COMPLETED,
-    EVENT_COMMIT_EXCEPTION_DETECTED,
-    EVENT_COMMIT_EXCEPTION_ACKNOWLEDGED,
-    EVENT_COMMIT_SKIPBLOCK,
-    EVENT_COMMIT_SMC_DETECTED,
-    EVENT_COMMIT_MEM_LOCKED,
-    EVENT_COMMIT_ASSIST,
-    EVENT_COMMIT_OK,
-    EVENT_RECLAIM_PHYSREG,
-    EVENT_RELEASE_MEM_LOCK,
-  };
-
-  //
-  // Event that gets written to the trace buffer
-  //
-  // In the interest of minimizing space, the cycle counters
-  // and uuids are only 32-bits; in practice wraparound is
-  // not likely to be a problem.
-  //
-  struct SMTCoreEvent {
-    W32 cycle;
-    W32 uuid;
-    RIPVirtPhysBase rip;
-    TransOpBase uop;
-    W16 rob;
-    W16 physreg;
-    W16 lsq;
-    W16 type;
-    W16s lfrqslot;
-    byte rfid;
-    byte cluster;
-    byte fu;
-    W8 threadid;
-    W32 issueq_count;
-
-    SMTCoreEvent* fill(int type) {
-      this->type = type;
-      cycle = sim_cycle;
-      uuid = 0;
-      threadid = 0xff;
-      return this;
-    }
-
-    SMTCoreEvent* fill(int type, const FetchBufferEntry& uop) {
-      fill(type);
-      uuid = uop.uuid;
-      rip = uop.rip;
-      threadid = uop.threadid;
-      this->uop = uop;
-      return this;
-    }
-
-    SMTCoreEvent* fill(int type, const RIPVirtPhys& rvp) {
-      fill(type);
-      rip = rvp;
-      return this;
-    }
-
-    SMTCoreEvent* fill(int type, const ReorderBufferEntry* rob) {
-      fill(type, rob->uop);
-      this->rob = rob->index();
-      physreg = rob->physreg->index();
-      lsq = (rob->lsq) ? rob->lsq->index() : 0;
-      rfid = rob->physreg->rfid;
-      cluster = rob->cluster;
-      fu = rob->fu;
-      lfrqslot = rob->lfrqslot;
-      return this;
-    }
-
-    SMTCoreEvent* fill_commit(int type, const ReorderBufferEntry* rob) {
-      fill(type, rob);
-      if unlikely (isstore(rob->uop.opcode)) {
-        commit.state.st = *rob->lsq;
-      } else {
-        commit.state.reg.rddata = rob->physreg->data;
-        commit.state.reg.rdflags = rob->physreg->flags;
-      }
-      // taken, predtaken only for branches
-      commit.ld_st_truly_unaligned = rob->uop.ld_st_truly_unaligned;
-      commit.pteupdate = rob->pteupdate;
-      // oldphysreg filled in later
-      // oldphysreg_refcount filled in later
-      commit.origvirt = rob->origvirt;
-      commit.total_user_insns_committed = total_user_insns_committed;
-      // target_rip filled in later
-      foreach (i, MAX_OPERANDS) commit.operand_physregs[i] = rob->operands[i]->index();
-      return this;
-    }
-
-    SMTCoreEvent* fill_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr, Waddr virtaddr) {
-      fill(type, rob);
-      loadstore.sfr = *rob->lsq;
-      loadstore.virtaddr = virtaddr;
-      loadstore.load_store_second_phase = rob->load_store_second_phase;
-      loadstore.inherit_sfr_used = (inherit_sfr != null);
-      if unlikely (inherit_sfr) {
-        loadstore.inherit_sfr = *inherit_sfr;
-        loadstore.inherit_sfr_lsq = inherit_sfr->rob->lsq->index();
-        loadstore.inherit_sfr_uuid = inherit_sfr->rob->uop.uuid;
-        loadstore.inherit_sfr_rob = inherit_sfr->rob->index();
-        loadstore.inherit_sfr_physreg = inherit_sfr->rob->physreg->index();
-        loadstore.inherit_sfr_rip = inherit_sfr->rob->uop.rip;
-      }
-      loadstore.tlb_walk_level = rob->tlb_walk_level;
-      return this;
-    }
-
-    union {
-      struct {
-        W16s missbuf;
-        W64 predrip;
-        W16 bb_uop_count;
-      } fetch;
-      struct {
-        W16  oldphys;
-        W16  oldzf;
-        W16  oldcf;
-        W16  oldof;
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } rename;
-      struct {
-        W16 cycles_left;
-      } frontend;
-      struct {
-        W16 allowed_clusters;
-        W16 iq_avail[MAX_CLUSTERS];
-      } select_cluster;
-      struct {
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } dispatch;
-      struct {
-        byte mispredicted:1;
-        IssueState state;
-        W16 cycles_left;
-        W64 operand_data[MAX_OPERANDS];
-        W16 operand_flags[MAX_OPERANDS];
-        W64 predrip;
-        W32 fu_avail;
-      } issue;
-      struct {
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-        byte ready;
-      } replay;
-      struct {
-        W64 virtaddr; 
-        W64 data_to_store;
-        SFR sfr;
-        SFR inherit_sfr;
-        W64 inherit_sfr_uuid;        
-        W64 inherit_sfr_rip;
-        W16 inherit_sfr_lsq;
-        W16 inherit_sfr_rob;
-        W16 inherit_sfr_physreg;
-        W16 cycles_left;
-        W64 locking_uuid;
-        byte inherit_sfr_used:1, rcready:1, load_store_second_phase:1, predicted_alias:1;
-        byte locking_vcpuid;
-        W16 locking_rob;
-        W8 threadid;
-        W8 tlb_walk_level;
-      } loadstore;
-      struct {
-        W16 somidx;
-        W16 eomidx;
-        W16 startidx;
-        W16 endidx;
-        byte annulras;
-      } annul;
-      struct {
-        StateList* current_state_list;
-        W16 iqslot;
-        W16 count;
-        byte dependent_operands;
-        PhysicalRegisterOperandInfo opinfo[MAX_OPERANDS];
-      } redispatch;
-      struct {
-        W8  forward_cycle;
-        W8  operand;
-        W8  target_operands_ready;
-        W8  target_all_operands_ready;
-        W16 target_rob;
-        W16 target_physreg;
-        W8  target_rfid;
-        W8  target_cluster;
-        W64 target_uuid;
-        W16 target_lsq;
-        W8  target_st;
-      } forwarding;
-      struct {
-        W16 consumer_count;
-        W16 flags;
-        W64 data;
-        byte transient:1, all_consumers_sourced_from_bypass:1, no_branches_between_renamings:1, dest_renamed_before_writeback:1;
-      } writeback;
-      struct {
-        IssueState state;
-        byte taken:1, predtaken:1, ld_st_truly_unaligned:1;
-        PTEUpdateBase pteupdate;
-        W16s oldphysreg;
-        W16 oldphysreg_refcount;
-        W64 origvirt;
-        W64 total_user_insns_committed;
-        W64 target_rip;
-        W16 operand_physregs[MAX_OPERANDS];
-      } commit;
-    };
-
-    ostream& print(ostream& os) const;
-  };
-
-  struct EventLog {
-    SMTCoreEvent* start;
-    SMTCoreEvent* end;
-    SMTCoreEvent* tail;
-    ostream* logfile;
-
-    EventLog() { start = null; end = null; tail = null; logfile = null; }
-
-    bool init(size_t bufsize);
-    void reset();
-
-    SMTCoreEvent* add() {
-      if unlikely (tail >= end) {
-        tail = start;
-        flush();
-      }
-      SMTCoreEvent* event = tail;
-      tail++;
-      return event;
-    }
-
-    void flush(bool only_to_tail = false);
-
-    SMTCoreEvent* add(int type) {
-      return add()->fill(type);
-    }
-
-    SMTCoreEvent* add(int type, const RIPVirtPhys& rvp) {
-      return add()->fill(type, rvp);
-    }
-
-    SMTCoreEvent* add(int type, const FetchBufferEntry& uop) {
-      return add()->fill(type, uop);
-    }
-
-    SMTCoreEvent* add(int type, const ReorderBufferEntry* rob) {
-      return add()->fill(type, rob);
-    }
-
-    SMTCoreEvent* add_commit(int type, const ReorderBufferEntry* rob) {
-      return add()->fill_commit(type, rob);
-    }
-
-    SMTCoreEvent* add_load_store(int type, const ReorderBufferEntry* rob, LoadStoreQueueEntry* inherit_sfr = null, Waddr addr = 0) {
-      return add()->fill_load_store(type, rob, inherit_sfr, addr);
-    }
-
-    ostream& print(ostream& os, bool only_to_tail = false);
-  };
-
-  struct LoadStoreAliasPredictor: public FullyAssociativeTags<W64, 8> { };
-
-  enum {
-    ROB_STATE_READY = (1 << 0),
-    ROB_STATE_IN_ISSUE_QUEUE = (1 << 1),
-    ROB_STATE_PRE_READY_TO_DISPATCH = (1 << 2)
-  };
-
-#ifdef MULTI_IQ
-#define InitClusteredROBList(name, description, flags) \
-  name[0](description "-int0", rob_states, flags); \
-  name[1](description "-int1", rob_states, flags); \
-  name[2](description "-ld", rob_states, flags); \
-  name[3](description "-fp", rob_states, flags)
-#else
-#define InitClusteredROBList(name, description, flags) \
-  name[0](description "-all", rob_states, flags);
-#endif
-
-  static const int ISSUE_QUEUE_SIZE = 16;
-
-  // How many bytes of x86 code to fetch into decode buffer at once
-  static const int ICACHE_FETCH_GRANULARITY = 16;
-  // Deadlock timeout: if nothing dispatches for this many cycles, flush the pipeline
-  static const int DISPATCH_DEADLOCK_COUNTDOWN_CYCLES = 256;
-  // Size of unaligned predictor Bloom filter
-  static const int UNALIGNED_PREDICTOR_SIZE = 4096;
-
-  struct ThreadContext {
-    SMTCore& core;
-    SMTCore& getcore() const { return core; }
-
-    int threadid;
-    Context& ctx;
-    BranchPredictorInterface branchpred;
-
-    Queue<FetchBufferEntry, FETCH_QUEUE_SIZE> fetchq;
-
-    ListOfStateLists rob_states;
-    ListOfStateLists lsq_states;
-    //
-    // Each ROB's state can be linked into at most one of the
-    // following rob_xxx_list lists at any given time; the ROB's
-    // current_state_list points back to the list it belongs to.
-    //
-    StateList rob_free_list;                             // Free ROB entyry
-    StateList rob_frontend_list;                         // Frontend in progress (artificial delay)
-    StateList rob_ready_to_dispatch_list;                // Ready to dispatch
-    StateList rob_dispatched_list[MAX_CLUSTERS];         // Dispatched but waiting for operands
-    StateList rob_ready_to_issue_list[MAX_CLUSTERS];     // Ready to issue (all operands ready)
-    StateList rob_ready_to_store_list[MAX_CLUSTERS];     // Ready to store (all operands except possibly rc are ready)
-    StateList rob_ready_to_load_list[MAX_CLUSTERS];      // Ready to load (all operands ready)
-    StateList rob_issued_list[MAX_CLUSTERS];             // Issued and in progress (or for loads, returned here after address is generated)
-    StateList rob_completed_list[MAX_CLUSTERS];          // Completed and result in transit for local and global forwarding
-    StateList rob_ready_to_writeback_list[MAX_CLUSTERS]; // Completed; result ready to writeback in parallel across all cluster register files
-    StateList rob_cache_miss_list;                       // Loads only: wait for cache miss to be serviced
-    StateList rob_tlb_miss_list;                         // TLB miss waiting to be serviced on one or more levels
-    StateList rob_memory_fence_list;                     // mf uops only: wait for memory fence to reach head of LSQ before completing
-    StateList rob_ready_to_commit_queue;                 // Ready to commit
-
-    Queue<ReorderBufferEntry, ROB_SIZE> ROB;
-
-    Queue<LoadStoreQueueEntry, LSQ_SIZE> LSQ;
-    RegisterRenameTable specrrt;
-    RegisterRenameTable commitrrt;
-
-    // Fetch-related structures
-    RIPVirtPhys fetchrip;
-    BasicBlock* current_basic_block;
-    int current_basic_block_transop_index;
-    bool stall_frontend;
-    bool waiting_for_icache_fill;
-
-    // Last block in icache we fetched into our buffer
-    W64 current_icache_block;
-    W64 fetch_uuid;
-    int loads_in_flight;
-    int stores_in_flight;
-    bool prev_interrupts_pending;
-    bool handle_interrupt_at_next_eom;
-    bool stop_at_next_eom;
-
-    W64 last_commit_at_cycle;
-    bool smc_invalidate_pending;
-    RIPVirtPhys smc_invalidate_rvp;
-    W64 chk_recovery_rip;
-
-    TransOpBuffer unaligned_ldst_buf;
-    LoadStoreAliasPredictor lsap;
-    int loads_in_this_cycle;
-    W64 load_to_store_parallel_forwarding_buffer[LOAD_FU_COUNT];
-
-    W64 consecutive_commits_inside_spinlock;
-
-    // statistics:
-    W64 total_uops_committed;
-    W64 total_insns_committed;
-    int dispatch_deadlock_countdown;    
-    int issueq_count;
-
-    //
-    // List of memory locks that will be removed from
-    // the lock controller when the macro-op commits.
-    //
-    // At most 4 chunks are allowed, to ensure
-    // cmpxchg16b works even with unaligned data.
-    //
-    byte queued_mem_lock_release_count;
-    W64 queued_mem_lock_release_list[4];
-
-    ThreadContext(SMTCore& core_, int threadid_, Context& ctx_): core(core_), threadid(threadid_), ctx(ctx_) {
-      reset();
-    }
-
-    int commit();
-    int writeback(int cluster);
-    int transfer(int cluster);
-    int complete(int cluster);
-    int dispatch();
-    void frontend();
-    void rename();
-    bool fetch();
-    void tlbwalk();
-
-    bool handle_barrier();
-    bool handle_exception();
-    bool handle_interrupt();
-    void reset_fetch_unit(W64 realrip);
-    void flush_pipeline();
-    void invalidate_smc();
-    void external_to_core_state();
-    void core_to_external_state() { }
-    void annul_fetchq();
-    BasicBlock* fetch_or_translate_basic_block(const RIPVirtPhys& rvp);
-    void redispatch_deadlock_recovery();
-    void flush_mem_lock_release_list();
-    int get_priority() const;
-
-    void dump_smt_state(ostream& os);
-    void print_smt_state(ostream& os);
-    void print_rob(ostream& os);
-    void print_lsq(ostream& os);
-    void print_rename_tables(ostream& os);
-
-    void reset();
-    void init();
-  };
-
-  //
-  // checkpointed core
-  //
-  struct SMTCore {
-    SMTMachine& machine;
-    int coreid;
-    SMTCore& getcore() const { return coreof(coreid); }
-
-    int threadcount;
-    ThreadContext* threads[MAX_THREADS_PER_CORE];
-
-    ListOfStateLists rob_states;
-    ListOfStateLists lsq_states;
-
-    EventLog eventlog;
-    ListOfStateLists physreg_states;
-    // Bandwidth counters:
-    int commitcount;
-    int writecount;
-    int dispatchcount;
-
-    byte round_robin_tid;
-
-    //
-    // Issue Queues (one per cluster)
-    //
-    int reserved_iq_entries;
-#define declare_issueq_templates template struct IssueQueue<ISSUE_QUEUE_SIZE>
-#ifdef MULTI_IQ
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int0;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_int1;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_ld;
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_fp;
-
-    // Instantiate any issueq sizes used above:
-
-
-#define foreach_issueq(expr) { SMTCore& core = getcore(); core.issueq_int0.expr; core.issueq_int1.expr; core.issueq_ld.expr; core.issueq_fp.expr; }
-  
-    void sched_get_all_issueq_free_slots(int* a) {
-      a[0] = issueq_int0.remaining();
-      a[1] = issueq_int1.remaining();
-      a[2] = issueq_ld.remaining();
-      a[3] = issueq_fp.remaining();
-    }
-
-#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) \
-  switch (cluster) { \
-  case 0: rc = core.issueq_int0.expr; break; \
-  case 1: rc = core.issueq_int1.expr; break; \
-  case 2: rc = core.issueq_ld.expr; break; \
-  case 3: rc = core.issueq_fp.expr; break; \
-  }
-
-#define per_cluster_stats_update(prefix, cluster, expr) \
-  switch (cluster) { \
-  case 0: prefix.int0 expr; break; \
-  case 1: prefix.int1 expr; break; \
-  case 2: prefix.ld expr; break; \
-  case 3: prefix.fp expr; break; \
-  }
-
-#else
-    IssueQueue<ISSUE_QUEUE_SIZE> issueq_all;
-#define foreach_issueq(expr) { getcore().issueq_all.expr; }
-    void sched_get_all_issueq_free_slots(int* a) {
-      a[0] = issueq_all.remaining();
-    }
-#define issueq_operation_on_cluster_with_result(core, cluster, rc, expr) rc = core.issueq_all.expr;
-#define per_cluster_stats_update(prefix, cluster, expr) prefix.all expr;
-
-#endif
-
-#define per_physregfile_stats_update(prefix, rfid, expr) \
-  switch (rfid) { \
-  case 0: prefix.integer expr; break; \
-  case 1: prefix.fp expr; break; \
-  case 2: prefix.st expr; break; \
-  case 3: prefix.br expr; break; \
-  }
-
-#define issueq_operation_on_cluster(core, cluster, expr) { int dummyrc; issueq_operation_on_cluster_with_result(core, cluster, dummyrc, expr); }
-
-#define for_each_cluster(iter) foreach (iter, MAX_CLUSTERS)
-#define for_each_operand(iter) foreach (iter, MAX_OPERANDS)
-
-    SMTCore(int coreid_, SMTMachine& machine_): coreid(coreid_), machine(machine_), cache_callbacks(*this) {
-      threadcount = 0;
-      setzero(threads);
-    }
-    
-    ~SMTCore(){};
-
-    // 
-    // Initialize structures independent of the core parameters
-    //
-    void init_generic();
-    void reset();
-
-    //
-    // Initialize all structures for the first time
-    //
-    void init() {
-      init_generic();
-      //
-      // Physical register files
-      //
-      physregfiles[0]("int", coreid, 0, PHYS_REG_FILE_SIZE);
-      physregfiles[1]("fp", coreid, 1, PHYS_REG_FILE_SIZE);
-      physregfiles[2]("st", coreid, 2, STQ_SIZE * MAX_THREADS_PER_CORE);
-      physregfiles[3]("br", coreid, 3, MAX_BRANCHES_IN_FLIGHT * MAX_THREADS_PER_CORE);
-    }
-
-    //
-    // Physical Registers
-    //
-
-    enum { PHYS_REG_FILE_INT, PHYS_REG_FILE_FP, PHYS_REG_FILE_ST, PHYS_REG_FILE_BR };
-
-    enum {  
-      PHYS_REG_FILE_MASK_INT = (1 << 0),
-      PHYS_REG_FILE_MASK_FP  = (1 << 1),
-      PHYS_REG_FILE_MASK_ST  = (1 << 2),
-      PHYS_REG_FILE_MASK_BR  = (1 << 3)
-    };
-
-    // Major core structures
-    PhysicalRegisterFile physregfiles[PHYS_REG_FILE_COUNT];
-    int round_robin_reg_file_offset;
-    W32 fu_avail;
-    ReorderBufferEntry* robs_on_fu[FU_COUNT];
-    CacheSubsystem::CacheHierarchy caches;
-    SMTCoreCacheCallbacks cache_callbacks;
-
-    // Unaligned load/store predictor
-    bitvec<UNALIGNED_PREDICTOR_SIZE> unaligned_predictor;
-    static int hash_unaligned_predictor_slot(const RIPVirtPhysBase& rvp);
-    bool get_unaligned_hint(const RIPVirtPhysBase& rvp) const;
-    void set_unaligned_hint(const RIPVirtPhysBase& rvp, bool value);
-
-    // Pipeline Stages
-    bool runcycle();
-    void flush_pipeline_all();
-    bool fetch();
-    void rename();
-    void frontend();
-    int dispatch();
-    int issue(int cluster);
-    int complete(int cluster);
-    int transfer(int cluster);
-    int writeback(int cluster);
-    int commit();
-
-    // Callbacks
-    void flush_tlb(Context& ctx, int threadid, bool selective = false, Waddr virtaddr = 0);
-
-    // Debugging
-    void dump_smt_state(ostream& os);
-    void print_smt_state(ostream& os);
-    void check_refcounts();
-    void check_rob();
-  };
-
-#define MAX_SMT_CORES 1
-
-  struct SMTMachine: public PTLsimMachine {
-    SMTCore* cores[MAX_SMT_CORES];
-    bitvec<MAX_CONTEXTS> stopped;
-    SMTMachine(const char* name);
-    virtual bool init(PTLsimConfig& config);
-    virtual int run(PTLsimConfig& config);
-    virtual void dump_state(ostream& os);
-    virtual void update_stats(PTLsimStats& stats);
-    virtual void flush_tlb(Context& ctx);
-    virtual void flush_tlb_virt(Context& ctx, Waddr virtaddr);
-    void flush_all_pipelines();
-  };
-
-  extern CycleTimer cttotal;
-  extern CycleTimer ctfetch;
-  extern CycleTimer ctdecode;
-  extern CycleTimer ctrename;
-  extern CycleTimer ctfrontend;
-  extern CycleTimer ctdispatch;
-  extern CycleTimer ctissue;
-  extern CycleTimer ctissueload;
-  extern CycleTimer ctissuestore;
-  extern CycleTimer ctcomplete;
-  extern CycleTimer cttransfer;
-  extern CycleTimer ctwriteback;
-  extern CycleTimer ctcommit;
-
-#ifdef DECLARE_STRUCTURES
-  //
-  // The following configuration has two integer/store clusters with a single cycle
-  // latency between them, but both clusters can access the load pseudo-cluster with
-  // no extra cycle. The floating point cluster is two cycles from everything else.
-  //
-#ifdef MULTI_IQ
-  const Cluster clusters[MAX_CLUSTERS] = {
-    {"int0",  2, (FU_ALU0|FU_STU0)},
-    {"int1",  2, (FU_ALU1|FU_STU1)},
-    {"ld",    2, (FU_LDU0|FU_LDU1)},
-    {"fp",    2, (FU_FPU0|FU_FPU1)},
-  };
-
-  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
-    // I0 I1 LD FP <-to
-    {0, 1, 0, 2}, // from I0
-    {1, 0, 0, 2}, // from I1
-    {0, 0, 0, 2}, // from LD
-    {2, 2, 2, 0}, // from FP
-  };
-
-  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {
-    // I0 I1 LD FP <-to
-    {2, 2, 1, 1}, // from I0
-    {2, 2, 1, 1}, // from I1
-    {1, 1, 2, 2}, // from LD
-    {1, 1, 1, 2}, // from FP
-  };
-
-#else // single issueq
-  const Cluster clusters[MAX_CLUSTERS] = {
-    {"all",  4, (FU_ALU0|FU_ALU1|FU_STU0|FU_STU1|FU_LDU0|FU_LDU1|FU_FPU0|FU_FPU1)},
-   };
-  const byte intercluster_latency_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{0}};
-  const byte intercluster_bandwidth_map[MAX_CLUSTERS][MAX_CLUSTERS] = {{64}};
-#endif // multi_issueq
-
-#endif // DECLARE_STRUCTURES
-
-#endif // INSIDE_SMTCORE
-
-  //
-  // This part is used when parsing stats.h to build the
-  // data store template; these must be in sync with the
-  // corresponding definitions elsewhere.
-  //
-#ifdef MULTI_IQ
-  static const char* cluster_names[MAX_CLUSTERS] = {"int0", "int1", "ld", "fp"};
-#else
-  static const char* cluster_names[MAX_CLUSTERS] = {"all"};
-#endif
-
-  static const char* phys_reg_file_names[PHYS_REG_FILE_COUNT] = {"int", "fp", "st", "br"};
-};
-
-struct PerContextSMTStats { // rootnode:
-  struct fetch {
-    struct stop { // node: summable
-      W64 stalled;
-      W64 icache_miss;
-      W64 fetchq_full;
-      W64 issueq_quota_full;
-      W64 bogus_rip;
-      W64 microcode_assist;
-      W64 branch_taken;
-      W64 full_width;
-    } stop;
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-    W64 width[SMTModel::FETCH_WIDTH+1]; // histo: 0, SMTModel::FETCH_WIDTH, 1
-    W64 blocks;
-    W64 uops;
-    W64 user_insns;
-  } fetch;
-
-  struct frontend {
-    struct status { // node: summable
-      W64 complete;
-      W64 fetchq_empty;
-      W64 rob_full;
-      W64 physregs_full;
-      W64 ldq_full;
-      W64 stq_full;
-    } status;
-    W64 width[SMTModel::FRONTEND_WIDTH+1]; // histo: 0, SMTModel::FRONTEND_WIDTH, 1
-    struct renamed {
-      W64 none;
-      W64 reg;
-      W64 flags;
-      W64 reg_and_flags;
-    } renamed;
-    struct alloc {
-      W64 reg;
-      W64 ldreg;
-      W64 sfr;
-      W64 br;
-    } alloc;
-    // NOTE: This is capped at 255 consumers to keep the size reasonable:
-    W64 consumer_count[256]; // histo: 0, 255, 1
-  } frontend;
-
-  struct dispatch {
-    W64 cluster[SMTModel::MAX_CLUSTERS]; // label: SMTModel::cluster_names
-    struct redispatch {
-      W64 trigger_uops;
-      W64 deadlock_flushes;
-      W64 deadlock_uops_flushed;
-      W64 dependent_uops[SMTModel::ROB_SIZE+1]; // histo: 0, SMTModel::ROB_SIZE, 1
-    } redispatch;
-  } dispatch;
-
-  struct issue {
-    W64 uops;
-    double uipc;
-    struct result { // node: summable
-      W64 no_fu;
-      W64 replay;
-      W64 misspeculated;
-      W64 refetch;
-      W64 branch_mispredict;
-      W64 exception;
-      W64 complete;
-    } result;
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-  } issue;
-
-  struct writeback {
-    W64 writebacks[SMTModel::PHYS_REG_FILE_COUNT]; // label: SMTModel::phys_reg_file_names
-  } writeback;
-
-  struct commit {
-    W64 uops;
-    W64 insns;
-    double uipc;
-    double ipc;
-
-    struct result { // node: summable
-      W64 none;
-      W64 ok;
-      W64 exception;
-      W64 skipblock;
-      W64 barrier;
-      W64 smc;
-      W64 memlocked;
-      W64 stop;
-    } result;
-
-    struct setflags { // node: summable
-      W64 yes;
-      W64 no;
-    } setflags;
-
-    W64 opclass[OPCLASS_COUNT]; // label: opclass_names
-  } commit;
-
-  struct branchpred {
-    W64 predictions;
-    W64 updates;
-
-    // These counters are [0] = mispred, [1] = correct
-    W64 cond[2]; // label: branchpred_outcome_names
-    W64 indir[2]; // label: branchpred_outcome_names
-    W64 ret[2]; // label: branchpred_outcome_names
-    W64 summary[2]; // label: branchpred_outcome_names
-    struct ras { // node: summable
-      W64 pushes;
-      W64 overflows;
-      W64 pops;
-      W64 underflows;
-      W64 annuls;
-    } ras;
-  } branchpred;
-
-  struct dcache {
-    struct load {
-      struct issue { // node: summable
-        W64 complete;
-        W64 miss;
-        W64 exception;
-        W64 ordering;
-        W64 unaligned;
-        struct replay { // node: summable
-          W64 sfr_addr_and_data_not_ready;
-          W64 sfr_addr_not_ready;
-          W64 sfr_data_not_ready;
-          W64 missbuf_full;
-          W64 interlocked;
-          W64 interlock_overflow;
-          W64 fence;
-          W64 bank_conflict;
-        } replay;
-      } issue;
-
-      struct forward { // node: summable
-        W64 cache;
-        W64 sfr;
-        W64 sfr_and_cache;
-      } forward;
-        
-      struct dependency { // node: summable
-        W64 independent;
-        W64 predicted_alias_unresolved;
-        W64 stq_address_match;
-        W64 stq_address_not_ready;
-        W64 fence;
-      } dependency;
-        
-      struct type { // node: summable
-        W64 aligned;
-        W64 unaligned;
-        W64 internal;
-      } type;
-        
-      W64 size[4]; // label: sizeshift_names
-
-      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
-    } load;
-
-    struct store {
-      struct issue { // node: summable
-        W64 complete;
-        W64 exception;
-        W64 ordering;
-        W64 unaligned;
-        struct replay { // node: summable
-          W64 sfr_addr_and_data_not_ready;
-          W64 sfr_addr_not_ready;
-          W64 sfr_data_not_ready;
-          W64 sfr_addr_and_data_and_data_to_store_not_ready;
-          W64 sfr_addr_and_data_to_store_not_ready;
-          W64 sfr_data_and_data_to_store_not_ready;
-          W64 interlocked;
-          W64 fence;
-          W64 parallel_aliasing;
-          W64 bank_conflict;
-        } replay;
-      } issue;
-
-      struct forward { // node: summable
-        W64 zero;
-        W64 sfr;
-      } forward;
-        
-      struct type { // node: summable
-        W64 aligned;
-        W64 unaligned;
-        W64 internal;
-      } type;
-        
-      W64 size[4]; // label: sizeshift_names
-
-      W64 datatype[DATATYPE_COUNT]; // label: datatype_names
-    } store;
-
-    struct fence { // node: summable
-      W64 lfence;
-      W64 sfence;
-      W64 mfence;
-    } fence;
-  } dcache;
-};
-
-//
-// SMT Core
-//
-struct SMTCoreStats { // rootnode:
-  W64 cycles;
-
-  struct dispatch {
-    struct source { // node: summable
-      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-    } source;
-    W64 width[SMTModel::DISPATCH_WIDTH+1]; // histo: 0, SMTModel::DISPATCH_WIDTH, 1
-  } dispatch;
-
-  struct issue {
-    struct source { // node: summable
-      W64 integer[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 fp[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 st[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-      W64 br[SMTModel::MAX_PHYSREG_STATE]; // label: SMTModel::physreg_state_names
-    } source;
-    struct width {
-#ifdef MULTI_IQ
-      W64 int0[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-#else
-      W64 all[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-#endif
-    } width;
-  } issue;
-
-  struct writeback {
-    struct width {
-#ifdef MULTI_IQ
-      W64 int0[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 int1[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 ld[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-      W64 fp[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-#else
-      W64 all[SMTModel::MAX_ISSUE_WIDTH+1]; // histo: 0, SMTModel::MAX_ISSUE_WIDTH, 1
-#endif
-    } width;
-  } writeback;
-
-  struct commit {
-    struct freereg { // node: summable
-      W64 pending;
-      W64 free;
-    } freereg;
-
-    W64 free_regs_recycled;
-
-    W64 width[SMTModel::COMMIT_WIDTH+1]; // histo: 0, SMTModel::COMMIT_WIDTH, 1
-  } commit;
-
-  PerContextSMTStats total;
-  PerContextSMTStats vcpu0;
-  PerContextSMTStats vcpu1;
-  PerContextSMTStats vcpu2;
-  PerContextSMTStats vcpu3;
-
-  struct simulator {
-    double total_time;
-    struct cputime { // node: summable
-      double fetch;
-      double decode;
-      double rename;
-      double frontend;
-      double dispatch;
-      double issue;
-      double issueload;
-      double issuestore;
-      double complete;
-      double transfer;
-      double writeback;
-      double commit;
-    } cputime;
-  } simulator;
-};
-
 #endif // _SMTCORE_H_
diff -r 10448c053ad6 smtexec.cpp
--- a/smtexec.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/smtexec.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -14,6 +16,7 @@
 #include <datastore.h>
 #include <logic.h>
 #include <dcache.h>
+#include <random_inject.h>
 
 #define INSIDE_SMTCORE
 #include <smtcore.h>
@@ -318,6 +321,7 @@
   bool ld = isload(uop.opcode);
   bool st = isstore(uop.opcode);
   bool br = isbranch(uop.opcode);
+  bool pf = isprefetch(uop.opcode);
 
   assert(operands[RA]->ready());
   assert(rb.ready());
@@ -379,6 +383,13 @@
         per_context_smtcore_stats_update(threadid, issue.result.replay++);
         return 0;
       }
+    } else if unlikely (pf) {
+      issueprefetch(state, radata, rbdata, rcdata, uop.cachelevel, pteupdate);
+    } else if unlikely ((uop.opcode == OP_acq) |(uop.opcode == OP_com)) {
+      if unlikely (issueasf(state, rbdata) == ISSUE_NEEDS_REPLAY) {
+        per_context_smtcore_stats_update(threadid, issue.result.replay++);
+        return 0;
+      }
     } else {
       if unlikely (br) {
         state.brreg.riptaken = uop.riptaken;
@@ -386,6 +397,13 @@
       }
       uop.synthop(state, radata, rbdata, rcdata, ra.flags, rb.flags, rc.flags); 
     }
+  }
+
+  /* Injection of exceptions during the ASF critical section */
+  if unlikely (thread.asf_in_crit_sec && asf_exception_critsec()) {
+    state.st.invalid = 1;
+    state.reg.rdflags = FLAG_INV;
+    state.reg.rddata = EXCEPTION_ASF_Testing;
   }
 
   physreg->flags = state.reg.rdflags;
@@ -1062,12 +1080,42 @@
   int aligntype = uop.cond;
   bool signext = (uop.opcode == OP_ldx);
 
+  /* SD: There are quite a few addresses in this function:
+     state.physaddr - physical address aligned to 8 byte, shifted right by 3
+     physaddr       - physical address, not shifted
+     addr           - is the effective virtual address, adjusted for unaligned
+                      loads of halfs
+     origaddr       - effective virtual address, not adjusted for unaligned
+                      accesses -> value in ROBentry
+     mapped         - physical address, mapped to PTLsims address space in
+                      order to access the data from within the simulator
+     virtpage       - same as addr, but stored inside the ROBentry */
   Waddr addr;
   int exception = 0;
   PageFaultErrorCode pfec;
   bool annul;
 
   void* mapped = addrgen(state, origaddr, virtpage, ra, rb, rc, pteupdate, addr, exception, pfec, annul);
+
+  if unlikely (uop.is_asf)
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,"@",sim_cycle,": Issueing ", *this, "@", uop.rip, endl;
+
+  if unlikely (uop.is_asf && asf_load_exception()) exception = EXCEPTION_ASF_Testing;
+
+  /* LOCKed loads are illegal inside an ASF-critical-section, but they might
+     belong to a new ASF CS -> check for an earlier ASF-rel */
+  if unlikely (uop.is_asf && thread.asf_in_crit_sec) {
+    foreach_backward_before(thread.ROB, this, i) {
+      ReorderBufferEntry&  rob = thread.ROB[i];
+      /* first thing found is an ACQUIRE.. Outch! */
+      if unlikely (rob.uop.opcode == OP_acq) {
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__," Locked LD inside ASF Cs :-/ ", rob, "! THIS IS BAD!",endl;
+        return ISSUE_COMPLETED;
+      } else if (rob.uop.opcode == OP_com) {
+        break;
+      }
+    }
+  }
 
   if unlikely (exception) {
     return (handle_common_load_store_exceptions(state, origaddr, addr, exception, pfec)) ? ISSUE_COMPLETED : ISSUE_MISSPECULATED;
@@ -1082,6 +1130,21 @@
   state.physaddr = physaddr >> 3;
 
   //
+  // If the data was speculatively modified within an ASF-CS running on another core,
+  // abort those CS and get the data for simulating directly from that remote LLB. In
+  // real systems, the original data would arrive with the probe response from the other
+  // core (we don't have the line, as it is modified in the other core's cache),
+  // perhaps forwarded by that cores LLB in order to avoid waiting for the full roll-back
+  // of the critical section.
+  //
+  W64* orig_line = (W64*)thread.locked_line_buffer.probe_other_LLBs(physaddr, uop.invalidating);
+  // see comments in smtpipe,1913, SMTCore::commit, call of commitstore...
+  if unlikely (orig_line && uop.invalidating)  {
+    memcpy(phys_to_mapped_virt(floor(physaddr, LLB_LINE_SIZE)), orig_line, LLB_LINE_SIZE);
+    thread.locked_line_buffer.mark_clean_others(physaddr);
+  }
+
+  //
   // For simulation purposes only, load the data immediately
   // so it is easier to track. In the hardware this obviously
   // only arrives later, but it saves us from having to copy
@@ -1089,6 +1152,7 @@
   //
   barrier();
   W64 data = (annul) ? 0 : *((W64*)(Waddr)floor(signext64((Waddr)mapped, 48), 8));
+  if unlikely (orig_line) data = (annul) ? 0 : orig_line[mask(physaddr, LLB_LINE_SIZE) >> 3];
 
   LoadStoreQueueEntry* sfra = null;
 
@@ -1128,6 +1192,8 @@
     } else {
       // Address is unknown: is it a memory fence that hasn't committed?
       if unlikely (stbuf.lfence) {
+        // SD: Special asf-lfences separate locked-lds from the next ASF CS
+        if (stbuf.rob->uop.is_asf && !this->uop.is_asf) continue;
         per_context_smtcore_stats_update(threadid, dcache.load.dependency.fence++);
         sfra = &stbuf;
         break;
@@ -1202,6 +1268,28 @@
     return ISSUE_NEEDS_REPLAY;
   }
 
+  if unlikely (uop.is_asf) {
+    /* Add the address to the LLB */
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Hi we: ", *this, "@", uop.rip, " add to the LLB!", endl;
+    llbline = thread.locked_line_buffer.add_location(state.physaddr << 3);
+
+    if unlikely (!llbline) {
+      /* Full LLB, this is either a programming error or a result of mis-speculation. */
+      if unlikely (this == &thread.ROB[thread.ROB.head]) {
+        /* This uop is at the head of the ROB, no (mis-speculated) locked-loads,
+           might be before it. Hence this load is _really_ exceeding the LLB's
+           capapcity and is a programmer's error! -> Let it proceed!*/
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": We deliberatly exceed the LLB's capacity!", endl;
+      } else {
+        /* Retry until some uops blocking the LLB are annulled. */
+        replay();
+        load_store_second_phase = 1;
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Head-ROB: ", thread.ROB[thread.ROB.head], endl;
+        return ISSUE_NEEDS_REPLAY;
+      }
+    }
+  }
+
 #ifdef ENFORCE_L1_DCACHE_BANK_CONFLICTS
   foreach (i, thread.loads_in_this_cycle) {
     W64 prevaddr = thread.load_to_store_parallel_forwarding_buffer[i];
@@ -1212,7 +1300,17 @@
     // allowed since the chunk has been loaded anyway, so we might
     // as well use it.
     //
-    if unlikely ((prevaddr != state.physaddr) && (lowbits(prevaddr, log2(CacheSubsystem::L1_DCACHE_BANKS)) == lowbits(state.physaddr, log2(CacheSubsystem::L1_DCACHE_BANKS)))) {
+    // S.D.: More flexible bank layout: banks may span cachelines now.
+    // S.D.: Barcelona has 128 bits cache width (fitting with the banksize ;))
+    //       so we will not conflict on the same 16 byte chunk! Simply assume
+    //       that such a chunksize is equal to the BANK_SIZE, which is actually
+    //       sane!
+    int bank_idx_start = log2(CacheSubsystem::L1_DCACHE_BANKSIZE) - 3;
+    int bank_idx_bits  = log2(CacheSubsystem::L1_DCACHE_BANKS);
+
+    if unlikely (((prevaddr >> bank_idx_start) != (state.physaddr >> bank_idx_start)) && (
+                  bits(prevaddr,       bank_idx_start, bank_idx_bits) == 
+                  bits(state.physaddr, bank_idx_start, bank_idx_bits))) { 
       if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_BANK_CONFLICT, this, null, addr);
       per_context_smtcore_stats_update(threadid, dcache.load.issue.replay.bank_conflict++);
 
@@ -1271,8 +1369,10 @@
       }
  
       // Double-locking within a thread is NOT allowed!
-      assert(lock->vcpuid != thread.ctx.vcpuid);
-      assert(lock->threadid != threadid);
+      // S.D. This assertion is checked already in if's condition 
+      //assert(lock->vcpuid != thread.ctx.vcpuid);
+      // S.D. This assertion breaks with threadids being local to cores
+      //assert(lock->threadid != threadid);
 
       per_context_smtcore_stats_update(threadid, dcache.load.issue.replay.interlocked++);
       replay_locked();
@@ -1280,11 +1380,11 @@
     }
 
     // Issuing more than one ld.acq on the same block is not allowed:
-    if (lock) {
+    if (lock) { //&& (lock->vcpuid == thread.ctx.vcpuid)
       logfile << "ERROR: thread ", thread.ctx.vcpuid, " uuid ", uop.uuid, " over physaddr ", (void*)physaddr, ": lock was already acquired by vcpuid ", lock->vcpuid, " uuid ", lock->uuid, " rob ", lock->rob, endl;
       assert(false);
     }
- 
+    // S.D. Location NOT locked
     if unlikely (uop.locked) {
       //
       // Attempt to acquire an exclusive lock on the block via ld.acq,
@@ -1319,7 +1419,7 @@
         replay();
         return ISSUE_NEEDS_REPLAY;
       }
- 
+
       lock->vcpuid = thread.ctx.vcpuid;
       lock->uuid = uop.uuid;
       lock->rob = index();
@@ -1401,28 +1501,59 @@
   assert(thread.loads_in_this_cycle < LOAD_FU_COUNT);
   thread.load_to_store_parallel_forwarding_buffer[thread.loads_in_this_cycle++] = state.physaddr;
 
+  // Internal loads don't hit the cache hierarchy, but rather complete in two cycles.
+  if unlikely (uop.internal) {
+    cycles_left = LOADLAT;
+
+    if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIT, this, sfra, addr);
+
+    load_store_second_phase = 1;
+    state.datavalid = 1;
+    // SD: Make the destination available at the complete time, when the load
+    //     actually produces the data, not now, at the beginning of issue...
+    // physreg->flags &= ~FLAG_WAIT;
+    // physreg->complete();
+    changestate(thread.rob_issued_list[cluster]);
+    lfrqslot = -1;
+    forward_cycle = 0;
+
+    return ISSUE_COMPLETED;
+  }
 #ifdef USE_TLB
   if unlikely (!core.caches.dtlb.probe(addr, threadid)) {
     //
     // TLB miss: 
     //
     if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_TLB_MISS, this, sfra, addr);
-    cycles_left = 0;
-    tlb_walk_level = thread.ctx.page_table_level_count();
+
+#ifdef USE_L2_TLB
+    // S.D.: A quick hack just sets the walklevel to zero, but makes the rob
+    // stay in the tlbmiss-state for the L2-DTLB latency.
+    if likely (core.caches.l2dtlb.probe(addr, threadid)) {
+      cycles_left = CacheSubsystem::L2_DTLB_LATENCY;
+      tlb_walk_level = 0;
+      per_context_dcache_stats_update(threadid, load.dtlb.l2hits++);
+    } else 
+#endif /* USE_L2_TLB */
+    {
+      cycles_left = 0;
+      tlb_walk_level = thread.ctx.page_table_level_count();
+      per_context_dcache_stats_update(threadid, load.dtlb.misses++);
+    }
     changestate(thread.rob_tlb_miss_list);
-    per_context_dcache_stats_update(threadid, load.dtlb.misses++);
-    
     return ISSUE_COMPLETED;
   }
 
-  per_context_dcache_stats_update(threadid, load.dtlb.hits++);
-#endif
+  per_context_dcache_stats_update(threadid, load.dtlb.l1hits++);
+#endif /* USE_TLB */
 
-  return probecache(physaddr, sfra);
+  return probecache(addr, sfra);
 }
 
 //
 // Probe the cache and initiate a miss if required
+// Parameters: addr - effective virtual address, adjusted for unaligned loads!
+//             sfra - LSQ-entry of aliasing load
 //
 int ReorderBufferEntry::probecache(Waddr addr, LoadStoreQueueEntry* sfra) {
   SMTCore& core = getcore();
@@ -1435,26 +1566,37 @@
   LoadStoreQueueEntry& state = *lsq;
   W64 physaddr = state.physaddr << 3;
 
-  bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(physaddr, sfra, sizeshift);
+  bool L1hit = (config.perfect_cache) ? 1 : core.caches.probe_cache_and_sfr(physaddr, addr, sfra, sizeshift);
 
-  if likely (L1hit) {    
+  if likely (L1hit) {
     cycles_left = LOADLAT;
 
     if unlikely (config.event_log_enabled) core.eventlog.add_load_store(EVENT_LOAD_HIT, this, sfra, addr);
     
     load_store_second_phase = 1;
     state.datavalid = 1;
-    physreg->flags &= ~FLAG_WAIT;
-    physreg->complete();
+    // SD: Make the destination available at the complete time, when the load
+    //     actually produces the data, not now, at the beginning of issue...
+    // physreg->flags &= ~FLAG_WAIT;
+    // physreg->complete();
     changestate(thread.rob_issued_list[cluster]);
     lfrqslot = -1;
     forward_cycle = 0;
+
+    // If we have an ASF invalidating probe, this will be decoded as a load, but will also invalidate the line in all other caches!
+    if unlikely (uop.invalidating) core.caches.invalidate_other_caches(physaddr, addr);
 
     per_context_smtcore_stats_update(threadid, dcache.load.issue.complete++);
     per_context_dcache_stats_update(threadid, load.hit.L1++);
     return ISSUE_COMPLETED;
   }
-
+  /* in case we miss, no additional probe has to be issued (eg waiting for a PROBE_ACK, in case we received a PROBE_WAIT above, as
+     the rollback can't take longer than the cache miss: the core could either forward the original value from its LLB, when serving
+     the cache miss (that'd be MOESI) or write back the value from the LLB / ignore the changes made in L1 when using MESI!)
+     Hence, we can be sure that the data we're reading was valid at some point in time, i.e. we can find a linearisation point for
+     this read. If this read is actually part of an ASF spec. phase, we don't have to worry either, as the address is already in our
+     LLB, and in case the other guy restarts his transaction after his rollback, our transaction will be aborted anyways! */
+  
   per_context_smtcore_stats_update(threadid, dcache.load.issue.miss++);
 
   cycles_left = 0;
@@ -1471,8 +1613,11 @@
 
   SFR dummysfr;
   setzero(dummysfr);
-  lfrqslot = core.caches.issueload_slowpath(physaddr, dummysfr, lsi);
+  lfrqslot = core.caches.issueload_slowpath(physaddr, addr, dummysfr, lsi);
   assert(lfrqslot >= 0);
+
+  // If we have an ASF invalidating probe, this will be decoded as a load, but will also invalidate the line in all other caches!
+  if unlikely (uop.invalidating) core.caches.invalidate_other_caches(physaddr,addr);
 
   if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_MISS, this, sfra, addr);
 
@@ -1489,6 +1634,18 @@
   SMTCoreEvent* event;
   W64 virtaddr = virtpage;
 
+#ifdef USE_L2_TLB
+  //
+  // Hits in the L2 TLB are tlb-walks with an initial level of 0 and cycles_left
+  // set to the latency of the L2. Hence we check if there are any cycles left and
+  // just decrement them.
+  //
+  if likely(cycles_left) {
+    cycles_left--;
+    return;
+  }
+#endif
+
   if unlikely (!tlb_walk_level) {
     // End of walk sequence: try to probe cache
     if unlikely (core.caches.lfrq_or_missbuf_full()) {
@@ -1504,7 +1661,25 @@
 
     if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_TLBWALK_COMPLETE, this, null, virtaddr);
     core.caches.dtlb.insert(virtaddr, threadid);
-    probecache(virtaddr, null);
+#ifdef USE_L2_TLB
+    core.caches.l2dtlb.insert(virtaddr, threadid);
+#endif
+    if unlikely (isprefetch(uop.opcode)) {
+      // SD: Make the destination available at the complete time, not so much
+      //     an issue for prefetches, but for sake of consistency...
+      // physreg->flags &= ~FLAG_WAIT;
+      // physreg->complete();
+      changestate(thread.rob_issued_list[cluster]);
+      forward_cycle = 0;
+      int exception;
+      PageFaultErrorCode pfec;
+      PTEUpdate pteupdate;
+      Context& ctx = getthread().ctx;
+      Waddr physaddr = (Waddr)ctx.check_and_translate(virtaddr, 1, 0, 0, exception, pfec, pteupdate);
+      core.caches.initiate_prefetch(physaddr, virtaddr, uop.cachelevel);
+    } else {
+      probecache(virtaddr, null);
+    }
     return;
   }
 
@@ -1522,8 +1697,8 @@
     return;
   }
 
-  cycles_left = 0;
-  changestate(thread.rob_cache_miss_list);
+  //cycles_left = 0;
+  //changestate(thread.rob_cache_miss_list);
 
   LoadStoreInfo lsi = 0;
   lsi.threadid = thread.threadid;
@@ -1533,11 +1708,15 @@
   setzero(dummysfr);
   lfrqslot = core.caches.issueload_slowpath(pteaddr, dummysfr, lsi);
   // No LFRQ or MB slots? Try again on next cycle
+  // TODO: For prefetches, we might want to drop the tlb miss!
   if (lfrqslot < 0) {
     if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_TLBWALK_NO_LFRQ_MB, this, null, pteaddr);
     per_context_dcache_stats_update(threadid, load.tlbwalk.no_lfrq_mb++);
     return;
   }
+  // S.D.: Moved here in order to allow re-tlbwalk-ing, when there is no lfrqslot available!
+  cycles_left = 0;
+  changestate(thread.rob_cache_miss_list);
 
   if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_TLBWALK_MISS, this, null, pteaddr);
   per_context_dcache_stats_update(threadid, load.tlbwalk.L1_dcache_miss++);
@@ -1627,6 +1806,90 @@
   changestate(thread.rob_memory_fence_list);
 
   return ISSUE_COMPLETED;
+}
+
+//
+// Issues a prefetch on the given memory address into the specified cache level.
+//
+void ReorderBufferEntry::issueprefetch(IssueState& state, W64 ra, W64 rb, W64 rc, int cachelevel, PTEUpdate& pteupdate) {
+  ThreadContext& thread = getthread();
+  SMTCore&       core = getcore();
+  SMTCoreEvent*  event;
+
+  state.reg.rddata = 0;
+  state.reg.rdflags = 0;
+
+  int exception = 0;
+  Waddr addr;
+  Waddr origaddr;
+  PTEUpdate dummy_pteu;
+  PageFaultErrorCode pfec;
+  bool annul;
+
+  LoadStoreQueueEntry dummy;
+  setzero(dummy);
+  void* mapped = addrgen(dummy, origaddr, virtpage, ra, rb, rc, uop.is_asf ? pteupdate : dummy_pteu,
+                         addr, exception, pfec, annul);
+
+  /*S.D. TESTING: */ if unlikely (uop.is_asf && asf_prefetch_exception()) exception = EXCEPTION_ASF_Testing;
+  /* LOCKed prefetches are illegal within an ASF critical section! */
+  if unlikely (uop.is_asf && thread.asf_in_crit_sec)
+    exception = EXCEPTION_CheckFailed;
+
+  // Ignore bogus prefetches:
+  if unlikely (exception) {
+    if unlikely (uop.is_asf) {
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,"@",sim_cycle,
+                                 ": Prefetch has exception ",exception_name(exception),endl;
+      /* Prefetches used for specification in ASF do generate pagefaults and clear the LLB when commited! */
+      state.reg.rdflags |= FLAG_INV;
+      state.reg.rddata   = exception | ((W64)pfec << 32);
+    }
+    return;
+  }
+
+  // Ignore unaligned prefetches (should never happen)
+  if unlikely (annul) return;
+
+  // (Stats are already updated by initiate_prefetch())
+  Waddr physaddr = (annul) ? 0 : Waddr(mapped_virt_to_phys(mapped));
+
+  if unlikely (uop.is_asf) {
+    /* Add the address to the LLB */
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Hi we: ", *this,
+                               "@", uop.rip, " add to the LLB!", endl;
+    llbline = thread.locked_line_buffer.add_location(physaddr);
+    //if (uop.invalidating) thread.locked_line_buffer.mark_written(physaddr);
+  }
+  thread.locked_line_buffer.probe_other_LLBs(physaddr, uop.invalidating);
+
+#ifdef USE_TLB
+  if unlikely (!core.caches.dtlb.probe(addr, threadid)) {
+    //
+    // TLB miss: Handle the miss and do the prefetch, too!
+    //
+    if unlikely (config.event_log_enabled) event = core.eventlog.add_load_store(EVENT_LOAD_TLB_MISS, this, null, addr);
+#ifdef USE_L2_TLB
+    // S.D.: A quick hack just sets the walklevel to zero, but makes the rob
+    // stay in the tlbmiss for the L2-DTLB latency.
+    if likely (core.caches.l2dtlb.probe(addr, threadid)) {
+      cycles_left = CacheSubsystem::L2_DTLB_LATENCY;
+      tlb_walk_level = 0;
+      per_context_dcache_stats_update(threadid, load.dtlb.l2hits++);
+    } else 
+#endif /* USE_L2_TLB */
+    {
+      cycles_left = 0;
+      tlb_walk_level = thread.ctx.page_table_level_count();
+      per_context_dcache_stats_update(threadid, load.dtlb.misses++);
+    }
+    changestate(thread.rob_tlb_miss_list);
+    return;
+  }
+
+  per_context_dcache_stats_update(threadid, load.dtlb.l1hits++);
+#endif /* USE_TLB */
+  core.caches.initiate_prefetch(physaddr, origaddr, cachelevel, uop.invalidating);
 }
 
 //
@@ -1745,6 +2008,13 @@
       uopids[operand] = 0;
       preready[operand] = 1;
     }
+  }
+  // SD: Remove any locked cache-line
+  if unlikely(llbline) {
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Removing reference to line ", llbline, endl;
+    thread.locked_line_buffer.remove_ref(llbline);
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Line has ", llbline->refcount, " references.", endl;
+    llbline = (LLBLine*)null;
   }
 
   if unlikely (operands_still_needed) {
@@ -1930,7 +2200,7 @@
   RegisterRenameTable& commitrrt = thread.commitrrt;
   int& loads_in_flight = thread.loads_in_flight;
   int& stores_in_flight = thread.stores_in_flight;
-
+  byte queued_locks_before = thread.queued_mem_lock_release_count;
   SMTCoreEvent* event;
 
   int idx;
@@ -2055,8 +2325,15 @@
     annulrob.physreg->free();
 
     if unlikely (isclass(annulrob.uop.opcode, OPCLASS_LOAD|OPCLASS_STORE)) {
-      annulrob.release_mem_lock(true);
-      thread.flush_mem_lock_release_list();
+      /* SD: This was a nasty bug: We have to be careful to not flush away any
+         locks that are about to be removed by a committing locked RMW
+         instruction, which takes more than one cycle to commit, but has
+         already declared the locks it wants to give up!
+         There are a few things we can do here: Only flush, if the annulrob
+         actually held locks and then only flush those locks (actually only
+         a single one) added here!*/
+      if(annulrob.release_mem_lock(true))
+        thread.flush_mem_lock_release_list(queued_locks_before);
       loads_in_flight -= (annulrob.lsq->store == 0);
       stores_in_flight -= (annulrob.lsq->store == 1);
       annulrob.lsq->reset();
@@ -2084,6 +2361,17 @@
       //
       if unlikely (config.event_log_enabled) event->annul.annulras = 1;
       branchpred.annulras(annulrob.uop.predinfo);
+    }
+
+    if (annulrob.uop.is_asf)
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Annulling ", annulrob,endl;
+
+    // Remove it from the ASF-LLB, if it was on one
+    if unlikely(annulrob.llbline) {
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Removing reference to line ", annulrob.llbline, endl;
+      thread.locked_line_buffer.remove_ref(annulrob.llbline);
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Line has ", annulrob.llbline->refcount, " references.", endl;
+      annulrob.llbline = (LLBLine*)null;
     }
 
     annulrob.reset();
@@ -2185,6 +2473,14 @@
   physreg->data = 0;
   physreg->flags = FLAG_WAIT;
   physreg->changestate(PHYSREG_WAITING);
+
+  if (uop.is_asf)
+    if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Redispatching ", uop,endl;
+  // Remove it from the ASF-LLB, if it was on one
+  if unlikely(llbline) {
+    thread.locked_line_buffer.remove_ref(llbline);
+    llbline = (LLBLine*)null;
+  }
 
   // Force ROB to be re-dispatched in program order
   cycles_left = 0;
@@ -2301,3 +2597,108 @@
 
   return COMMIT_RESULT_OK;
 }
+
+/**
+ * Issues an ASF operation (i.e. ACQUIRE / COMMIT) on the core.
+ */
+template <typename T> byte x86_genflags(T r);
+int ReorderBufferEntry::issueasf(IssueState& state, W64 rbdata) {
+  SMTCore& core         = getcore();
+  ThreadContext& thread = getthread();
+  LockedLineBuffer& llb = thread.locked_line_buffer;
+  W64 asf_err;
+
+  switch(uop.opcode) {
+    case (OP_acq):
+      /* Check, whether this is an re-execution of the ACQUIRE instruction. This is caused
+         by either an exception or an abort within the critical section. There is no need
+         to check anything else, we just return the error-code and reset the state. */
+      if (thread.asf_reissue_will_fail) {
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Re-Issueing failing ACQUIRE, error code: ",
+                                   thread.asf_stored_error, endl;
+        state.reg.rddata = thread.asf_stored_error;
+        break;
+      }
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,"@",sim_cycle,": Issue ASF ACQUIRE.",endl;
+      /* Scan backwards through the ROB, in order to find the latest locked loads and stores, which alias with the
+         content of the LLB, in order to not ignore any changes to addresses (locked loads) or data (stores) in the LLB.
+         This works, as the LSQ is filled in-order, together with the ROB! */
+      /* NOTE: As locked prefetchw's will be decoded as normal (invalidating) loads, they will be in the LSQ.
+               Hence it should be sufficient to traverse that only! */
+      foreach_backward_before(thread.ROB, this, i) {
+        ReorderBufferEntry&  rob = thread.ROB[i];
+        if (!isclass(rob.uop.opcode, OPCLASS_MEM)) continue;
+        /* Ignore everything non-asf, except stores */
+        if likely(!(rob.uop.is_asf || isstore(rob.uop.opcode))) continue;
+        /* Locked prefetches require special treatment, as they don't have an associated LSQEntry! */
+        if unlikely (isprefetch(rob.uop.opcode)) {
+          if (rob.issued) continue;
+        } else {
+          LoadStoreQueueEntry& lsq = *rob.lsq;
+          if (!lsq.entry_valid) continue;
+          /* Make the acquire dependend also on the data of the load! */
+          if (!lsq.store && lsq.addrvalid && lsq.datavalid) continue;
+          /* and also stores, which have a known non-aliasing address */
+          if (lsq.store && lsq.addrvalid && !llb.contains(lsq.physaddr << 3)) continue;
+        }
+        /* we have found something dangerous, either:
+           -a locked prefetchw, which has not yet issued
+           [-a locked load, which does not yet have an address added to the LLB]
+           -a locked load, which does not yet have an address added to the LLB or its data not in the core!
+           -a store, which either aliases with one of the LLB entries or does not have a valid address generated
+          -> create dependency on it! */
+        operands[RS]->unref(*this, thread.threadid);
+        operands[RS] = rob.physreg;
+        operands[RS]->addref(*this, thread.threadid);
+
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Found aliasing uop ",
+                                   rob.uop, " at ROB[",i,"]=", rob, endl;
+        replay();
+        return ISSUE_NEEDS_REPLAY;
+      }
+
+      /* Check if this is actually a validate! */
+      if ((W64s)rbdata < 0 ) {
+        rbdata = -(W64s)rbdata;
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": ACQUIRE -> VALIDATE (",rbdata,") locs.", endl;
+        /* Check correct #locs and consistency errors */
+        if (rbdata != llb.num_locations)
+          state.reg.rddata = -2;
+        else
+          state.reg.rddata = llb.consistency_error();
+        break;
+      }
+      /* Not found anything potentially bad -> check for consistency with intermediate! */
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Nothing found! Checking intermediate..", endl;
+      if unlikely(rbdata != llb.num_locations) {
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Wrong number of locations for ACQUIRE: Spec:",
+                                   rbdata," vs. actual: ",llb.num_locations, endl;
+        state.reg.rddata = -2;
+        break;
+      }
+      /* Check for any conflicts during the specification phase, i.e. invalidating probes to cachelines in the LLB and fault early.
+         This is also used, when the critical section aborts asynchronously and the acquire is re-executed!
+         This will be done again at commit time, in order to find any remaining conflicts that occured between issue and commit of
+         the acquire instruction. In case there are errors then, the acquire instruction has to be redispatched! */
+      asf_err = llb.consistency_error();
+      if unlikely(asf_err) { 
+        if (logable(5))  logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                                    ": ACQUIRE could not create a valid snapshot! Error ", asf_err, endl;
+        state.reg.rddata = uop.rip;
+        break;
+      }
+
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Taking snapshot!",endl;
+      llb.snapshot();
+      state.reg.rddata = 0;
+      break;
+    case (OP_com):
+      /* Nothing to be done yet for releasing at issue time, see comment in ROBEntry::issue */
+      break;
+    default:
+      assert(false);
+  }
+
+  state.reg.rdflags = x86_genflags<W64>(state.reg.rddata);
+  return ISSUE_COMPLETED;
+}
diff -r 10448c053ad6 smtpipe.cpp
--- a/smtpipe.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/smtpipe.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -5,6 +5,8 @@
 //
 // Copyright 2003-2006 Matt T. Yourst <yourst@yourst.com>
 // Copyright 2006 Hui Zeng <hzeng@cs.binghamton.edu>
+// Copyright 2007-2008 Advanced Micro Devices, Inc.
+// Contributed by Stephan Diestelhorst <stephan.diestelhorst@amd.com>
 //
 
 #include <globals.h>
@@ -102,6 +104,9 @@
   foreach_forward(ROB, i) {
     ReorderBufferEntry& rob = ROB[i];
     rob.release_mem_lock(true);
+    // SD: Note, that we might actually flush halfway through a locked RMW
+    // instruction. But this is not as bad as in the annul case, as the
+    // store (the W-part) will be wiped, too!
     flush_mem_lock_release_list();
     rob.physreg->reset(threadid); // free all register allocated by rob:
   }
@@ -127,7 +132,7 @@
       obj->reset(threadid);
     }     
   }
-  
+
   reset_fetch_unit(ctx.commitarf[REG_rip]);
   rob_states.reset();
 
@@ -162,7 +167,9 @@
 
   fetchrip = realrip;
   fetchrip.update(ctx);
+
   stall_frontend = 0;
+  stall_on_eom   = 0;
   waiting_for_icache_fill = 0;
   fetchq.reset();
   current_basic_block_transop_index = 0;
@@ -355,7 +362,7 @@
 
   int fetchcount = 0;
   int taken_branch_count = 0;
-
+  
   SMTCoreEvent* event;
 
   if unlikely (stall_frontend) {
@@ -378,7 +385,7 @@
     return true;
   }
 
-  while ((fetchcount < FETCH_WIDTH) && (taken_branch_count == 0)) {
+  while ((fetchcount < FETCH_WIDTH) && (taken_branch_count == 0) && !stall_frontend) {
     if unlikely (!fetchq.remaining()) {
       if unlikely (config.event_log_enabled) { 
         if (!fetchcount) {
@@ -514,6 +521,9 @@
       per_context_smtcore_stats_update(threadid, fetch.stop.microcode_assist++);
       stall_frontend = 1;      
     }
+
+    // SD: Try w/o the pipeline stalls!
+    if unlikely (stall_on_eom && transop.eom) {stall_frontend = true; stall_on_eom = false;}
 
     per_context_smtcore_stats_update(threadid, fetch.uops++);
 
@@ -1104,11 +1114,103 @@
   
   ThreadContext& thread = getthread();
   thread.issueq_count++;
+#ifndef MULTI_IQ
   assert(thread.issueq_count >= 0 && thread.issueq_count <= ISSUE_QUEUE_SIZE);
-
+#endif
   assert(ok);
 
   return operands_still_needed;
+}
+
+// SD: This is an attempt at an improved dispatcher
+int ReorderBufferEntry::select_cluster_penalty() {
+  SMTCoreEvent* event;
+  SMTCore& core = getcore();
+  ThreadContext& thread = getthread();
+
+  if (MAX_CLUSTERS == 1) {
+    int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+    getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+    return (cluster_issue_queue_avail_count[0] > 0) ? 0 : -1;
+  }
+
+  W32 executable_on_cluster = executable_on_cluster_mask;
+
+  static const int F = 8;   //fixed point arithmetic
+  int cluster_operand_penalty[MAX_CLUSTERS];
+  foreach (i, MAX_CLUSTERS) { cluster_operand_penalty[i] = 0; }
+
+  // SD: Instead of computing bonusses to run on the same cluster as one of the
+  //     operands, we will compute penalties which correspond to distances
+  //     between us and the operand.
+  foreach (i, MAX_OPERANDS) {
+    PhysicalRegister& r = *operands[i];
+    if ((&r) && ((r.state == PHYSREG_WAITING) || (r.state == PHYSREG_BYPASS)) && (r.rob->cluster >= 0))
+      foreach (c, MAX_CLUSTERS)
+        cluster_operand_penalty[c] += intercluster_latency_map[r.rob->cluster][c] << F;
+  }
+  // and then also take the FUs into account. Try to spread the uops evenly
+  // amongst them!
+  ReorderBufferEntry* rob;
+  foreach (c, MAX_CLUSTERS) {
+    foreach_list_mutable(thread.rob_dispatched_list[c], rob, entry, nextentry) {
+      // SD: The idea is to minimise the collision probability of uops when 
+      //     assuming random selection if multiple FUs are available in a
+      //     cluster.
+      W32 FU_can     = fuinfo[rob->uop.opcode].fu & clusters[c].fu_mask;
+      W32 FU_can_we  = fuinfo[uop.opcode].fu      & clusters[c].fu_mask;
+      W32 FU_overlap = FU_can & FU_can_we;
+      if unlikely(FU_overlap)
+        cluster_operand_penalty[c] += (F * popcount(FU_overlap)) /
+                                      (popcount(FU_can) * popcount(FU_can_we));
+    }
+  }
+
+  assert(executable_on_cluster);
+
+  // If a given cluster's issue queue is full, try another cluster:
+  int cluster_issue_queue_avail_count[MAX_CLUSTERS];
+  W32 cluster_issue_queue_avail_mask = 0;
+
+  getcore().sched_get_all_issueq_free_slots(cluster_issue_queue_avail_count);
+
+  foreach (i, MAX_CLUSTERS) {
+    cluster_issue_queue_avail_mask |= ((cluster_issue_queue_avail_count[i] > 0) << i);
+  }
+
+  executable_on_cluster &= cluster_issue_queue_avail_mask;
+
+  if unlikely (config.event_log_enabled) {
+    event = getcore().eventlog.add(EVENT_CLUSTER_OK, this);
+    event->select_cluster.allowed_clusters = executable_on_cluster_mask;
+    foreach (i, MAX_CLUSTERS) event->select_cluster.iq_avail[i] = cluster_issue_queue_avail_count[i];
+  }
+
+  if unlikely (!executable_on_cluster) {
+    if unlikely (config.event_log_enabled) event->type = EVENT_CLUSTER_NO_CLUSTER;
+    return -1;
+  }
+  
+  int n = 0;
+  // SD: Using the sim_cycle as a source of randomness is utterly stupid,
+  //     because it can put two similar uops on the same cluster, whereas
+  //     spreading them might be much more usefull!
+  int ticker = sim_cycle*DISPATCH_WIDTH + core.dispatchcount;
+  int cluster = find_random_set_bit(executable_on_cluster, ticker);
+  n = cluster_operand_penalty[cluster];
+  foreach (i, MAX_CLUSTERS) {
+    if ((cluster_operand_penalty[i] < n) && bit(executable_on_cluster, i)) {
+      n = cluster_operand_penalty[i];
+      cluster = i;
+    }
+  }
+
+  per_context_smtcore_stats_update(threadid, dispatch.cluster[cluster]++);
+
+  if unlikely (config.event_log_enabled) event->cluster = cluster;
+
+  return cluster;
+
 }
 
 int ReorderBufferEntry::select_cluster() {
@@ -1153,10 +1255,10 @@
     if unlikely (config.event_log_enabled) event->type = EVENT_CLUSTER_NO_CLUSTER;
     return -1;
   }
-  
+
   int n = 0;
   int cluster = find_random_set_bit(executable_on_cluster, sim_cycle);
-  
+
   foreach (i, MAX_CLUSTERS) {
     if ((cluster_operand_tally[i] > n) && bit(executable_on_cluster, i)) {
       n = cluster_operand_tally[i];
@@ -1186,8 +1288,11 @@
 
     // All operands start out as valid, then get put on wait queues if they are not actually ready.
 
+#ifndef PENALTY_DISPATCHER
     rob->cluster = rob->select_cluster();
-
+#else
+    rob->cluster = rob->select_cluster_penalty();
+#endif
     //
     // An available cluster could not be found. This only happens 
     // when all applicable cluster issue queues are full. Since
@@ -1295,6 +1400,9 @@
     if unlikely (rob->cycles_left <= 0) {
       if unlikely (config.event_log_enabled) core.eventlog.add(EVENT_COMPLETE, rob);
       rob->changestate(rob_completed_list[cluster]);
+      // SD: Make the register ready here, instead of at issue time. This should be more correct!
+      //     Together with the 0 cycle fwd = bypass fix elsewhere, this works as expected!
+      rob->physreg->flags &= ~FLAG_WAIT;
       rob->physreg->complete();
       rob->forward_cycle = 0;
       rob->fu = 0;
@@ -1499,6 +1607,10 @@
 
     if unlikely (core.commitcount >= COMMIT_WIDTH) break;
     rc = rob.commit();
+
+    /* Injection of random interrupts for testing of single-core ASF on PTLsim/classic happens here */
+    if unlikely(asf_in_crit_sec && asf_interrupt_critsec()) rc = COMMIT_RESULT_INTERRUPT;
+
     if likely (rc == COMMIT_RESULT_OK) {
       core.commitcount++;
       last_commit_at_cycle = sim_cycle;
@@ -1513,8 +1625,8 @@
   return rc;
 }
 
-void ThreadContext::flush_mem_lock_release_list() {
-  foreach (i, queued_mem_lock_release_count) {
+void ThreadContext::flush_mem_lock_release_list(byte from /*=0*/) {
+  for (size_t i = from; i < queued_mem_lock_release_count; i++) {
     W64 lockaddr = queued_mem_lock_release_list[i];
 
     MemoryInterlockEntry* lock = interlocks.probe(lockaddr);
@@ -1534,11 +1646,9 @@
       event->threadid = ctx.vcpuid;
       event->loadstore.sfr.physaddr = lockaddr >> 3;
     }
-
     interlocks.invalidate(lockaddr);
   }
-
-  queued_mem_lock_release_count = 0;
+  queued_mem_lock_release_count = from;
 }
 
 #ifdef PTLSIM_HYPERVISOR
@@ -1594,6 +1704,14 @@
   // its P (internal) bit must be set.
   //
 
+  //
+  // SD: Above explanation is at least IMHO somewhat unsatisfactory on
+  // why flushing the locks here actually makes sense. It does, if you
+  // figure out that in order to have sth. to flush, this must be the
+  // fence exactly after the locked RMW instruction, as the lock is just
+  // added to the flush-list at the commit of the load (the R part), which
+  // will definitly happen after the commit of the preceeding fence.
+  //
   if unlikely ((uop.opcode == OP_mf) && ready_to_commit() && (!load_store_second_phase)) {
     fencewakeup();
     thread.flush_mem_lock_release_list();
@@ -1662,7 +1780,7 @@
       found_eom = true;
       break;
     }
-    
+
     if likely (subrob.uop.eom) break;
   }
 
@@ -1824,6 +1942,11 @@
 
   if likely (uop.som) assert(ctx.commitarf[REG_rip] == uop.rip); 
 
+  if (uop.is_asf) {
+    int asf_commit_rc = commit_asf_instruction(); 
+    if (asf_commit_rc != COMMIT_RESULT_OK) return asf_commit_rc;
+  }
+
   //
   // The commit of all uops in the x86 macro-op is guaranteed to happen after this point
   //
@@ -1855,7 +1978,7 @@
       assert(!isbranch(uop.opcode));
       ctx.commitarf[REG_rip] += uop.bytes;
     }
-    if unlikely (config.event_log_enabled) event->commit.target_rip = ctx.commitarf[REG_rip];
+    if unlikely (config.event_log_enabled) {event->commit.target_rip = ctx.commitarf[REG_rip]; event->commit.krn = ctx.kernel_mode;}
   }
 
   if likely ((!ld) & (!st) & (!uop.nouserflags)) {
@@ -1888,7 +2011,25 @@
     Waddr mfn = (lsq->physaddr << 3) >> 12;
     smc_setdirty(mfn);
 
-    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, thread.threadid) == 0);
+    // mark the line dirty, in case we store speculatively to it
+    if unlikely (thread.asf_in_crit_sec) thread.locked_line_buffer.mark_written(lsq->physaddr << 3);
+
+    // Probe other LLBs, where the original contents of speculatively (in ASF-CS) modified memory is located
+    // In case the line is modified somewhere else, we will receive a pointer to the backed up data.
+    // In real hardware, the other LLB would reply with the original data directly from the LLB, in response to
+    // our invalidating probe / or would delay the repsonse until it has finished its rollback. In the simulator
+    // however, data is written w/o waiting for the cache miss to be served. Hence, do the forwarding in instant fashion, too.
+    void* orig_line = thread.locked_line_buffer.probe_other_LLBs(lsq->physaddr << 3, true);
+    if unlikely (orig_line)  {
+      // options: a) we could either wait until all other CS have been rolled back, but that could stall this store forever,
+      //             if no contention control is employed
+      // hence:   b) merge the data from the other LLB and store the entire updated cacheline. We then have to ensure that the line is
+      //             _NOT_ overwritten by the back-rolling CS We also have to write back the entire $-line, in case there have been any
+      //             other modifications.
+      memcpy(phys_to_mapped_virt(floor(lsq->physaddr << 3, LLB_LINE_SIZE)), orig_line, LLB_LINE_SIZE);
+      thread.locked_line_buffer.mark_clean_others(lsq->physaddr << 3);
+    }
+    if (lsq->bytemask) assert(core.caches.commitstore(*lsq, virtpage, uop.internal, thread.threadid) == 0);
   }
 
   if unlikely (pteupdate) {
@@ -2042,3 +2183,146 @@
     1, 1, 1, 1, 1, 1, 1, 1,
   };
 };
+
+
+/**
+ * This aborts a currently running ASF transaction, by
+ * copying (instantly) all backed up values from the
+ * LLB to the caches and memory (this is the same in PTLsim).
+ * This should be called during the commit phase of the
+ * instruction causing the abort. It checks all loads and
+ * stores which are in flight and forces them to redispatch.
+ * This implements Leendert's suggestion of lazy abort at the
+ * final release.
+ */
+
+void ReorderBufferEntry::abort_asf() {
+  SMTCore&        core  = getcore();
+  ThreadContext& thread = getthread();
+  LockedLineBuffer& llb = thread.locked_line_buffer;
+
+  assert(thread.asf_in_crit_sec);
+  if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Aborting ASF crit-sec, µop: ", uop,
+                             " LLB: ", llb.num_locations, endl;
+  /* Scan through the LoadStoreQueue and replay every load and store that hits one of the adresses in the llb. */
+  foreach_forward(thread.LSQ, i){
+    LoadStoreQueueEntry& lsq = thread.LSQ[i];
+    /* Check for valid entries that alias */
+    if (lsq.entry_valid && llb.contains(lsq.physaddr << 3)) {
+      if (logable(5)) {
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Found aliasing ", lsq.store ? "store":"load", " at LSQ[",i,"] ";
+        logfile << " which happens ",(uop.uuid > lsq.rob->uop.uuid) ? "earlier": "later", " (we=",uop.uuid,", them=",lsq.rob->uop.uuid,")",endl;
+        logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Redispatching associated ROBEntry ",lsq.rob->uop,endl;
+      }
+      /* Replaying is not enough! *///lsq.rob->replay();
+      lsq.rob->redispatch_dependents();
+    }
+  }
+  /* Restore the contents from the LLB and clear it */
+  llb.abort();
+  thread.asf_in_crit_sec = false;
+}
+
+/**
+ * Hook into ASF, processes committing the ASF state changing instructions,
+ * ACQUIRE and COMMIT. It checks, whether there have been no concurrent accesses
+ * before the ACQUIRE and handles COMMIT.
+ */
+int ReorderBufferEntry::commit_asf_instruction() {
+  assert(uop.is_asf);
+
+  SMTCore&       core   = getcore();
+  ThreadContext& thread = getthread();
+  LockedLineBuffer& llb = thread.locked_line_buffer;
+  Context&          ctx = thread.ctx;
+  
+  PhysicalRegister& rb = *operands[RB];
+  W64s rbdata          = (uop.rb == REG_imm) ? uop.rbimm : rb.data;
+
+  if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": Commiting ASF instruction ", *this,
+                             " LLB: ", llb.num_locations, endl;
+
+  /* Unstall the frontend to allow fetching of the following instructions */
+
+  if unlikely(uop.opcode == OP_acq) {
+    assert(!thread.asf_in_crit_sec);
+    /* Check for VALIDATEs, which are just ACQUIREs with negative number of locs. */
+    if unlikely (rbdata < 0) return COMMIT_RESULT_OK;
+
+      /* We already have some problems -> do not enter the critical section! */
+    if unlikely(physreg->data) {
+      /* Moved here, in order to be really sure that this ACQUIRE actually is
+         the one that makes it. 
+         NOTE: Any later locked-loads can't fill the LLB already, as they
+         would have to be after the release and the release is after the
+         acquire! */
+      llb.clear();
+      thread.stall_frontend = false; thread.stall_on_eom = false;
+      thread.asf_reissue_will_fail = false;
+      /* All side-effects have already been cleared! Just do NOT enter the crit.sec.*/
+      return COMMIT_RESULT_OK;
+    }
+    /* Additional check for ASF's maximum capacity. */
+    if unlikely (llb.num_locations > ASF_MAX_LINES) {
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                                 ": ACQUIRE detected too large ASF-CS: ", llb.num_locations, " vs. ", ASF_MAX_LINES, endl;
+      redispatch_dependents(true);
+      thread.asf_reissue_will_fail = true;
+      thread.asf_stored_error      = -3;
+      return  COMMIT_RESULT_NONE;
+    }
+    /* Late check for any problems during the execution (between issue and commit) of
+       the ACQUIRE instruction, before we (finally) enter the critical section. */
+    W64 llb_err = llb.consistency_error();
+    if unlikely(llb_err) {
+      if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,
+                                 ": ACQUIRE could not create a valid snapshot, detected LATE! Error ", llb_err, endl;
+      /* As this has happened fairly late, other ops might actually already execute inside the CS.
+         -> Redispatch, the conditional jump after the ACQUIRE will move the control flow away
+         from any instruction inside the CS and annul any speculative data! */
+      /* The acquire will fail, because the number of specified locations will be wrong. Alternatively,
+         one might specify some error_code, this is TODO! */
+      /* Should that actually be done at commit time of the acquire? Probably the Acquire instruction should
+         be prevented from committing, by redispatching it, when a conflicting access hits inside the LLB. TODO!
+         For now, this is ugly, but works! */
+      redispatch_dependents(true);
+      thread.asf_reissue_will_fail = true;
+     /* TODO: Current spec says simply -18 for this, but how about using the RIP here? We also could
+              incorporate some information from the LLB. TBD! */
+      thread.asf_stored_error = llb_err;//uop.rip;
+      return  COMMIT_RESULT_NONE;
+    }
+
+    /* All is well -> inside crit. section, save our acquire rip*/
+    thread.asf_in_crit_sec       = true;
+    thread.asf_failing_acquire   = uop.rip;
+    thread.asf_saved_rsp         = ctx.commitarf[REG_rsp];
+    thread.asf_reissue_will_fail = false;
+    thread.stall_frontend        = false;
+    thread.stall_on_eom          = false;
+
+  } else if unlikely (uop.opcode == OP_com) {
+    thread.stall_frontend        = false;
+
+    /* COMMIT could also be used to just clear the LLB, without an ACQUIRE */
+    if likely (thread.asf_in_crit_sec) {
+      /* Do a final check on the sanity of this transaction! */
+      W64 llb_err = llb.consistency_error();
+      if unlikely(llb_err) {
+        if (logable(5)) logfile << "[vcpu ", thread.ctx.vcpuid,"]"__FILE__,__LINE__,": COMMIT found a LATE error: ", llb_err, endl;
+        return COMMIT_RESULT_NONE;
+      }
+    }
+    llb.commit();
+    thread.asf_in_crit_sec       = false;
+    thread.asf_failing_acquire   = 0;
+    thread.asf_saved_rsp         = 0;
+  }
+
+  if ((uop.opcode == OP_ld) || (uop.opcode == OP_ld_pre) || (uop.opcode == OP_st)) {
+    /* LOCKed loads are illegal inside an ASF-critical-section */
+    if unlikely (thread.asf_in_crit_sec)
+      ctx.propagate_x86_exception(EXCEPTION_x86_invalid_opcode);
+  }
+  return COMMIT_RESULT_OK;
+}
diff -r 10448c053ad6 stats.h
--- a/stats.h	Thu May 31 15:36:20 2007 +0200
+++ b/stats.h	Wed Nov 05 14:15:51 2008 +0100
@@ -41,7 +41,7 @@
 // need to replicate the vcpu0,vcpu1,... structures in several
 // places below.
 //
-static const int MAX_SIMULATED_VCPUS = 4;
+static const int MAX_SIMULATED_VCPUS = 8;
 
 struct EventsInMode { // rootnode: summable
   W64 user64;
diff -r 10448c053ad6 uopimpl.cpp
--- a/uopimpl.cpp	Thu May 31 15:36:20 2007 +0200
+++ b/uopimpl.cpp	Wed Nov 05 14:15:51 2008 +0100
@@ -1178,6 +1178,9 @@
   case OP_ld_pre:
   case OP_st:
   case OP_mf:
+  /* ASF instructions too ;) */
+  case OP_acq:
+  case OP_com:
     func = uop_impl_nop; break;
   case OP_rotl: 
     func = implmap_rotl[size][setflags]; break;

