From 6e7718395efb2bf299224e5188b32da47efe0883 Mon Sep 17 00:00:00 2001
From: mig <mig>
Date: Sat, 12 Jan 2013 10:14:06 +0000
Subject: even better ... or so I hope: also inlining INST_PUSH1 in the
 peephole, checking for ISC after LOAD1 and PUSH1

---
 generic/tclExecute.c | 93 ++++++++++++++++++++++------------------------------
 1 file changed, 40 insertions(+), 53 deletions(-)
diff --git a/generic/tclExecute.c b/generic/tclExecute.c
index 1ed8949..4d758f6 100644
--- a/generic/tclExecute.c
+++ b/generic/tclExecute.c
@@ -2250,23 +2250,6 @@ TEBCresume(
     }
   cleanup0:
 
-#ifdef TCL_COMPILE_DEBUG
-    /*
-     * Skip the stack depth check if an expansion is in progress.
-     */
-
-    CHECK_STACK();
-    if (traceInstructions) {
-	fprintf(stdout, "%2d: %2d ", iPtr->numLevels, (int) CURR_DEPTH);
-	TclPrintInstruction(codePtr, pc);
-	fflush(stdout);
-    }
-#endif /* TCL_COMPILE_DEBUG */
-
-#ifdef TCL_COMPILE_STATS
-    iPtr->stats.instructionCount[*pc]++;
-#endif
-
     /*
      * Check for asynchronous handlers [Bug 746722]; we do the check every
      * ASYNC_CHECK_COUNT_MASK instruction, of the form (2**n-1).
@@ -2298,16 +2281,51 @@ TEBCresume(
 	CACHE_STACK_INFO();
     }
 
+    /*
+     * These two instructions account for 26% of all instructions (according
+     * to measurements on tclbench by Ben Vitale
+     * [http://www.cs.toronto.edu/syslab/pubs/tcl2005-vitale-zaleski.pdf]
+     * Resolving them before the switch reduces the cost of branch
+     * mispredictions, seems to improve runtime by 5% to 15%, and (amazingly!)
+     * reduces total obj size.
+     */
+
+    peepholeStart:
+#ifdef TCL_COMPILE_STATS
+    iPtr->stats.instructionCount[*pc]++;
+#endif
+
+#ifdef TCL_COMPILE_DEBUG
+    /*
+     * Skip the stack depth check if an expansion is in progress.
+     */
+
+    CHECK_STACK();
+    if (traceInstructions) {
+	fprintf(stdout, "%2d: %2d ", iPtr->numLevels, (int) CURR_DEPTH);
+	TclPrintInstruction(codePtr, pc);
+	fflush(stdout);
+    }
+#endif /* TCL_COMPILE_DEBUG */
+
     TCL_DTRACE_INST_NEXT();
+    
+    if (*pc == INST_LOAD_SCALAR1) {
+	goto instLoadScalar1;
+    }
 
-    while (*pc == INST_START_CMD) {
+    if (*pc == INST_PUSH1) {
+	PUSH_OBJECT(codePtr->objArrayPtr[TclGetUInt1AtPtr(pc+1)]);
+	TRACE_WITH_OBJ(("%u => ", TclGetInt1AtPtr(pc+1)), OBJ_AT_TOS);
+	pc += 2;
+	goto peepholeStart;
+    }
+
+    if (*pc == INST_START_CMD) {
 	/*
 	 * Peephole: do not run INST_START_CMD, just skip it
 	 */
 	
-#ifdef TCL_COMPILE_STATS
-	iPtr->stats.instructionCount[*pc]++;
-#endif
 	iPtr->cmdCount += TclGetUInt4AtPtr(pc+5);
 	if (checkInterp) {
 	    checkInterp = 0;
@@ -2317,23 +2335,9 @@ TEBCresume(
 	    }
 	}
 	pc += 9;
+	goto peepholeStart;
     }
     
-    /*
-     * These two instructions account for 26% of all instructions (according
-     * to measurements on tclbench by Ben Vitale
-     * [http://www.cs.toronto.edu/syslab/pubs/tcl2005-vitale-zaleski.pdf]
-     * Resolving them before the switch reduces the cost of branch
-     * mispredictions, seems to improve runtime by 5% to 15%, and (amazingly!)
-     * reduces total obj size.
-     */
-
-    if (*pc == INST_LOAD_SCALAR1) {
-	goto instLoadScalar1;
-    } else if (*pc == INST_PUSH1) {
-	goto instPush1Peephole;
-    }
-
     switch (*pc) {
     case INST_SYNTAX:
     case INST_RETURN_IMM: {
@@ -2484,23 +2488,6 @@ TEBCresume(
 	(void) POP_OBJECT();
 	goto abnormalReturn;
 
-    case INST_PUSH1:
-    instPush1Peephole:
-	PUSH_OBJECT(codePtr->objArrayPtr[TclGetUInt1AtPtr(pc+1)]);
-	TRACE_WITH_OBJ(("%u => ", TclGetInt1AtPtr(pc+1)), OBJ_AT_TOS);
-	pc += 2;
-#if !TCL_COMPILE_DEBUG
-	/*
-	 * Runtime peephole optimisation: check if we are pushing again.
-	 */
-
-	if (*pc == INST_PUSH1) {
-	    TCL_DTRACE_INST_NEXT();
-	    goto instPush1Peephole;
-	}
-#endif
-	NEXT_INST_F(0, 0, 0);
-
     case INST_PUSH4:
 	objResultPtr = codePtr->objArrayPtr[TclGetUInt4AtPtr(pc+1)];
 	TRACE_WITH_OBJ(("%u => ", TclGetUInt4AtPtr(pc+1)), objResultPtr);
-- 
cgit v0.12