From 6e7718395efb2bf299224e5188b32da47efe0883 Mon Sep 17 00:00:00 2001 From: mig Date: Sat, 12 Jan 2013 10:14:06 +0000 Subject: even better ... or so I hope: also inlining INST_PUSH1 in the peephole, checking for ISC after LOAD1 and PUSH1 --- generic/tclExecute.c | 93 ++++++++++++++++++++++------------------------------ 1 file changed, 40 insertions(+), 53 deletions(-) diff --git a/generic/tclExecute.c b/generic/tclExecute.c index 1ed8949..4d758f6 100644 --- a/generic/tclExecute.c +++ b/generic/tclExecute.c @@ -2250,23 +2250,6 @@ TEBCresume( } cleanup0: -#ifdef TCL_COMPILE_DEBUG - /* - * Skip the stack depth check if an expansion is in progress. - */ - - CHECK_STACK(); - if (traceInstructions) { - fprintf(stdout, "%2d: %2d ", iPtr->numLevels, (int) CURR_DEPTH); - TclPrintInstruction(codePtr, pc); - fflush(stdout); - } -#endif /* TCL_COMPILE_DEBUG */ - -#ifdef TCL_COMPILE_STATS - iPtr->stats.instructionCount[*pc]++; -#endif - /* * Check for asynchronous handlers [Bug 746722]; we do the check every * ASYNC_CHECK_COUNT_MASK instruction, of the form (2**n-1). @@ -2298,16 +2281,51 @@ TEBCresume( CACHE_STACK_INFO(); } + /* + * These two instructions account for 26% of all instructions (according + * to measurements on tclbench by Ben Vitale + * [http://www.cs.toronto.edu/syslab/pubs/tcl2005-vitale-zaleski.pdf] + * Resolving them before the switch reduces the cost of branch + * mispredictions, seems to improve runtime by 5% to 15%, and (amazingly!) + * reduces total obj size. + */ + + peepholeStart: +#ifdef TCL_COMPILE_STATS + iPtr->stats.instructionCount[*pc]++; +#endif + +#ifdef TCL_COMPILE_DEBUG + /* + * Skip the stack depth check if an expansion is in progress. + */ + + CHECK_STACK(); + if (traceInstructions) { + fprintf(stdout, "%2d: %2d ", iPtr->numLevels, (int) CURR_DEPTH); + TclPrintInstruction(codePtr, pc); + fflush(stdout); + } +#endif /* TCL_COMPILE_DEBUG */ + TCL_DTRACE_INST_NEXT(); + + if (*pc == INST_LOAD_SCALAR1) { + goto instLoadScalar1; + } - while (*pc == INST_START_CMD) { + if (*pc == INST_PUSH1) { + PUSH_OBJECT(codePtr->objArrayPtr[TclGetUInt1AtPtr(pc+1)]); + TRACE_WITH_OBJ(("%u => ", TclGetInt1AtPtr(pc+1)), OBJ_AT_TOS); + pc += 2; + goto peepholeStart; + } + + if (*pc == INST_START_CMD) { /* * Peephole: do not run INST_START_CMD, just skip it */ -#ifdef TCL_COMPILE_STATS - iPtr->stats.instructionCount[*pc]++; -#endif iPtr->cmdCount += TclGetUInt4AtPtr(pc+5); if (checkInterp) { checkInterp = 0; @@ -2317,23 +2335,9 @@ TEBCresume( } } pc += 9; + goto peepholeStart; } - /* - * These two instructions account for 26% of all instructions (according - * to measurements on tclbench by Ben Vitale - * [http://www.cs.toronto.edu/syslab/pubs/tcl2005-vitale-zaleski.pdf] - * Resolving them before the switch reduces the cost of branch - * mispredictions, seems to improve runtime by 5% to 15%, and (amazingly!) - * reduces total obj size. - */ - - if (*pc == INST_LOAD_SCALAR1) { - goto instLoadScalar1; - } else if (*pc == INST_PUSH1) { - goto instPush1Peephole; - } - switch (*pc) { case INST_SYNTAX: case INST_RETURN_IMM: { @@ -2484,23 +2488,6 @@ TEBCresume( (void) POP_OBJECT(); goto abnormalReturn; - case INST_PUSH1: - instPush1Peephole: - PUSH_OBJECT(codePtr->objArrayPtr[TclGetUInt1AtPtr(pc+1)]); - TRACE_WITH_OBJ(("%u => ", TclGetInt1AtPtr(pc+1)), OBJ_AT_TOS); - pc += 2; -#if !TCL_COMPILE_DEBUG - /* - * Runtime peephole optimisation: check if we are pushing again. - */ - - if (*pc == INST_PUSH1) { - TCL_DTRACE_INST_NEXT(); - goto instPush1Peephole; - } -#endif - NEXT_INST_F(0, 0, 0); - case INST_PUSH4: objResultPtr = codePtr->objArrayPtr[TclGetUInt4AtPtr(pc+1)]; TRACE_WITH_OBJ(("%u => ", TclGetUInt4AtPtr(pc+1)), objResultPtr); -- cgit v0.12