summaryrefslogtreecommitdiffstats
path: root/Tools/scripts
diff options
context:
space:
mode:
authorMichael Droettboom <mdboom@gmail.com>2022-11-04 10:15:54 (GMT)
committerGitHub <noreply@github.com>2022-11-04 10:15:54 (GMT)
commit2844aa6a8eb1d486b5c432f0ed33a2082998f41e (patch)
treeb49e2a0b9c3bd28d311ace81165c298c969d4faa /Tools/scripts
parent044bcc1771fe7e2f8eba21793a72ba15e75e6715 (diff)
downloadcpython-2844aa6a8eb1d486b5c432f0ed33a2082998f41e.zip
cpython-2844aa6a8eb1d486b5c432f0ed33a2082998f41e.tar.gz
cpython-2844aa6a8eb1d486b5c432f0ed33a2082998f41e.tar.bz2
Support comparing two sets of pystats (GH-98816)
This adds support for comparing pystats collected from two different builds. - The `--json-output` can be used to load in a set of raw stats and output a JSON file. - Two of these JSON files can be provided on the next run, and then comparative results between the two are output.
Diffstat (limited to 'Tools/scripts')
-rw-r--r--Tools/scripts/summarize_stats.py486
1 files changed, 364 insertions, 122 deletions
diff --git a/Tools/scripts/summarize_stats.py b/Tools/scripts/summarize_stats.py
index 2e8261a..8d91bda 100644
--- a/Tools/scripts/summarize_stats.py
+++ b/Tools/scripts/summarize_stats.py
@@ -2,7 +2,9 @@
default stats folders.
"""
+import argparse
import collections
+import json
import os.path
import opcode
from datetime import date
@@ -32,6 +34,93 @@ opmap = dict(sorted(opmap.items()))
TOTAL = "specialization.deferred", "specialization.hit", "specialization.miss", "execution_count"
+def join_rows(a_rows, b_rows):
+ """
+ Joins two tables together, side-by-side, where the first column in each is a
+ common key.
+ """
+ if len(a_rows) == 0 and len(b_rows) == 0:
+ return []
+
+ if len(a_rows):
+ a_ncols = list(set(len(x) for x in a_rows))
+ if len(a_ncols) != 1:
+ raise ValueError("Table a is ragged")
+
+ if len(b_rows):
+ b_ncols = list(set(len(x) for x in b_rows))
+ if len(b_ncols) != 1:
+ raise ValueError("Table b is ragged")
+
+ if len(a_rows) and len(b_rows) and a_ncols[0] != b_ncols[0]:
+ raise ValueError("Tables have different widths")
+
+ if len(a_rows):
+ ncols = a_ncols[0]
+ else:
+ ncols = b_ncols[0]
+
+ default = [""] * (ncols - 1)
+ a_data = {x[0]: x[1:] for x in a_rows}
+ b_data = {x[0]: x[1:] for x in b_rows}
+
+ if len(a_data) != len(a_rows) or len(b_data) != len(b_rows):
+ raise ValueError("Duplicate keys")
+
+ # To preserve ordering, use A's keys as is and then add any in B that aren't
+ # in A
+ keys = list(a_data.keys()) + [k for k in b_data.keys() if k not in a_data]
+ return [(k, *a_data.get(k, default), *b_data.get(k, default)) for k in keys]
+
+def calculate_specialization_stats(family_stats, total):
+ rows = []
+ for key in sorted(family_stats):
+ if key.startswith("specialization.failure_kinds"):
+ continue
+ if key in ("specialization.hit", "specialization.miss"):
+ label = key[len("specialization."):]
+ elif key == "execution_count":
+ label = "unquickened"
+ elif key in ("specialization.success", "specialization.failure", "specializable"):
+ continue
+ elif key.startswith("pair"):
+ continue
+ else:
+ label = key
+ rows.append((f"{label:>12}", f"{family_stats[key]:>12}", f"{100*family_stats[key]/total:0.1f}%"))
+ return rows
+
+def calculate_specialization_success_failure(family_stats):
+ total_attempts = 0
+ for key in ("specialization.success", "specialization.failure"):
+ total_attempts += family_stats.get(key, 0)
+ rows = []
+ if total_attempts:
+ for key in ("specialization.success", "specialization.failure"):
+ label = key[len("specialization."):]
+ label = label[0].upper() + label[1:]
+ val = family_stats.get(key, 0)
+ rows.append((label, val, f"{100*val/total_attempts:0.1f}%"))
+ return rows
+
+def calculate_specialization_failure_kinds(name, family_stats, defines):
+ total_failures = family_stats.get("specialization.failure", 0)
+ failure_kinds = [ 0 ] * 30
+ for key in family_stats:
+ if not key.startswith("specialization.failure_kind"):
+ continue
+ _, index = key[:-1].split("[")
+ index = int(index)
+ failure_kinds[index] = family_stats[key]
+ failures = [(value, index) for (index, value) in enumerate(failure_kinds)]
+ failures.sort(reverse=True)
+ rows = []
+ for value, index in failures:
+ if not value:
+ continue
+ rows.append((kind_to_text(index, defines, name), value, f"{100*value/total_failures:0.1f}%"))
+ return rows
+
def print_specialization_stats(name, family_stats, defines):
if "specializable" not in family_stats:
return
@@ -39,65 +128,65 @@ def print_specialization_stats(name, family_stats, defines):
if total == 0:
return
with Section(name, 3, f"specialization stats for {name} family"):
- rows = []
- for key in sorted(family_stats):
- if key.startswith("specialization.failure_kinds"):
- continue
- if key in ("specialization.hit", "specialization.miss"):
- label = key[len("specialization."):]
- elif key == "execution_count":
- label = "unquickened"
- elif key in ("specialization.success", "specialization.failure", "specializable"):
- continue
- elif key.startswith("pair"):
- continue
- else:
- label = key
- rows.append((f"{label:>12}", f"{family_stats[key]:>12}", f"{100*family_stats[key]/total:0.1f}%"))
+ rows = calculate_specialization_stats(family_stats, total)
emit_table(("Kind", "Count", "Ratio"), rows)
- print_title("Specialization attempts", 4)
- total_attempts = 0
- for key in ("specialization.success", "specialization.failure"):
- total_attempts += family_stats.get(key, 0)
- rows = []
- if total_attempts:
- for key in ("specialization.success", "specialization.failure"):
- label = key[len("specialization."):]
- label = label[0].upper() + label[1:]
- val = family_stats.get(key, 0)
- rows.append((label, val, f"{100*val/total_attempts:0.1f}%"))
+ rows = calculate_specialization_success_failure(family_stats)
+ if rows:
+ print_title("Specialization attempts", 4)
emit_table(("", "Count:", "Ratio:"), rows)
- total_failures = family_stats.get("specialization.failure", 0)
- failure_kinds = [ 0 ] * 30
- for key in family_stats:
- if not key.startswith("specialization.failure_kind"):
- continue
- _, index = key[:-1].split("[")
- index = int(index)
- failure_kinds[index] = family_stats[key]
- failures = [(value, index) for (index, value) in enumerate(failure_kinds)]
- failures.sort(reverse=True)
- rows = []
- for value, index in failures:
- if not value:
- continue
- rows.append((kind_to_text(index, defines, name), value, f"{100*value/total_failures:0.1f}%"))
- emit_table(("Failure kind", "Count:", "Ratio:"), rows)
-
-def gather_stats():
- stats = collections.Counter()
- for filename in os.listdir(DEFAULT_DIR):
- with open(os.path.join(DEFAULT_DIR, filename)) as fd:
- for line in fd:
- try:
- key, value = line.split(":")
- except ValueError:
- print (f"Unparsable line: '{line.strip()}' in {filename}", file=sys.stderr)
- continue
- key = key.strip()
- value = int(value)
- stats[key] += value
- return stats
+ rows = calculate_specialization_failure_kinds(name, family_stats, defines)
+ emit_table(("Failure kind", "Count:", "Ratio:"), rows)
+
+def print_comparative_specialization_stats(name, base_family_stats, head_family_stats, defines):
+ if "specializable" not in base_family_stats:
+ return
+
+ base_total = sum(base_family_stats.get(kind, 0) for kind in TOTAL)
+ head_total = sum(head_family_stats.get(kind, 0) for kind in TOTAL)
+ if base_total + head_total == 0:
+ return
+ with Section(name, 3, f"specialization stats for {name} family"):
+ base_rows = calculate_specialization_stats(base_family_stats, base_total)
+ head_rows = calculate_specialization_stats(head_family_stats, head_total)
+ emit_table(
+ ("Kind", "Base Count", "Base Ratio", "Head Count", "Head Ratio"),
+ join_rows(base_rows, head_rows)
+ )
+ base_rows = calculate_specialization_success_failure(base_family_stats)
+ head_rows = calculate_specialization_success_failure(head_family_stats)
+ rows = join_rows(base_rows, head_rows)
+ if rows:
+ print_title("Specialization attempts", 4)
+ emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), rows)
+ base_rows = calculate_specialization_failure_kinds(name, base_family_stats, defines)
+ head_rows = calculate_specialization_failure_kinds(name, head_family_stats, defines)
+ emit_table(
+ ("Failure kind", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
+ join_rows(base_rows, head_rows)
+ )
+
+def gather_stats(input):
+ # Note the output of this function must be JSON-serializable
+
+ if os.path.isfile(input):
+ with open(input, "r") as fd:
+ return json.load(fd)
+ elif os.path.isdir(input):
+ stats = collections.Counter()
+ for filename in os.listdir(input):
+ with open(os.path.join(input, filename)) as fd:
+ for line in fd:
+ try:
+ key, value = line.split(":")
+ except ValueError:
+ print(f"Unparsable line: '{line.strip()}' in {filename}", file=sys.stderr)
+ continue
+ key = key.strip()
+ value = int(value)
+ stats[key] += value
+ return stats
+ else:
+ raise ValueError(f"{input:r} is not a file or directory path")
def extract_opcode_stats(stats):
opcode_stats = [ {} for _ in range(256) ]
@@ -213,50 +302,98 @@ def emit_table(header, rows):
print("|", " | ".join(to_str(i) for i in row), "|")
print()
+def calculate_execution_counts(opcode_stats, total):
+ counts = []
+ for i, opcode_stat in enumerate(opcode_stats):
+ if "execution_count" in opcode_stat:
+ count = opcode_stat['execution_count']
+ miss = 0
+ if "specializable" not in opcode_stat:
+ miss = opcode_stat.get("specialization.miss")
+ counts.append((count, opname[i], miss))
+ counts.sort(reverse=True)
+ cumulative = 0
+ rows = []
+ for (count, name, miss) in counts:
+ cumulative += count
+ if miss:
+ miss = f"{100*miss/count:0.1f}%"
+ else:
+ miss = ""
+ rows.append((name, count, f"{100*count/total:0.1f}%",
+ f"{100*cumulative/total:0.1f}%", miss))
+ return rows
+
def emit_execution_counts(opcode_stats, total):
with Section("Execution counts", summary="execution counts for all instructions"):
- counts = []
- for i, opcode_stat in enumerate(opcode_stats):
- if "execution_count" in opcode_stat:
- count = opcode_stat['execution_count']
- miss = 0
- if "specializable" not in opcode_stat:
- miss = opcode_stat.get("specialization.miss")
- counts.append((count, opname[i], miss))
- counts.sort(reverse=True)
- cumulative = 0
- rows = []
- for (count, name, miss) in counts:
- cumulative += count
- if miss:
- miss = f"{100*miss/count:0.1f}%"
- else:
- miss = ""
- rows.append((name, count, f"{100*count/total:0.1f}%",
- f"{100*cumulative/total:0.1f}%", miss))
+ rows = calculate_execution_counts(opcode_stats, total)
emit_table(
("Name", "Count:", "Self:", "Cumulative:", "Miss ratio:"),
rows
)
+def emit_comparative_execution_counts(
+ base_opcode_stats, base_total, head_opcode_stats, head_total
+):
+ with Section("Execution counts", summary="execution counts for all instructions"):
+ base_rows = calculate_execution_counts(base_opcode_stats, base_total)
+ head_rows = calculate_execution_counts(head_opcode_stats, head_total)
+ base_data = dict((x[0], x[1:]) for x in base_rows)
+ head_data = dict((x[0], x[1:]) for x in head_rows)
+ opcodes = set(base_data.keys()) | set(head_data.keys())
-def emit_specialization_stats(opcode_stats):
+ rows = []
+ default = [0, "0.0%", "0.0%", 0]
+ for opcode in opcodes:
+ base_entry = base_data.get(opcode, default)
+ head_entry = head_data.get(opcode, default)
+ if base_entry[0] == 0:
+ change = 1
+ else:
+ change = (head_entry[0] - base_entry[0]) / base_entry[0]
+ rows.append(
+ (opcode, base_entry[0], head_entry[0],
+ f"{100*change:0.1f}%"))
+
+ rows.sort(key=lambda x: -abs(float(x[-1][:-1])))
+
+ emit_table(
+ ("Name", "Base Count:", "Head Count:", "Change:"),
+ rows
+ )
+
+def get_defines():
spec_path = os.path.join(os.path.dirname(__file__), "../../Python/specialize.c")
with open(spec_path) as spec_src:
defines = parse_kinds(spec_src)
+ return defines
+
+def emit_specialization_stats(opcode_stats):
+ defines = get_defines()
with Section("Specialization stats", summary="specialization stats by family"):
for i, opcode_stat in enumerate(opcode_stats):
name = opname[i]
print_specialization_stats(name, opcode_stat, defines)
-def emit_specialization_overview(opcode_stats, total):
+def emit_comparative_specialization_stats(base_opcode_stats, head_opcode_stats):
+ defines = get_defines()
+ with Section("Specialization stats", summary="specialization stats by family"):
+ for i, (base_opcode_stat, head_opcode_stat) in enumerate(zip(base_opcode_stats, head_opcode_stats)):
+ name = opname[i]
+ print_comparative_specialization_stats(name, base_opcode_stat, head_opcode_stat, defines)
+
+def calculate_specialization_effectiveness(opcode_stats, total):
basic, not_specialized, specialized = categorized_counts(opcode_stats)
+ return [
+ ("Basic", basic, f"{basic*100/total:0.1f}%"),
+ ("Not specialized", not_specialized, f"{not_specialized*100/total:0.1f}%"),
+ ("Specialized", specialized, f"{specialized*100/total:0.1f}%"),
+ ]
+
+def emit_specialization_overview(opcode_stats, total):
with Section("Specialization effectiveness"):
- emit_table(("Instructions", "Count:", "Ratio:"), (
- ("Basic", basic, f"{basic*100/total:0.1f}%"),
- ("Not specialized", not_specialized, f"{not_specialized*100/total:0.1f}%"),
- ("Specialized", specialized, f"{specialized*100/total:0.1f}%"),
- ))
+ rows = calculate_specialization_effectiveness(opcode_stats, total)
+ emit_table(("Instructions", "Count:", "Ratio:"), rows)
for title, field in (("Deferred", "specialization.deferred"), ("Misses", "specialization.miss")):
total = 0
counts = []
@@ -270,53 +407,91 @@ def emit_specialization_overview(opcode_stats, total):
rows = [ (name, count, f"{100*count/total:0.1f}%") for (count, name) in counts[:10] ]
emit_table(("Name", "Count:", "Ratio:"), rows)
-def emit_call_stats(stats):
+def emit_comparative_specialization_overview(base_opcode_stats, base_total, head_opcode_stats, head_total):
+ with Section("Specialization effectiveness"):
+ base_rows = calculate_specialization_effectiveness(base_opcode_stats, base_total)
+ head_rows = calculate_specialization_effectiveness(head_opcode_stats, head_total)
+ emit_table(
+ ("Instructions", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
+ join_rows(base_rows, head_rows)
+ )
+
+def get_stats_defines():
stats_path = os.path.join(os.path.dirname(__file__), "../../Include/pystats.h")
with open(stats_path) as stats_src:
defines = parse_kinds(stats_src, prefix="EVAL_CALL")
+ return defines
+
+def calculate_call_stats(stats):
+ defines = get_stats_defines()
+ total = 0
+ for key, value in stats.items():
+ if "Calls to" in key:
+ total += value
+ rows = []
+ for key, value in stats.items():
+ if "Calls to" in key:
+ rows.append((key, value, f"{100*value/total:0.1f}%"))
+ elif key.startswith("Calls "):
+ name, index = key[:-1].split("[")
+ index = int(index)
+ label = name + " (" + pretty(defines[index][0]) + ")"
+ rows.append((label, value, f"{100*value/total:0.1f}%"))
+ for key, value in stats.items():
+ if key.startswith("Frame"):
+ rows.append((key, value, f"{100*value/total:0.1f}%"))
+ return rows
+
+def emit_call_stats(stats):
with Section("Call stats", summary="Inlined calls and frame stats"):
- total = 0
- for key, value in stats.items():
- if "Calls to" in key:
- total += value
- rows = []
- for key, value in stats.items():
- if "Calls to" in key:
- rows.append((key, value, f"{100*value/total:0.1f}%"))
- elif key.startswith("Calls "):
- name, index = key[:-1].split("[")
- index = int(index)
- label = name + " (" + pretty(defines[index][0]) + ")"
- rows.append((label, value, f"{100*value/total:0.1f}%"))
- for key, value in stats.items():
- if key.startswith("Frame"):
- rows.append((key, value, f"{100*value/total:0.1f}%"))
+ rows = calculate_call_stats(stats)
emit_table(("", "Count:", "Ratio:"), rows)
+def emit_comparative_call_stats(base_stats, head_stats):
+ with Section("Call stats", summary="Inlined calls and frame stats"):
+ base_rows = calculate_call_stats(base_stats)
+ head_rows = calculate_call_stats(head_stats)
+ rows = join_rows(base_rows, head_rows)
+ rows.sort(key=lambda x: -float(x[-1][:-1]))
+ emit_table(
+ ("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"),
+ rows
+ )
+
+def calculate_object_stats(stats):
+ total_materializations = stats.get("Object new values")
+ total_allocations = stats.get("Object allocations") + stats.get("Object allocations from freelist")
+ total_increfs = stats.get("Object interpreter increfs") + stats.get("Object increfs")
+ total_decrefs = stats.get("Object interpreter decrefs") + stats.get("Object decrefs")
+ rows = []
+ for key, value in stats.items():
+ if key.startswith("Object"):
+ if "materialize" in key:
+ ratio = f"{100*value/total_materializations:0.1f}%"
+ elif "allocations" in key:
+ ratio = f"{100*value/total_allocations:0.1f}%"
+ elif "increfs" in key:
+ ratio = f"{100*value/total_increfs:0.1f}%"
+ elif "decrefs" in key:
+ ratio = f"{100*value/total_decrefs:0.1f}%"
+ else:
+ ratio = ""
+ label = key[6:].strip()
+ label = label[0].upper() + label[1:]
+ rows.append((label, value, ratio))
+ return rows
+
def emit_object_stats(stats):
with Section("Object stats", summary="allocations, frees and dict materializatons"):
- total_materializations = stats.get("Object new values")
- total_allocations = stats.get("Object allocations") + stats.get("Object allocations from freelist")
- total_increfs = stats.get("Object interpreter increfs") + stats.get("Object increfs")
- total_decrefs = stats.get("Object interpreter decrefs") + stats.get("Object decrefs")
- rows = []
- for key, value in stats.items():
- if key.startswith("Object"):
- if "materialize" in key:
- ratio = f"{100*value/total_materializations:0.1f}%"
- elif "allocations" in key:
- ratio = f"{100*value/total_allocations:0.1f}%"
- elif "increfs" in key:
- ratio = f"{100*value/total_increfs:0.1f}%"
- elif "decrefs" in key:
- ratio = f"{100*value/total_decrefs:0.1f}%"
- else:
- ratio = ""
- label = key[6:].strip()
- label = label[0].upper() + label[1:]
- rows.append((label, value, ratio))
+ rows = calculate_object_stats(stats)
emit_table(("", "Count:", "Ratio:"), rows)
+def emit_comparative_object_stats(base_stats, head_stats):
+ with Section("Object stats", summary="allocations, frees and dict materializatons"):
+ base_rows = calculate_object_stats(base_stats)
+ head_rows = calculate_object_stats(head_stats)
+ emit_table(("", "Base Count:", "Base Ratio:", "Head Count:", "Head Ratio:"), join_rows(base_rows, head_rows))
+
def get_total(opcode_stats):
total = 0
for opcode_stat in opcode_stats:
@@ -377,8 +552,7 @@ def emit_pair_counts(opcode_stats, total):
succ_rows
)
-def main():
- stats = gather_stats()
+def output_single_stats(stats):
opcode_stats = extract_opcode_stats(stats)
total = get_total(opcode_stats)
emit_execution_counts(opcode_stats, total)
@@ -387,8 +561,76 @@ def main():
emit_specialization_overview(opcode_stats, total)
emit_call_stats(stats)
emit_object_stats(stats)
+
+def output_comparative_stats(base_stats, head_stats):
+ base_opcode_stats = extract_opcode_stats(base_stats)
+ base_total = get_total(base_opcode_stats)
+
+ head_opcode_stats = extract_opcode_stats(head_stats)
+ head_total = get_total(head_opcode_stats)
+
+ emit_comparative_execution_counts(
+ base_opcode_stats, base_total, head_opcode_stats, head_total
+ )
+ emit_comparative_specialization_stats(
+ base_opcode_stats, head_opcode_stats
+ )
+ emit_comparative_specialization_overview(
+ base_opcode_stats, base_total, head_opcode_stats, head_total
+ )
+ emit_comparative_call_stats(base_stats, head_stats)
+ emit_comparative_object_stats(base_stats, head_stats)
+
+def output_stats(inputs, json_output=None):
+ if len(inputs) == 1:
+ stats = gather_stats(inputs[0])
+ if json_output is not None:
+ json.dump(stats, json_output)
+ output_single_stats(stats)
+ elif len(inputs) == 2:
+ if json_output is not None:
+ raise ValueError(
+ "Can not output to JSON when there are multiple inputs"
+ )
+
+ base_stats = gather_stats(inputs[0])
+ head_stats = gather_stats(inputs[1])
+ output_comparative_stats(base_stats, head_stats)
+
print("---")
print("Stats gathered on:", date.today())
+def main():
+ parser = argparse.ArgumentParser(description="Summarize pystats results")
+
+ parser.add_argument(
+ "inputs",
+ nargs="*",
+ type=str,
+ default=[DEFAULT_DIR],
+ help=f"""
+ Input source(s).
+ For each entry, if a .json file, the output provided by --json-output from a previous run;
+ if a directory, a directory containing raw pystats .txt files.
+ If one source is provided, its stats are printed.
+ If two sources are provided, comparative stats are printed.
+ Default is {DEFAULT_DIR}.
+ """
+ )
+
+ parser.add_argument(
+ "--json-output",
+ nargs="?",
+ type=argparse.FileType("w"),
+ help="Output complete raw results to the given JSON file."
+ )
+
+ args = parser.parse_args()
+
+ if len(args.inputs) > 2:
+ raise ValueError("0-2 arguments may be provided.")
+
+ output_stats(args.inputs, json_output=args.json_output)
+
if __name__ == "__main__":
main()