[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py

#!/usr/bin/python3
# Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import json
import os
import tempfile
from statistics import mean
import argparse
import sys
from operator import add

import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
from matplotlib.ticker import PercentFormatter

import git
import numpy
import lava_submit

from minio import Minio
from minio.error import NoSuchKey
from minio.error import ResponseError


BENCHMARK_TYPES = ["dummy", "text"]
DEFAULT_BUCKET = "lava"

invalid_commits = {
        "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
        "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
        "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
        "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
        "d0d4e0ed487ea23aaf0d023513c0a4d86901b79b", # Does not build
        "c24f7ab4dd9edeb5e50b0070fd9d9e8691057dde", # Does not build
        "ce67f5614a4db3b2de4d887eca52135b439b4937", # Does not build
        "80aff5efc66679fd934cef433c0e698694748385", # Does not build
        "f4f11e84942d36fcc8a597d226928bce2ccac4b3", # Does not build
        }

def json_type(string):
    """
    Argpase type for json args.
    We expect a base dictionary.
    """
    passed_json = json.loads(string)
    if not isinstance(passed_json, dict):
        msg = "%r is not a dict" % string
        raise argparse.ArgumentTypeError(msg)
    return passed_json

def graph_get_color(branch):
    """
    Get the color matching the branch.
    """
    color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
    return color[branch]


def graph_get_title(branch, benchmark_type):
    """
    Get title for graph based on benchmark type.
    """
    string = {"dummy": "Dummy output", "text": "Text output"}
    return "{} - {}".format(branch, string[benchmark_type])


def get_client():
    """
    Return minio client configured.
    """
    return Minio(
        "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
    )


def get_file(client, prefix, file_name, workdir_name):
    """
    Return the path of the downloaded file.
    Return None on error
    """
    destination = os.path.join(workdir_name, file_name)
    object_name = "{}/{}".format(prefix, file_name)
    try:
        client.fget_object(DEFAULT_BUCKET, object_name, destination)
    except NoSuchKey:
        return None

    return destination


def delete_file(client, prefix, file_name):
    """
    Delete the file on remote.
    """
    object_name = "{}/{}".format(prefix, file_name)
    try:
        client.remove_object(DEFAULT_BUCKET, object_name)
    except ResponseError as err:
        print(err)
    except NoSuchKey:
        pass


def get_git_log(bt_version, cutoff, repo_path):
    """
    Return an ordered (older to newer) list of commits for the bt_version and
    cutoff. WARNING: This changes the git repo HEAD.
    """
    repo = git.Repo(repo_path)
    repo.git.fetch()
    return repo.git.log(
        "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
    ).split("\n")


def parse_result(result_path):
    """
    Parse the result file. Return a dataset of User time + System time.
    """
    with open(result_path) as result:
        parsed_result = json.load(result)
        return list(
            map(
                add,
                parsed_result["User time (seconds)"],
                parsed_result["System time (seconds)"],
            )
        )


def get_benchmark_results(client, commit, workdir):
    """
    Fetch the benchmark result from a certain commit across all benchmark type.
    """
    results = {}
    benchmark_valid = True
    for b_type in BENCHMARK_TYPES:
        prefix = "/results/benchmarks/babeltrace/{}".format(b_type)
        result_file = get_file(client, prefix, commit, workdir)
        if not result_file:
            """
            Benchmark is either corrupted or not complete.
            """
            return None, benchmark_valid
        results[b_type] = parse_result(result_file)
        if all(i == 0.0 for i in results[b_type]):
            benchmark_valid = False
            print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
    # The dataset is valid return immediately.
    return results, benchmark_valid


def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
    """
    Plot the graph using the raw value.
    """
    point_x_data = []
    outlier_x_data = []
    point_y_data = []
    outlier_y_data = []
    for pos in range(len(x_data)):
        x = x_data[pos]
        valid_points, outliers = sanitize_dataset(y_data[pos])
        for y in valid_points:
            point_x_data.append(x)
            point_y_data.append(y)
        for y in outliers:
            outlier_x_data.append(x)
            outlier_y_data.append(y)

    plt.plot(
        point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
    )
    plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")

    ymax = 1
    if y_data:
        ymin = 0.8 * min([item for sublist in y_data for item in sublist])
        ymax = 1.2 * max([item for sublist in y_data for item in sublist])
    # Put latest of other branches for reference as horizontal line.
    for l_branch, l_result in latest_values.items():
        if not l_result or l_branch == branch:
            continue
        plt.axhline(
            y=l_result,
            label="Latest {}".format(l_branch),
            color=graph_get_color(l_branch),
        )
        if l_result >= ymax:
            ymax = 1.2 * l_result
    ax = plt.gca()
    plt.ylim(ymin=0, ymax=ymax)
    plt.xticks(x_data, labels, rotation=90, family="monospace")
    plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
    plt.ylabel("User + system time (s)")
    plt.xlabel("Latest commits")
    plt.legend()
    plt.grid(True)

    # Put tick on the right side
    ax.tick_params(labeltop=False, labelright=True)

    plt.tight_layout()
    return


def plot_delta_between_point(
    branch, benchmark_type, x_data, y_data, labels, latest_values
):
    """
    Plot the graph of delta between each sequential commit.
    """
    local_abs_max = 100

    # Transform y_data to a list of  for which the reference is the first
    # element.
    local_y_data = []
    for pos, y in enumerate(y_data):
        if pos == 0:
            local_y_data.append(0.0)
            continue
        local_y_data.append(y - y_data[pos - 1])

    plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))

    # Get max absolute value to align the y axis with zero in the middle.
    if local_y_data:
        local_abs_max = abs(max(local_y_data, key=abs)) * 1.3

    plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)

    ax = plt.gca()
    plt.xticks(x_data, labels, rotation=90, family="monospace")
    plt.title(
        graph_get_title(branch, benchmark_type) + " Delta to previous commit",
        fontweight="bold",
    )
    plt.ylabel("Seconds")
    plt.xlabel("Latest commits")
    plt.legend()
    plt.grid(True)

    # Put tick on the right side
    ax.tick_params(labeltop=False, labelright=True)

    plt.tight_layout()
    return


def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
    """
    Plot the graph using a ratio using first point as reference (0%).
    """
    reference = 0.01
    y_abs_max = 100

    if y_data:
        reference = y_data[0]

    # Transform y_data to a list of ratio for which the reference is the first
    # element.
    local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))

    plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))

    # Put latest of other branches for reference as horizontal line.
    for l_branch, l_result in latest_values.items():
        if not l_result or l_branch == branch:
            continue
        ratio_l_result = ((l_result / reference) - 1.0) * 100.0
        print(
            "branch {} branch {} value {} l_result {} reference {}".format(
                branch, l_branch, ratio_l_result, l_result, reference
            )
        )
        plt.axhline(
            y=ratio_l_result,
            label="Latest {}".format(l_branch),
            color=graph_get_color(l_branch),
        )

    # Draw the reference line.
    plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")

    # Get max absolute value to align the y axis with zero in the middle.
    if local_y_data:
        local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
        if y_abs_max > 100:
            y_abs_max = local_abs_max

    plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)

    ax = plt.gca()
    percent_formatter = PercentFormatter()
    ax.yaxis.set_major_formatter(percent_formatter)
    ax.yaxis.set_minor_formatter(percent_formatter)
    plt.xticks(x_data, labels, rotation=90, family="monospace")
    plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
    plt.ylabel("Ratio")
    plt.xlabel("Latest commits")
    plt.legend()
    plt.grid(True)

    # Put tick on the right side
    ax.tick_params(labeltop=False, labelright=True)

    plt.tight_layout()
    return


def generate_graph(branches, report_name, git_path):

    # The PDF document
    pdf_pages = PdfPages(report_name)

    client = get_client()
    branch_results = dict()

    # Fetch the results for each branch.
    for branch, cutoff in branches.items():
        commits = get_git_log(branch, cutoff, git_path)
        results = []
        with tempfile.TemporaryDirectory() as workdir:
            for commit in commits:
                b_results, valid = get_benchmark_results(client, commit, workdir)
                if not b_results or not valid:
                    continue
                results.append((commit, b_results))
        branch_results[branch] = results

    for b_type in BENCHMARK_TYPES:
        latest_values = {}
        max_len = 0

        # Find the maximum size for a series inside our series dataset.
        # This is used later to compute the size of the actual plot (pdf).
        # While there gather the comparison value used to draw comparison line
        # between branches.
        for branch, results in branch_results.items():
            max_len = max([max_len, len(results)])
            if results:
                latest_values[branch] = mean(
                    sanitize_dataset(results[-1][1][b_type])[0]
                )
            else:
                latest_values[branch] = None

        for branch, results in branch_results.items():
            # Create a figure instance
            if max_len and max_len > 10:
                width = 0.16 * max_len
            else:
                width = 11.69

            x_data = list(range(len(results)))
            y_data = [c[1][b_type] for c in results]
            labels = [c[0][:8] for c in results]

            fig = plt.figure(figsize=(width, 8.27), dpi=100)
            plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
            pdf_pages.savefig(fig)

            # Use the mean of each sanitize dataset here, we do not care for
            # variance for ratio. At least not yet.
            y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
            fig = plt.figure(figsize=(width, 8.27), dpi=100)
            plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
            pdf_pages.savefig(fig)

            fig = plt.figure(figsize=(width, 8.27), dpi=100)
            plot_delta_between_point(
                branch, b_type, x_data, y_data, labels, latest_values
            )
            pdf_pages.savefig(fig)

    pdf_pages.close()


def launch_jobs(branches, git_path, wait_for_completion, debug, force):
    """
    Lauch jobs for all missing results.
    """
    client = get_client()
    for branch, cutoff in branches.items():
        commits = get_git_log(branch, cutoff, git_path)

        with tempfile.TemporaryDirectory() as workdir:
            for commit in commits:
                if commit in invalid_commits:
                    continue
                b_results = get_benchmark_results(client, commit, workdir)[0]
                if b_results and not force:
                    continue
                lava_submit.submit(
                    commit, wait_for_completion=wait_for_completion, debug=debug
                )


def main():
    """
    Parse arguments and execute as needed.
    """
    bt_branches = {
        "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
        "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
        "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
    }

    parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
    parser.add_argument(
        "--generate-jobs", action="store_true", help="Generate and send jobs"
    )
    parser.add_argument(
        "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
    )
    parser.add_argument(
        "--do-not-wait-on-completion",
        action="store_true",
        default=False,
        help="Wait for the completion of each jobs sent. This is useful"
        "for the ci. Otherwise we could end up spaming the lava instance.",
    )
    parser.add_argument(
        "--generate-report",
        action="store_true",
        help="Generate graphs and save them to pdf",
    )
    parser.add_argument(
        "--report-name", default="report.pdf", help="The name of the pdf report."
    )
    parser.add_argument(
        "--debug", action="store_true", default=False, help="Do not send jobs to lava."
    )
    parser.add_argument(
        "--repo-path", help="The location of the git repo to use.", required=True
    )
    parser.add_argument(
        "--overwrite-branches-cutoff",
        help="A dictionary of the form {"
        "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
        "jobs generation.",
        required=False, type=json_type
    )

    args = parser.parse_args()

    if args.overwrite_branches_cutoff:
        bt_branches = args.overwrite_branches_cutoff

    if not os.path.exists(args.repo_path):
        print("Repository location does not exists.")
        return 1

    if args.generate_jobs:
        print("Launching jobs for:")

        for branch, cutoff in bt_branches.items():
            print("\t Branch {} with cutoff {}".format(branch, cutoff))

        launch_jobs(
            bt_branches,
            args.repo_path,
            not args.do_not_wait_on_completion,
            args.debug,
            args.force_jobs,
        )

    if args.generate_report:
        print("Generating pdf report ({}) for:".format(args.report_name))
        for branch, cutoff in bt_branches.items():
            print("\t Branch {} with cutoff {}".format(branch, cutoff))
        generate_graph(bt_branches, args.report_name, args.repo_path)

    return 0


def sanitize_dataset(dataset):
    """
    Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
    representative mean without outlier in it.
    [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
    """
    sorted_data = sorted(dataset)
    q1, q3 = numpy.percentile(sorted_data, [25, 75])
    iqr = q3 - q1
    lower_bound = q1 - (1.5 * iqr)
    upper_bound = q3 + (1.5 * iqr)
    new_dataset = []
    outliers = []
    for i in dataset:
        if lower_bound <= i <= upper_bound:
            new_dataset.append(i)
        else:
            outliers.append(i)
    return new_dataset, outliers


if __name__ == "__main__":
    sys.exit(main())
Commit	Line	Data
5c65bbc2 JR	1	#!/usr/bin/python3
	2	# Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
	3	#
	4	# This program is free software: you can redistribute it and/or modify
	5	# it under the terms of the GNU General Public License as published by
	6	# the Free Software Foundation, either version 3 of the License, or
	7	# (at your option) any later version.
	8	#
	9	# This program is distributed in the hope that it will be useful,
	10	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	11	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	12	# GNU General Public License for more details.
	13	#
	14	# You should have received a copy of the GNU General Public License
	15	# along with this program. If not, see <http://www.gnu.org/licenses/>.
	16
	17	import json
	18	import os
	19	import tempfile
	20	from statistics import mean
	21	import argparse
	22	import sys
	23	from operator import add
	24
	25	import matplotlib.pyplot as plt
	26	from matplotlib.backends.backend_pdf import PdfPages
	27	from matplotlib.ticker import PercentFormatter
	28
	29	import git
	30	import numpy
	31	import lava_submit
	32
	33	from minio import Minio
	34	from minio.error import NoSuchKey
	35	from minio.error import ResponseError
	36
	37
	38	BENCHMARK_TYPES = ["dummy", "text"]
	39	DEFAULT_BUCKET = "lava"
	40
e085717c JR	41	invalid_commits = {
	42	"ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
	43	"8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
	44	"f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
	45	"e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
42fc7d5c JR	46	"d0d4e0ed487ea23aaf0d023513c0a4d86901b79b", # Does not build
	47	"c24f7ab4dd9edeb5e50b0070fd9d9e8691057dde", # Does not build
	48	"ce67f5614a4db3b2de4d887eca52135b439b4937", # Does not build
	49	"80aff5efc66679fd934cef433c0e698694748385", # Does not build
	50	"f4f11e84942d36fcc8a597d226928bce2ccac4b3", # Does not build
e085717c	51	}
5c65bbc2	52
cf595cda JR	53	def json_type(string):
	54	"""
	55	Argpase type for json args.
	56	We expect a base dictionary.
	57	"""
	58	passed_json = json.loads(string)
	59	if not isinstance(passed_json, dict):
	60	msg = "%r is not a dict" % string
	61	raise argparse.ArgumentTypeError(msg)
	62	return passed_json
	63
5c65bbc2 JR	64	def graph_get_color(branch):
	65	"""
	66	Get the color matching the branch.
	67	"""
	68	color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
	69	return color[branch]
	70
	71
	72	def graph_get_title(branch, benchmark_type):
	73	"""
	74	Get title for graph based on benchmark type.
	75	"""
	76	string = {"dummy": "Dummy output", "text": "Text output"}
	77	return "{} - {}".format(branch, string[benchmark_type])
	78
	79
	80	def get_client():
	81	"""
	82	Return minio client configured.
	83	"""
	84	return Minio(
	85	"obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
	86	)
	87
	88
	89	def get_file(client, prefix, file_name, workdir_name):
	90	"""
	91	Return the path of the downloaded file.
	92	Return None on error
	93	"""
	94	destination = os.path.join(workdir_name, file_name)
	95	object_name = "{}/{}".format(prefix, file_name)
	96	try:
	97	client.fget_object(DEFAULT_BUCKET, object_name, destination)
	98	except NoSuchKey:
	99	return None
	100
	101	return destination
	102
	103
	104	def delete_file(client, prefix, file_name):
	105	"""
	106	Delete the file on remote.
	107	"""
	108	object_name = "{}/{}".format(prefix, file_name)
	109	try:
	110	client.remove_object(DEFAULT_BUCKET, object_name)
	111	except ResponseError as err:
	112	print(err)
	113	except NoSuchKey:
	114	pass
	115
	116
	117	def get_git_log(bt_version, cutoff, repo_path):
	118	"""
	119	Return an ordered (older to newer) list of commits for the bt_version and
	120	cutoff. WARNING: This changes the git repo HEAD.
	121	"""
	122	repo = git.Repo(repo_path)
	123	repo.git.fetch()
	124	return repo.git.log(
	125	"{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
	126	).split("\n")
	127
128
129	def parse_result(result_path):
130	"""
131	Parse the result file. Return a dataset of User time + System time.
132	"""
133	with open(result_path) as result:
134	parsed_result = json.load(result)
135	return list(
136	map(
137	add,
138	parsed_result["User time (seconds)"],
139	parsed_result["System time (seconds)"],
140	)
141	)
142
143
144	def get_benchmark_results(client, commit, workdir):
145	"""
146	Fetch the benchmark result from a certain commit across all benchmark type.
147	"""
148	results = {}
149	benchmark_valid = True
150	for b_type in BENCHMARK_TYPES:
cdace203	151	prefix = "/results/benchmarks/babeltrace/{}".format(b_type)
5c65bbc2 JR	152	result_file = get_file(client, prefix, commit, workdir)
	153	if not result_file:
	154	"""
	155	Benchmark is either corrupted or not complete.
	156	"""
	157	return None, benchmark_valid
	158	results[b_type] = parse_result(result_file)
	159	if all(i == 0.0 for i in results[b_type]):
	160	benchmark_valid = False
	161	print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
	162	# The dataset is valid return immediately.
	163	return results, benchmark_valid
	164
	165
	166	def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
	167	"""
	168	Plot the graph using the raw value.
	169	"""
	170	point_x_data = []
	171	outlier_x_data = []
	172	point_y_data = []
	173	outlier_y_data = []
	174	for pos in range(len(x_data)):
	175	x = x_data[pos]
	176	valid_points, outliers = sanitize_dataset(y_data[pos])
	177	for y in valid_points:
	178	point_x_data.append(x)
	179	point_y_data.append(y)
	180	for y in outliers:
	181	outlier_x_data.append(x)
	182	outlier_y_data.append(y)
	183
	184	plt.plot(
	185	point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
	186	)
	187	plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
	188
5c65bbc2 JR	189	ymax = 1
	190	if y_data:
	191	ymin = 0.8 * min([item for sublist in y_data for item in sublist])
	192	ymax = 1.2 * max([item for sublist in y_data for item in sublist])
	193	# Put latest of other branches for reference as horizontal line.
	194	for l_branch, l_result in latest_values.items():
	195	if not l_result or l_branch == branch:
	196	continue
	197	plt.axhline(
	198	y=l_result,
	199	label="Latest {}".format(l_branch),
	200	color=graph_get_color(l_branch),
	201	)
5c65bbc2 JR	202	if l_result >= ymax:
5c65bbc2 JR	203	ymax = 1.2 * l_result
056f7519	204	ax = plt.gca()
925d7893	205	plt.ylim(ymin=0, ymax=ymax)
5c65bbc2 JR	206	plt.xticks(x_data, labels, rotation=90, family="monospace")
	207	plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
	208	plt.ylabel("User + system time (s)")
	209	plt.xlabel("Latest commits")
	210	plt.legend()
526aab11	211	plt.grid(True)
5c65bbc2	212
056f7519 JR	213	# Put tick on the right side
	214	ax.tick_params(labeltop=False, labelright=True)
	215
5c65bbc2 JR	216	plt.tight_layout()
	217	return
	218
09de7b53 JR	219
	220	def plot_delta_between_point(
	221	branch, benchmark_type, x_data, y_data, labels, latest_values
	222	):
20defd5e JR	223	"""
	224	Plot the graph of delta between each sequential commit.
	225	"""
	226	local_abs_max = 100
	227
	228	# Transform y_data to a list of for which the reference is the first
	229	# element.
	230	local_y_data = []
	231	for pos, y in enumerate(y_data):
	232	if pos == 0:
	233	local_y_data.append(0.0)
	234	continue
	235	local_y_data.append(y - y_data[pos - 1])
	236
	237	plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
	238
	239	# Get max absolute value to align the y axis with zero in the middle.
	240	if local_y_data:
	241	local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
	242
	243	plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
	244
	245	ax = plt.gca()
	246	plt.xticks(x_data, labels, rotation=90, family="monospace")
09de7b53 JR	247	plt.title(
	248	graph_get_title(branch, benchmark_type) + " Delta to previous commit",
	249	fontweight="bold",
	250	)
20defd5e JR	251	plt.ylabel("Seconds")
	252	plt.xlabel("Latest commits")
	253	plt.legend()
526aab11	254	plt.grid(True)
20defd5e JR	255
	256	# Put tick on the right side
	257	ax.tick_params(labeltop=False, labelright=True)
	258
	259	plt.tight_layout()
	260	return
5c65bbc2	261
09de7b53	262
5c65bbc2 JR	263	def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
	264	"""
	265	Plot the graph using a ratio using first point as reference (0%).
	266	"""
	267	reference = 0.01
	268	y_abs_max = 100
	269
	270	if y_data:
	271	reference = y_data[0]
	272
	273	# Transform y_data to a list of ratio for which the reference is the first
	274	# element.
	275	local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
	276
	277	plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
	278
	279	# Put latest of other branches for reference as horizontal line.
	280	for l_branch, l_result in latest_values.items():
	281	if not l_result or l_branch == branch:
	282	continue
	283	ratio_l_result = ((l_result / reference) - 1.0) * 100.0
	284	print(
	285	"branch {} branch {} value {} l_result {} reference {}".format(
	286	branch, l_branch, ratio_l_result, l_result, reference
	287	)
	288	)
	289	plt.axhline(
	290	y=ratio_l_result,
	291	label="Latest {}".format(l_branch),
	292	color=graph_get_color(l_branch),
	293	)
	294
	295	# Draw the reference line.
	296	plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
	297
	298	# Get max absolute value to align the y axis with zero in the middle.
	299	if local_y_data:
	300	local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
	301	if y_abs_max > 100:
	302	y_abs_max = local_abs_max
	303
	304	plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
	305
	306	ax = plt.gca()
	307	percent_formatter = PercentFormatter()
	308	ax.yaxis.set_major_formatter(percent_formatter)
	309	ax.yaxis.set_minor_formatter(percent_formatter)
	310	plt.xticks(x_data, labels, rotation=90, family="monospace")
	311	plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
	312	plt.ylabel("Ratio")
	313	plt.xlabel("Latest commits")
	314	plt.legend()
526aab11	315	plt.grid(True)
5c65bbc2	316
056f7519 JR	317	# Put tick on the right side
	318	ax.tick_params(labeltop=False, labelright=True)
	319
5c65bbc2 JR	320	plt.tight_layout()
	321	return
	322
09de7b53	323
5c65bbc2 JR	324	def generate_graph(branches, report_name, git_path):
	325
	326	# The PDF document
	327	pdf_pages = PdfPages(report_name)
	328
	329	client = get_client()
	330	branch_results = dict()
	331
	332	# Fetch the results for each branch.
	333	for branch, cutoff in branches.items():
	334	commits = get_git_log(branch, cutoff, git_path)
	335	results = []
	336	with tempfile.TemporaryDirectory() as workdir:
	337	for commit in commits:
	338	b_results, valid = get_benchmark_results(client, commit, workdir)
	339	if not b_results or not valid:
	340	continue
	341	results.append((commit, b_results))
	342	branch_results[branch] = results
	343
	344	for b_type in BENCHMARK_TYPES:
	345	latest_values = {}
	346	max_len = 0
	347
	348	# Find the maximum size for a series inside our series dataset.
	349	# This is used later to compute the size of the actual plot (pdf).
	350	# While there gather the comparison value used to draw comparison line
	351	# between branches.
	352	for branch, results in branch_results.items():
	353	max_len = max([max_len, len(results)])
	354	if results:
	355	latest_values[branch] = mean(
	356	sanitize_dataset(results[-1][1][b_type])[0]
	357	)
	358	else:
	359	latest_values[branch] = None
	360
	361	for branch, results in branch_results.items():
	362	# Create a figure instance
	363	if max_len and max_len > 10:
	364	width = 0.16 * max_len
	365	else:
	366	width = 11.69
	367
	368	x_data = list(range(len(results)))
	369	y_data = [c[1][b_type] for c in results]
	370	labels = [c[0][:8] for c in results]
	371
	372	fig = plt.figure(figsize=(width, 8.27), dpi=100)
	373	plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
	374	pdf_pages.savefig(fig)
	375
5c65bbc2 JR	376	# Use the mean of each sanitize dataset here, we do not care for
	377	# variance for ratio. At least not yet.
	378	y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
20defd5e	379	fig = plt.figure(figsize=(width, 8.27), dpi=100)
5c65bbc2 JR	380	plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
	381	pdf_pages.savefig(fig)
	382
20defd5e	383	fig = plt.figure(figsize=(width, 8.27), dpi=100)
09de7b53 JR	384	plot_delta_between_point(
	385	branch, b_type, x_data, y_data, labels, latest_values
	386	)
20defd5e JR	387	pdf_pages.savefig(fig)
20defd5e JR	388
5c65bbc2 JR	389	pdf_pages.close()
	390
	391
d373c66e	392	def launch_jobs(branches, git_path, wait_for_completion, debug, force):
5c65bbc2 JR	393	"""
	394	Lauch jobs for all missing results.
	395	"""
	396	client = get_client()
	397	for branch, cutoff in branches.items():
	398	commits = get_git_log(branch, cutoff, git_path)
	399
	400	with tempfile.TemporaryDirectory() as workdir:
	401	for commit in commits:
e085717c JR	402	if commit in invalid_commits:
e085717c JR	403	continue
5c65bbc2	404	b_results = get_benchmark_results(client, commit, workdir)[0]
d373c66e	405	if b_results and not force:
5c65bbc2 JR	406	continue
	407	lava_submit.submit(
	408	commit, wait_for_completion=wait_for_completion, debug=debug
	409	)
	410
	411
	412	def main():
	413	"""
	414	Parse arguments and execute as needed.
	415	"""
	416	bt_branches = {
	417	"master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
	418	"stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
	419	"stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
	420	}
	421
	422	parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
	423	parser.add_argument(
	424	"--generate-jobs", action="store_true", help="Generate and send jobs"
	425	)
d373c66e JR	426	parser.add_argument(
	427	"--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
	428	)
5c65bbc2 JR	429	parser.add_argument(
	430	"--do-not-wait-on-completion",
	431	action="store_true",
	432	default=False,
	433	help="Wait for the completion of each jobs sent. This is useful"
	434	"for the ci. Otherwise we could end up spaming the lava instance.",
	435	)
	436	parser.add_argument(
	437	"--generate-report",
	438	action="store_true",
	439	help="Generate graphs and save them to pdf",
	440	)
	441	parser.add_argument(
	442	"--report-name", default="report.pdf", help="The name of the pdf report."
	443	)
	444	parser.add_argument(
	445	"--debug", action="store_true", default=False, help="Do not send jobs to lava."
	446	)
	447	parser.add_argument(
	448	"--repo-path", help="The location of the git repo to use.", required=True
	449	)
cf595cda JR	450	parser.add_argument(
	451	"--overwrite-branches-cutoff",
	452	help="A dictionary of the form {"
	453	"'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
	454	"jobs generation.",
	455	required=False, type=json_type
	456	)
5c65bbc2 JR	457
	458	args = parser.parse_args()
	459
cf595cda JR	460	if args.overwrite_branches_cutoff:
	461	bt_branches = args.overwrite_branches_cutoff
	462
5c65bbc2 JR	463	if not os.path.exists(args.repo_path):
	464	print("Repository location does not exists.")
	465	return 1
	466
	467	if args.generate_jobs:
	468	print("Launching jobs for:")
d373c66e	469
5c65bbc2 JR	470	for branch, cutoff in bt_branches.items():
5c65bbc2 JR	471	print("\t Branch {} with cutoff {}".format(branch, cutoff))
d373c66e	472
5c65bbc2	473	launch_jobs(
d373c66e JR	474	bt_branches,
	475	args.repo_path,
	476	not args.do_not_wait_on_completion,
	477	args.debug,
	478	args.force_jobs,
5c65bbc2 JR	479	)
	480
	481	if args.generate_report:
	482	print("Generating pdf report ({}) for:".format(args.report_name))
	483	for branch, cutoff in bt_branches.items():
	484	print("\t Branch {} with cutoff {}".format(branch, cutoff))
	485	generate_graph(bt_branches, args.report_name, args.repo_path)
	486
	487	return 0
	488
	489
	490	def sanitize_dataset(dataset):
	491	"""
	492	Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
	493	representative mean without outlier in it.
	494	[1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
	495	"""
	496	sorted_data = sorted(dataset)
	497	q1, q3 = numpy.percentile(sorted_data, [25, 75])
	498	iqr = q3 - q1
	499	lower_bound = q1 - (1.5 * iqr)
	500	upper_bound = q3 + (1.5 * iqr)
	501	new_dataset = []
	502	outliers = []
	503	for i in dataset:
	504	if lower_bound <= i <= upper_bound:
	505	new_dataset.append(i)
	506	else:
	507	outliers.append(i)
	508	return new_dataset, outliers
	509
	510
	511	if __name__ == "__main__":
	512	sys.exit(main())