scripts/babeltrace-benchmark/benchmark.py

   1 #!/usr/bin/python3
   2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 import json
  18 import os
  19 import tempfile
  20 from statistics import mean
  21 import argparse
  22 import sys
  23 from operator import add
  24
  25 import matplotlib.pyplot as plt
  26 from matplotlib.backends.backend_pdf import PdfPages
  27 from matplotlib.ticker import PercentFormatter
  28
  29 import git
  30 import numpy
  31 import lava_submit
  32
  33 from minio import Minio
  34 from minio.error import NoSuchKey
  35 from minio.error import ResponseError
  36
  37
  38 BENCHMARK_TYPES = ["dummy", "text"]
  39 DEFAULT_BUCKET = "lava"
  40
  41
  42 def json_type(string):
  43     """
  44     Argpase type for json args.
  45     We expect a base dictionary.
  46     """
  47     passed_json = json.loads(string)
  48     if not isinstance(passed_json, dict):
  49         msg = "%r is not a dict" % string
  50         raise argparse.ArgumentTypeError(msg)
  51     return passed_json
  52
  53 def graph_get_color(branch):
  54     """
  55     Get the color matching the branch.
  56     """
  57     color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
  58     return color[branch]
  59
  60
  61 def graph_get_title(branch, benchmark_type):
  62     """
  63     Get title for graph based on benchmark type.
  64     """
  65     string = {"dummy": "Dummy output", "text": "Text output"}
  66     return "{} - {}".format(branch, string[benchmark_type])
  67
  68
  69 def get_client():
  70     """
  71     Return minio client configured.
  72     """
  73     return Minio(
  74         "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
  75     )
  76
  77
  78 def get_file(client, prefix, file_name, workdir_name):
  79     """
  80     Return the path of the downloaded file.
  81     Return None on error
  82     """
  83     destination = os.path.join(workdir_name, file_name)
  84     object_name = "{}/{}".format(prefix, file_name)
  85     try:
  86         client.fget_object(DEFAULT_BUCKET, object_name, destination)
  87     except NoSuchKey:
  88         return None
  89
  90     return destination
  91
  92
  93 def delete_file(client, prefix, file_name):
  94     """
  95     Delete the file on remote.
  96     """
  97     object_name = "{}/{}".format(prefix, file_name)
  98     try:
  99         client.remove_object(DEFAULT_BUCKET, object_name)
 100     except ResponseError as err:
 101         print(err)
 102     except NoSuchKey:
 103         pass
 104
 105
 106 def get_git_log(bt_version, cutoff, repo_path):
 107     """
 108     Return an ordered (older to newer) list of commits for the bt_version and
 109     cutoff. WARNING: This changes the git repo HEAD.
 110     """
 111     repo = git.Repo(repo_path)
 112     repo.git.fetch()
 113     return repo.git.log(
 114         "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
 115     ).split("\n")
 116
 117
 118 def parse_result(result_path):
 119     """
 120     Parse the result file. Return a dataset of User time + System time.
 121     """
 122     with open(result_path) as result:
 123         parsed_result = json.load(result)
 124         return list(
 125             map(
 126                 add,
 127                 parsed_result["User time (seconds)"],
 128                 parsed_result["System time (seconds)"],
 129             )
 130         )
 131
 132
 133 def get_benchmark_results(client, commit, workdir):
 134     """
 135     Fetch the benchmark result from a certain commit across all benchmark type.
 136     """
 137     results = {}
 138     benchmark_valid = True
 139     for b_type in BENCHMARK_TYPES:
 140         prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
 141         result_file = get_file(client, prefix, commit, workdir)
 142         if not result_file:
 143             """
 144             Benchmark is either corrupted or not complete.
 145             """
 146             return None, benchmark_valid
 147         results[b_type] = parse_result(result_file)
 148         if all(i == 0.0 for i in results[b_type]):
 149             benchmark_valid = False
 150             print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
 151     # The dataset is valid return immediately.
 152     return results, benchmark_valid
 153
 154
 155 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
 156     """
 157     Plot the graph using the raw value.
 158     """
 159     point_x_data = []
 160     outlier_x_data = []
 161     point_y_data = []
 162     outlier_y_data = []
 163     for pos in range(len(x_data)):
 164         x = x_data[pos]
 165         valid_points, outliers = sanitize_dataset(y_data[pos])
 166         for y in valid_points:
 167             point_x_data.append(x)
 168             point_y_data.append(y)
 169         for y in outliers:
 170             outlier_x_data.append(x)
 171             outlier_y_data.append(y)
 172
 173     plt.plot(
 174         point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
 175     )
 176     plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
 177
 178     ymax = 1
 179     if y_data:
 180         ymin = 0.8 * min([item for sublist in y_data for item in sublist])
 181         ymax = 1.2 * max([item for sublist in y_data for item in sublist])
 182     # Put latest of other branches for reference as horizontal line.
 183     for l_branch, l_result in latest_values.items():
 184         if not l_result or l_branch == branch:
 185             continue
 186         plt.axhline(
 187             y=l_result,
 188             label="Latest {}".format(l_branch),
 189             color=graph_get_color(l_branch),
 190         )
 191         if l_result >= ymax:
 192             ymax = 1.2 * l_result
 193     ax = plt.gca()
 194     plt.ylim(ymin=0, ymax=ymax)
 195     plt.xticks(x_data, labels, rotation=90, family="monospace")
 196     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 197     plt.ylabel("User + system time (s)")
 198     plt.xlabel("Latest commits")
 199     plt.legend()
 200
 201     # Put tick on the right side
 202     ax.tick_params(labeltop=False, labelright=True)
 203
 204     plt.tight_layout()
 205     return
 206
 207
 208 def plot_delta_between_point(
 209     branch, benchmark_type, x_data, y_data, labels, latest_values
 210 ):
 211     """
 212     Plot the graph of delta between each sequential commit.
 213     """
 214     local_abs_max = 100
 215
 216     # Transform y_data to a list of  for which the reference is the first
 217     # element.
 218     local_y_data = []
 219     for pos, y in enumerate(y_data):
 220         if pos == 0:
 221             local_y_data.append(0.0)
 222             continue
 223         local_y_data.append(y - y_data[pos - 1])
 224
 225     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 226
 227     # Get max absolute value to align the y axis with zero in the middle.
 228     if local_y_data:
 229         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 230
 231     plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
 232
 233     ax = plt.gca()
 234     plt.xticks(x_data, labels, rotation=90, family="monospace")
 235     plt.title(
 236         graph_get_title(branch, benchmark_type) + " Delta to previous commit",
 237         fontweight="bold",
 238     )
 239     plt.ylabel("Seconds")
 240     plt.xlabel("Latest commits")
 241     plt.legend()
 242
 243     # Put tick on the right side
 244     ax.tick_params(labeltop=False, labelright=True)
 245
 246     plt.tight_layout()
 247     return
 248
 249
 250 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
 251     """
 252     Plot the graph using a ratio using first point as reference (0%).
 253     """
 254     reference = 0.01
 255     y_abs_max = 100
 256
 257     if y_data:
 258         reference = y_data[0]
 259
 260     # Transform y_data to a list of ratio for which the reference is the first
 261     # element.
 262     local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
 263
 264     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 265
 266     # Put latest of other branches for reference as horizontal line.
 267     for l_branch, l_result in latest_values.items():
 268         if not l_result or l_branch == branch:
 269             continue
 270         ratio_l_result = ((l_result / reference) - 1.0) * 100.0
 271         print(
 272             "branch {} branch {} value {} l_result {} reference {}".format(
 273                 branch, l_branch, ratio_l_result, l_result, reference
 274             )
 275         )
 276         plt.axhline(
 277             y=ratio_l_result,
 278             label="Latest {}".format(l_branch),
 279             color=graph_get_color(l_branch),
 280         )
 281
 282     # Draw the reference line.
 283     plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
 284
 285     # Get max absolute value to align the y axis with zero in the middle.
 286     if local_y_data:
 287         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 288         if y_abs_max > 100:
 289             y_abs_max = local_abs_max
 290
 291     plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
 292
 293     ax = plt.gca()
 294     percent_formatter = PercentFormatter()
 295     ax.yaxis.set_major_formatter(percent_formatter)
 296     ax.yaxis.set_minor_formatter(percent_formatter)
 297     plt.xticks(x_data, labels, rotation=90, family="monospace")
 298     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 299     plt.ylabel("Ratio")
 300     plt.xlabel("Latest commits")
 301     plt.legend()
 302
 303     # Put tick on the right side
 304     ax.tick_params(labeltop=False, labelright=True)
 305
 306     plt.tight_layout()
 307     return
 308
 309
 310 def generate_graph(branches, report_name, git_path):
 311
 312     # The PDF document
 313     pdf_pages = PdfPages(report_name)
 314
 315     client = get_client()
 316     branch_results = dict()
 317
 318     # Fetch the results for each branch.
 319     for branch, cutoff in branches.items():
 320         commits = get_git_log(branch, cutoff, git_path)
 321         results = []
 322         with tempfile.TemporaryDirectory() as workdir:
 323             for commit in commits:
 324                 b_results, valid = get_benchmark_results(client, commit, workdir)
 325                 if not b_results or not valid:
 326                     continue
 327                 results.append((commit, b_results))
 328         branch_results[branch] = results
 329
 330     for b_type in BENCHMARK_TYPES:
 331         latest_values = {}
 332         max_len = 0
 333
 334         # Find the maximum size for a series inside our series dataset.
 335         # This is used later to compute the size of the actual plot (pdf).
 336         # While there gather the comparison value used to draw comparison line
 337         # between branches.
 338         for branch, results in branch_results.items():
 339             max_len = max([max_len, len(results)])
 340             if results:
 341                 latest_values[branch] = mean(
 342                     sanitize_dataset(results[-1][1][b_type])[0]
 343                 )
 344             else:
 345                 latest_values[branch] = None
 346
 347         for branch, results in branch_results.items():
 348             # Create a figure instance
 349             if max_len and max_len > 10:
 350                 width = 0.16 * max_len
 351             else:
 352                 width = 11.69
 353
 354             x_data = list(range(len(results)))
 355             y_data = [c[1][b_type] for c in results]
 356             labels = [c[0][:8] for c in results]
 357
 358             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 359             plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
 360             pdf_pages.savefig(fig)
 361
 362             # Use the mean of each sanitize dataset here, we do not care for
 363             # variance for ratio. At least not yet.
 364             y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
 365             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 366             plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
 367             pdf_pages.savefig(fig)
 368
 369             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 370             plot_delta_between_point(
 371                 branch, b_type, x_data, y_data, labels, latest_values
 372             )
 373             pdf_pages.savefig(fig)
 374
 375     pdf_pages.close()
 376
 377
 378 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
 379     """
 380     Lauch jobs for all missing results.
 381     """
 382     client = get_client()
 383     for branch, cutoff in branches.items():
 384         commits = get_git_log(branch, cutoff, git_path)
 385
 386         with tempfile.TemporaryDirectory() as workdir:
 387             for commit in commits:
 388                 b_results = get_benchmark_results(client, commit, workdir)[0]
 389                 if b_results and not force:
 390                     continue
 391                 lava_submit.submit(
 392                     commit, wait_for_completion=wait_for_completion, debug=debug
 393                 )
 394
 395
 396 def main():
 397     """
 398     Parse arguments and execute as needed.
 399     """
 400     bt_branches = {
 401         "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
 402         "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
 403         "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
 404     }
 405
 406     parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
 407     parser.add_argument(
 408         "--generate-jobs", action="store_true", help="Generate and send jobs"
 409     )
 410     parser.add_argument(
 411         "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
 412     )
 413     parser.add_argument(
 414         "--do-not-wait-on-completion",
 415         action="store_true",
 416         default=False,
 417         help="Wait for the completion of each jobs sent. This is useful"
 418         "for the ci. Otherwise we could end up spaming the lava instance.",
 419     )
 420     parser.add_argument(
 421         "--generate-report",
 422         action="store_true",
 423         help="Generate graphs and save them to pdf",
 424     )
 425     parser.add_argument(
 426         "--report-name", default="report.pdf", help="The name of the pdf report."
 427     )
 428     parser.add_argument(
 429         "--debug", action="store_true", default=False, help="Do not send jobs to lava."
 430     )
 431     parser.add_argument(
 432         "--repo-path", help="The location of the git repo to use.", required=True
 433     )
 434     parser.add_argument(
 435         "--overwrite-branches-cutoff",
 436         help="A dictionary of the form {"
 437         "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
 438         "jobs generation.",
 439         required=False, type=json_type
 440     )
 441
 442     args = parser.parse_args()
 443
 444     if args.overwrite_branches_cutoff:
 445         bt_branches = args.overwrite_branches_cutoff
 446
 447     if not os.path.exists(args.repo_path):
 448         print("Repository location does not exists.")
 449         return 1
 450
 451     if args.generate_jobs:
 452         print("Launching jobs for:")
 453
 454         for branch, cutoff in bt_branches.items():
 455             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 456
 457         launch_jobs(
 458             bt_branches,
 459             args.repo_path,
 460             not args.do_not_wait_on_completion,
 461             args.debug,
 462             args.force_jobs,
 463         )
 464
 465     if args.generate_report:
 466         print("Generating pdf report ({}) for:".format(args.report_name))
 467         for branch, cutoff in bt_branches.items():
 468             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 469         generate_graph(bt_branches, args.report_name, args.repo_path)
 470
 471     return 0
 472
 473
 474 def sanitize_dataset(dataset):
 475     """
 476     Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
 477     representative mean without outlier in it.
 478     [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
 479     """
 480     sorted_data = sorted(dataset)
 481     q1, q3 = numpy.percentile(sorted_data, [25, 75])
 482     iqr = q3 - q1
 483     lower_bound = q1 - (1.5 * iqr)
 484     upper_bound = q3 + (1.5 * iqr)
 485     new_dataset = []
 486     outliers = []
 487     for i in dataset:
 488         if lower_bound <= i <= upper_bound:
 489             new_dataset.append(i)
 490         else:
 491             outliers.append(i)
 492     return new_dataset, outliers
 493
 494
 495 if __name__ == "__main__":
 496     sys.exit(main())