scripts/babeltrace-benchmark/benchmark.py

   1 #!/usr/bin/python3
   2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 import json
  18 import os
  19 import tempfile
  20 from statistics import mean
  21 import argparse
  22 import sys
  23 from operator import add
  24
  25 import matplotlib.pyplot as plt
  26 from matplotlib.backends.backend_pdf import PdfPages
  27 from matplotlib.ticker import PercentFormatter
  28
  29 import git
  30 import numpy
  31 import lava_submit
  32
  33 from minio import Minio
  34 from minio.error import NoSuchKey
  35 from minio.error import ResponseError
  36
  37
  38 BENCHMARK_TYPES = ["dummy", "text"]
  39 DEFAULT_BUCKET = "lava"
  40
  41 invalid_commits = {
  42         "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
  43         "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
  44         "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
  45         "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
  46         }
  47
  48 def json_type(string):
  49     """
  50     Argpase type for json args.
  51     We expect a base dictionary.
  52     """
  53     passed_json = json.loads(string)
  54     if not isinstance(passed_json, dict):
  55         msg = "%r is not a dict" % string
  56         raise argparse.ArgumentTypeError(msg)
  57     return passed_json
  58
  59 def graph_get_color(branch):
  60     """
  61     Get the color matching the branch.
  62     """
  63     color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
  64     return color[branch]
  65
  66
  67 def graph_get_title(branch, benchmark_type):
  68     """
  69     Get title for graph based on benchmark type.
  70     """
  71     string = {"dummy": "Dummy output", "text": "Text output"}
  72     return "{} - {}".format(branch, string[benchmark_type])
  73
  74
  75 def get_client():
  76     """
  77     Return minio client configured.
  78     """
  79     return Minio(
  80         "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
  81     )
  82
  83
  84 def get_file(client, prefix, file_name, workdir_name):
  85     """
  86     Return the path of the downloaded file.
  87     Return None on error
  88     """
  89     destination = os.path.join(workdir_name, file_name)
  90     object_name = "{}/{}".format(prefix, file_name)
  91     try:
  92         client.fget_object(DEFAULT_BUCKET, object_name, destination)
  93     except NoSuchKey:
  94         return None
  95
  96     return destination
  97
  98
  99 def delete_file(client, prefix, file_name):
 100     """
 101     Delete the file on remote.
 102     """
 103     object_name = "{}/{}".format(prefix, file_name)
 104     try:
 105         client.remove_object(DEFAULT_BUCKET, object_name)
 106     except ResponseError as err:
 107         print(err)
 108     except NoSuchKey:
 109         pass
 110
 111
 112 def get_git_log(bt_version, cutoff, repo_path):
 113     """
 114     Return an ordered (older to newer) list of commits for the bt_version and
 115     cutoff. WARNING: This changes the git repo HEAD.
 116     """
 117     repo = git.Repo(repo_path)
 118     repo.git.fetch()
 119     return repo.git.log(
 120         "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
 121     ).split("\n")
 122
 123
 124 def parse_result(result_path):
 125     """
 126     Parse the result file. Return a dataset of User time + System time.
 127     """
 128     with open(result_path) as result:
 129         parsed_result = json.load(result)
 130         return list(
 131             map(
 132                 add,
 133                 parsed_result["User time (seconds)"],
 134                 parsed_result["System time (seconds)"],
 135             )
 136         )
 137
 138
 139 def get_benchmark_results(client, commit, workdir):
 140     """
 141     Fetch the benchmark result from a certain commit across all benchmark type.
 142     """
 143     results = {}
 144     benchmark_valid = True
 145     for b_type in BENCHMARK_TYPES:
 146         prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
 147         result_file = get_file(client, prefix, commit, workdir)
 148         if not result_file:
 149             """
 150             Benchmark is either corrupted or not complete.
 151             """
 152             return None, benchmark_valid
 153         results[b_type] = parse_result(result_file)
 154         if all(i == 0.0 for i in results[b_type]):
 155             benchmark_valid = False
 156             print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
 157     # The dataset is valid return immediately.
 158     return results, benchmark_valid
 159
 160
 161 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
 162     """
 163     Plot the graph using the raw value.
 164     """
 165     point_x_data = []
 166     outlier_x_data = []
 167     point_y_data = []
 168     outlier_y_data = []
 169     for pos in range(len(x_data)):
 170         x = x_data[pos]
 171         valid_points, outliers = sanitize_dataset(y_data[pos])
 172         for y in valid_points:
 173             point_x_data.append(x)
 174             point_y_data.append(y)
 175         for y in outliers:
 176             outlier_x_data.append(x)
 177             outlier_y_data.append(y)
 178
 179     plt.plot(
 180         point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
 181     )
 182     plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
 183
 184     ymax = 1
 185     if y_data:
 186         ymin = 0.8 * min([item for sublist in y_data for item in sublist])
 187         ymax = 1.2 * max([item for sublist in y_data for item in sublist])
 188     # Put latest of other branches for reference as horizontal line.
 189     for l_branch, l_result in latest_values.items():
 190         if not l_result or l_branch == branch:
 191             continue
 192         plt.axhline(
 193             y=l_result,
 194             label="Latest {}".format(l_branch),
 195             color=graph_get_color(l_branch),
 196         )
 197         if l_result >= ymax:
 198             ymax = 1.2 * l_result
 199     ax = plt.gca()
 200     plt.ylim(ymin=0, ymax=ymax)
 201     plt.xticks(x_data, labels, rotation=90, family="monospace")
 202     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 203     plt.ylabel("User + system time (s)")
 204     plt.xlabel("Latest commits")
 205     plt.legend()
 206     plt.grid(True)
 207
 208     # Put tick on the right side
 209     ax.tick_params(labeltop=False, labelright=True)
 210
 211     plt.tight_layout()
 212     return
 213
 214
 215 def plot_delta_between_point(
 216     branch, benchmark_type, x_data, y_data, labels, latest_values
 217 ):
 218     """
 219     Plot the graph of delta between each sequential commit.
 220     """
 221     local_abs_max = 100
 222
 223     # Transform y_data to a list of  for which the reference is the first
 224     # element.
 225     local_y_data = []
 226     for pos, y in enumerate(y_data):
 227         if pos == 0:
 228             local_y_data.append(0.0)
 229             continue
 230         local_y_data.append(y - y_data[pos - 1])
 231
 232     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 233
 234     # Get max absolute value to align the y axis with zero in the middle.
 235     if local_y_data:
 236         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 237
 238     plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
 239
 240     ax = plt.gca()
 241     plt.xticks(x_data, labels, rotation=90, family="monospace")
 242     plt.title(
 243         graph_get_title(branch, benchmark_type) + " Delta to previous commit",
 244         fontweight="bold",
 245     )
 246     plt.ylabel("Seconds")
 247     plt.xlabel("Latest commits")
 248     plt.legend()
 249     plt.grid(True)
 250
 251     # Put tick on the right side
 252     ax.tick_params(labeltop=False, labelright=True)
 253
 254     plt.tight_layout()
 255     return
 256
 257
 258 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
 259     """
 260     Plot the graph using a ratio using first point as reference (0%).
 261     """
 262     reference = 0.01
 263     y_abs_max = 100
 264
 265     if y_data:
 266         reference = y_data[0]
 267
 268     # Transform y_data to a list of ratio for which the reference is the first
 269     # element.
 270     local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
 271
 272     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 273
 274     # Put latest of other branches for reference as horizontal line.
 275     for l_branch, l_result in latest_values.items():
 276         if not l_result or l_branch == branch:
 277             continue
 278         ratio_l_result = ((l_result / reference) - 1.0) * 100.0
 279         print(
 280             "branch {} branch {} value {} l_result {} reference {}".format(
 281                 branch, l_branch, ratio_l_result, l_result, reference
 282             )
 283         )
 284         plt.axhline(
 285             y=ratio_l_result,
 286             label="Latest {}".format(l_branch),
 287             color=graph_get_color(l_branch),
 288         )
 289
 290     # Draw the reference line.
 291     plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
 292
 293     # Get max absolute value to align the y axis with zero in the middle.
 294     if local_y_data:
 295         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 296         if y_abs_max > 100:
 297             y_abs_max = local_abs_max
 298
 299     plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
 300
 301     ax = plt.gca()
 302     percent_formatter = PercentFormatter()
 303     ax.yaxis.set_major_formatter(percent_formatter)
 304     ax.yaxis.set_minor_formatter(percent_formatter)
 305     plt.xticks(x_data, labels, rotation=90, family="monospace")
 306     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 307     plt.ylabel("Ratio")
 308     plt.xlabel("Latest commits")
 309     plt.legend()
 310     plt.grid(True)
 311
 312     # Put tick on the right side
 313     ax.tick_params(labeltop=False, labelright=True)
 314
 315     plt.tight_layout()
 316     return
 317
 318
 319 def generate_graph(branches, report_name, git_path):
 320
 321     # The PDF document
 322     pdf_pages = PdfPages(report_name)
 323
 324     client = get_client()
 325     branch_results = dict()
 326
 327     # Fetch the results for each branch.
 328     for branch, cutoff in branches.items():
 329         commits = get_git_log(branch, cutoff, git_path)
 330         results = []
 331         with tempfile.TemporaryDirectory() as workdir:
 332             for commit in commits:
 333                 b_results, valid = get_benchmark_results(client, commit, workdir)
 334                 if not b_results or not valid:
 335                     continue
 336                 results.append((commit, b_results))
 337         branch_results[branch] = results
 338
 339     for b_type in BENCHMARK_TYPES:
 340         latest_values = {}
 341         max_len = 0
 342
 343         # Find the maximum size for a series inside our series dataset.
 344         # This is used later to compute the size of the actual plot (pdf).
 345         # While there gather the comparison value used to draw comparison line
 346         # between branches.
 347         for branch, results in branch_results.items():
 348             max_len = max([max_len, len(results)])
 349             if results:
 350                 latest_values[branch] = mean(
 351                     sanitize_dataset(results[-1][1][b_type])[0]
 352                 )
 353             else:
 354                 latest_values[branch] = None
 355
 356         for branch, results in branch_results.items():
 357             # Create a figure instance
 358             if max_len and max_len > 10:
 359                 width = 0.16 * max_len
 360             else:
 361                 width = 11.69
 362
 363             x_data = list(range(len(results)))
 364             y_data = [c[1][b_type] for c in results]
 365             labels = [c[0][:8] for c in results]
 366
 367             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 368             plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
 369             pdf_pages.savefig(fig)
 370
 371             # Use the mean of each sanitize dataset here, we do not care for
 372             # variance for ratio. At least not yet.
 373             y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
 374             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 375             plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
 376             pdf_pages.savefig(fig)
 377
 378             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 379             plot_delta_between_point(
 380                 branch, b_type, x_data, y_data, labels, latest_values
 381             )
 382             pdf_pages.savefig(fig)
 383
 384     pdf_pages.close()
 385
 386
 387 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
 388     """
 389     Lauch jobs for all missing results.
 390     """
 391     client = get_client()
 392     for branch, cutoff in branches.items():
 393         commits = get_git_log(branch, cutoff, git_path)
 394
 395         with tempfile.TemporaryDirectory() as workdir:
 396             for commit in commits:
 397                 if commit in invalid_commits:
 398                     continue
 399                 b_results = get_benchmark_results(client, commit, workdir)[0]
 400                 if b_results and not force:
 401                     continue
 402                 lava_submit.submit(
 403                     commit, wait_for_completion=wait_for_completion, debug=debug
 404                 )
 405
 406
 407 def main():
 408     """
 409     Parse arguments and execute as needed.
 410     """
 411     bt_branches = {
 412         "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
 413         "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
 414         "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
 415     }
 416
 417     parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
 418     parser.add_argument(
 419         "--generate-jobs", action="store_true", help="Generate and send jobs"
 420     )
 421     parser.add_argument(
 422         "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
 423     )
 424     parser.add_argument(
 425         "--do-not-wait-on-completion",
 426         action="store_true",
 427         default=False,
 428         help="Wait for the completion of each jobs sent. This is useful"
 429         "for the ci. Otherwise we could end up spaming the lava instance.",
 430     )
 431     parser.add_argument(
 432         "--generate-report",
 433         action="store_true",
 434         help="Generate graphs and save them to pdf",
 435     )
 436     parser.add_argument(
 437         "--report-name", default="report.pdf", help="The name of the pdf report."
 438     )
 439     parser.add_argument(
 440         "--debug", action="store_true", default=False, help="Do not send jobs to lava."
 441     )
 442     parser.add_argument(
 443         "--repo-path", help="The location of the git repo to use.", required=True
 444     )
 445     parser.add_argument(
 446         "--overwrite-branches-cutoff",
 447         help="A dictionary of the form {"
 448         "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
 449         "jobs generation.",
 450         required=False, type=json_type
 451     )
 452
 453     args = parser.parse_args()
 454
 455     if args.overwrite_branches_cutoff:
 456         bt_branches = args.overwrite_branches_cutoff
 457
 458     if not os.path.exists(args.repo_path):
 459         print("Repository location does not exists.")
 460         return 1
 461
 462     if args.generate_jobs:
 463         print("Launching jobs for:")
 464
 465         for branch, cutoff in bt_branches.items():
 466             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 467
 468         launch_jobs(
 469             bt_branches,
 470             args.repo_path,
 471             not args.do_not_wait_on_completion,
 472             args.debug,
 473             args.force_jobs,
 474         )
 475
 476     if args.generate_report:
 477         print("Generating pdf report ({}) for:".format(args.report_name))
 478         for branch, cutoff in bt_branches.items():
 479             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 480         generate_graph(bt_branches, args.report_name, args.repo_path)
 481
 482     return 0
 483
 484
 485 def sanitize_dataset(dataset):
 486     """
 487     Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
 488     representative mean without outlier in it.
 489     [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
 490     """
 491     sorted_data = sorted(dataset)
 492     q1, q3 = numpy.percentile(sorted_data, [25, 75])
 493     iqr = q3 - q1
 494     lower_bound = q1 - (1.5 * iqr)
 495     upper_bound = q3 + (1.5 * iqr)
 496     new_dataset = []
 497     outliers = []
 498     for i in dataset:
 499         if lower_bound <= i <= upper_bound:
 500             new_dataset.append(i)
 501         else:
 502             outliers.append(i)
 503     return new_dataset, outliers
 504
 505
 506 if __name__ == "__main__":
 507     sys.exit(main())