scripts/babeltrace-benchmark/benchmark.py

   1 #!/usr/bin/python3
   2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 import json
  18 import os
  19 import tempfile
  20 from statistics import mean
  21 import argparse
  22 import sys
  23 from operator import add
  24
  25 import matplotlib.pyplot as plt
  26 from matplotlib.backends.backend_pdf import PdfPages
  27 from matplotlib.ticker import PercentFormatter
  28
  29 import git
  30 import numpy
  31 import lava_submit
  32
  33 from minio import Minio
  34 from minio.error import NoSuchKey
  35 from minio.error import ResponseError
  36
  37
  38 BENCHMARK_TYPES = ["dummy", "text"]
  39 DEFAULT_BUCKET = "lava"
  40
  41
  42 def json_type(string):
  43     """
  44     Argpase type for json args.
  45     We expect a base dictionary.
  46     """
  47     passed_json = json.loads(string)
  48     if not isinstance(passed_json, dict):
  49         msg = "%r is not a dict" % string
  50         raise argparse.ArgumentTypeError(msg)
  51     return passed_json
  52
  53 def graph_get_color(branch):
  54     """
  55     Get the color matching the branch.
  56     """
  57     color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
  58     return color[branch]
  59
  60
  61 def graph_get_title(branch, benchmark_type):
  62     """
  63     Get title for graph based on benchmark type.
  64     """
  65     string = {"dummy": "Dummy output", "text": "Text output"}
  66     return "{} - {}".format(branch, string[benchmark_type])
  67
  68
  69 def get_client():
  70     """
  71     Return minio client configured.
  72     """
  73     return Minio(
  74         "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
  75     )
  76
  77
  78 def get_file(client, prefix, file_name, workdir_name):
  79     """
  80     Return the path of the downloaded file.
  81     Return None on error
  82     """
  83     destination = os.path.join(workdir_name, file_name)
  84     object_name = "{}/{}".format(prefix, file_name)
  85     try:
  86         client.fget_object(DEFAULT_BUCKET, object_name, destination)
  87     except NoSuchKey:
  88         return None
  89
  90     return destination
  91
  92
  93 def delete_file(client, prefix, file_name):
  94     """
  95     Delete the file on remote.
  96     """
  97     object_name = "{}/{}".format(prefix, file_name)
  98     try:
  99         client.remove_object(DEFAULT_BUCKET, object_name)
 100     except ResponseError as err:
 101         print(err)
 102     except NoSuchKey:
 103         pass
 104
 105
 106 def get_git_log(bt_version, cutoff, repo_path):
 107     """
 108     Return an ordered (older to newer) list of commits for the bt_version and
 109     cutoff. WARNING: This changes the git repo HEAD.
 110     """
 111     repo = git.Repo(repo_path)
 112     repo.git.fetch()
 113     return repo.git.log(
 114         "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
 115     ).split("\n")
 116
 117
 118 def parse_result(result_path):
 119     """
 120     Parse the result file. Return a dataset of User time + System time.
 121     """
 122     with open(result_path) as result:
 123         parsed_result = json.load(result)
 124         return list(
 125             map(
 126                 add,
 127                 parsed_result["User time (seconds)"],
 128                 parsed_result["System time (seconds)"],
 129             )
 130         )
 131
 132
 133 def get_benchmark_results(client, commit, workdir):
 134     """
 135     Fetch the benchmark result from a certain commit across all benchmark type.
 136     """
 137     results = {}
 138     benchmark_valid = True
 139     for b_type in BENCHMARK_TYPES:
 140         prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
 141         result_file = get_file(client, prefix, commit, workdir)
 142         if not result_file:
 143             """
 144             Benchmark is either corrupted or not complete.
 145             """
 146             return None, benchmark_valid
 147         results[b_type] = parse_result(result_file)
 148         if all(i == 0.0 for i in results[b_type]):
 149             benchmark_valid = False
 150             print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
 151     # The dataset is valid return immediately.
 152     return results, benchmark_valid
 153
 154
 155 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
 156     """
 157     Plot the graph using the raw value.
 158     """
 159     point_x_data = []
 160     outlier_x_data = []
 161     point_y_data = []
 162     outlier_y_data = []
 163     for pos in range(len(x_data)):
 164         x = x_data[pos]
 165         valid_points, outliers = sanitize_dataset(y_data[pos])
 166         for y in valid_points:
 167             point_x_data.append(x)
 168             point_y_data.append(y)
 169         for y in outliers:
 170             outlier_x_data.append(x)
 171             outlier_y_data.append(y)
 172
 173     plt.plot(
 174         point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
 175     )
 176     plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
 177
 178     ymax = 1
 179     if y_data:
 180         ymin = 0.8 * min([item for sublist in y_data for item in sublist])
 181         ymax = 1.2 * max([item for sublist in y_data for item in sublist])
 182     # Put latest of other branches for reference as horizontal line.
 183     for l_branch, l_result in latest_values.items():
 184         if not l_result or l_branch == branch:
 185             continue
 186         plt.axhline(
 187             y=l_result,
 188             label="Latest {}".format(l_branch),
 189             color=graph_get_color(l_branch),
 190         )
 191         if l_result >= ymax:
 192             ymax = 1.2 * l_result
 193     ax = plt.gca()
 194     plt.ylim(ymin=0, ymax=ymax)
 195     plt.xticks(x_data, labels, rotation=90, family="monospace")
 196     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 197     plt.ylabel("User + system time (s)")
 198     plt.xlabel("Latest commits")
 199     plt.legend()
 200
 201     # Put tick on the right side
 202     ax.tick_params(labeltop=False, labelright=True)
 203
 204     plt.tight_layout()
 205     return
 206
 207 def plot_delta_between_point(branch, benchmark_type, x_data, y_data, labels, latest_values):
 208     """
 209     Plot the graph of delta between each sequential commit.
 210     """
 211     local_abs_max = 100
 212
 213     # Transform y_data to a list of  for which the reference is the first
 214     # element.
 215     local_y_data = []
 216     for pos, y in enumerate(y_data):
 217         if pos == 0:
 218             local_y_data.append(0.0)
 219             continue
 220         local_y_data.append(y - y_data[pos - 1])
 221
 222     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 223
 224     # Get max absolute value to align the y axis with zero in the middle.
 225     if local_y_data:
 226         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 227
 228     plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
 229
 230     ax = plt.gca()
 231     plt.xticks(x_data, labels, rotation=90, family="monospace")
 232     plt.title(graph_get_title(branch, benchmark_type) + " Delta to previous commit", fontweight="bold")
 233     plt.ylabel("Seconds")
 234     plt.xlabel("Latest commits")
 235     plt.legend()
 236
 237     # Put tick on the right side
 238     ax.tick_params(labeltop=False, labelright=True)
 239
 240     plt.tight_layout()
 241     return
 242
 243 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
 244     """
 245     Plot the graph using a ratio using first point as reference (0%).
 246     """
 247     reference = 0.01
 248     y_abs_max = 100
 249
 250     if y_data:
 251         reference = y_data[0]
 252
 253     # Transform y_data to a list of ratio for which the reference is the first
 254     # element.
 255     local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
 256
 257     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 258
 259     # Put latest of other branches for reference as horizontal line.
 260     for l_branch, l_result in latest_values.items():
 261         if not l_result or l_branch == branch:
 262             continue
 263         ratio_l_result = ((l_result / reference) - 1.0) * 100.0
 264         print(
 265             "branch {} branch {} value {} l_result {} reference {}".format(
 266                 branch, l_branch, ratio_l_result, l_result, reference
 267             )
 268         )
 269         plt.axhline(
 270             y=ratio_l_result,
 271             label="Latest {}".format(l_branch),
 272             color=graph_get_color(l_branch),
 273         )
 274
 275     # Draw the reference line.
 276     plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
 277
 278     # Get max absolute value to align the y axis with zero in the middle.
 279     if local_y_data:
 280         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 281         if y_abs_max > 100:
 282             y_abs_max = local_abs_max
 283
 284     plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
 285
 286     ax = plt.gca()
 287     percent_formatter = PercentFormatter()
 288     ax.yaxis.set_major_formatter(percent_formatter)
 289     ax.yaxis.set_minor_formatter(percent_formatter)
 290     plt.xticks(x_data, labels, rotation=90, family="monospace")
 291     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 292     plt.ylabel("Ratio")
 293     plt.xlabel("Latest commits")
 294     plt.legend()
 295
 296     # Put tick on the right side
 297     ax.tick_params(labeltop=False, labelright=True)
 298
 299     plt.tight_layout()
 300     return
 301
 302 def generate_graph(branches, report_name, git_path):
 303
 304     # The PDF document
 305     pdf_pages = PdfPages(report_name)
 306
 307     client = get_client()
 308     branch_results = dict()
 309
 310     # Fetch the results for each branch.
 311     for branch, cutoff in branches.items():
 312         commits = get_git_log(branch, cutoff, git_path)
 313         results = []
 314         with tempfile.TemporaryDirectory() as workdir:
 315             for commit in commits:
 316                 b_results, valid = get_benchmark_results(client, commit, workdir)
 317                 if not b_results or not valid:
 318                     continue
 319                 results.append((commit, b_results))
 320         branch_results[branch] = results
 321
 322     for b_type in BENCHMARK_TYPES:
 323         latest_values = {}
 324         max_len = 0
 325
 326         # Find the maximum size for a series inside our series dataset.
 327         # This is used later to compute the size of the actual plot (pdf).
 328         # While there gather the comparison value used to draw comparison line
 329         # between branches.
 330         for branch, results in branch_results.items():
 331             max_len = max([max_len, len(results)])
 332             if results:
 333                 latest_values[branch] = mean(
 334                     sanitize_dataset(results[-1][1][b_type])[0]
 335                 )
 336             else:
 337                 latest_values[branch] = None
 338
 339         for branch, results in branch_results.items():
 340             # Create a figure instance
 341             if max_len and max_len > 10:
 342                 width = 0.16 * max_len
 343             else:
 344                 width = 11.69
 345
 346             x_data = list(range(len(results)))
 347             y_data = [c[1][b_type] for c in results]
 348             labels = [c[0][:8] for c in results]
 349
 350             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 351             plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
 352             pdf_pages.savefig(fig)
 353
 354             # Use the mean of each sanitize dataset here, we do not care for
 355             # variance for ratio. At least not yet.
 356             y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
 357             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 358             plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
 359             pdf_pages.savefig(fig)
 360
 361             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 362             plot_delta_between_point(branch, b_type, x_data, y_data, labels, latest_values)
 363             pdf_pages.savefig(fig)
 364
 365     pdf_pages.close()
 366
 367
 368 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
 369     """
 370     Lauch jobs for all missing results.
 371     """
 372     client = get_client()
 373     for branch, cutoff in branches.items():
 374         commits = get_git_log(branch, cutoff, git_path)
 375
 376         with tempfile.TemporaryDirectory() as workdir:
 377             for commit in commits:
 378                 b_results = get_benchmark_results(client, commit, workdir)[0]
 379                 if b_results and not force:
 380                     continue
 381                 lava_submit.submit(
 382                     commit, wait_for_completion=wait_for_completion, debug=debug
 383                 )
 384
 385
 386 def main():
 387     """
 388     Parse arguments and execute as needed.
 389     """
 390     bt_branches = {
 391         "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
 392         "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
 393         "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
 394     }
 395
 396     parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
 397     parser.add_argument(
 398         "--generate-jobs", action="store_true", help="Generate and send jobs"
 399     )
 400     parser.add_argument(
 401         "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
 402     )
 403     parser.add_argument(
 404         "--do-not-wait-on-completion",
 405         action="store_true",
 406         default=False,
 407         help="Wait for the completion of each jobs sent. This is useful"
 408         "for the ci. Otherwise we could end up spaming the lava instance.",
 409     )
 410     parser.add_argument(
 411         "--generate-report",
 412         action="store_true",
 413         help="Generate graphs and save them to pdf",
 414     )
 415     parser.add_argument(
 416         "--report-name", default="report.pdf", help="The name of the pdf report."
 417     )
 418     parser.add_argument(
 419         "--debug", action="store_true", default=False, help="Do not send jobs to lava."
 420     )
 421     parser.add_argument(
 422         "--repo-path", help="The location of the git repo to use.", required=True
 423     )
 424     parser.add_argument(
 425         "--overwrite-branches-cutoff",
 426         help="A dictionary of the form {"
 427         "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
 428         "jobs generation.",
 429         required=False, type=json_type
 430     )
 431
 432     args = parser.parse_args()
 433
 434     if args.overwrite_branches_cutoff:
 435         bt_branches = args.overwrite_branches_cutoff
 436
 437     if not os.path.exists(args.repo_path):
 438         print("Repository location does not exists.")
 439         return 1
 440
 441     if args.generate_jobs:
 442         print("Launching jobs for:")
 443
 444         for branch, cutoff in bt_branches.items():
 445             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 446
 447         launch_jobs(
 448             bt_branches,
 449             args.repo_path,
 450             not args.do_not_wait_on_completion,
 451             args.debug,
 452             args.force_jobs,
 453         )
 454
 455     if args.generate_report:
 456         print("Generating pdf report ({}) for:".format(args.report_name))
 457         for branch, cutoff in bt_branches.items():
 458             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 459         generate_graph(bt_branches, args.report_name, args.repo_path)
 460
 461     return 0
 462
 463
 464 def sanitize_dataset(dataset):
 465     """
 466     Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
 467     representative mean without outlier in it.
 468     [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
 469     """
 470     sorted_data = sorted(dataset)
 471     q1, q3 = numpy.percentile(sorted_data, [25, 75])
 472     iqr = q3 - q1
 473     lower_bound = q1 - (1.5 * iqr)
 474     upper_bound = q3 + (1.5 * iqr)
 475     new_dataset = []
 476     outliers = []
 477     for i in dataset:
 478         if lower_bound <= i <= upper_bound:
 479             new_dataset.append(i)
 480         else:
 481             outliers.append(i)
 482     return new_dataset, outliers
 483
 484
 485 if __name__ == "__main__":
 486     sys.exit(main())