benchmark.py

   1 #!/usr/bin/python3
   2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 import json
  18 import os
  19 import tempfile
  20 from statistics import mean
  21 import argparse
  22 import sys
  23 from operator import add
  24
  25 import matplotlib.pyplot as plt
  26 from matplotlib.backends.backend_pdf import PdfPages
  27 from matplotlib.ticker import PercentFormatter
  28
  29 import git
  30 import numpy
  31 import lava_submit
  32
  33 from minio import Minio
  34 from minio.error import NoSuchKey
  35 from minio.error import ResponseError
  36
  37
  38 BENCHMARK_TYPES = ["dummy", "text"]
  39 DEFAULT_BUCKET = "lava"
  40
  41
  42 def graph_get_color(branch):
  43     """
  44     Get the color matching the branch.
  45     """
  46     color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
  47     return color[branch]
  48
  49
  50 def graph_get_title(branch, benchmark_type):
  51     """
  52     Get title for graph based on benchmark type.
  53     """
  54     string = {"dummy": "Dummy output", "text": "Text output"}
  55     return "{} - {}".format(branch, string[benchmark_type])
  56
  57
  58 def get_client():
  59     """
  60     Return minio client configured.
  61     """
  62     return Minio(
  63         "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
  64     )
  65
  66
  67 def get_file(client, prefix, file_name, workdir_name):
  68     """
  69     Return the path of the downloaded file.
  70     Return None on error
  71     """
  72     destination = os.path.join(workdir_name, file_name)
  73     object_name = "{}/{}".format(prefix, file_name)
  74     try:
  75         client.fget_object(DEFAULT_BUCKET, object_name, destination)
  76     except NoSuchKey:
  77         return None
  78
  79     return destination
  80
  81
  82 def delete_file(client, prefix, file_name):
  83     """
  84     Delete the file on remote.
  85     """
  86     object_name = "{}/{}".format(prefix, file_name)
  87     try:
  88         client.remove_object(DEFAULT_BUCKET, object_name)
  89     except ResponseError as err:
  90         print(err)
  91     except NoSuchKey:
  92         pass
  93
  94
  95 def get_git_log(bt_version, cutoff, repo_path):
  96     """
  97     Return an ordered (older to newer) list of commits for the bt_version and
  98     cutoff. WARNING: This changes the git repo HEAD.
  99     """
 100     repo = git.Repo(repo_path)
 101     repo.git.fetch()
 102     return repo.git.log(
 103         "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
 104     ).split("\n")
 105
 106
 107 def parse_result(result_path):
 108     """
 109     Parse the result file. Return a dataset of User time + System time.
 110     """
 111     with open(result_path) as result:
 112         parsed_result = json.load(result)
 113         return list(
 114             map(
 115                 add,
 116                 parsed_result["User time (seconds)"],
 117                 parsed_result["System time (seconds)"],
 118             )
 119         )
 120
 121
 122 def get_benchmark_results(client, commit, workdir):
 123     """
 124     Fetch the benchmark result from a certain commit across all benchmark type.
 125     """
 126     results = {}
 127     benchmark_valid = True
 128     for b_type in BENCHMARK_TYPES:
 129         prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
 130         result_file = get_file(client, prefix, commit, workdir)
 131         if not result_file:
 132             """
 133             Benchmark is either corrupted or not complete.
 134             """
 135             return None, benchmark_valid
 136         results[b_type] = parse_result(result_file)
 137         if all(i == 0.0 for i in results[b_type]):
 138             benchmark_valid = False
 139             print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
 140     # The dataset is valid return immediately.
 141     return results, benchmark_valid
 142
 143
 144 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
 145     """
 146     Plot the graph using the raw value.
 147     """
 148     point_x_data = []
 149     outlier_x_data = []
 150     point_y_data = []
 151     outlier_y_data = []
 152     for pos in range(len(x_data)):
 153         x = x_data[pos]
 154         valid_points, outliers = sanitize_dataset(y_data[pos])
 155         for y in valid_points:
 156             point_x_data.append(x)
 157             point_y_data.append(y)
 158         for y in outliers:
 159             outlier_x_data.append(x)
 160             outlier_y_data.append(y)
 161
 162     plt.plot(
 163         point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
 164     )
 165     plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
 166
 167     ymax = 1
 168     if y_data:
 169         ymin = 0.8 * min([item for sublist in y_data for item in sublist])
 170         ymax = 1.2 * max([item for sublist in y_data for item in sublist])
 171     # Put latest of other branches for reference as horizontal line.
 172     for l_branch, l_result in latest_values.items():
 173         if not l_result or l_branch == branch:
 174             continue
 175         plt.axhline(
 176             y=l_result,
 177             label="Latest {}".format(l_branch),
 178             color=graph_get_color(l_branch),
 179         )
 180         if l_result >= ymax:
 181             ymax = 1.2 * l_result
 182     ax = plt.gca()
 183     plt.ylim(ymin=0, ymax=ymax)
 184     plt.xticks(x_data, labels, rotation=90, family="monospace")
 185     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 186     plt.ylabel("User + system time (s)")
 187     plt.xlabel("Latest commits")
 188     plt.legend()
 189
 190     # Put tick on the right side
 191     ax.tick_params(labeltop=False, labelright=True)
 192
 193     plt.tight_layout()
 194     return
 195
 196 def plot_delta_between_point(branch, benchmark_type, x_data, y_data, labels, latest_values):
 197     """
 198     Plot the graph of delta between each sequential commit.
 199     """
 200     local_abs_max = 100
 201
 202     # Transform y_data to a list of  for which the reference is the first
 203     # element.
 204     local_y_data = []
 205     for pos, y in enumerate(y_data):
 206         if pos == 0:
 207             local_y_data.append(0.0)
 208             continue
 209         local_y_data.append(y - y_data[pos - 1])
 210
 211     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 212
 213     # Get max absolute value to align the y axis with zero in the middle.
 214     if local_y_data:
 215         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 216
 217     plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
 218
 219     ax = plt.gca()
 220     plt.xticks(x_data, labels, rotation=90, family="monospace")
 221     plt.title(graph_get_title(branch, benchmark_type) + " Delta to previous commit", fontweight="bold")
 222     plt.ylabel("Seconds")
 223     plt.xlabel("Latest commits")
 224     plt.legend()
 225
 226     # Put tick on the right side
 227     ax.tick_params(labeltop=False, labelright=True)
 228
 229     plt.tight_layout()
 230     return
 231
 232 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
 233     """
 234     Plot the graph using a ratio using first point as reference (0%).
 235     """
 236     reference = 0.01
 237     y_abs_max = 100
 238
 239     if y_data:
 240         reference = y_data[0]
 241
 242     # Transform y_data to a list of ratio for which the reference is the first
 243     # element.
 244     local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
 245
 246     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 247
 248     # Put latest of other branches for reference as horizontal line.
 249     for l_branch, l_result in latest_values.items():
 250         if not l_result or l_branch == branch:
 251             continue
 252         ratio_l_result = ((l_result / reference) - 1.0) * 100.0
 253         print(
 254             "branch {} branch {} value {} l_result {} reference {}".format(
 255                 branch, l_branch, ratio_l_result, l_result, reference
 256             )
 257         )
 258         plt.axhline(
 259             y=ratio_l_result,
 260             label="Latest {}".format(l_branch),
 261             color=graph_get_color(l_branch),
 262         )
 263
 264     # Draw the reference line.
 265     plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
 266
 267     # Get max absolute value to align the y axis with zero in the middle.
 268     if local_y_data:
 269         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 270         if y_abs_max > 100:
 271             y_abs_max = local_abs_max
 272
 273     plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
 274
 275     ax = plt.gca()
 276     percent_formatter = PercentFormatter()
 277     ax.yaxis.set_major_formatter(percent_formatter)
 278     ax.yaxis.set_minor_formatter(percent_formatter)
 279     plt.xticks(x_data, labels, rotation=90, family="monospace")
 280     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 281     plt.ylabel("Ratio")
 282     plt.xlabel("Latest commits")
 283     plt.legend()
 284
 285     # Put tick on the right side
 286     ax.tick_params(labeltop=False, labelright=True)
 287
 288     plt.tight_layout()
 289     return
 290
 291 def generate_graph(branches, report_name, git_path):
 292
 293     # The PDF document
 294     pdf_pages = PdfPages(report_name)
 295
 296     client = get_client()
 297     branch_results = dict()
 298
 299     # Fetch the results for each branch.
 300     for branch, cutoff in branches.items():
 301         commits = get_git_log(branch, cutoff, git_path)
 302         results = []
 303         with tempfile.TemporaryDirectory() as workdir:
 304             for commit in commits:
 305                 b_results, valid = get_benchmark_results(client, commit, workdir)
 306                 if not b_results or not valid:
 307                     continue
 308                 results.append((commit, b_results))
 309         branch_results[branch] = results
 310
 311     for b_type in BENCHMARK_TYPES:
 312         latest_values = {}
 313         max_len = 0
 314
 315         # Find the maximum size for a series inside our series dataset.
 316         # This is used later to compute the size of the actual plot (pdf).
 317         # While there gather the comparison value used to draw comparison line
 318         # between branches.
 319         for branch, results in branch_results.items():
 320             max_len = max([max_len, len(results)])
 321             if results:
 322                 latest_values[branch] = mean(
 323                     sanitize_dataset(results[-1][1][b_type])[0]
 324                 )
 325             else:
 326                 latest_values[branch] = None
 327
 328         for branch, results in branch_results.items():
 329             # Create a figure instance
 330             if max_len and max_len > 10:
 331                 width = 0.16 * max_len
 332             else:
 333                 width = 11.69
 334
 335             x_data = list(range(len(results)))
 336             y_data = [c[1][b_type] for c in results]
 337             labels = [c[0][:8] for c in results]
 338
 339             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 340             plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
 341             pdf_pages.savefig(fig)
 342
 343             # Use the mean of each sanitize dataset here, we do not care for
 344             # variance for ratio. At least not yet.
 345             y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
 346             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 347             plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
 348             pdf_pages.savefig(fig)
 349
 350             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 351             plot_delta_between_point(branch, b_type, x_data, y_data, labels, latest_values)
 352             pdf_pages.savefig(fig)
 353
 354     pdf_pages.close()
 355
 356
 357 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
 358     """
 359     Lauch jobs for all missing results.
 360     """
 361     client = get_client()
 362     for branch, cutoff in branches.items():
 363         commits = get_git_log(branch, cutoff, git_path)
 364
 365         with tempfile.TemporaryDirectory() as workdir:
 366             for commit in commits:
 367                 b_results = get_benchmark_results(client, commit, workdir)[0]
 368                 if b_results and not force:
 369                     continue
 370                 lava_submit.submit(
 371                     commit, wait_for_completion=wait_for_completion, debug=debug
 372                 )
 373
 374
 375 def main():
 376     """
 377     Parse arguments and execute as needed.
 378     """
 379     bt_branches = {
 380         "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
 381         "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
 382         "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
 383     }
 384
 385     parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
 386     parser.add_argument(
 387         "--generate-jobs", action="store_true", help="Generate and send jobs"
 388     )
 389     parser.add_argument(
 390         "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
 391     )
 392     parser.add_argument(
 393         "--do-not-wait-on-completion",
 394         action="store_true",
 395         default=False,
 396         help="Wait for the completion of each jobs sent. This is useful"
 397         "for the ci. Otherwise we could end up spaming the lava instance.",
 398     )
 399     parser.add_argument(
 400         "--generate-report",
 401         action="store_true",
 402         help="Generate graphs and save them to pdf",
 403     )
 404     parser.add_argument(
 405         "--report-name", default="report.pdf", help="The name of the pdf report."
 406     )
 407     parser.add_argument(
 408         "--debug", action="store_true", default=False, help="Do not send jobs to lava."
 409     )
 410     parser.add_argument(
 411         "--repo-path", help="The location of the git repo to use.", required=True
 412     )
 413
 414     args = parser.parse_args()
 415
 416     if not os.path.exists(args.repo_path):
 417         print("Repository location does not exists.")
 418         return 1
 419
 420     if args.generate_jobs:
 421         print("Launching jobs for:")
 422
 423         for branch, cutoff in bt_branches.items():
 424             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 425
 426         launch_jobs(
 427             bt_branches,
 428             args.repo_path,
 429             not args.do_not_wait_on_completion,
 430             args.debug,
 431             args.force_jobs,
 432         )
 433
 434     if args.generate_report:
 435         print("Generating pdf report ({}) for:".format(args.report_name))
 436         for branch, cutoff in bt_branches.items():
 437             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 438         generate_graph(bt_branches, args.report_name, args.repo_path)
 439
 440     return 0
 441
 442
 443 def sanitize_dataset(dataset):
 444     """
 445     Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
 446     representative mean without outlier in it.
 447     [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
 448     """
 449     sorted_data = sorted(dataset)
 450     q1, q3 = numpy.percentile(sorted_data, [25, 75])
 451     iqr = q3 - q1
 452     lower_bound = q1 - (1.5 * iqr)
 453     upper_bound = q3 + (1.5 * iqr)
 454     new_dataset = []
 455     outliers = []
 456     for i in dataset:
 457         if lower_bound <= i <= upper_bound:
 458             new_dataset.append(i)
 459         else:
 460             outliers.append(i)
 461     return new_dataset, outliers
 462
 463
 464 if __name__ == "__main__":
 465     sys.exit(main())