scripts/babeltrace-benchmark/benchmark.py

   1 #!/usr/bin/python3
   2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
   3 #
   4 # This program is free software: you can redistribute it and/or modify
   5 # it under the terms of the GNU General Public License as published by
   6 # the Free Software Foundation, either version 3 of the License, or
   7 # (at your option) any later version.
   8 #
   9 # This program is distributed in the hope that it will be useful,
  10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
  11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12 # GNU General Public License for more details.
  13 #
  14 # You should have received a copy of the GNU General Public License
  15 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
  16
  17 import json
  18 import os
  19 import tempfile
  20 from statistics import mean
  21 import argparse
  22 import sys
  23 from operator import add
  24
  25 import matplotlib.pyplot as plt
  26 from matplotlib.backends.backend_pdf import PdfPages
  27 from matplotlib.ticker import PercentFormatter
  28
  29 import git
  30 import numpy
  31 import lava_submit
  32
  33 from minio import Minio
  34 from minio.error import NoSuchKey
  35 from minio.error import ResponseError
  36
  37
  38 BENCHMARK_TYPES = ["dummy", "text"]
  39 DEFAULT_BUCKET = "lava"
  40
  41
  42 def graph_get_color(branch):
  43     """
  44     Get the color matching the branch.
  45     """
  46     color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
  47     return color[branch]
  48
  49
  50 def graph_get_title(branch, benchmark_type):
  51     """
  52     Get title for graph based on benchmark type.
  53     """
  54     string = {"dummy": "Dummy output", "text": "Text output"}
  55     return "{} - {}".format(branch, string[benchmark_type])
  56
  57
  58 def get_client():
  59     """
  60     Return minio client configured.
  61     """
  62     return Minio(
  63         "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
  64     )
  65
  66
  67 def get_file(client, prefix, file_name, workdir_name):
  68     """
  69     Return the path of the downloaded file.
  70     Return None on error
  71     """
  72     destination = os.path.join(workdir_name, file_name)
  73     object_name = "{}/{}".format(prefix, file_name)
  74     try:
  75         client.fget_object(DEFAULT_BUCKET, object_name, destination)
  76     except NoSuchKey:
  77         return None
  78
  79     return destination
  80
  81
  82 def delete_file(client, prefix, file_name):
  83     """
  84     Delete the file on remote.
  85     """
  86     object_name = "{}/{}".format(prefix, file_name)
  87     try:
  88         client.remove_object(DEFAULT_BUCKET, object_name)
  89     except ResponseError as err:
  90         print(err)
  91     except NoSuchKey:
  92         pass
  93
  94
  95 def get_git_log(bt_version, cutoff, repo_path):
  96     """
  97     Return an ordered (older to newer) list of commits for the bt_version and
  98     cutoff. WARNING: This changes the git repo HEAD.
  99     """
 100     repo = git.Repo(repo_path)
 101     repo.git.fetch()
 102     return repo.git.log(
 103         "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
 104     ).split("\n")
 105
 106
 107 def parse_result(result_path):
 108     """
 109     Parse the result file. Return a dataset of User time + System time.
 110     """
 111     with open(result_path) as result:
 112         parsed_result = json.load(result)
 113         return list(
 114             map(
 115                 add,
 116                 parsed_result["User time (seconds)"],
 117                 parsed_result["System time (seconds)"],
 118             )
 119         )
 120
 121
 122 def get_benchmark_results(client, commit, workdir):
 123     """
 124     Fetch the benchmark result from a certain commit across all benchmark type.
 125     """
 126     results = {}
 127     benchmark_valid = True
 128     for b_type in BENCHMARK_TYPES:
 129         prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
 130         result_file = get_file(client, prefix, commit, workdir)
 131         if not result_file:
 132             """
 133             Benchmark is either corrupted or not complete.
 134             """
 135             return None, benchmark_valid
 136         results[b_type] = parse_result(result_file)
 137         if all(i == 0.0 for i in results[b_type]):
 138             benchmark_valid = False
 139             print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
 140     # The dataset is valid return immediately.
 141     return results, benchmark_valid
 142
 143
 144 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
 145     """
 146     Plot the graph using the raw value.
 147     """
 148     point_x_data = []
 149     outlier_x_data = []
 150     point_y_data = []
 151     outlier_y_data = []
 152     for pos in range(len(x_data)):
 153         x = x_data[pos]
 154         valid_points, outliers = sanitize_dataset(y_data[pos])
 155         for y in valid_points:
 156             point_x_data.append(x)
 157             point_y_data.append(y)
 158         for y in outliers:
 159             outlier_x_data.append(x)
 160             outlier_y_data.append(y)
 161
 162     plt.plot(
 163         point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
 164     )
 165     plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
 166
 167     ymin = 0
 168     ymax = 1
 169     if y_data:
 170         ymin = 0.8 * min([item for sublist in y_data for item in sublist])
 171         ymax = 1.2 * max([item for sublist in y_data for item in sublist])
 172     # Put latest of other branches for reference as horizontal line.
 173     for l_branch, l_result in latest_values.items():
 174         if not l_result or l_branch == branch:
 175             continue
 176         plt.axhline(
 177             y=l_result,
 178             label="Latest {}".format(l_branch),
 179             color=graph_get_color(l_branch),
 180         )
 181         if l_result <= ymin:
 182             ymin = 0.8 * l_result
 183         if l_result >= ymax:
 184             ymax = 1.2 * l_result
 185
 186     plt.ylim(ymin=ymin, ymax=ymax)
 187     plt.xticks(x_data, labels, rotation=90, family="monospace")
 188     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 189     plt.ylabel("User + system time (s)")
 190     plt.xlabel("Latest commits")
 191     plt.legend()
 192
 193     plt.tight_layout()
 194     return
 195
 196
 197 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
 198     """
 199     Plot the graph using a ratio using first point as reference (0%).
 200     """
 201     reference = 0.01
 202     y_abs_max = 100
 203
 204     if y_data:
 205         reference = y_data[0]
 206
 207     # Transform y_data to a list of ratio for which the reference is the first
 208     # element.
 209     local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
 210
 211     plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
 212
 213     # Put latest of other branches for reference as horizontal line.
 214     for l_branch, l_result in latest_values.items():
 215         if not l_result or l_branch == branch:
 216             continue
 217         ratio_l_result = ((l_result / reference) - 1.0) * 100.0
 218         print(
 219             "branch {} branch {} value {} l_result {} reference {}".format(
 220                 branch, l_branch, ratio_l_result, l_result, reference
 221             )
 222         )
 223         plt.axhline(
 224             y=ratio_l_result,
 225             label="Latest {}".format(l_branch),
 226             color=graph_get_color(l_branch),
 227         )
 228
 229     # Draw the reference line.
 230     plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
 231
 232     # Get max absolute value to align the y axis with zero in the middle.
 233     if local_y_data:
 234         local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
 235         if y_abs_max > 100:
 236             y_abs_max = local_abs_max
 237
 238     plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
 239
 240     ax = plt.gca()
 241     percent_formatter = PercentFormatter()
 242     ax.yaxis.set_major_formatter(percent_formatter)
 243     ax.yaxis.set_minor_formatter(percent_formatter)
 244     plt.xticks(x_data, labels, rotation=90, family="monospace")
 245     plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
 246     plt.ylabel("Ratio")
 247     plt.xlabel("Latest commits")
 248     plt.legend()
 249
 250     plt.tight_layout()
 251     return
 252
 253
 254 def generate_graph(branches, report_name, git_path):
 255
 256     # The PDF document
 257     pdf_pages = PdfPages(report_name)
 258
 259     client = get_client()
 260     branch_results = dict()
 261
 262     # Fetch the results for each branch.
 263     for branch, cutoff in branches.items():
 264         commits = get_git_log(branch, cutoff, git_path)
 265         results = []
 266         with tempfile.TemporaryDirectory() as workdir:
 267             for commit in commits:
 268                 b_results, valid = get_benchmark_results(client, commit, workdir)
 269                 if not b_results or not valid:
 270                     continue
 271                 results.append((commit, b_results))
 272         branch_results[branch] = results
 273
 274     for b_type in BENCHMARK_TYPES:
 275         latest_values = {}
 276         max_len = 0
 277
 278         # Find the maximum size for a series inside our series dataset.
 279         # This is used later to compute the size of the actual plot (pdf).
 280         # While there gather the comparison value used to draw comparison line
 281         # between branches.
 282         for branch, results in branch_results.items():
 283             max_len = max([max_len, len(results)])
 284             if results:
 285                 latest_values[branch] = mean(
 286                     sanitize_dataset(results[-1][1][b_type])[0]
 287                 )
 288             else:
 289                 latest_values[branch] = None
 290
 291         for branch, results in branch_results.items():
 292             # Create a figure instance
 293             if max_len and max_len > 10:
 294                 width = 0.16 * max_len
 295             else:
 296                 width = 11.69
 297
 298             x_data = list(range(len(results)))
 299             y_data = [c[1][b_type] for c in results]
 300             labels = [c[0][:8] for c in results]
 301
 302             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 303             plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
 304             pdf_pages.savefig(fig)
 305
 306             fig = plt.figure(figsize=(width, 8.27), dpi=100)
 307             # Use the mean of each sanitize dataset here, we do not care for
 308             # variance for ratio. At least not yet.
 309             y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
 310             plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
 311             pdf_pages.savefig(fig)
 312
 313     pdf_pages.close()
 314
 315
 316 def launch_jobs(branches, git_path, wait_for_completion, debug):
 317     """
 318     Lauch jobs for all missing results.
 319     """
 320     client = get_client()
 321     for branch, cutoff in branches.items():
 322         commits = get_git_log(branch, cutoff, git_path)
 323
 324         with tempfile.TemporaryDirectory() as workdir:
 325             for commit in commits:
 326                 b_results = get_benchmark_results(client, commit, workdir)[0]
 327                 if b_results:
 328                     continue
 329                 lava_submit.submit(
 330                     commit, wait_for_completion=wait_for_completion, debug=debug
 331                 )
 332
 333
 334 def main():
 335     """
 336     Parse arguments and execute as needed.
 337     """
 338     bt_branches = {
 339         "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
 340         "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
 341         "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
 342     }
 343
 344     parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
 345     parser.add_argument(
 346         "--generate-jobs", action="store_true", help="Generate and send jobs"
 347     )
 348     parser.add_argument(
 349         "--do-not-wait-on-completion",
 350         action="store_true",
 351         default=False,
 352         help="Wait for the completion of each jobs sent. This is useful"
 353         "for the ci. Otherwise we could end up spaming the lava instance.",
 354     )
 355     parser.add_argument(
 356         "--generate-report",
 357         action="store_true",
 358         help="Generate graphs and save them to pdf",
 359     )
 360     parser.add_argument(
 361         "--report-name", default="report.pdf", help="The name of the pdf report."
 362     )
 363     parser.add_argument(
 364         "--debug", action="store_true", default=False, help="Do not send jobs to lava."
 365     )
 366     parser.add_argument(
 367         "--repo-path", help="The location of the git repo to use.", required=True
 368     )
 369
 370     args = parser.parse_args()
 371
 372     if not os.path.exists(args.repo_path):
 373         print("Repository location does not exists.")
 374         return 1
 375
 376     if args.generate_jobs:
 377         print("Launching jobs for:")
 378         for branch, cutoff in bt_branches.items():
 379             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 380         launch_jobs(
 381             bt_branches, args.repo_path, not args.do_not_wait_on_completion, args.debug
 382         )
 383
 384     if args.generate_report:
 385         print("Generating pdf report ({}) for:".format(args.report_name))
 386         for branch, cutoff in bt_branches.items():
 387             print("\t Branch {} with cutoff {}".format(branch, cutoff))
 388         generate_graph(bt_branches, args.report_name, args.repo_path)
 389
 390     return 0
 391
 392
 393 def sanitize_dataset(dataset):
 394     """
 395     Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
 396     representative mean without outlier in it.
 397     [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
 398     """
 399     sorted_data = sorted(dataset)
 400     q1, q3 = numpy.percentile(sorted_data, [25, 75])
 401     iqr = q3 - q1
 402     lower_bound = q1 - (1.5 * iqr)
 403     upper_bound = q3 + (1.5 * iqr)
 404     new_dataset = []
 405     outliers = []
 406     for i in dataset:
 407         if lower_bound <= i <= upper_bound:
 408             new_dataset.append(i)
 409         else:
 410             outliers.append(i)
 411     return new_dataset, outliers
 412
 413
 414 if __name__ == "__main__":
 415     sys.exit(main())