formatting: black benchmark.py
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41
42 def json_type(string):
43 """
44 Argpase type for json args.
45 We expect a base dictionary.
46 """
47 passed_json = json.loads(string)
48 if not isinstance(passed_json, dict):
49 msg = "%r is not a dict" % string
50 raise argparse.ArgumentTypeError(msg)
51 return passed_json
52
53 def graph_get_color(branch):
54 """
55 Get the color matching the branch.
56 """
57 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
58 return color[branch]
59
60
61 def graph_get_title(branch, benchmark_type):
62 """
63 Get title for graph based on benchmark type.
64 """
65 string = {"dummy": "Dummy output", "text": "Text output"}
66 return "{} - {}".format(branch, string[benchmark_type])
67
68
69 def get_client():
70 """
71 Return minio client configured.
72 """
73 return Minio(
74 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
75 )
76
77
78 def get_file(client, prefix, file_name, workdir_name):
79 """
80 Return the path of the downloaded file.
81 Return None on error
82 """
83 destination = os.path.join(workdir_name, file_name)
84 object_name = "{}/{}".format(prefix, file_name)
85 try:
86 client.fget_object(DEFAULT_BUCKET, object_name, destination)
87 except NoSuchKey:
88 return None
89
90 return destination
91
92
93 def delete_file(client, prefix, file_name):
94 """
95 Delete the file on remote.
96 """
97 object_name = "{}/{}".format(prefix, file_name)
98 try:
99 client.remove_object(DEFAULT_BUCKET, object_name)
100 except ResponseError as err:
101 print(err)
102 except NoSuchKey:
103 pass
104
105
106 def get_git_log(bt_version, cutoff, repo_path):
107 """
108 Return an ordered (older to newer) list of commits for the bt_version and
109 cutoff. WARNING: This changes the git repo HEAD.
110 """
111 repo = git.Repo(repo_path)
112 repo.git.fetch()
113 return repo.git.log(
114 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
115 ).split("\n")
116
117
118 def parse_result(result_path):
119 """
120 Parse the result file. Return a dataset of User time + System time.
121 """
122 with open(result_path) as result:
123 parsed_result = json.load(result)
124 return list(
125 map(
126 add,
127 parsed_result["User time (seconds)"],
128 parsed_result["System time (seconds)"],
129 )
130 )
131
132
133 def get_benchmark_results(client, commit, workdir):
134 """
135 Fetch the benchmark result from a certain commit across all benchmark type.
136 """
137 results = {}
138 benchmark_valid = True
139 for b_type in BENCHMARK_TYPES:
140 prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
141 result_file = get_file(client, prefix, commit, workdir)
142 if not result_file:
143 """
144 Benchmark is either corrupted or not complete.
145 """
146 return None, benchmark_valid
147 results[b_type] = parse_result(result_file)
148 if all(i == 0.0 for i in results[b_type]):
149 benchmark_valid = False
150 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
151 # The dataset is valid return immediately.
152 return results, benchmark_valid
153
154
155 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
156 """
157 Plot the graph using the raw value.
158 """
159 point_x_data = []
160 outlier_x_data = []
161 point_y_data = []
162 outlier_y_data = []
163 for pos in range(len(x_data)):
164 x = x_data[pos]
165 valid_points, outliers = sanitize_dataset(y_data[pos])
166 for y in valid_points:
167 point_x_data.append(x)
168 point_y_data.append(y)
169 for y in outliers:
170 outlier_x_data.append(x)
171 outlier_y_data.append(y)
172
173 plt.plot(
174 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
175 )
176 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
177
178 ymax = 1
179 if y_data:
180 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
181 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
182 # Put latest of other branches for reference as horizontal line.
183 for l_branch, l_result in latest_values.items():
184 if not l_result or l_branch == branch:
185 continue
186 plt.axhline(
187 y=l_result,
188 label="Latest {}".format(l_branch),
189 color=graph_get_color(l_branch),
190 )
191 if l_result >= ymax:
192 ymax = 1.2 * l_result
193 ax = plt.gca()
194 plt.ylim(ymin=0, ymax=ymax)
195 plt.xticks(x_data, labels, rotation=90, family="monospace")
196 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
197 plt.ylabel("User + system time (s)")
198 plt.xlabel("Latest commits")
199 plt.legend()
200
201 # Put tick on the right side
202 ax.tick_params(labeltop=False, labelright=True)
203
204 plt.tight_layout()
205 return
206
207
208 def plot_delta_between_point(
209 branch, benchmark_type, x_data, y_data, labels, latest_values
210 ):
211 """
212 Plot the graph of delta between each sequential commit.
213 """
214 local_abs_max = 100
215
216 # Transform y_data to a list of for which the reference is the first
217 # element.
218 local_y_data = []
219 for pos, y in enumerate(y_data):
220 if pos == 0:
221 local_y_data.append(0.0)
222 continue
223 local_y_data.append(y - y_data[pos - 1])
224
225 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
226
227 # Get max absolute value to align the y axis with zero in the middle.
228 if local_y_data:
229 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
230
231 plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
232
233 ax = plt.gca()
234 plt.xticks(x_data, labels, rotation=90, family="monospace")
235 plt.title(
236 graph_get_title(branch, benchmark_type) + " Delta to previous commit",
237 fontweight="bold",
238 )
239 plt.ylabel("Seconds")
240 plt.xlabel("Latest commits")
241 plt.legend()
242
243 # Put tick on the right side
244 ax.tick_params(labeltop=False, labelright=True)
245
246 plt.tight_layout()
247 return
248
249
250 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
251 """
252 Plot the graph using a ratio using first point as reference (0%).
253 """
254 reference = 0.01
255 y_abs_max = 100
256
257 if y_data:
258 reference = y_data[0]
259
260 # Transform y_data to a list of ratio for which the reference is the first
261 # element.
262 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
263
264 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
265
266 # Put latest of other branches for reference as horizontal line.
267 for l_branch, l_result in latest_values.items():
268 if not l_result or l_branch == branch:
269 continue
270 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
271 print(
272 "branch {} branch {} value {} l_result {} reference {}".format(
273 branch, l_branch, ratio_l_result, l_result, reference
274 )
275 )
276 plt.axhline(
277 y=ratio_l_result,
278 label="Latest {}".format(l_branch),
279 color=graph_get_color(l_branch),
280 )
281
282 # Draw the reference line.
283 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
284
285 # Get max absolute value to align the y axis with zero in the middle.
286 if local_y_data:
287 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
288 if y_abs_max > 100:
289 y_abs_max = local_abs_max
290
291 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
292
293 ax = plt.gca()
294 percent_formatter = PercentFormatter()
295 ax.yaxis.set_major_formatter(percent_formatter)
296 ax.yaxis.set_minor_formatter(percent_formatter)
297 plt.xticks(x_data, labels, rotation=90, family="monospace")
298 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
299 plt.ylabel("Ratio")
300 plt.xlabel("Latest commits")
301 plt.legend()
302
303 # Put tick on the right side
304 ax.tick_params(labeltop=False, labelright=True)
305
306 plt.tight_layout()
307 return
308
309
310 def generate_graph(branches, report_name, git_path):
311
312 # The PDF document
313 pdf_pages = PdfPages(report_name)
314
315 client = get_client()
316 branch_results = dict()
317
318 # Fetch the results for each branch.
319 for branch, cutoff in branches.items():
320 commits = get_git_log(branch, cutoff, git_path)
321 results = []
322 with tempfile.TemporaryDirectory() as workdir:
323 for commit in commits:
324 b_results, valid = get_benchmark_results(client, commit, workdir)
325 if not b_results or not valid:
326 continue
327 results.append((commit, b_results))
328 branch_results[branch] = results
329
330 for b_type in BENCHMARK_TYPES:
331 latest_values = {}
332 max_len = 0
333
334 # Find the maximum size for a series inside our series dataset.
335 # This is used later to compute the size of the actual plot (pdf).
336 # While there gather the comparison value used to draw comparison line
337 # between branches.
338 for branch, results in branch_results.items():
339 max_len = max([max_len, len(results)])
340 if results:
341 latest_values[branch] = mean(
342 sanitize_dataset(results[-1][1][b_type])[0]
343 )
344 else:
345 latest_values[branch] = None
346
347 for branch, results in branch_results.items():
348 # Create a figure instance
349 if max_len and max_len > 10:
350 width = 0.16 * max_len
351 else:
352 width = 11.69
353
354 x_data = list(range(len(results)))
355 y_data = [c[1][b_type] for c in results]
356 labels = [c[0][:8] for c in results]
357
358 fig = plt.figure(figsize=(width, 8.27), dpi=100)
359 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
360 pdf_pages.savefig(fig)
361
362 # Use the mean of each sanitize dataset here, we do not care for
363 # variance for ratio. At least not yet.
364 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
365 fig = plt.figure(figsize=(width, 8.27), dpi=100)
366 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
367 pdf_pages.savefig(fig)
368
369 fig = plt.figure(figsize=(width, 8.27), dpi=100)
370 plot_delta_between_point(
371 branch, b_type, x_data, y_data, labels, latest_values
372 )
373 pdf_pages.savefig(fig)
374
375 pdf_pages.close()
376
377
378 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
379 """
380 Lauch jobs for all missing results.
381 """
382 client = get_client()
383 for branch, cutoff in branches.items():
384 commits = get_git_log(branch, cutoff, git_path)
385
386 with tempfile.TemporaryDirectory() as workdir:
387 for commit in commits:
388 b_results = get_benchmark_results(client, commit, workdir)[0]
389 if b_results and not force:
390 continue
391 lava_submit.submit(
392 commit, wait_for_completion=wait_for_completion, debug=debug
393 )
394
395
396 def main():
397 """
398 Parse arguments and execute as needed.
399 """
400 bt_branches = {
401 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
402 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
403 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
404 }
405
406 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
407 parser.add_argument(
408 "--generate-jobs", action="store_true", help="Generate and send jobs"
409 )
410 parser.add_argument(
411 "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
412 )
413 parser.add_argument(
414 "--do-not-wait-on-completion",
415 action="store_true",
416 default=False,
417 help="Wait for the completion of each jobs sent. This is useful"
418 "for the ci. Otherwise we could end up spaming the lava instance.",
419 )
420 parser.add_argument(
421 "--generate-report",
422 action="store_true",
423 help="Generate graphs and save them to pdf",
424 )
425 parser.add_argument(
426 "--report-name", default="report.pdf", help="The name of the pdf report."
427 )
428 parser.add_argument(
429 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
430 )
431 parser.add_argument(
432 "--repo-path", help="The location of the git repo to use.", required=True
433 )
434 parser.add_argument(
435 "--overwrite-branches-cutoff",
436 help="A dictionary of the form {"
437 "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
438 "jobs generation.",
439 required=False, type=json_type
440 )
441
442 args = parser.parse_args()
443
444 if args.overwrite_branches_cutoff:
445 bt_branches = args.overwrite_branches_cutoff
446
447 if not os.path.exists(args.repo_path):
448 print("Repository location does not exists.")
449 return 1
450
451 if args.generate_jobs:
452 print("Launching jobs for:")
453
454 for branch, cutoff in bt_branches.items():
455 print("\t Branch {} with cutoff {}".format(branch, cutoff))
456
457 launch_jobs(
458 bt_branches,
459 args.repo_path,
460 not args.do_not_wait_on_completion,
461 args.debug,
462 args.force_jobs,
463 )
464
465 if args.generate_report:
466 print("Generating pdf report ({}) for:".format(args.report_name))
467 for branch, cutoff in bt_branches.items():
468 print("\t Branch {} with cutoff {}".format(branch, cutoff))
469 generate_graph(bt_branches, args.report_name, args.repo_path)
470
471 return 0
472
473
474 def sanitize_dataset(dataset):
475 """
476 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
477 representative mean without outlier in it.
478 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
479 """
480 sorted_data = sorted(dataset)
481 q1, q3 = numpy.percentile(sorted_data, [25, 75])
482 iqr = q3 - q1
483 lower_bound = q1 - (1.5 * iqr)
484 upper_bound = q3 + (1.5 * iqr)
485 new_dataset = []
486 outliers = []
487 for i in dataset:
488 if lower_bound <= i <= upper_bound:
489 new_dataset.append(i)
490 else:
491 outliers.append(i)
492 return new_dataset, outliers
493
494
495 if __name__ == "__main__":
496 sys.exit(main())
This page took 0.04212 seconds and 4 git commands to generate.