lava: Run babeltrace-benchmark jobs with /bin/bash
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41 invalid_commits = {
42 "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
43 "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
44 "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
45 "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
46 "d0d4e0ed487ea23aaf0d023513c0a4d86901b79b", # Does not build
47 "c24f7ab4dd9edeb5e50b0070fd9d9e8691057dde", # Does not build
48 "ce67f5614a4db3b2de4d887eca52135b439b4937", # Does not build
49 "80aff5efc66679fd934cef433c0e698694748385", # Does not build
50 "f4f11e84942d36fcc8a597d226928bce2ccac4b3", # Does not build
51 }
52
53 def json_type(string):
54 """
55 Argpase type for json args.
56 We expect a base dictionary.
57 """
58 passed_json = json.loads(string)
59 if not isinstance(passed_json, dict):
60 msg = "%r is not a dict" % string
61 raise argparse.ArgumentTypeError(msg)
62 return passed_json
63
64 def graph_get_color(branch):
65 """
66 Get the color matching the branch.
67 """
68 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
69 return color[branch]
70
71
72 def graph_get_title(branch, benchmark_type):
73 """
74 Get title for graph based on benchmark type.
75 """
76 string = {"dummy": "Dummy output", "text": "Text output"}
77 return "{} - {}".format(branch, string[benchmark_type])
78
79
80 def get_client():
81 """
82 Return minio client configured.
83 """
84 return Minio(
85 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
86 )
87
88
89 def get_file(client, prefix, file_name, workdir_name):
90 """
91 Return the path of the downloaded file.
92 Return None on error
93 """
94 destination = os.path.join(workdir_name, file_name)
95 object_name = "{}/{}".format(prefix, file_name)
96 try:
97 client.fget_object(DEFAULT_BUCKET, object_name, destination)
98 except NoSuchKey:
99 return None
100
101 return destination
102
103
104 def delete_file(client, prefix, file_name):
105 """
106 Delete the file on remote.
107 """
108 object_name = "{}/{}".format(prefix, file_name)
109 try:
110 client.remove_object(DEFAULT_BUCKET, object_name)
111 except ResponseError as err:
112 print(err)
113 except NoSuchKey:
114 pass
115
116
117 def get_git_log(bt_version, cutoff, repo_path):
118 """
119 Return an ordered (older to newer) list of commits for the bt_version and
120 cutoff. WARNING: This changes the git repo HEAD.
121 """
122 repo = git.Repo(repo_path)
123 repo.git.fetch()
124 return repo.git.log(
125 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
126 ).split("\n")
127
128
129 def parse_result(result_path):
130 """
131 Parse the result file. Return a dataset of User time + System time.
132 """
133 with open(result_path) as result:
134 parsed_result = json.load(result)
135 return list(
136 map(
137 add,
138 parsed_result["User time (seconds)"],
139 parsed_result["System time (seconds)"],
140 )
141 )
142
143
144 def get_benchmark_results(client, commit, workdir):
145 """
146 Fetch the benchmark result from a certain commit across all benchmark type.
147 """
148 results = {}
149 benchmark_valid = True
150 for b_type in BENCHMARK_TYPES:
151 prefix = "/results/benchmarks/babeltrace/{}".format(b_type)
152 result_file = get_file(client, prefix, commit, workdir)
153 if not result_file:
154 """
155 Benchmark is either corrupted or not complete.
156 """
157 return None, benchmark_valid
158 results[b_type] = parse_result(result_file)
159 if all(i == 0.0 for i in results[b_type]):
160 benchmark_valid = False
161 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
162 # The dataset is valid return immediately.
163 return results, benchmark_valid
164
165
166 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
167 """
168 Plot the graph using the raw value.
169 """
170 point_x_data = []
171 outlier_x_data = []
172 point_y_data = []
173 outlier_y_data = []
174 for pos in range(len(x_data)):
175 x = x_data[pos]
176 valid_points, outliers = sanitize_dataset(y_data[pos])
177 for y in valid_points:
178 point_x_data.append(x)
179 point_y_data.append(y)
180 for y in outliers:
181 outlier_x_data.append(x)
182 outlier_y_data.append(y)
183
184 plt.plot(
185 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
186 )
187 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
188
189 ymax = 1
190 if y_data:
191 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
192 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
193 # Put latest of other branches for reference as horizontal line.
194 for l_branch, l_result in latest_values.items():
195 if not l_result or l_branch == branch:
196 continue
197 plt.axhline(
198 y=l_result,
199 label="Latest {}".format(l_branch),
200 color=graph_get_color(l_branch),
201 )
202 if l_result >= ymax:
203 ymax = 1.2 * l_result
204 ax = plt.gca()
205 plt.ylim(ymin=0, ymax=ymax)
206 plt.xticks(x_data, labels, rotation=90, family="monospace")
207 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
208 plt.ylabel("User + system time (s)")
209 plt.xlabel("Latest commits")
210 plt.legend()
211 plt.grid(True)
212
213 # Put tick on the right side
214 ax.tick_params(labeltop=False, labelright=True)
215
216 plt.tight_layout()
217 return
218
219
220 def plot_delta_between_point(
221 branch, benchmark_type, x_data, y_data, labels, latest_values
222 ):
223 """
224 Plot the graph of delta between each sequential commit.
225 """
226 local_abs_max = 100
227
228 # Transform y_data to a list of for which the reference is the first
229 # element.
230 local_y_data = []
231 for pos, y in enumerate(y_data):
232 if pos == 0:
233 local_y_data.append(0.0)
234 continue
235 local_y_data.append(y - y_data[pos - 1])
236
237 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
238
239 # Get max absolute value to align the y axis with zero in the middle.
240 if local_y_data:
241 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
242
243 plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
244
245 ax = plt.gca()
246 plt.xticks(x_data, labels, rotation=90, family="monospace")
247 plt.title(
248 graph_get_title(branch, benchmark_type) + " Delta to previous commit",
249 fontweight="bold",
250 )
251 plt.ylabel("Seconds")
252 plt.xlabel("Latest commits")
253 plt.legend()
254 plt.grid(True)
255
256 # Put tick on the right side
257 ax.tick_params(labeltop=False, labelright=True)
258
259 plt.tight_layout()
260 return
261
262
263 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
264 """
265 Plot the graph using a ratio using first point as reference (0%).
266 """
267 reference = 0.01
268 y_abs_max = 100
269
270 if y_data:
271 reference = y_data[0]
272
273 # Transform y_data to a list of ratio for which the reference is the first
274 # element.
275 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
276
277 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
278
279 # Put latest of other branches for reference as horizontal line.
280 for l_branch, l_result in latest_values.items():
281 if not l_result or l_branch == branch:
282 continue
283 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
284 print(
285 "branch {} branch {} value {} l_result {} reference {}".format(
286 branch, l_branch, ratio_l_result, l_result, reference
287 )
288 )
289 plt.axhline(
290 y=ratio_l_result,
291 label="Latest {}".format(l_branch),
292 color=graph_get_color(l_branch),
293 )
294
295 # Draw the reference line.
296 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
297
298 # Get max absolute value to align the y axis with zero in the middle.
299 if local_y_data:
300 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
301 if y_abs_max > 100:
302 y_abs_max = local_abs_max
303
304 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
305
306 ax = plt.gca()
307 percent_formatter = PercentFormatter()
308 ax.yaxis.set_major_formatter(percent_formatter)
309 ax.yaxis.set_minor_formatter(percent_formatter)
310 plt.xticks(x_data, labels, rotation=90, family="monospace")
311 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
312 plt.ylabel("Ratio")
313 plt.xlabel("Latest commits")
314 plt.legend()
315 plt.grid(True)
316
317 # Put tick on the right side
318 ax.tick_params(labeltop=False, labelright=True)
319
320 plt.tight_layout()
321 return
322
323
324 def generate_graph(branches, report_name, git_path):
325
326 # The PDF document
327 pdf_pages = PdfPages(report_name)
328
329 client = get_client()
330 branch_results = dict()
331
332 # Fetch the results for each branch.
333 for branch, cutoff in branches.items():
334 commits = get_git_log(branch, cutoff, git_path)
335 results = []
336 with tempfile.TemporaryDirectory() as workdir:
337 for commit in commits:
338 b_results, valid = get_benchmark_results(client, commit, workdir)
339 if not b_results or not valid:
340 continue
341 results.append((commit, b_results))
342 branch_results[branch] = results
343
344 for b_type in BENCHMARK_TYPES:
345 latest_values = {}
346 max_len = 0
347
348 # Find the maximum size for a series inside our series dataset.
349 # This is used later to compute the size of the actual plot (pdf).
350 # While there gather the comparison value used to draw comparison line
351 # between branches.
352 for branch, results in branch_results.items():
353 max_len = max([max_len, len(results)])
354 if results:
355 latest_values[branch] = mean(
356 sanitize_dataset(results[-1][1][b_type])[0]
357 )
358 else:
359 latest_values[branch] = None
360
361 for branch, results in branch_results.items():
362 # Create a figure instance
363 if max_len and max_len > 10:
364 width = 0.16 * max_len
365 else:
366 width = 11.69
367
368 x_data = list(range(len(results)))
369 y_data = [c[1][b_type] for c in results]
370 labels = [c[0][:8] for c in results]
371
372 fig = plt.figure(figsize=(width, 8.27), dpi=100)
373 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
374 pdf_pages.savefig(fig)
375
376 # Use the mean of each sanitize dataset here, we do not care for
377 # variance for ratio. At least not yet.
378 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
379 fig = plt.figure(figsize=(width, 8.27), dpi=100)
380 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
381 pdf_pages.savefig(fig)
382
383 fig = plt.figure(figsize=(width, 8.27), dpi=100)
384 plot_delta_between_point(
385 branch, b_type, x_data, y_data, labels, latest_values
386 )
387 pdf_pages.savefig(fig)
388
389 pdf_pages.close()
390
391
392 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
393 """
394 Lauch jobs for all missing results.
395 """
396 client = get_client()
397 for branch, cutoff in branches.items():
398 commits = get_git_log(branch, cutoff, git_path)
399
400 with tempfile.TemporaryDirectory() as workdir:
401 for commit in commits:
402 if commit in invalid_commits:
403 continue
404 b_results = get_benchmark_results(client, commit, workdir)[0]
405 if b_results and not force:
406 continue
407 lava_submit.submit(
408 commit, wait_for_completion=wait_for_completion, debug=debug
409 )
410
411
412 def main():
413 """
414 Parse arguments and execute as needed.
415 """
416 bt_branches = {
417 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
418 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
419 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
420 }
421
422 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
423 parser.add_argument(
424 "--generate-jobs", action="store_true", help="Generate and send jobs"
425 )
426 parser.add_argument(
427 "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
428 )
429 parser.add_argument(
430 "--do-not-wait-on-completion",
431 action="store_true",
432 default=False,
433 help="Wait for the completion of each jobs sent. This is useful"
434 "for the ci. Otherwise we could end up spaming the lava instance.",
435 )
436 parser.add_argument(
437 "--generate-report",
438 action="store_true",
439 help="Generate graphs and save them to pdf",
440 )
441 parser.add_argument(
442 "--report-name", default="report.pdf", help="The name of the pdf report."
443 )
444 parser.add_argument(
445 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
446 )
447 parser.add_argument(
448 "--repo-path", help="The location of the git repo to use.", required=True
449 )
450 parser.add_argument(
451 "--overwrite-branches-cutoff",
452 help="A dictionary of the form {"
453 "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
454 "jobs generation.",
455 required=False, type=json_type
456 )
457
458 args = parser.parse_args()
459
460 if args.overwrite_branches_cutoff:
461 bt_branches = args.overwrite_branches_cutoff
462
463 if not os.path.exists(args.repo_path):
464 print("Repository location does not exists.")
465 return 1
466
467 if args.generate_jobs:
468 print("Launching jobs for:")
469
470 for branch, cutoff in bt_branches.items():
471 print("\t Branch {} with cutoff {}".format(branch, cutoff))
472
473 launch_jobs(
474 bt_branches,
475 args.repo_path,
476 not args.do_not_wait_on_completion,
477 args.debug,
478 args.force_jobs,
479 )
480
481 if args.generate_report:
482 print("Generating pdf report ({}) for:".format(args.report_name))
483 for branch, cutoff in bt_branches.items():
484 print("\t Branch {} with cutoff {}".format(branch, cutoff))
485 generate_graph(bt_branches, args.report_name, args.repo_path)
486
487 return 0
488
489
490 def sanitize_dataset(dataset):
491 """
492 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
493 representative mean without outlier in it.
494 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
495 """
496 sorted_data = sorted(dataset)
497 q1, q3 = numpy.percentile(sorted_data, [25, 75])
498 iqr = q3 - q1
499 lower_bound = q1 - (1.5 * iqr)
500 upper_bound = q3 + (1.5 * iqr)
501 new_dataset = []
502 outliers = []
503 for i in dataset:
504 if lower_bound <= i <= upper_bound:
505 new_dataset.append(i)
506 else:
507 outliers.append(i)
508 return new_dataset, outliers
509
510
511 if __name__ == "__main__":
512 sys.exit(main())
This page took 0.048506 seconds and 4 git commands to generate.