Add invalid commit skip for babeltrace benchmark
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41 invalid_commits = {
42 "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build
43 "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure
44 "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build
45 "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build
46 }
47
48 def json_type(string):
49 """
50 Argpase type for json args.
51 We expect a base dictionary.
52 """
53 passed_json = json.loads(string)
54 if not isinstance(passed_json, dict):
55 msg = "%r is not a dict" % string
56 raise argparse.ArgumentTypeError(msg)
57 return passed_json
58
59 def graph_get_color(branch):
60 """
61 Get the color matching the branch.
62 """
63 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
64 return color[branch]
65
66
67 def graph_get_title(branch, benchmark_type):
68 """
69 Get title for graph based on benchmark type.
70 """
71 string = {"dummy": "Dummy output", "text": "Text output"}
72 return "{} - {}".format(branch, string[benchmark_type])
73
74
75 def get_client():
76 """
77 Return minio client configured.
78 """
79 return Minio(
80 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
81 )
82
83
84 def get_file(client, prefix, file_name, workdir_name):
85 """
86 Return the path of the downloaded file.
87 Return None on error
88 """
89 destination = os.path.join(workdir_name, file_name)
90 object_name = "{}/{}".format(prefix, file_name)
91 try:
92 client.fget_object(DEFAULT_BUCKET, object_name, destination)
93 except NoSuchKey:
94 return None
95
96 return destination
97
98
99 def delete_file(client, prefix, file_name):
100 """
101 Delete the file on remote.
102 """
103 object_name = "{}/{}".format(prefix, file_name)
104 try:
105 client.remove_object(DEFAULT_BUCKET, object_name)
106 except ResponseError as err:
107 print(err)
108 except NoSuchKey:
109 pass
110
111
112 def get_git_log(bt_version, cutoff, repo_path):
113 """
114 Return an ordered (older to newer) list of commits for the bt_version and
115 cutoff. WARNING: This changes the git repo HEAD.
116 """
117 repo = git.Repo(repo_path)
118 repo.git.fetch()
119 return repo.git.log(
120 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
121 ).split("\n")
122
123
124 def parse_result(result_path):
125 """
126 Parse the result file. Return a dataset of User time + System time.
127 """
128 with open(result_path) as result:
129 parsed_result = json.load(result)
130 return list(
131 map(
132 add,
133 parsed_result["User time (seconds)"],
134 parsed_result["System time (seconds)"],
135 )
136 )
137
138
139 def get_benchmark_results(client, commit, workdir):
140 """
141 Fetch the benchmark result from a certain commit across all benchmark type.
142 """
143 results = {}
144 benchmark_valid = True
145 for b_type in BENCHMARK_TYPES:
146 prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
147 result_file = get_file(client, prefix, commit, workdir)
148 if not result_file:
149 """
150 Benchmark is either corrupted or not complete.
151 """
152 return None, benchmark_valid
153 results[b_type] = parse_result(result_file)
154 if all(i == 0.0 for i in results[b_type]):
155 benchmark_valid = False
156 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
157 # The dataset is valid return immediately.
158 return results, benchmark_valid
159
160
161 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
162 """
163 Plot the graph using the raw value.
164 """
165 point_x_data = []
166 outlier_x_data = []
167 point_y_data = []
168 outlier_y_data = []
169 for pos in range(len(x_data)):
170 x = x_data[pos]
171 valid_points, outliers = sanitize_dataset(y_data[pos])
172 for y in valid_points:
173 point_x_data.append(x)
174 point_y_data.append(y)
175 for y in outliers:
176 outlier_x_data.append(x)
177 outlier_y_data.append(y)
178
179 plt.plot(
180 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
181 )
182 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
183
184 ymax = 1
185 if y_data:
186 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
187 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
188 # Put latest of other branches for reference as horizontal line.
189 for l_branch, l_result in latest_values.items():
190 if not l_result or l_branch == branch:
191 continue
192 plt.axhline(
193 y=l_result,
194 label="Latest {}".format(l_branch),
195 color=graph_get_color(l_branch),
196 )
197 if l_result >= ymax:
198 ymax = 1.2 * l_result
199 ax = plt.gca()
200 plt.ylim(ymin=0, ymax=ymax)
201 plt.xticks(x_data, labels, rotation=90, family="monospace")
202 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
203 plt.ylabel("User + system time (s)")
204 plt.xlabel("Latest commits")
205 plt.legend()
206 plt.grid(True)
207
208 # Put tick on the right side
209 ax.tick_params(labeltop=False, labelright=True)
210
211 plt.tight_layout()
212 return
213
214
215 def plot_delta_between_point(
216 branch, benchmark_type, x_data, y_data, labels, latest_values
217 ):
218 """
219 Plot the graph of delta between each sequential commit.
220 """
221 local_abs_max = 100
222
223 # Transform y_data to a list of for which the reference is the first
224 # element.
225 local_y_data = []
226 for pos, y in enumerate(y_data):
227 if pos == 0:
228 local_y_data.append(0.0)
229 continue
230 local_y_data.append(y - y_data[pos - 1])
231
232 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
233
234 # Get max absolute value to align the y axis with zero in the middle.
235 if local_y_data:
236 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
237
238 plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
239
240 ax = plt.gca()
241 plt.xticks(x_data, labels, rotation=90, family="monospace")
242 plt.title(
243 graph_get_title(branch, benchmark_type) + " Delta to previous commit",
244 fontweight="bold",
245 )
246 plt.ylabel("Seconds")
247 plt.xlabel("Latest commits")
248 plt.legend()
249 plt.grid(True)
250
251 # Put tick on the right side
252 ax.tick_params(labeltop=False, labelright=True)
253
254 plt.tight_layout()
255 return
256
257
258 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
259 """
260 Plot the graph using a ratio using first point as reference (0%).
261 """
262 reference = 0.01
263 y_abs_max = 100
264
265 if y_data:
266 reference = y_data[0]
267
268 # Transform y_data to a list of ratio for which the reference is the first
269 # element.
270 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
271
272 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
273
274 # Put latest of other branches for reference as horizontal line.
275 for l_branch, l_result in latest_values.items():
276 if not l_result or l_branch == branch:
277 continue
278 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
279 print(
280 "branch {} branch {} value {} l_result {} reference {}".format(
281 branch, l_branch, ratio_l_result, l_result, reference
282 )
283 )
284 plt.axhline(
285 y=ratio_l_result,
286 label="Latest {}".format(l_branch),
287 color=graph_get_color(l_branch),
288 )
289
290 # Draw the reference line.
291 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
292
293 # Get max absolute value to align the y axis with zero in the middle.
294 if local_y_data:
295 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
296 if y_abs_max > 100:
297 y_abs_max = local_abs_max
298
299 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
300
301 ax = plt.gca()
302 percent_formatter = PercentFormatter()
303 ax.yaxis.set_major_formatter(percent_formatter)
304 ax.yaxis.set_minor_formatter(percent_formatter)
305 plt.xticks(x_data, labels, rotation=90, family="monospace")
306 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
307 plt.ylabel("Ratio")
308 plt.xlabel("Latest commits")
309 plt.legend()
310 plt.grid(True)
311
312 # Put tick on the right side
313 ax.tick_params(labeltop=False, labelright=True)
314
315 plt.tight_layout()
316 return
317
318
319 def generate_graph(branches, report_name, git_path):
320
321 # The PDF document
322 pdf_pages = PdfPages(report_name)
323
324 client = get_client()
325 branch_results = dict()
326
327 # Fetch the results for each branch.
328 for branch, cutoff in branches.items():
329 commits = get_git_log(branch, cutoff, git_path)
330 results = []
331 with tempfile.TemporaryDirectory() as workdir:
332 for commit in commits:
333 b_results, valid = get_benchmark_results(client, commit, workdir)
334 if not b_results or not valid:
335 continue
336 results.append((commit, b_results))
337 branch_results[branch] = results
338
339 for b_type in BENCHMARK_TYPES:
340 latest_values = {}
341 max_len = 0
342
343 # Find the maximum size for a series inside our series dataset.
344 # This is used later to compute the size of the actual plot (pdf).
345 # While there gather the comparison value used to draw comparison line
346 # between branches.
347 for branch, results in branch_results.items():
348 max_len = max([max_len, len(results)])
349 if results:
350 latest_values[branch] = mean(
351 sanitize_dataset(results[-1][1][b_type])[0]
352 )
353 else:
354 latest_values[branch] = None
355
356 for branch, results in branch_results.items():
357 # Create a figure instance
358 if max_len and max_len > 10:
359 width = 0.16 * max_len
360 else:
361 width = 11.69
362
363 x_data = list(range(len(results)))
364 y_data = [c[1][b_type] for c in results]
365 labels = [c[0][:8] for c in results]
366
367 fig = plt.figure(figsize=(width, 8.27), dpi=100)
368 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
369 pdf_pages.savefig(fig)
370
371 # Use the mean of each sanitize dataset here, we do not care for
372 # variance for ratio. At least not yet.
373 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
374 fig = plt.figure(figsize=(width, 8.27), dpi=100)
375 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
376 pdf_pages.savefig(fig)
377
378 fig = plt.figure(figsize=(width, 8.27), dpi=100)
379 plot_delta_between_point(
380 branch, b_type, x_data, y_data, labels, latest_values
381 )
382 pdf_pages.savefig(fig)
383
384 pdf_pages.close()
385
386
387 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
388 """
389 Lauch jobs for all missing results.
390 """
391 client = get_client()
392 for branch, cutoff in branches.items():
393 commits = get_git_log(branch, cutoff, git_path)
394
395 with tempfile.TemporaryDirectory() as workdir:
396 for commit in commits:
397 if commit in invalid_commits:
398 continue
399 b_results = get_benchmark_results(client, commit, workdir)[0]
400 if b_results and not force:
401 continue
402 lava_submit.submit(
403 commit, wait_for_completion=wait_for_completion, debug=debug
404 )
405
406
407 def main():
408 """
409 Parse arguments and execute as needed.
410 """
411 bt_branches = {
412 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
413 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
414 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
415 }
416
417 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
418 parser.add_argument(
419 "--generate-jobs", action="store_true", help="Generate and send jobs"
420 )
421 parser.add_argument(
422 "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
423 )
424 parser.add_argument(
425 "--do-not-wait-on-completion",
426 action="store_true",
427 default=False,
428 help="Wait for the completion of each jobs sent. This is useful"
429 "for the ci. Otherwise we could end up spaming the lava instance.",
430 )
431 parser.add_argument(
432 "--generate-report",
433 action="store_true",
434 help="Generate graphs and save them to pdf",
435 )
436 parser.add_argument(
437 "--report-name", default="report.pdf", help="The name of the pdf report."
438 )
439 parser.add_argument(
440 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
441 )
442 parser.add_argument(
443 "--repo-path", help="The location of the git repo to use.", required=True
444 )
445 parser.add_argument(
446 "--overwrite-branches-cutoff",
447 help="A dictionary of the form {"
448 "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
449 "jobs generation.",
450 required=False, type=json_type
451 )
452
453 args = parser.parse_args()
454
455 if args.overwrite_branches_cutoff:
456 bt_branches = args.overwrite_branches_cutoff
457
458 if not os.path.exists(args.repo_path):
459 print("Repository location does not exists.")
460 return 1
461
462 if args.generate_jobs:
463 print("Launching jobs for:")
464
465 for branch, cutoff in bt_branches.items():
466 print("\t Branch {} with cutoff {}".format(branch, cutoff))
467
468 launch_jobs(
469 bt_branches,
470 args.repo_path,
471 not args.do_not_wait_on_completion,
472 args.debug,
473 args.force_jobs,
474 )
475
476 if args.generate_report:
477 print("Generating pdf report ({}) for:".format(args.report_name))
478 for branch, cutoff in bt_branches.items():
479 print("\t Branch {} with cutoff {}".format(branch, cutoff))
480 generate_graph(bt_branches, args.report_name, args.repo_path)
481
482 return 0
483
484
485 def sanitize_dataset(dataset):
486 """
487 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
488 representative mean without outlier in it.
489 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
490 """
491 sorted_data = sorted(dataset)
492 q1, q3 = numpy.percentile(sorted_data, [25, 75])
493 iqr = q3 - q1
494 lower_bound = q1 - (1.5 * iqr)
495 upper_bound = q3 + (1.5 * iqr)
496 new_dataset = []
497 outliers = []
498 for i in dataset:
499 if lower_bound <= i <= upper_bound:
500 new_dataset.append(i)
501 else:
502 outliers.append(i)
503 return new_dataset, outliers
504
505
506 if __name__ == "__main__":
507 sys.exit(main())
This page took 0.040838 seconds and 4 git commands to generate.