c7dc999b7b87eb9e04ba1bb61b168583d68dd056
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41
42 def json_type(string):
43 """
44 Argpase type for json args.
45 We expect a base dictionary.
46 """
47 passed_json = json.loads(string)
48 if not isinstance(passed_json, dict):
49 msg = "%r is not a dict" % string
50 raise argparse.ArgumentTypeError(msg)
51 return passed_json
52
53 def graph_get_color(branch):
54 """
55 Get the color matching the branch.
56 """
57 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
58 return color[branch]
59
60
61 def graph_get_title(branch, benchmark_type):
62 """
63 Get title for graph based on benchmark type.
64 """
65 string = {"dummy": "Dummy output", "text": "Text output"}
66 return "{} - {}".format(branch, string[benchmark_type])
67
68
69 def get_client():
70 """
71 Return minio client configured.
72 """
73 return Minio(
74 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
75 )
76
77
78 def get_file(client, prefix, file_name, workdir_name):
79 """
80 Return the path of the downloaded file.
81 Return None on error
82 """
83 destination = os.path.join(workdir_name, file_name)
84 object_name = "{}/{}".format(prefix, file_name)
85 try:
86 client.fget_object(DEFAULT_BUCKET, object_name, destination)
87 except NoSuchKey:
88 return None
89
90 return destination
91
92
93 def delete_file(client, prefix, file_name):
94 """
95 Delete the file on remote.
96 """
97 object_name = "{}/{}".format(prefix, file_name)
98 try:
99 client.remove_object(DEFAULT_BUCKET, object_name)
100 except ResponseError as err:
101 print(err)
102 except NoSuchKey:
103 pass
104
105
106 def get_git_log(bt_version, cutoff, repo_path):
107 """
108 Return an ordered (older to newer) list of commits for the bt_version and
109 cutoff. WARNING: This changes the git repo HEAD.
110 """
111 repo = git.Repo(repo_path)
112 repo.git.fetch()
113 return repo.git.log(
114 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
115 ).split("\n")
116
117
118 def parse_result(result_path):
119 """
120 Parse the result file. Return a dataset of User time + System time.
121 """
122 with open(result_path) as result:
123 parsed_result = json.load(result)
124 return list(
125 map(
126 add,
127 parsed_result["User time (seconds)"],
128 parsed_result["System time (seconds)"],
129 )
130 )
131
132
133 def get_benchmark_results(client, commit, workdir):
134 """
135 Fetch the benchmark result from a certain commit across all benchmark type.
136 """
137 results = {}
138 benchmark_valid = True
139 for b_type in BENCHMARK_TYPES:
140 prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
141 result_file = get_file(client, prefix, commit, workdir)
142 if not result_file:
143 """
144 Benchmark is either corrupted or not complete.
145 """
146 return None, benchmark_valid
147 results[b_type] = parse_result(result_file)
148 if all(i == 0.0 for i in results[b_type]):
149 benchmark_valid = False
150 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
151 # The dataset is valid return immediately.
152 return results, benchmark_valid
153
154
155 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
156 """
157 Plot the graph using the raw value.
158 """
159 point_x_data = []
160 outlier_x_data = []
161 point_y_data = []
162 outlier_y_data = []
163 for pos in range(len(x_data)):
164 x = x_data[pos]
165 valid_points, outliers = sanitize_dataset(y_data[pos])
166 for y in valid_points:
167 point_x_data.append(x)
168 point_y_data.append(y)
169 for y in outliers:
170 outlier_x_data.append(x)
171 outlier_y_data.append(y)
172
173 plt.plot(
174 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
175 )
176 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
177
178 ymax = 1
179 if y_data:
180 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
181 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
182 # Put latest of other branches for reference as horizontal line.
183 for l_branch, l_result in latest_values.items():
184 if not l_result or l_branch == branch:
185 continue
186 plt.axhline(
187 y=l_result,
188 label="Latest {}".format(l_branch),
189 color=graph_get_color(l_branch),
190 )
191 if l_result >= ymax:
192 ymax = 1.2 * l_result
193 ax = plt.gca()
194 plt.ylim(ymin=0, ymax=ymax)
195 plt.xticks(x_data, labels, rotation=90, family="monospace")
196 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
197 plt.ylabel("User + system time (s)")
198 plt.xlabel("Latest commits")
199 plt.legend()
200
201 # Put tick on the right side
202 ax.tick_params(labeltop=False, labelright=True)
203
204 plt.tight_layout()
205 return
206
207 def plot_delta_between_point(branch, benchmark_type, x_data, y_data, labels, latest_values):
208 """
209 Plot the graph of delta between each sequential commit.
210 """
211 local_abs_max = 100
212
213 # Transform y_data to a list of for which the reference is the first
214 # element.
215 local_y_data = []
216 for pos, y in enumerate(y_data):
217 if pos == 0:
218 local_y_data.append(0.0)
219 continue
220 local_y_data.append(y - y_data[pos - 1])
221
222 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
223
224 # Get max absolute value to align the y axis with zero in the middle.
225 if local_y_data:
226 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
227
228 plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max)
229
230 ax = plt.gca()
231 plt.xticks(x_data, labels, rotation=90, family="monospace")
232 plt.title(graph_get_title(branch, benchmark_type) + " Delta to previous commit", fontweight="bold")
233 plt.ylabel("Seconds")
234 plt.xlabel("Latest commits")
235 plt.legend()
236
237 # Put tick on the right side
238 ax.tick_params(labeltop=False, labelright=True)
239
240 plt.tight_layout()
241 return
242
243 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
244 """
245 Plot the graph using a ratio using first point as reference (0%).
246 """
247 reference = 0.01
248 y_abs_max = 100
249
250 if y_data:
251 reference = y_data[0]
252
253 # Transform y_data to a list of ratio for which the reference is the first
254 # element.
255 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
256
257 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
258
259 # Put latest of other branches for reference as horizontal line.
260 for l_branch, l_result in latest_values.items():
261 if not l_result or l_branch == branch:
262 continue
263 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
264 print(
265 "branch {} branch {} value {} l_result {} reference {}".format(
266 branch, l_branch, ratio_l_result, l_result, reference
267 )
268 )
269 plt.axhline(
270 y=ratio_l_result,
271 label="Latest {}".format(l_branch),
272 color=graph_get_color(l_branch),
273 )
274
275 # Draw the reference line.
276 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
277
278 # Get max absolute value to align the y axis with zero in the middle.
279 if local_y_data:
280 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
281 if y_abs_max > 100:
282 y_abs_max = local_abs_max
283
284 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
285
286 ax = plt.gca()
287 percent_formatter = PercentFormatter()
288 ax.yaxis.set_major_formatter(percent_formatter)
289 ax.yaxis.set_minor_formatter(percent_formatter)
290 plt.xticks(x_data, labels, rotation=90, family="monospace")
291 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
292 plt.ylabel("Ratio")
293 plt.xlabel("Latest commits")
294 plt.legend()
295
296 # Put tick on the right side
297 ax.tick_params(labeltop=False, labelright=True)
298
299 plt.tight_layout()
300 return
301
302 def generate_graph(branches, report_name, git_path):
303
304 # The PDF document
305 pdf_pages = PdfPages(report_name)
306
307 client = get_client()
308 branch_results = dict()
309
310 # Fetch the results for each branch.
311 for branch, cutoff in branches.items():
312 commits = get_git_log(branch, cutoff, git_path)
313 results = []
314 with tempfile.TemporaryDirectory() as workdir:
315 for commit in commits:
316 b_results, valid = get_benchmark_results(client, commit, workdir)
317 if not b_results or not valid:
318 continue
319 results.append((commit, b_results))
320 branch_results[branch] = results
321
322 for b_type in BENCHMARK_TYPES:
323 latest_values = {}
324 max_len = 0
325
326 # Find the maximum size for a series inside our series dataset.
327 # This is used later to compute the size of the actual plot (pdf).
328 # While there gather the comparison value used to draw comparison line
329 # between branches.
330 for branch, results in branch_results.items():
331 max_len = max([max_len, len(results)])
332 if results:
333 latest_values[branch] = mean(
334 sanitize_dataset(results[-1][1][b_type])[0]
335 )
336 else:
337 latest_values[branch] = None
338
339 for branch, results in branch_results.items():
340 # Create a figure instance
341 if max_len and max_len > 10:
342 width = 0.16 * max_len
343 else:
344 width = 11.69
345
346 x_data = list(range(len(results)))
347 y_data = [c[1][b_type] for c in results]
348 labels = [c[0][:8] for c in results]
349
350 fig = plt.figure(figsize=(width, 8.27), dpi=100)
351 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
352 pdf_pages.savefig(fig)
353
354 # Use the mean of each sanitize dataset here, we do not care for
355 # variance for ratio. At least not yet.
356 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
357 fig = plt.figure(figsize=(width, 8.27), dpi=100)
358 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
359 pdf_pages.savefig(fig)
360
361 fig = plt.figure(figsize=(width, 8.27), dpi=100)
362 plot_delta_between_point(branch, b_type, x_data, y_data, labels, latest_values)
363 pdf_pages.savefig(fig)
364
365 pdf_pages.close()
366
367
368 def launch_jobs(branches, git_path, wait_for_completion, debug, force):
369 """
370 Lauch jobs for all missing results.
371 """
372 client = get_client()
373 for branch, cutoff in branches.items():
374 commits = get_git_log(branch, cutoff, git_path)
375
376 with tempfile.TemporaryDirectory() as workdir:
377 for commit in commits:
378 b_results = get_benchmark_results(client, commit, workdir)[0]
379 if b_results and not force:
380 continue
381 lava_submit.submit(
382 commit, wait_for_completion=wait_for_completion, debug=debug
383 )
384
385
386 def main():
387 """
388 Parse arguments and execute as needed.
389 """
390 bt_branches = {
391 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
392 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
393 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
394 }
395
396 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
397 parser.add_argument(
398 "--generate-jobs", action="store_true", help="Generate and send jobs"
399 )
400 parser.add_argument(
401 "--force-jobs", action="store_true", help="Force the queueing of jobs to lava"
402 )
403 parser.add_argument(
404 "--do-not-wait-on-completion",
405 action="store_true",
406 default=False,
407 help="Wait for the completion of each jobs sent. This is useful"
408 "for the ci. Otherwise we could end up spaming the lava instance.",
409 )
410 parser.add_argument(
411 "--generate-report",
412 action="store_true",
413 help="Generate graphs and save them to pdf",
414 )
415 parser.add_argument(
416 "--report-name", default="report.pdf", help="The name of the pdf report."
417 )
418 parser.add_argument(
419 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
420 )
421 parser.add_argument(
422 "--repo-path", help="The location of the git repo to use.", required=True
423 )
424 parser.add_argument(
425 "--overwrite-branches-cutoff",
426 help="A dictionary of the form {"
427 "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and"
428 "jobs generation.",
429 required=False, type=json_type
430 )
431
432 args = parser.parse_args()
433
434 if args.overwrite_branches_cutoff:
435 bt_branches = args.overwrite_branches_cutoff
436
437 if not os.path.exists(args.repo_path):
438 print("Repository location does not exists.")
439 return 1
440
441 if args.generate_jobs:
442 print("Launching jobs for:")
443
444 for branch, cutoff in bt_branches.items():
445 print("\t Branch {} with cutoff {}".format(branch, cutoff))
446
447 launch_jobs(
448 bt_branches,
449 args.repo_path,
450 not args.do_not_wait_on_completion,
451 args.debug,
452 args.force_jobs,
453 )
454
455 if args.generate_report:
456 print("Generating pdf report ({}) for:".format(args.report_name))
457 for branch, cutoff in bt_branches.items():
458 print("\t Branch {} with cutoff {}".format(branch, cutoff))
459 generate_graph(bt_branches, args.report_name, args.repo_path)
460
461 return 0
462
463
464 def sanitize_dataset(dataset):
465 """
466 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
467 representative mean without outlier in it.
468 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
469 """
470 sorted_data = sorted(dataset)
471 q1, q3 = numpy.percentile(sorted_data, [25, 75])
472 iqr = q3 - q1
473 lower_bound = q1 - (1.5 * iqr)
474 upper_bound = q3 + (1.5 * iqr)
475 new_dataset = []
476 outliers = []
477 for i in dataset:
478 if lower_bound <= i <= upper_bound:
479 new_dataset.append(i)
480 else:
481 outliers.append(i)
482 return new_dataset, outliers
483
484
485 if __name__ == "__main__":
486 sys.exit(main())
This page took 0.039635 seconds and 3 git commands to generate.