Add y scale on the right side of plot
[lttng-ci.git] / scripts / babeltrace-benchmark / benchmark.py
1 #!/usr/bin/python3
2 # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com>
3 #
4 # This program is free software: you can redistribute it and/or modify
5 # it under the terms of the GNU General Public License as published by
6 # the Free Software Foundation, either version 3 of the License, or
7 # (at your option) any later version.
8 #
9 # This program is distributed in the hope that it will be useful,
10 # but WITHOUT ANY WARRANTY; without even the implied warranty of
11 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 # GNU General Public License for more details.
13 #
14 # You should have received a copy of the GNU General Public License
15 # along with this program. If not, see <http://www.gnu.org/licenses/>.
16
17 import json
18 import os
19 import tempfile
20 from statistics import mean
21 import argparse
22 import sys
23 from operator import add
24
25 import matplotlib.pyplot as plt
26 from matplotlib.backends.backend_pdf import PdfPages
27 from matplotlib.ticker import PercentFormatter
28
29 import git
30 import numpy
31 import lava_submit
32
33 from minio import Minio
34 from minio.error import NoSuchKey
35 from minio.error import ResponseError
36
37
38 BENCHMARK_TYPES = ["dummy", "text"]
39 DEFAULT_BUCKET = "lava"
40
41
42 def graph_get_color(branch):
43 """
44 Get the color matching the branch.
45 """
46 color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"}
47 return color[branch]
48
49
50 def graph_get_title(branch, benchmark_type):
51 """
52 Get title for graph based on benchmark type.
53 """
54 string = {"dummy": "Dummy output", "text": "Text output"}
55 return "{} - {}".format(branch, string[benchmark_type])
56
57
58 def get_client():
59 """
60 Return minio client configured.
61 """
62 return Minio(
63 "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456"
64 )
65
66
67 def get_file(client, prefix, file_name, workdir_name):
68 """
69 Return the path of the downloaded file.
70 Return None on error
71 """
72 destination = os.path.join(workdir_name, file_name)
73 object_name = "{}/{}".format(prefix, file_name)
74 try:
75 client.fget_object(DEFAULT_BUCKET, object_name, destination)
76 except NoSuchKey:
77 return None
78
79 return destination
80
81
82 def delete_file(client, prefix, file_name):
83 """
84 Delete the file on remote.
85 """
86 object_name = "{}/{}".format(prefix, file_name)
87 try:
88 client.remove_object(DEFAULT_BUCKET, object_name)
89 except ResponseError as err:
90 print(err)
91 except NoSuchKey:
92 pass
93
94
95 def get_git_log(bt_version, cutoff, repo_path):
96 """
97 Return an ordered (older to newer) list of commits for the bt_version and
98 cutoff. WARNING: This changes the git repo HEAD.
99 """
100 repo = git.Repo(repo_path)
101 repo.git.fetch()
102 return repo.git.log(
103 "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse"
104 ).split("\n")
105
106
107 def parse_result(result_path):
108 """
109 Parse the result file. Return a dataset of User time + System time.
110 """
111 with open(result_path) as result:
112 parsed_result = json.load(result)
113 return list(
114 map(
115 add,
116 parsed_result["User time (seconds)"],
117 parsed_result["System time (seconds)"],
118 )
119 )
120
121
122 def get_benchmark_results(client, commit, workdir):
123 """
124 Fetch the benchmark result from a certain commit across all benchmark type.
125 """
126 results = {}
127 benchmark_valid = True
128 for b_type in BENCHMARK_TYPES:
129 prefix = "/results/benchmarks/babeltrace/{}/".format(b_type)
130 result_file = get_file(client, prefix, commit, workdir)
131 if not result_file:
132 """
133 Benchmark is either corrupted or not complete.
134 """
135 return None, benchmark_valid
136 results[b_type] = parse_result(result_file)
137 if all(i == 0.0 for i in results[b_type]):
138 benchmark_valid = False
139 print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit))
140 # The dataset is valid return immediately.
141 return results, benchmark_valid
142
143
144 def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values):
145 """
146 Plot the graph using the raw value.
147 """
148 point_x_data = []
149 outlier_x_data = []
150 point_y_data = []
151 outlier_y_data = []
152 for pos in range(len(x_data)):
153 x = x_data[pos]
154 valid_points, outliers = sanitize_dataset(y_data[pos])
155 for y in valid_points:
156 point_x_data.append(x)
157 point_y_data.append(y)
158 for y in outliers:
159 outlier_x_data.append(x)
160 outlier_y_data.append(y)
161
162 plt.plot(
163 point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch)
164 )
165 plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black")
166
167 ymax = 1
168 if y_data:
169 ymin = 0.8 * min([item for sublist in y_data for item in sublist])
170 ymax = 1.2 * max([item for sublist in y_data for item in sublist])
171 # Put latest of other branches for reference as horizontal line.
172 for l_branch, l_result in latest_values.items():
173 if not l_result or l_branch == branch:
174 continue
175 plt.axhline(
176 y=l_result,
177 label="Latest {}".format(l_branch),
178 color=graph_get_color(l_branch),
179 )
180 if l_result >= ymax:
181 ymax = 1.2 * l_result
182 ax = plt.gca()
183 plt.ylim(ymin=0, ymax=ymax)
184 plt.xticks(x_data, labels, rotation=90, family="monospace")
185 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
186 plt.ylabel("User + system time (s)")
187 plt.xlabel("Latest commits")
188 plt.legend()
189
190 # Put tick on the right side
191 ax.tick_params(labeltop=False, labelright=True)
192
193 plt.tight_layout()
194 return
195
196
197 def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values):
198 """
199 Plot the graph using a ratio using first point as reference (0%).
200 """
201 reference = 0.01
202 y_abs_max = 100
203
204 if y_data:
205 reference = y_data[0]
206
207 # Transform y_data to a list of ratio for which the reference is the first
208 # element.
209 local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data))
210
211 plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch))
212
213 # Put latest of other branches for reference as horizontal line.
214 for l_branch, l_result in latest_values.items():
215 if not l_result or l_branch == branch:
216 continue
217 ratio_l_result = ((l_result / reference) - 1.0) * 100.0
218 print(
219 "branch {} branch {} value {} l_result {} reference {}".format(
220 branch, l_branch, ratio_l_result, l_result, reference
221 )
222 )
223 plt.axhline(
224 y=ratio_l_result,
225 label="Latest {}".format(l_branch),
226 color=graph_get_color(l_branch),
227 )
228
229 # Draw the reference line.
230 plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black")
231
232 # Get max absolute value to align the y axis with zero in the middle.
233 if local_y_data:
234 local_abs_max = abs(max(local_y_data, key=abs)) * 1.3
235 if y_abs_max > 100:
236 y_abs_max = local_abs_max
237
238 plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max)
239
240 ax = plt.gca()
241 percent_formatter = PercentFormatter()
242 ax.yaxis.set_major_formatter(percent_formatter)
243 ax.yaxis.set_minor_formatter(percent_formatter)
244 plt.xticks(x_data, labels, rotation=90, family="monospace")
245 plt.title(graph_get_title(branch, benchmark_type), fontweight="bold")
246 plt.ylabel("Ratio")
247 plt.xlabel("Latest commits")
248 plt.legend()
249
250 # Put tick on the right side
251 ax.tick_params(labeltop=False, labelright=True)
252
253 plt.tight_layout()
254 return
255
256 def generate_graph(branches, report_name, git_path):
257
258 # The PDF document
259 pdf_pages = PdfPages(report_name)
260
261 client = get_client()
262 branch_results = dict()
263
264 # Fetch the results for each branch.
265 for branch, cutoff in branches.items():
266 commits = get_git_log(branch, cutoff, git_path)
267 results = []
268 with tempfile.TemporaryDirectory() as workdir:
269 for commit in commits:
270 b_results, valid = get_benchmark_results(client, commit, workdir)
271 if not b_results or not valid:
272 continue
273 results.append((commit, b_results))
274 branch_results[branch] = results
275
276 for b_type in BENCHMARK_TYPES:
277 latest_values = {}
278 max_len = 0
279
280 # Find the maximum size for a series inside our series dataset.
281 # This is used later to compute the size of the actual plot (pdf).
282 # While there gather the comparison value used to draw comparison line
283 # between branches.
284 for branch, results in branch_results.items():
285 max_len = max([max_len, len(results)])
286 if results:
287 latest_values[branch] = mean(
288 sanitize_dataset(results[-1][1][b_type])[0]
289 )
290 else:
291 latest_values[branch] = None
292
293 for branch, results in branch_results.items():
294 # Create a figure instance
295 if max_len and max_len > 10:
296 width = 0.16 * max_len
297 else:
298 width = 11.69
299
300 x_data = list(range(len(results)))
301 y_data = [c[1][b_type] for c in results]
302 labels = [c[0][:8] for c in results]
303
304 fig = plt.figure(figsize=(width, 8.27), dpi=100)
305 plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values)
306 pdf_pages.savefig(fig)
307
308 fig = plt.figure(figsize=(width, 8.27), dpi=100)
309 # Use the mean of each sanitize dataset here, we do not care for
310 # variance for ratio. At least not yet.
311 y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results]
312 plot_ratio(branch, b_type, x_data, y_data, labels, latest_values)
313 pdf_pages.savefig(fig)
314
315 pdf_pages.close()
316
317
318 def launch_jobs(branches, git_path, wait_for_completion, debug):
319 """
320 Lauch jobs for all missing results.
321 """
322 client = get_client()
323 for branch, cutoff in branches.items():
324 commits = get_git_log(branch, cutoff, git_path)
325
326 with tempfile.TemporaryDirectory() as workdir:
327 for commit in commits:
328 b_results = get_benchmark_results(client, commit, workdir)[0]
329 if b_results:
330 continue
331 lava_submit.submit(
332 commit, wait_for_completion=wait_for_completion, debug=debug
333 )
334
335
336 def main():
337 """
338 Parse arguments and execute as needed.
339 """
340 bt_branches = {
341 "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575",
342 "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973",
343 "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582",
344 }
345
346 parser = argparse.ArgumentParser(description="Babeltrace benchmark utility")
347 parser.add_argument(
348 "--generate-jobs", action="store_true", help="Generate and send jobs"
349 )
350 parser.add_argument(
351 "--do-not-wait-on-completion",
352 action="store_true",
353 default=False,
354 help="Wait for the completion of each jobs sent. This is useful"
355 "for the ci. Otherwise we could end up spaming the lava instance.",
356 )
357 parser.add_argument(
358 "--generate-report",
359 action="store_true",
360 help="Generate graphs and save them to pdf",
361 )
362 parser.add_argument(
363 "--report-name", default="report.pdf", help="The name of the pdf report."
364 )
365 parser.add_argument(
366 "--debug", action="store_true", default=False, help="Do not send jobs to lava."
367 )
368 parser.add_argument(
369 "--repo-path", help="The location of the git repo to use.", required=True
370 )
371
372 args = parser.parse_args()
373
374 if not os.path.exists(args.repo_path):
375 print("Repository location does not exists.")
376 return 1
377
378 if args.generate_jobs:
379 print("Launching jobs for:")
380 for branch, cutoff in bt_branches.items():
381 print("\t Branch {} with cutoff {}".format(branch, cutoff))
382 launch_jobs(
383 bt_branches, args.repo_path, not args.do_not_wait_on_completion, args.debug
384 )
385
386 if args.generate_report:
387 print("Generating pdf report ({}) for:".format(args.report_name))
388 for branch, cutoff in bt_branches.items():
389 print("\t Branch {} with cutoff {}".format(branch, cutoff))
390 generate_graph(bt_branches, args.report_name, args.repo_path)
391
392 return 0
393
394
395 def sanitize_dataset(dataset):
396 """
397 Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a
398 representative mean without outlier in it.
399 [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers
400 """
401 sorted_data = sorted(dataset)
402 q1, q3 = numpy.percentile(sorted_data, [25, 75])
403 iqr = q3 - q1
404 lower_bound = q1 - (1.5 * iqr)
405 upper_bound = q3 + (1.5 * iqr)
406 new_dataset = []
407 outliers = []
408 for i in dataset:
409 if lower_bound <= i <= upper_bound:
410 new_dataset.append(i)
411 else:
412 outliers.append(i)
413 return new_dataset, outliers
414
415
416 if __name__ == "__main__":
417 sys.exit(main())
This page took 0.038061 seconds and 4 git commands to generate.