| 1 | #!/usr/bin/python3 |
| 2 | # Copyright (C) 2019 - Jonathan Rajotte <jonathan.rajotte-julien@efficios.com> |
| 3 | # |
| 4 | # This program is free software: you can redistribute it and/or modify |
| 5 | # it under the terms of the GNU General Public License as published by |
| 6 | # the Free Software Foundation, either version 3 of the License, or |
| 7 | # (at your option) any later version. |
| 8 | # |
| 9 | # This program is distributed in the hope that it will be useful, |
| 10 | # but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 11 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 12 | # GNU General Public License for more details. |
| 13 | # |
| 14 | # You should have received a copy of the GNU General Public License |
| 15 | # along with this program. If not, see <http://www.gnu.org/licenses/>. |
| 16 | |
| 17 | import json |
| 18 | import os |
| 19 | import tempfile |
| 20 | from statistics import mean |
| 21 | import argparse |
| 22 | import sys |
| 23 | from operator import add |
| 24 | |
| 25 | import matplotlib.pyplot as plt |
| 26 | from matplotlib.backends.backend_pdf import PdfPages |
| 27 | from matplotlib.ticker import PercentFormatter |
| 28 | |
| 29 | import git |
| 30 | import numpy |
| 31 | import lava_submit |
| 32 | |
| 33 | from minio import Minio |
| 34 | from minio.error import NoSuchKey |
| 35 | from minio.error import ResponseError |
| 36 | |
| 37 | |
| 38 | BENCHMARK_TYPES = ["dummy", "text"] |
| 39 | DEFAULT_BUCKET = "lava" |
| 40 | |
| 41 | invalid_commits = { |
| 42 | "ec9a9794af488a9accce7708a8b0d8188b498789", # Does not build |
| 43 | "8c99128c640cbce71fb8a6caa15e4c672252b662", # Block on configure |
| 44 | "f3847c753f1b4f12353c38d97b0577d9993d19fb", # Does not build |
| 45 | "e0111295f17ddfcc33ec771a8deac505473a06ad", # Does not build |
| 46 | "d0d4e0ed487ea23aaf0d023513c0a4d86901b79b", # Does not build |
| 47 | "c24f7ab4dd9edeb5e50b0070fd9d9e8691057dde", # Does not build |
| 48 | "ce67f5614a4db3b2de4d887eca52135b439b4937", # Does not build |
| 49 | "80aff5efc66679fd934cef433c0e698694748385", # Does not build |
| 50 | "f4f11e84942d36fcc8a597d226928bce2ccac4b3", # Does not build |
| 51 | "ae466a6e1b856d96cf5112a371b4df2b732503ec", # Does not build |
| 52 | "ade5c95e2a4f90f839f222fc1a66175b3b199922", # Configuration fails |
| 53 | "30341532906d62808e9d66fb115f5edb4e6f5706", # Configuration fails |
| 54 | "006c5ffb42f32e802136e3c27a63accb59b4d6c4", # Does not build |
| 55 | "88488ff5bdcd7679ff1f04fe6cff0d24b4f8fc0c", # Does not build |
| 56 | # Other errors |
| 57 | "7c7301d5827bd10ec7c34da7ffc5fe74e5047d38", |
| 58 | "a0df3abf88616cb0799f87f4eb57c54268e63448", |
| 59 | "b7045dd71bc0524ad6b5db96df365e98e237d395", |
| 60 | "cf7b259eaa602abcef308d2b5dd8e6c9ee995d8b", |
| 61 | "90a55a4ef47cac7b568f5f0a8a78bd760f82d23c", |
| 62 | "baa5e3aa82a82c9d0fa59e3c586c0168bb5dc267", |
| 63 | "af9f8da7ba4a9b16fc36d637b8c3a0c7a8774da2", |
| 64 | "fe748379adbd385efdfc7acae9c2340fb8b7d717", |
| 65 | "baa5e3aa82a82c9d0fa59e3c586c0168bb5dc267", |
| 66 | "af9f8da7ba4a9b16fc36d637b8c3a0c7a8774da2", |
| 67 | "fe748379adbd385efdfc7acae9c2340fb8b7d717", |
| 68 | "929627965e33e06dc77254d81e8ec1d66cc06590", |
| 69 | "48a0e52c4632a60cd43423f2f34f10de350bf868", |
| 70 | "b7fa35fce415b33207a9eba111069ed31ef122a0", |
| 71 | "828c8a25785e0cedaeb6987256a4dfc3c43b982f", |
| 72 | "213489680861e4d796173513effac7023312ec2d", |
| 73 | "430a5ccbbd15782501ca56bb148f3850126277ad", |
| 74 | "629d19044c43b195498d0a4e002906c54b6186d5", |
| 75 | "c423217ed1640b4152739f7e5613775d46c25050", |
| 76 | # Elfutils |
| 77 | "776a2a252c9875caa1e8b4f41cb8cc12c79611c3", |
| 78 | "435aa29aff0527d36aafa1b657ae70b9db5f9ea5", |
| 79 | "95651695473495501fc6b2c4a1cf6a78cfb3cd6a", |
| 80 | "e0748fb2ba8994c136bcc0b67d3044f09841cf8e", |
| 81 | "9e632b22e1310fe773edc32ab08a60602f4b2861", |
| 82 | "271fb6907a6f4705a1c799d925394243eae51d68", |
| 83 | "328342cd737582216dc7b8b7d558b2a1bf8ea5e8", |
| 84 | "ae5c1a4481be68fae027910b141354c1d86daa64", |
| 85 | "e6938018975e45d35dab5fef795fe7344eef7d62", |
| 86 | "e015bae2ef343b30c890eebb9182a8be13d12ed0", |
| 87 | "5e8a0751ae0c418a615025d1da10bc84f91b3d97", |
| 88 | "887d26fa0fd0ae0c5c15e4b885473c4cdc0bf078", |
| 89 | "e97fe75eac59fc39a6e4f3c4f9f3301835a0315e", |
| 90 | "8b130e7f1d6a41fb5c64a014c15246ba74b79470", |
| 91 | "f4f8f79893b18199b38edc3330093a9403c4c737", |
| 92 | } |
| 93 | |
| 94 | def json_type(string): |
| 95 | """ |
| 96 | Argpase type for json args. |
| 97 | We expect a base dictionary. |
| 98 | """ |
| 99 | passed_json = json.loads(string) |
| 100 | if not isinstance(passed_json, dict): |
| 101 | msg = "%r is not a dict" % string |
| 102 | raise argparse.ArgumentTypeError(msg) |
| 103 | return passed_json |
| 104 | |
| 105 | def graph_get_color(branch): |
| 106 | """ |
| 107 | Get the color matching the branch. |
| 108 | """ |
| 109 | color = {"stable-1.5": "red", "stable-2.0": "green", "master": "blue"} |
| 110 | return color[branch] |
| 111 | |
| 112 | |
| 113 | def graph_get_title(branch, benchmark_type): |
| 114 | """ |
| 115 | Get title for graph based on benchmark type. |
| 116 | """ |
| 117 | string = {"dummy": "Dummy output", "text": "Text output"} |
| 118 | return "{} - {}".format(branch, string[benchmark_type]) |
| 119 | |
| 120 | |
| 121 | def get_client(): |
| 122 | """ |
| 123 | Return minio client configured. |
| 124 | """ |
| 125 | return Minio( |
| 126 | "obj.internal.efficios.com", access_key="jenkins", secret_key="echo123456" |
| 127 | ) |
| 128 | |
| 129 | |
| 130 | def get_file(client, prefix, file_name, workdir_name): |
| 131 | """ |
| 132 | Return the path of the downloaded file. |
| 133 | Return None on error |
| 134 | """ |
| 135 | destination = os.path.join(workdir_name, file_name) |
| 136 | object_name = "{}/{}".format(prefix, file_name) |
| 137 | try: |
| 138 | client.fget_object(DEFAULT_BUCKET, object_name, destination) |
| 139 | except NoSuchKey: |
| 140 | return None |
| 141 | |
| 142 | return destination |
| 143 | |
| 144 | |
| 145 | def delete_file(client, prefix, file_name): |
| 146 | """ |
| 147 | Delete the file on remote. |
| 148 | """ |
| 149 | object_name = "{}/{}".format(prefix, file_name) |
| 150 | try: |
| 151 | client.remove_object(DEFAULT_BUCKET, object_name) |
| 152 | except ResponseError as err: |
| 153 | print(err) |
| 154 | except NoSuchKey: |
| 155 | pass |
| 156 | |
| 157 | |
| 158 | def get_git_log(bt_version, cutoff, repo_path): |
| 159 | """ |
| 160 | Return an ordered (older to newer) list of commits for the bt_version and |
| 161 | cutoff. WARNING: This changes the git repo HEAD. |
| 162 | """ |
| 163 | repo = git.Repo(repo_path) |
| 164 | repo.git.fetch() |
| 165 | return repo.git.log( |
| 166 | "{}..origin/{}".format(cutoff, bt_version), "--pretty=format:%H", "--reverse" |
| 167 | ).split("\n") |
| 168 | |
| 169 | |
| 170 | def parse_result(result_path): |
| 171 | """ |
| 172 | Parse the result file. Return a dataset of User time + System time. |
| 173 | """ |
| 174 | with open(result_path) as result: |
| 175 | parsed_result = json.load(result) |
| 176 | return list( |
| 177 | map( |
| 178 | add, |
| 179 | parsed_result["User time (seconds)"], |
| 180 | parsed_result["System time (seconds)"], |
| 181 | ) |
| 182 | ) |
| 183 | |
| 184 | |
| 185 | def get_benchmark_results(client, commit, workdir): |
| 186 | """ |
| 187 | Fetch the benchmark result from a certain commit across all benchmark type. |
| 188 | """ |
| 189 | results = {} |
| 190 | benchmark_valid = True |
| 191 | for b_type in BENCHMARK_TYPES: |
| 192 | prefix = "/results/benchmarks/babeltrace/{}".format(b_type) |
| 193 | result_file = get_file(client, prefix, commit, workdir) |
| 194 | if not result_file: |
| 195 | """ |
| 196 | Benchmark is either corrupted or not complete. |
| 197 | """ |
| 198 | return None, benchmark_valid |
| 199 | results[b_type] = parse_result(result_file) |
| 200 | if all(i == 0.0 for i in results[b_type]): |
| 201 | benchmark_valid = False |
| 202 | print("Invalid benchmark for {}/{}/{}".format(prefix, b_type, commit)) |
| 203 | # The dataset is valid return immediately. |
| 204 | return results, benchmark_valid |
| 205 | |
| 206 | |
| 207 | def plot_raw_value(branch, benchmark_type, x_data, y_data, labels, latest_values): |
| 208 | """ |
| 209 | Plot the graph using the raw value. |
| 210 | """ |
| 211 | point_x_data = [] |
| 212 | outlier_x_data = [] |
| 213 | point_y_data = [] |
| 214 | outlier_y_data = [] |
| 215 | for pos in range(len(x_data)): |
| 216 | x = x_data[pos] |
| 217 | valid_points, outliers = sanitize_dataset(y_data[pos]) |
| 218 | for y in valid_points: |
| 219 | point_x_data.append(x) |
| 220 | point_y_data.append(y) |
| 221 | for y in outliers: |
| 222 | outlier_x_data.append(x) |
| 223 | outlier_y_data.append(y) |
| 224 | |
| 225 | plt.plot( |
| 226 | point_x_data, point_y_data, "o", label=branch, color=graph_get_color(branch) |
| 227 | ) |
| 228 | plt.plot(outlier_x_data, outlier_y_data, "+", label="outlier", color="black") |
| 229 | |
| 230 | ymax = 1 |
| 231 | if y_data: |
| 232 | ymin = 0.8 * min([item for sublist in y_data for item in sublist]) |
| 233 | ymax = 1.2 * max([item for sublist in y_data for item in sublist]) |
| 234 | # Put latest of other branches for reference as horizontal line. |
| 235 | for l_branch, l_result in latest_values.items(): |
| 236 | if not l_result or l_branch == branch: |
| 237 | continue |
| 238 | plt.axhline( |
| 239 | y=l_result, |
| 240 | label="Latest {}".format(l_branch), |
| 241 | color=graph_get_color(l_branch), |
| 242 | ) |
| 243 | if l_result >= ymax: |
| 244 | ymax = 1.2 * l_result |
| 245 | ax = plt.gca() |
| 246 | plt.ylim(ymin=0, ymax=ymax) |
| 247 | plt.xticks(x_data, labels, rotation=90, family="monospace") |
| 248 | plt.title(graph_get_title(branch, benchmark_type), fontweight="bold") |
| 249 | plt.ylabel("User + system time (s)") |
| 250 | plt.xlabel("Latest commits") |
| 251 | plt.legend() |
| 252 | plt.grid(True) |
| 253 | |
| 254 | # Put tick on the right side |
| 255 | ax.tick_params(labeltop=False, labelright=True) |
| 256 | |
| 257 | plt.tight_layout() |
| 258 | return |
| 259 | |
| 260 | |
| 261 | def plot_delta_between_point( |
| 262 | branch, benchmark_type, x_data, y_data, labels, latest_values |
| 263 | ): |
| 264 | """ |
| 265 | Plot the graph of delta between each sequential commit. |
| 266 | """ |
| 267 | local_abs_max = 100 |
| 268 | |
| 269 | # Transform y_data to a list of for which the reference is the first |
| 270 | # element. |
| 271 | local_y_data = [] |
| 272 | for pos, y in enumerate(y_data): |
| 273 | if pos == 0: |
| 274 | local_y_data.append(0.0) |
| 275 | continue |
| 276 | local_y_data.append(y - y_data[pos - 1]) |
| 277 | |
| 278 | plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch)) |
| 279 | |
| 280 | # Get max absolute value to align the y axis with zero in the middle. |
| 281 | if local_y_data: |
| 282 | local_abs_max = abs(max(local_y_data, key=abs)) * 1.3 |
| 283 | |
| 284 | plt.ylim(ymin=local_abs_max * -1, ymax=local_abs_max) |
| 285 | |
| 286 | ax = plt.gca() |
| 287 | plt.xticks(x_data, labels, rotation=90, family="monospace") |
| 288 | plt.title( |
| 289 | graph_get_title(branch, benchmark_type) + " Delta to previous commit", |
| 290 | fontweight="bold", |
| 291 | ) |
| 292 | plt.ylabel("Seconds") |
| 293 | plt.xlabel("Latest commits") |
| 294 | plt.legend() |
| 295 | plt.grid(True) |
| 296 | |
| 297 | # Put tick on the right side |
| 298 | ax.tick_params(labeltop=False, labelright=True) |
| 299 | |
| 300 | plt.tight_layout() |
| 301 | return |
| 302 | |
| 303 | |
| 304 | def plot_ratio(branch, benchmark_type, x_data, y_data, labels, latest_values): |
| 305 | """ |
| 306 | Plot the graph using a ratio using first point as reference (0%). |
| 307 | """ |
| 308 | reference = 0.01 |
| 309 | y_abs_max = 100 |
| 310 | |
| 311 | if y_data: |
| 312 | reference = y_data[0] |
| 313 | |
| 314 | # Transform y_data to a list of ratio for which the reference is the first |
| 315 | # element. |
| 316 | local_y_data = list(map(lambda y: ((y / reference) - 1.0) * 100, y_data)) |
| 317 | |
| 318 | plt.plot(x_data, local_y_data, "o", label=branch, color=graph_get_color(branch)) |
| 319 | |
| 320 | # Put latest of other branches for reference as horizontal line. |
| 321 | for l_branch, l_result in latest_values.items(): |
| 322 | if not l_result or l_branch == branch: |
| 323 | continue |
| 324 | ratio_l_result = ((l_result / reference) - 1.0) * 100.0 |
| 325 | print( |
| 326 | "branch {} branch {} value {} l_result {} reference {}".format( |
| 327 | branch, l_branch, ratio_l_result, l_result, reference |
| 328 | ) |
| 329 | ) |
| 330 | plt.axhline( |
| 331 | y=ratio_l_result, |
| 332 | label="Latest {}".format(l_branch), |
| 333 | color=graph_get_color(l_branch), |
| 334 | ) |
| 335 | |
| 336 | # Draw the reference line. |
| 337 | plt.axhline(y=0, label="Reference (leftmost point)", linestyle="-", color="Black") |
| 338 | |
| 339 | # Get max absolute value to align the y axis with zero in the middle. |
| 340 | if local_y_data: |
| 341 | local_abs_max = abs(max(local_y_data, key=abs)) * 1.3 |
| 342 | if y_abs_max > 100: |
| 343 | y_abs_max = local_abs_max |
| 344 | |
| 345 | plt.ylim(ymin=y_abs_max * -1, ymax=y_abs_max) |
| 346 | |
| 347 | ax = plt.gca() |
| 348 | percent_formatter = PercentFormatter() |
| 349 | ax.yaxis.set_major_formatter(percent_formatter) |
| 350 | ax.yaxis.set_minor_formatter(percent_formatter) |
| 351 | plt.xticks(x_data, labels, rotation=90, family="monospace") |
| 352 | plt.title(graph_get_title(branch, benchmark_type), fontweight="bold") |
| 353 | plt.ylabel("Ratio") |
| 354 | plt.xlabel("Latest commits") |
| 355 | plt.legend() |
| 356 | plt.grid(True) |
| 357 | |
| 358 | # Put tick on the right side |
| 359 | ax.tick_params(labeltop=False, labelright=True) |
| 360 | |
| 361 | plt.tight_layout() |
| 362 | return |
| 363 | |
| 364 | |
| 365 | def generate_graph(branches, report_name, git_path): |
| 366 | |
| 367 | # The PDF document |
| 368 | pdf_pages = PdfPages(report_name) |
| 369 | |
| 370 | client = get_client() |
| 371 | branch_results = dict() |
| 372 | |
| 373 | # Fetch the results for each branch. |
| 374 | for branch, cutoff in branches.items(): |
| 375 | commits = get_git_log(branch, cutoff, git_path) |
| 376 | results = [] |
| 377 | with tempfile.TemporaryDirectory() as workdir: |
| 378 | for commit in commits: |
| 379 | b_results, valid = get_benchmark_results(client, commit, workdir) |
| 380 | if not b_results or not valid: |
| 381 | continue |
| 382 | results.append((commit, b_results)) |
| 383 | branch_results[branch] = results |
| 384 | |
| 385 | for b_type in BENCHMARK_TYPES: |
| 386 | latest_values = {} |
| 387 | max_len = 0 |
| 388 | |
| 389 | # Find the maximum size for a series inside our series dataset. |
| 390 | # This is used later to compute the size of the actual plot (pdf). |
| 391 | # While there gather the comparison value used to draw comparison line |
| 392 | # between branches. |
| 393 | for branch, results in branch_results.items(): |
| 394 | max_len = max([max_len, len(results)]) |
| 395 | if results: |
| 396 | latest_values[branch] = mean( |
| 397 | sanitize_dataset(results[-1][1][b_type])[0] |
| 398 | ) |
| 399 | else: |
| 400 | latest_values[branch] = None |
| 401 | |
| 402 | for branch, results in branch_results.items(): |
| 403 | # Create a figure instance |
| 404 | if max_len and max_len > 10: |
| 405 | width = 0.16 * max_len |
| 406 | else: |
| 407 | width = 11.69 |
| 408 | |
| 409 | x_data = list(range(len(results))) |
| 410 | y_data = [c[1][b_type] for c in results] |
| 411 | labels = [c[0][:8] for c in results] |
| 412 | |
| 413 | fig = plt.figure(figsize=(width, 8.27), dpi=100) |
| 414 | plot_raw_value(branch, b_type, x_data, y_data, labels, latest_values) |
| 415 | pdf_pages.savefig(fig) |
| 416 | |
| 417 | # Use the mean of each sanitize dataset here, we do not care for |
| 418 | # variance for ratio. At least not yet. |
| 419 | y_data = [mean(sanitize_dataset(c[1][b_type])[0]) for c in results] |
| 420 | fig = plt.figure(figsize=(width, 8.27), dpi=100) |
| 421 | plot_ratio(branch, b_type, x_data, y_data, labels, latest_values) |
| 422 | pdf_pages.savefig(fig) |
| 423 | |
| 424 | fig = plt.figure(figsize=(width, 8.27), dpi=100) |
| 425 | plot_delta_between_point( |
| 426 | branch, b_type, x_data, y_data, labels, latest_values |
| 427 | ) |
| 428 | pdf_pages.savefig(fig) |
| 429 | |
| 430 | pdf_pages.close() |
| 431 | |
| 432 | |
| 433 | def launch_jobs(branches, git_path, wait_for_completion, debug, force): |
| 434 | """ |
| 435 | Lauch jobs for all missing results. |
| 436 | """ |
| 437 | client = get_client() |
| 438 | commits_to_test = set() |
| 439 | for branch, cutoff in branches.items(): |
| 440 | commits = [x for x in get_git_log(branch, cutoff, git_path) if x not in invalid_commits] |
| 441 | with tempfile.TemporaryDirectory() as workdir: |
| 442 | for commit in commits: |
| 443 | b_results = get_benchmark_results(client, commit, workdir)[0] |
| 444 | if b_results and not force: |
| 445 | continue |
| 446 | commits_to_test.add(commit) |
| 447 | for index, commit in enumerate(commits_to_test): |
| 448 | print("Job {}/{}".format(index+1, len(commits_to_test))) |
| 449 | lava_submit.submit( |
| 450 | commit, wait_for_completion=wait_for_completion, debug=debug |
| 451 | ) |
| 452 | |
| 453 | |
| 454 | def main(): |
| 455 | """ |
| 456 | Parse arguments and execute as needed. |
| 457 | """ |
| 458 | bt_branches = { |
| 459 | "master": "31976fe2d70a8b6b7f8b31b9e0b3bc004d415575", |
| 460 | "stable-2.0": "07f585356018b4ddfbd0e09c49a14e38977c6973", |
| 461 | "stable-1.5": "49e98b837a5667130e0d1e062a6bd7985c7c4582", |
| 462 | } |
| 463 | |
| 464 | parser = argparse.ArgumentParser(description="Babeltrace benchmark utility") |
| 465 | parser.add_argument( |
| 466 | "--generate-jobs", action="store_true", help="Generate and send jobs" |
| 467 | ) |
| 468 | parser.add_argument( |
| 469 | "--force-jobs", action="store_true", help="Force the queueing of jobs to lava" |
| 470 | ) |
| 471 | parser.add_argument( |
| 472 | "--do-not-wait-on-completion", |
| 473 | action="store_true", |
| 474 | default=False, |
| 475 | help="Wait for the completion of each jobs sent. This is useful" |
| 476 | "for the ci. Otherwise we could end up spaming the lava instance.", |
| 477 | ) |
| 478 | parser.add_argument( |
| 479 | "--generate-report", |
| 480 | action="store_true", |
| 481 | help="Generate graphs and save them to pdf", |
| 482 | ) |
| 483 | parser.add_argument( |
| 484 | "--report-name", default="report.pdf", help="The name of the pdf report." |
| 485 | ) |
| 486 | parser.add_argument( |
| 487 | "--debug", action="store_true", default=False, help="Do not send jobs to lava." |
| 488 | ) |
| 489 | parser.add_argument( |
| 490 | "--repo-path", help="The location of the git repo to use.", required=True |
| 491 | ) |
| 492 | parser.add_argument( |
| 493 | "--overwrite-branches-cutoff", |
| 494 | help="A dictionary of the form {" |
| 495 | "'branch_name': 'commit_hash_cutoff',...}. Allow custom graphing and" |
| 496 | "jobs generation.", |
| 497 | required=False, type=json_type |
| 498 | ) |
| 499 | |
| 500 | args = parser.parse_args() |
| 501 | |
| 502 | if args.overwrite_branches_cutoff: |
| 503 | bt_branches = args.overwrite_branches_cutoff |
| 504 | |
| 505 | if not os.path.exists(args.repo_path): |
| 506 | print("Repository location does not exists.") |
| 507 | return 1 |
| 508 | |
| 509 | if args.generate_jobs: |
| 510 | print("Launching jobs for:") |
| 511 | |
| 512 | for branch, cutoff in bt_branches.items(): |
| 513 | print("\t Branch {} with cutoff {}".format(branch, cutoff)) |
| 514 | |
| 515 | launch_jobs( |
| 516 | bt_branches, |
| 517 | args.repo_path, |
| 518 | not args.do_not_wait_on_completion, |
| 519 | args.debug, |
| 520 | args.force_jobs, |
| 521 | ) |
| 522 | |
| 523 | if args.generate_report: |
| 524 | print("Generating pdf report ({}) for:".format(args.report_name)) |
| 525 | for branch, cutoff in bt_branches.items(): |
| 526 | print("\t Branch {} with cutoff {}".format(branch, cutoff)) |
| 527 | generate_graph(bt_branches, args.report_name, args.repo_path) |
| 528 | |
| 529 | return 0 |
| 530 | |
| 531 | |
| 532 | def sanitize_dataset(dataset): |
| 533 | """ |
| 534 | Use IRQ 1.5 [1] to remove outlier from the dataset. This is useful to get a |
| 535 | representative mean without outlier in it. |
| 536 | [1] https://en.wikipedia.org/wiki/Interquartile_range#Outliers |
| 537 | """ |
| 538 | sorted_data = sorted(dataset) |
| 539 | q1, q3 = numpy.percentile(sorted_data, [25, 75]) |
| 540 | iqr = q3 - q1 |
| 541 | lower_bound = q1 - (1.5 * iqr) |
| 542 | upper_bound = q3 + (1.5 * iqr) |
| 543 | new_dataset = [] |
| 544 | outliers = [] |
| 545 | for i in dataset: |
| 546 | if lower_bound <= i <= upper_bound: |
| 547 | new_dataset.append(i) |
| 548 | else: |
| 549 | outliers.append(i) |
| 550 | return new_dataset, outliers |
| 551 | |
| 552 | |
| 553 | if __name__ == "__main__": |
| 554 | sys.exit(main()) |