Commit | Line | Data |
---|---|---|
9a28bc04 KS |
1 | #!/usr/bin/env python3 |
2 | # | |
3 | # SPDX-FileCyoprightText: Kienan Stewart <kstewart@efficios.com> | |
4 | # SPDX-License-Identifier: GPL-2.0-only | |
5 | ||
6 | """ | |
7 | Test that the consumerd doesn't leak file descriptor allocations in /dev/shm | |
8 | when the relayd exits before instrumented applications start. | |
9 | ||
10 | @see https://bugs.lttng.org/issues/1411 | |
11 | """ | |
12 | ||
13 | import os | |
14 | import pathlib | |
15 | import subprocess | |
16 | import sys | |
fbedc3dc | 17 | import time |
9a28bc04 KS |
18 | |
19 | test_utils_import_path = pathlib.Path(__file__).absolute().parents[3] / "utils" | |
20 | sys.path.append(str(test_utils_import_path)) | |
21 | ||
22 | import lttngtest | |
23 | ||
24 | ||
25 | def get_consumerd_pid(tap, parent, match_string): | |
fbedc3dc | 26 | pid = None |
9a28bc04 KS |
27 | try: |
28 | process = subprocess.Popen( | |
29 | ["pgrep", "-P", str(parent), "-f", match_string], | |
30 | stdout=subprocess.PIPE, | |
31 | ) | |
32 | process.wait() | |
33 | output = str(process.stdout.read(), encoding="UTF-8").splitlines() | |
fbedc3dc | 34 | if len(output) > 1: |
9a28bc04 KS |
35 | raise Exception( |
36 | "Unexpected number of output lines (got {}): {}".format( | |
37 | len(output), output | |
38 | ) | |
39 | ) | |
fbedc3dc KS |
40 | elif len(output) == 1: |
41 | pid = int(output[0]) | |
9a28bc04 KS |
42 | except Exception as e: |
43 | tap.diagnostic( | |
44 | "Failed to find child process of '{}' matching '{}': '{}'".format( | |
45 | parent, match_string, str(e) | |
46 | ) | |
47 | ) | |
48 | return pid | |
49 | ||
50 | ||
51 | def count_process_dev_shm_fds(pid): | |
52 | count = 0 | |
fbedc3dc | 53 | if pid is None: |
9a28bc04 KS |
54 | return count |
55 | dir = os.path.join("/proc", str(pid), "fd") | |
56 | for root, dirs, files in os.walk(dir): | |
57 | for f in files: | |
58 | filename = pathlib.Path(os.path.join(root, f)) | |
59 | try: | |
fbedc3dc KS |
60 | # The symlink in /proc/PID may exist, but point to an unlinked |
61 | # file - shm_unlink is called but either the kernel hasn't yet | |
62 | # finished the clean-up or the consumer hasn't called close() | |
63 | # on the FD yet. | |
9a28bc04 KS |
64 | if filename.is_symlink() and str(filename.resolve()).startswith( |
65 | "/dev/shm/shm-ust-consumer" | |
66 | ): | |
67 | count += 1 | |
68 | except FileNotFoundError: | |
fbedc3dc | 69 | # As /proc/XX/fd/ is being walked, fds may be added or removed |
9a28bc04 KS |
70 | continue |
71 | return count | |
72 | ||
73 | ||
74 | def count_dev_shm_fds(tap, test_env): | |
75 | consumer32_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd32") | |
76 | fds_consumerd32 = count_process_dev_shm_fds(consumer32_pid) | |
77 | consumer64_pid = get_consumerd_pid(tap, test_env._sessiond.pid, "ustconsumerd64") | |
78 | fds_consumerd64 = count_process_dev_shm_fds(consumer64_pid) | |
79 | return (fds_consumerd32, fds_consumerd64) | |
80 | ||
81 | ||
82 | def test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd=True): | |
83 | tap.diagnostic( | |
84 | "test_fd_leak with buffer sharing policy {}, kill relayd: {}".format( | |
85 | buffer_sharing_policy, kill_relayd | |
86 | ) | |
87 | ) | |
88 | client = lttngtest.LTTngClient(test_env, log=tap.diagnostic) | |
89 | output = lttngtest.NetworkSessionOutputLocation( | |
90 | "net://localhost:{}:{}/".format( | |
91 | test_env.lttng_relayd_control_port, test_env.lttng_relayd_data_port | |
92 | ) | |
93 | ) | |
94 | ||
95 | session = client.create_session(output=output, live=True) | |
96 | channel = session.add_channel( | |
97 | lttngtest.lttngctl.TracingDomain.User, | |
98 | buffer_sharing_policy=buffer_sharing_policy, | |
99 | ) | |
100 | channel.add_recording_rule(lttngtest.lttngctl.UserTracepointEventRule()) | |
101 | session.start() | |
102 | ||
103 | count_post_start = count_dev_shm_fds(tap, test_env) | |
104 | ||
105 | # Kill the relayd | |
106 | if kill_relayd: | |
107 | test_env._terminate_relayd() | |
108 | ||
109 | test_env.launch_wait_trace_test_application(10) | |
110 | count_post_app1 = count_dev_shm_fds(tap, test_env) | |
111 | ||
112 | test_env.launch_wait_trace_test_application(10) | |
113 | count_post_app2 = count_dev_shm_fds(tap, test_env) | |
114 | ||
115 | test_env.launch_wait_trace_test_application(10) | |
116 | count_post_app3 = count_dev_shm_fds(tap, test_env) | |
117 | ||
118 | session.stop() | |
119 | session.destroy() | |
120 | ||
fbedc3dc KS |
121 | # As there is not method to know exactly when the final close of the |
122 | # shm happens (it is timing dependant from an external point of view), | |
123 | # this test iterates waiting for the post-destroy count to reach the | |
124 | # post-start count. In a failure, this will loop infinitely. | |
125 | tap.diagnostic( | |
126 | "Waiting for post-destroy shm count to drop back to post-start level" | |
127 | ) | |
128 | while True: | |
129 | count_post_destroy = count_dev_shm_fds(tap, test_env) | |
130 | if count_post_destroy == count_post_start: | |
131 | break | |
132 | time.sleep(0.1) | |
9a28bc04 KS |
133 | |
134 | tap.diagnostic( | |
135 | "FD counts post-start: {}, post-destroy: {}".format( | |
136 | count_post_start, count_post_destroy | |
137 | ) | |
138 | ) | |
139 | tap.test( | |
140 | count_post_start == count_post_destroy, | |
141 | "Count of consumerd FDs in /dev/shm are equal after session start then after destroy", | |
142 | ) | |
143 | ||
144 | tap.diagnostic( | |
145 | "FD counts post-app-1: {}, post-app-2: {}, post-app-3: {}".format( | |
146 | count_post_app1, count_post_app2, count_post_app3 | |
147 | ) | |
148 | ) | |
149 | if buffer_sharing_policy == lttngtest.lttngctl.BufferSharingPolicy.PerUID: | |
150 | tap.test( | |
151 | (count_post_app1 == count_post_app2) | |
152 | and (count_post_app2 == count_post_app3), | |
153 | "Count of consumerd FDs in /dev/shm doesn't leak over several application invocations", | |
154 | ) | |
155 | else: | |
156 | tap.skip( | |
157 | "Count of consumerds FDs in /dev/shm doesn't leak over several application invocations - no mechanism is available to guarantee buffer reclamation within a given time frame" | |
158 | ) | |
159 | ||
160 | ||
161 | tap = lttngtest.TapGenerator(8) | |
162 | for kill_relayd in [True, False]: | |
163 | for buffer_sharing_policy in [ | |
164 | lttngtest.lttngctl.BufferSharingPolicy.PerUID, | |
165 | lttngtest.lttngctl.BufferSharingPolicy.PerPID, | |
166 | ]: | |
167 | with lttngtest.test_environment( | |
168 | log=tap.diagnostic, with_relayd=True, with_sessiond=True | |
169 | ) as test_env: | |
170 | test_fd_leak(tap, test_env, buffer_sharing_policy, kill_relayd) | |
171 | ||
172 | sys.exit(0 if tap.is_successful else 1) |