1e0493a4af57c1a73376a7bafaed542c01f588196Eric Li# Copyright 2007-2010 Google Inc.  Released under the GPL v2
2e0493a4af57c1a73376a7bafaed542c01f588196Eric Li__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)"
3e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
4e0493a4af57c1a73376a7bafaed542c01f588196Eric Li# A basic cpuset/cgroup container manager for limiting memory use during tests
5e0493a4af57c1a73376a7bafaed542c01f588196Eric Li#   for use on kernels not running some site-specific container manager
6e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
7e0493a4af57c1a73376a7bafaed542c01f588196Eric Liimport os, sys, re, glob, fcntl, logging
8e0493a4af57c1a73376a7bafaed542c01f588196Eric Lifrom autotest_lib.client.bin import utils
9e0493a4af57c1a73376a7bafaed542c01f588196Eric Lifrom autotest_lib.client.common_lib import error
10e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
11e0493a4af57c1a73376a7bafaed542c01f588196Eric LiSUPER_ROOT = ''      # root of all containers or cgroups
12e0493a4af57c1a73376a7bafaed542c01f588196Eric LiNO_LIMIT = (1 << 63) - 1   # containername/memory.limit_in_bytes if no limit
13e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
14e0493a4af57c1a73376a7bafaed542c01f588196Eric Li# propio service classes:
15e0493a4af57c1a73376a7bafaed542c01f588196Eric LiPROPIO_PRIO = 1
16e0493a4af57c1a73376a7bafaed542c01f588196Eric LiPROPIO_NORMAL = 2
17e0493a4af57c1a73376a7bafaed542c01f588196Eric LiPROPIO_IDLE = 3
18e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
19e0493a4af57c1a73376a7bafaed542c01f588196Eric Lisuper_root_path = ''    # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18
20e0493a4af57c1a73376a7bafaed542c01f588196Eric Licpuset_prefix   = None  # usually 'cpuset.'; '' on 2.6.18
21e0493a4af57c1a73376a7bafaed542c01f588196Eric Lifake_numa_containers = False # container mem via numa=fake mem nodes, else pages
22e0493a4af57c1a73376a7bafaed542c01f588196Eric Limem_isolation_on = False
23e0493a4af57c1a73376a7bafaed542c01f588196Eric Linode_mbytes = 0         # mbytes in one typical mem node
24e0493a4af57c1a73376a7bafaed542c01f588196Eric Liroot_container_bytes = 0  # squishy limit on effective size of root container
25e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
26e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
27e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef discover_container_style():
28e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    global super_root_path, cpuset_prefix
29e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    global mem_isolation_on, fake_numa_containers
30e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    global node_mbytes, root_container_bytes
31e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if super_root_path != '':
32e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return  # already looked up
33e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if os.path.exists('/dev/cgroup/tasks'):
34e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # running on 2.6.26 or later kernel with containers on:
35e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        super_root_path = '/dev/cgroup'
36e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        cpuset_prefix = 'cpuset.'
37e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if get_boot_numa():
38e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            mem_isolation_on = fake_numa_containers = True
39e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        else:  # memcg containers IFF compiled-in & mounted & non-fakenuma boot
40e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            fake_numa_containers = False
41e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            mem_isolation_on = os.path.exists(
42e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                    '/dev/cgroup/memory.limit_in_bytes')
43e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            # TODO: handle possibility of where memcg is mounted as its own
44e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            #       cgroup hierarchy, separate from cpuset??
45e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    elif os.path.exists('/dev/cpuset/tasks'):
46e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # running on 2.6.18 kernel with containers on:
47e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        super_root_path = '/dev/cpuset'
48e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        cpuset_prefix = ''
49e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        mem_isolation_on = fake_numa_containers = get_boot_numa() != ''
50e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
51e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # neither cpuset nor cgroup filesystem active:
52e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        super_root_path = None
53e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        cpuset_prefix = 'no_cpusets_or_cgroups_exist'
54e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        mem_isolation_on = fake_numa_containers = False
55e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
56e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('mem_isolation: %s', mem_isolation_on)
57e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('fake_numa_containers: %s', fake_numa_containers)
58e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if fake_numa_containers:
59e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        node_mbytes = int(mbytes_per_mem_node())
60e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    elif mem_isolation_on:  # memcg-style containers
61e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # For now, limit total of all containers to using just 98% of system's
62e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        #   visible total ram, to avoid oom events at system level, and avoid
63e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        #   page reclaim overhead from going above kswapd highwater mark.
64e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        system_visible_pages = utils.memtotal() >> 2
65e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        usable_pages = int(system_visible_pages * 0.98)
66e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        root_container_bytes = usable_pages << 12
67e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        logging.debug('root_container_bytes: %s',
68e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      utils.human_format(root_container_bytes))
69e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
70e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
71e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef need_mem_containers():
72e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
73e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not mem_isolation_on:
74e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('Mem-isolation containers not enabled '
75e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                  'by latest reboot')
76e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
77e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef need_fake_numa():
78e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
79e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not fake_numa_containers:
80e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('fake=numa not enabled by latest reboot')
81e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
82e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
83e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef full_path(container_name):
84e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
85e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return os.path.join(super_root_path, container_name)
86e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
87e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
88e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef unpath(container_path):
89e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return container_path[len(super_root_path)+1:]
90e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
91e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
92e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef cpuset_attr(container_name, attr):
93e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
94e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return os.path.join(super_root_path, container_name, cpuset_prefix+attr)
95e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
96e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
97e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef io_attr(container_name, attr):
98e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
99e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # current version assumes shared cgroup hierarchy
100e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return os.path.join(super_root_path, container_name, 'io.'+attr)
101e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
102e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
103e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef tasks_path(container_name):
104e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return os.path.join(full_path(container_name), 'tasks')
105e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
106e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
107e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef mems_path(container_name):
108e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return cpuset_attr(container_name, 'mems')
109e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
110e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
111e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef memory_path(container_name):
112e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return os.path.join(super_root_path, container_name, 'memory')
113e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
114e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
115e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef cpus_path(container_name):
116e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return cpuset_attr(container_name, 'cpus')
117e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
118e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
119e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef container_exists(name):
120e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return name is not None and os.path.exists(tasks_path(name))
121e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
122e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
123e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef move_tasks_into_container(name, tasks):
124e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    task_file = tasks_path(name)
125e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for task in tasks:
126e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        try:
127e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            logging.debug('moving task %s into container "%s"', task, name)
128e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            utils.write_one_line(task_file, task)
129e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        except Exception:
130e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            if utils.pid_is_alive(task):
131e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                raise   # task exists but couldn't move it
132e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            # task is gone or zombie so ignore this exception
133e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
134e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
135e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef move_self_into_container(name):
136e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    me = str(os.getpid())
137e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    move_tasks_into_container(name, [me])
138e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('running self (pid %s) in container "%s"', me, name)
139e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
140e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
141e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef _avail_mbytes_via_nodes(parent):
142e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # total mbytes of mem nodes available for new containers in parent
143e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    free_nodes = available_exclusive_mem_nodes(parent)
144e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    mbytes = nodes_avail_mbytes(free_nodes)
145e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # don't have exact model for how container mgr measures mem space
146e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # better here to underestimate than overestimate
147e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    mbytes = max(mbytes - node_mbytes//2, 0)
148e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return mbytes
149e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
150e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
151e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef _avail_bytes_via_pages(parent):
152e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get memory bytes available to parent container which could
153e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #  be allocated exclusively to new child containers.
154e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # This excludes mem previously allocated to existing children.
155e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    available = container_bytes(parent)
156e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    mem_files_pattern = os.path.join(full_path(parent),
157e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                     '*', 'memory.limit_in_bytes')
158e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for mem_file in glob.glob(mem_files_pattern):
159e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        child_container = unpath(os.path.dirname(mem_file))
160e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        available -= container_bytes(child_container)
161e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return available
162e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
163e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
164e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef avail_mbytes(parent=SUPER_ROOT):
165e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # total mbytes available in parent, for exclusive use in new containers
166e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if fake_numa_containers:
167e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return _avail_mbytes_via_nodes(parent)
168e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
169e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return _avail_bytes_via_pages(parent) >> 20
170e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
171e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
172e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef delete_leftover_test_containers():
173e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # recover mems and cores tied up by containers of prior failed tests:
174e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for child in inner_containers_of(SUPER_ROOT):
175e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        _release_container_nest(child)
176e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
177e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
178e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef my_lock(lockname):
179e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # lockname is 'inner'
180e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lockdir = os.environ['AUTODIR']
181e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname)
182e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lockfile = open(lockname, 'w')
183e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    fcntl.flock(lockfile, fcntl.LOCK_EX)
184e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return lockfile
185e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
186e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
187e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef my_unlock(lockfile):
188e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    fcntl.flock(lockfile, fcntl.LOCK_UN)
189e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lockfile.close()
190e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
191e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
192e0493a4af57c1a73376a7bafaed542c01f588196Eric Li# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12)
193e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef rangelist_to_set(rangelist):
194e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    result = set()
195e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not rangelist:
196e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return result
197e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for x in rangelist.split(','):
198e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if re.match(r'^(\d+)$', x):
199e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            result.add(int(x))
200e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            continue
201e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        m = re.match(r'^(\d+)-(\d+)$', x)
202e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if m:
203e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            start = int(m.group(1))
204e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            end = int(m.group(2))
205e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            result.update(set(range(start, end+1)))
206e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            continue
207e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        msg = 'Cannot understand data input: %s %s' % (x, rangelist)
208e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise ValueError(msg)
209e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return result
210e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
211e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
212e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef my_container_name():
213e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get current process's inherited or self-built container name
214e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #   within /dev/cpuset or /dev/cgroup.  Is '' for root container.
215e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    name = utils.read_one_line('/proc/%i/cpuset' % os.getpid())
216e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return name[1:]   # strip leading /
217e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
218e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
219e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef get_mem_nodes(container_name):
220e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # all mem nodes now available to a container, both exclusive & shared
221e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    file_name = mems_path(container_name)
222e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if os.path.exists(file_name):
223e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return rangelist_to_set(utils.read_one_line(file_name))
224e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
225e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return set()
226e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
227e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
228e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef _busy_mem_nodes(parent_container):
229e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get set of numa memory nodes now used (exclusively or shared)
230e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #   by existing children of parent container
231e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    busy = set()
232e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    mem_files_pattern = os.path.join(full_path(parent_container),
233e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                     '*', cpuset_prefix+'mems')
234e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for mem_file in glob.glob(mem_files_pattern):
235e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        child_container = os.path.dirname(mem_file)
236e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        busy |= get_mem_nodes(child_container)
237e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return busy
238e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
239e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
240e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef available_exclusive_mem_nodes(parent_container):
241e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get subset of numa memory nodes of parent container which could
242e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #  be allocated exclusively to new child containers.
243e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # This excludes nodes now allocated to existing children.
244e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    need_fake_numa()
245e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    available = get_mem_nodes(parent_container)
246e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    available -= _busy_mem_nodes(parent_container)
247e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return available
248e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
249e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
250e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef my_mem_nodes():
251e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get set of numa memory nodes owned by current process's container.
252e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    discover_container_style()
253e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not mem_isolation_on:
254e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return set()    # as expected by vmstress
255e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return get_mem_nodes(my_container_name())
256e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
257e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
258e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef my_available_exclusive_mem_nodes():
259e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get subset of numa memory nodes owned by current process's
260e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # container, which could be allocated exclusively to new child
261e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # containers.  This excludes any nodes now allocated
262e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # to existing children.
263e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return available_exclusive_mem_nodes(my_container_name())
264e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
265e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
266e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef node_avail_kbytes(node):
267e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return node_mbytes << 10  # crude; fixed numa node size
268e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
269e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
270e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef nodes_avail_mbytes(nodes):
271e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # nodes' combined user+avail size, in Mbytes
272e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return sum(node_avail_kbytes(n) for n in nodes) // 1024
273e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
274e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
275e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef container_bytes(name):
276e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if fake_numa_containers:
277e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return nodes_avail_mbytes(get_mem_nodes(name)) << 20
278e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
279e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        while True:
280e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            file = memory_path(name) + '.limit_in_bytes'
281e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            limit = int(utils.read_one_line(file))
282e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            if limit < NO_LIMIT:
283e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                return limit
284e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            if name == SUPER_ROOT:
285e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                return root_container_bytes
286e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            name = os.path.dirname(name)
287e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
288e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
289e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef container_mbytes(name):
290e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return container_bytes(name) >> 20
291e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
292e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
293e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef mbytes_per_mem_node():
294e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Get mbyte size of standard numa mem node, as float
295e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #  (some nodes are bigger than this)
296e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Replaces utils.node_size().
297e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    numa = get_boot_numa()
298e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if numa.endswith('M'):
299e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return float(numa[:-1])  # mbyte size of fake nodes
300e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    elif numa:
301e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        nodecnt = int(numa)  # fake numa mem nodes for container isolation
302e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
303e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        nodecnt = len(utils.numa_nodes())  # phys mem-controller nodes
304e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Use guessed total physical mem size, not kernel's
305e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #   lesser 'available memory' after various system tables.
306e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return utils.rounded_memtotal() / (nodecnt * 1024.0)
307e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
308e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
309e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef get_cpus(container_name):
310e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    file_name = cpus_path(container_name)
311e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if os.path.exists(file_name):
312e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return rangelist_to_set(utils.read_one_line(file_name))
313e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
314e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return set()
315e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
316e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
317e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef get_tasks(container_name):
318e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    file_name = tasks_path(container_name)
319e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    try:
320e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        tasks = [x.rstrip() for x in open(file_name).readlines()]
321e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    except IOError:
322e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if os.path.exists(file_name):
323e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            raise
324e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        tasks = []   # container doesn't exist anymore
325e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return tasks
326e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
327e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
328e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef inner_containers_of(parent):
329e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    pattern = os.path.join(full_path(parent), '*/tasks')
330e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return [unpath(os.path.dirname(task_file))
331e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            for task_file in glob.glob(pattern)]
332e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
333e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
334e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef _release_container_nest(nest):
335e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Destroy a container, and any nested sub-containers
336e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    nest_path = full_path(nest)
337e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if os.path.exists(nest_path):
338e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
339e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # bottom-up walk of tree, releasing all nested sub-containers
340e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        for child in inner_containers_of(nest):
341e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            _release_container_nest(child)
342e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
343e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        logging.debug("releasing container %s", nest)
344e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
345e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # Transfer any survivor tasks (e.g. self) to parent container
346e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        parent = os.path.dirname(nest)
347e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        move_tasks_into_container(parent, get_tasks(nest))
348e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
349e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # remove the now-empty outermost container of this nest
350e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if os.path.exists(nest_path):
351e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            os.rmdir(nest_path)  # nested, or dead manager
352e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
353e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
354e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef release_container(container_name=None):
355e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # Destroy a container
356e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    my_container = my_container_name()
357e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if container_name is None:
358e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        container_name = my_container
359e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    _release_container_nest(container_name)
360e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    displaced = my_container_name()
361e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if displaced != my_container:
362e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        logging.debug('now running self (pid %d) in container "%s"',
363e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      os.getpid(), displaced)
364e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
365e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
366e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef remove_empty_prio_classes(prios):
367e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # remove prio classes whose set of allowed priorities is empty
368e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #    e.g  'no:3;rt:;be:3;id:'  -->  'no:3;be:3'
369e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return ';'.join(p for p in prios.split(';') if p.split(':')[1])
370e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
371e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
372e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef all_drive_names():
373e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # list of all disk drives sda,sdb,...
374e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    paths = glob.glob('/sys/block/sd*')
375e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not paths:
376e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        paths = glob.glob('/sys/block/hd*')
377e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return [os.path.basename(path) for path in paths]
378e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
379e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
380e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL],
381e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                    io_shares=[95], io_limits=[0]):
382e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # set the propio controls for one container, for selected disks
383e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # writing directly to /dev/cgroup/container_name/io.io_service_level
384e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #    without using containerd or container.py
385e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # See wiki ProportionalIOScheduler for definitions
386e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # ioprio_classes: list of service classes, one per disk
387e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #    using numeric propio service classes as used by kernel API, namely
388e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       1: RT, Real Time, aka PROPIO_PRIO
389e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       2: BE, Best Effort, aka PROPIO_NORMAL
390e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       3: PROPIO_IDLE
391e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # io_shares: list of disk-time-fractions, one per disk,
392e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       as percentage integer 0..100
393e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # io_limits: list of limit on/off, one per disk
394e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       0: no limit, shares use of other containers' unused disk time
395e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #       1: limited, container's use of disk time is capped to given DTF
396e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # ioprio_classes defaults to best-effort
397e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # io_limit defaults to no limit, use slack time
398e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not disks:  # defaults to all drives
399e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        disks = all_drive_names()
400e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        io_shares      = [io_shares     [0]] * len(disks)
401e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        ioprio_classes = [ioprio_classes[0]] * len(disks)
402e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        io_limits      = [io_limits     [0]] * len(disks)
403e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares)
404e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                              and len(disks) == len(io_limits)):
405e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('Unequal number of values for io controls')
406e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    service_level = io_attr(container_name, 'io_service_level')
407e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not os.path.exists(service_level):
408e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        return  # kernel predates propio features
409e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            # or io cgroup is mounted separately from cpusets
410e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    disk_infos = []
411e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for disk,ioclass,limit,share in zip(disks, ioprio_classes,
412e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                        io_limits, io_shares):
413e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        parts = (disk, str(ioclass), str(limit), str(share))
414e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        disk_info = ' '.join(parts)
415e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        utils.write_one_line(service_level, disk_info)
416e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        disk_infos.append(disk_info)
417e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('set_io_controls of %s to %s',
418e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  container_name, ', '.join(disk_infos))
419e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
420e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
421e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef abbrev_list(vals):
422e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'."""
423e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    ranges = []
424e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lower = 0
425e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    upper = -2
426e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for val in sorted(vals)+[-1]:
427e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if val != upper+1:
428e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            if lower == upper:
429e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                ranges.append(str(lower))
430e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            elif lower <= upper:
431e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                ranges.append('%d-%d' % (lower, upper))
432e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            lower = val
433e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        upper = val
434e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return ','.join(ranges)
435e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
436e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
437e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef create_container_with_specific_mems_cpus(name, mems, cpus):
438e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    need_fake_numa()
439e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    os.mkdir(full_path(name))
440e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1')
441e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(mems_path(name), ','.join(map(str, mems)))
442e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
443e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('container %s has %d cpus and %d nodes totalling %s bytes',
444e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  name, len(cpus), len(get_mem_nodes(name)),
445e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  utils.human_format(container_bytes(name)) )
446e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
447e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
448e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef create_container_via_memcg(name, parent, bytes, cpus):
449e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # create container via direct memcg cgroup writes
450e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    os.mkdir(full_path(name))
451e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    nodes = utils.read_one_line(mems_path(parent))
452e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(mems_path(name), nodes)  # inherit parent's nodes
453e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes))
454e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    utils.write_one_line(cpus_path(name), ','.join(map(str, cpus)))
455e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    logging.debug('Created container %s directly via memcg,'
456e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  ' has %d cpus and %s bytes',
457e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  name, len(cpus), utils.human_format(container_bytes(name)))
458e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
459e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
460e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef _create_fake_numa_container_directly(name, parent, mbytes, cpus):
461e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    need_fake_numa()
462e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    lockfile = my_lock('inner')   # serialize race between parallel tests
463e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    try:
464e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # Pick specific mem nodes for new cpuset's exclusive use
465e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # For now, arbitrarily pick highest available node numbers
466e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        needed_kbytes = mbytes * 1024
467e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        nodes = sorted(list(available_exclusive_mem_nodes(parent)))
468e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        kbytes = 0
469e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        nodecnt = 0
470e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        while kbytes < needed_kbytes and nodecnt < len(nodes):
471e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            nodecnt += 1
472e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            kbytes += node_avail_kbytes(nodes[-nodecnt])
473e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if kbytes < needed_kbytes:
474e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            parent_mbytes = container_mbytes(parent)
475e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            if mbytes > parent_mbytes:
476e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                raise error.AutotestError(
477e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      "New container's %d Mbytes exceeds "
478e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      "parent container's %d Mbyte size"
479e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      % (mbytes, parent_mbytes) )
480e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            else:
481e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                raise error.AutotestError(
482e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      "Existing sibling containers hold "
483e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      "%d Mbytes needed by new container"
484e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                      % ((needed_kbytes - kbytes)//1024) )
485e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        mems = nodes[-nodecnt:]
486e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
487e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        create_container_with_specific_mems_cpus(name, mems, cpus)
488e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    finally:
489e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        my_unlock(lockfile)
490e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
491e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
492e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef create_container_directly(name, mbytes, cpus):
493e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    parent = os.path.dirname(name)
494e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if fake_numa_containers:
495e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        _create_fake_numa_container_directly(name, parent, mbytes, cpus)
496e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
497e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        create_container_via_memcg(name, parent, mbytes<<20, cpus)
498e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
499e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
500e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef create_container_with_mbytes_and_specific_cpus(name, mbytes,
501e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0):
502e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    """\
503e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    Create a cpuset container and move job's current pid into it
504e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    Allocate the list "cpus" of cpus to that container
505e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
506e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            name = arbitrary string tag
507e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            mbytes = reqested memory for job in megabytes
508e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            cpus = list of cpu indicies to associate with the cpuset
509e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                  defaults to all cpus avail with given root
510e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            root = the parent cpuset to nest this new set within
511e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                   '': unnested top-level container
512e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            io = arguments for proportional IO containers
513e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            move_in = True: Move current process into the new container now.
514e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            timeout = must be 0: persist until explicitly deleted.
515e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    """
516e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    need_mem_containers()
517e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not container_exists(root):
518e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('Parent container "%s" does not exist'
519e0493a4af57c1a73376a7bafaed542c01f588196Eric Li                                   % root)
520e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if cpus is None:
521e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        # default to biggest container we can make under root
522e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        cpus = get_cpus(root)
523e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    else:
524e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        cpus = set(cpus)  # interface uses list
525e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if not cpus:
526e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('Creating container with no cpus')
527e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    name = os.path.join(root, name)  # path relative to super_root
528e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if os.path.exists(full_path(name)):
529e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        raise error.AutotestError('Container %s already exists' % name)
530e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    create_container_directly(name, mbytes, cpus)
531e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    set_io_controls(name, **io)
532e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    if move_in:
533e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        move_self_into_container(name)
534e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return name
535e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
536e0493a4af57c1a73376a7bafaed542c01f588196Eric Li
537e0493a4af57c1a73376a7bafaed542c01f588196Eric Lidef get_boot_numa():
538e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    # get boot-time numa=fake=xyz option for current boot
539e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    #   eg  numa=fake=nnn,  numa=fake=nnnM, or nothing
540e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    label = 'numa=fake='
541e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    for arg in utils.read_one_line('/proc/cmdline').split():
542e0493a4af57c1a73376a7bafaed542c01f588196Eric Li        if arg.startswith(label):
543e0493a4af57c1a73376a7bafaed542c01f588196Eric Li            return arg[len(label):]
544e0493a4af57c1a73376a7bafaed542c01f588196Eric Li    return ''
545