1# Copyright 2007-2010 Google Inc. Released under the GPL v2 2__author__ = "duanes (Duane Sand), pdahl (Peter Dahl)" 3 4# A basic cpuset/cgroup container manager for limiting memory use during tests 5# for use on kernels not running some site-specific container manager 6 7import os, sys, re, glob, fcntl, logging 8from autotest_lib.client.bin import utils 9from autotest_lib.client.common_lib import error 10 11SUPER_ROOT = '' # root of all containers or cgroups 12NO_LIMIT = (1 << 63) - 1 # containername/memory.limit_in_bytes if no limit 13 14# propio service classes: 15PROPIO_PRIO = 1 16PROPIO_NORMAL = 2 17PROPIO_IDLE = 3 18 19super_root_path = '' # usually '/dev/cgroup'; '/dev/cpuset' on 2.6.18 20cpuset_prefix = None # usually 'cpuset.'; '' on 2.6.18 21fake_numa_containers = False # container mem via numa=fake mem nodes, else pages 22mem_isolation_on = False 23node_mbytes = 0 # mbytes in one typical mem node 24root_container_bytes = 0 # squishy limit on effective size of root container 25 26 27def discover_container_style(): 28 global super_root_path, cpuset_prefix 29 global mem_isolation_on, fake_numa_containers 30 global node_mbytes, root_container_bytes 31 if super_root_path != '': 32 return # already looked up 33 if os.path.exists('/dev/cgroup/tasks'): 34 # running on 2.6.26 or later kernel with containers on: 35 super_root_path = '/dev/cgroup' 36 cpuset_prefix = 'cpuset.' 37 if get_boot_numa(): 38 mem_isolation_on = fake_numa_containers = True 39 else: # memcg containers IFF compiled-in & mounted & non-fakenuma boot 40 fake_numa_containers = False 41 mem_isolation_on = os.path.exists( 42 '/dev/cgroup/memory.limit_in_bytes') 43 # TODO: handle possibility of where memcg is mounted as its own 44 # cgroup hierarchy, separate from cpuset?? 45 elif os.path.exists('/dev/cpuset/tasks'): 46 # running on 2.6.18 kernel with containers on: 47 super_root_path = '/dev/cpuset' 48 cpuset_prefix = '' 49 mem_isolation_on = fake_numa_containers = get_boot_numa() != '' 50 else: 51 # neither cpuset nor cgroup filesystem active: 52 super_root_path = None 53 cpuset_prefix = 'no_cpusets_or_cgroups_exist' 54 mem_isolation_on = fake_numa_containers = False 55 56 logging.debug('mem_isolation: %s', mem_isolation_on) 57 logging.debug('fake_numa_containers: %s', fake_numa_containers) 58 if fake_numa_containers: 59 node_mbytes = int(mbytes_per_mem_node()) 60 elif mem_isolation_on: # memcg-style containers 61 # For now, limit total of all containers to using just 98% of system's 62 # visible total ram, to avoid oom events at system level, and avoid 63 # page reclaim overhead from going above kswapd highwater mark. 64 system_visible_pages = utils.memtotal() >> 2 65 usable_pages = int(system_visible_pages * 0.98) 66 root_container_bytes = usable_pages << 12 67 logging.debug('root_container_bytes: %s', 68 utils.human_format(root_container_bytes)) 69 70 71def need_mem_containers(): 72 discover_container_style() 73 if not mem_isolation_on: 74 raise error.AutotestError('Mem-isolation containers not enabled ' 75 'by latest reboot') 76 77def need_fake_numa(): 78 discover_container_style() 79 if not fake_numa_containers: 80 raise error.AutotestError('fake=numa not enabled by latest reboot') 81 82 83def full_path(container_name): 84 discover_container_style() 85 return os.path.join(super_root_path, container_name) 86 87 88def unpath(container_path): 89 return container_path[len(super_root_path)+1:] 90 91 92def cpuset_attr(container_name, attr): 93 discover_container_style() 94 return os.path.join(super_root_path, container_name, cpuset_prefix+attr) 95 96 97def io_attr(container_name, attr): 98 discover_container_style() 99 # current version assumes shared cgroup hierarchy 100 return os.path.join(super_root_path, container_name, 'io.'+attr) 101 102 103def tasks_path(container_name): 104 return os.path.join(full_path(container_name), 'tasks') 105 106 107def mems_path(container_name): 108 return cpuset_attr(container_name, 'mems') 109 110 111def memory_path(container_name): 112 return os.path.join(super_root_path, container_name, 'memory') 113 114 115def cpus_path(container_name): 116 return cpuset_attr(container_name, 'cpus') 117 118 119def container_exists(name): 120 return name is not None and os.path.exists(tasks_path(name)) 121 122 123def move_tasks_into_container(name, tasks): 124 task_file = tasks_path(name) 125 for task in tasks: 126 try: 127 logging.debug('moving task %s into container "%s"', task, name) 128 utils.write_one_line(task_file, task) 129 except Exception: 130 if utils.pid_is_alive(task): 131 raise # task exists but couldn't move it 132 # task is gone or zombie so ignore this exception 133 134 135def move_self_into_container(name): 136 me = str(os.getpid()) 137 move_tasks_into_container(name, [me]) 138 logging.debug('running self (pid %s) in container "%s"', me, name) 139 140 141def _avail_mbytes_via_nodes(parent): 142 # total mbytes of mem nodes available for new containers in parent 143 free_nodes = available_exclusive_mem_nodes(parent) 144 mbytes = nodes_avail_mbytes(free_nodes) 145 # don't have exact model for how container mgr measures mem space 146 # better here to underestimate than overestimate 147 mbytes = max(mbytes - node_mbytes//2, 0) 148 return mbytes 149 150 151def _avail_bytes_via_pages(parent): 152 # Get memory bytes available to parent container which could 153 # be allocated exclusively to new child containers. 154 # This excludes mem previously allocated to existing children. 155 available = container_bytes(parent) 156 mem_files_pattern = os.path.join(full_path(parent), 157 '*', 'memory.limit_in_bytes') 158 for mem_file in glob.glob(mem_files_pattern): 159 child_container = unpath(os.path.dirname(mem_file)) 160 available -= container_bytes(child_container) 161 return available 162 163 164def avail_mbytes(parent=SUPER_ROOT): 165 # total mbytes available in parent, for exclusive use in new containers 166 if fake_numa_containers: 167 return _avail_mbytes_via_nodes(parent) 168 else: 169 return _avail_bytes_via_pages(parent) >> 20 170 171 172def delete_leftover_test_containers(): 173 # recover mems and cores tied up by containers of prior failed tests: 174 for child in inner_containers_of(SUPER_ROOT): 175 _release_container_nest(child) 176 177 178def my_lock(lockname): 179 # lockname is 'inner' 180 lockdir = os.environ['AUTODIR'] 181 lockname = os.path.join(lockdir, '.cpuset.lock.'+lockname) 182 lockfile = open(lockname, 'w') 183 fcntl.flock(lockfile, fcntl.LOCK_EX) 184 return lockfile 185 186 187def my_unlock(lockfile): 188 fcntl.flock(lockfile, fcntl.LOCK_UN) 189 lockfile.close() 190 191 192# Convert '1-3,7,9-12' to set(1,2,3,7,9,10,11,12) 193def rangelist_to_set(rangelist): 194 result = set() 195 if not rangelist: 196 return result 197 for x in rangelist.split(','): 198 if re.match(r'^(\d+)$', x): 199 result.add(int(x)) 200 continue 201 m = re.match(r'^(\d+)-(\d+)$', x) 202 if m: 203 start = int(m.group(1)) 204 end = int(m.group(2)) 205 result.update(set(range(start, end+1))) 206 continue 207 msg = 'Cannot understand data input: %s %s' % (x, rangelist) 208 raise ValueError(msg) 209 return result 210 211 212def my_container_name(): 213 # Get current process's inherited or self-built container name 214 # within /dev/cpuset or /dev/cgroup. Is '' for root container. 215 name = utils.read_one_line('/proc/%i/cpuset' % os.getpid()) 216 return name[1:] # strip leading / 217 218 219def get_mem_nodes(container_name): 220 # all mem nodes now available to a container, both exclusive & shared 221 file_name = mems_path(container_name) 222 if os.path.exists(file_name): 223 return rangelist_to_set(utils.read_one_line(file_name)) 224 else: 225 return set() 226 227 228def _busy_mem_nodes(parent_container): 229 # Get set of numa memory nodes now used (exclusively or shared) 230 # by existing children of parent container 231 busy = set() 232 mem_files_pattern = os.path.join(full_path(parent_container), 233 '*', cpuset_prefix+'mems') 234 for mem_file in glob.glob(mem_files_pattern): 235 child_container = os.path.dirname(mem_file) 236 busy |= get_mem_nodes(child_container) 237 return busy 238 239 240def available_exclusive_mem_nodes(parent_container): 241 # Get subset of numa memory nodes of parent container which could 242 # be allocated exclusively to new child containers. 243 # This excludes nodes now allocated to existing children. 244 need_fake_numa() 245 available = get_mem_nodes(parent_container) 246 available -= _busy_mem_nodes(parent_container) 247 return available 248 249 250def my_mem_nodes(): 251 # Get set of numa memory nodes owned by current process's container. 252 discover_container_style() 253 if not mem_isolation_on: 254 return set() # as expected by vmstress 255 return get_mem_nodes(my_container_name()) 256 257 258def my_available_exclusive_mem_nodes(): 259 # Get subset of numa memory nodes owned by current process's 260 # container, which could be allocated exclusively to new child 261 # containers. This excludes any nodes now allocated 262 # to existing children. 263 return available_exclusive_mem_nodes(my_container_name()) 264 265 266def node_avail_kbytes(node): 267 return node_mbytes << 10 # crude; fixed numa node size 268 269 270def nodes_avail_mbytes(nodes): 271 # nodes' combined user+avail size, in Mbytes 272 return sum(node_avail_kbytes(n) for n in nodes) // 1024 273 274 275def container_bytes(name): 276 if fake_numa_containers: 277 return nodes_avail_mbytes(get_mem_nodes(name)) << 20 278 else: 279 while True: 280 file = memory_path(name) + '.limit_in_bytes' 281 limit = int(utils.read_one_line(file)) 282 if limit < NO_LIMIT: 283 return limit 284 if name == SUPER_ROOT: 285 return root_container_bytes 286 name = os.path.dirname(name) 287 288 289def container_mbytes(name): 290 return container_bytes(name) >> 20 291 292 293def mbytes_per_mem_node(): 294 # Get mbyte size of standard numa mem node, as float 295 # (some nodes are bigger than this) 296 # Replaces utils.node_size(). 297 numa = get_boot_numa() 298 if numa.endswith('M'): 299 return float(numa[:-1]) # mbyte size of fake nodes 300 elif numa: 301 nodecnt = int(numa) # fake numa mem nodes for container isolation 302 else: 303 nodecnt = len(utils.numa_nodes()) # phys mem-controller nodes 304 # Use guessed total physical mem size, not kernel's 305 # lesser 'available memory' after various system tables. 306 return utils.rounded_memtotal() / (nodecnt * 1024.0) 307 308 309def get_cpus(container_name): 310 file_name = cpus_path(container_name) 311 if os.path.exists(file_name): 312 return rangelist_to_set(utils.read_one_line(file_name)) 313 else: 314 return set() 315 316 317def get_tasks(container_name): 318 file_name = tasks_path(container_name) 319 try: 320 tasks = [x.rstrip() for x in open(file_name).readlines()] 321 except IOError: 322 if os.path.exists(file_name): 323 raise 324 tasks = [] # container doesn't exist anymore 325 return tasks 326 327 328def inner_containers_of(parent): 329 pattern = os.path.join(full_path(parent), '*/tasks') 330 return [unpath(os.path.dirname(task_file)) 331 for task_file in glob.glob(pattern)] 332 333 334def _release_container_nest(nest): 335 # Destroy a container, and any nested sub-containers 336 nest_path = full_path(nest) 337 if os.path.exists(nest_path): 338 339 # bottom-up walk of tree, releasing all nested sub-containers 340 for child in inner_containers_of(nest): 341 _release_container_nest(child) 342 343 logging.debug("releasing container %s", nest) 344 345 # Transfer any survivor tasks (e.g. self) to parent container 346 parent = os.path.dirname(nest) 347 move_tasks_into_container(parent, get_tasks(nest)) 348 349 # remove the now-empty outermost container of this nest 350 if os.path.exists(nest_path): 351 os.rmdir(nest_path) # nested, or dead manager 352 353 354def release_container(container_name=None): 355 # Destroy a container 356 my_container = my_container_name() 357 if container_name is None: 358 container_name = my_container 359 _release_container_nest(container_name) 360 displaced = my_container_name() 361 if displaced != my_container: 362 logging.debug('now running self (pid %d) in container "%s"', 363 os.getpid(), displaced) 364 365 366def remove_empty_prio_classes(prios): 367 # remove prio classes whose set of allowed priorities is empty 368 # e.g 'no:3;rt:;be:3;id:' --> 'no:3;be:3' 369 return ';'.join(p for p in prios.split(';') if p.split(':')[1]) 370 371 372def all_drive_names(): 373 # list of all disk drives sda,sdb,... 374 paths = glob.glob('/sys/block/sd*') 375 if not paths: 376 paths = glob.glob('/sys/block/hd*') 377 return [os.path.basename(path) for path in paths] 378 379 380def set_io_controls(container_name, disks=[], ioprio_classes=[PROPIO_NORMAL], 381 io_shares=[95], io_limits=[0]): 382 # set the propio controls for one container, for selected disks 383 # writing directly to /dev/cgroup/container_name/io.io_service_level 384 # without using containerd or container.py 385 # See wiki ProportionalIOScheduler for definitions 386 # ioprio_classes: list of service classes, one per disk 387 # using numeric propio service classes as used by kernel API, namely 388 # 1: RT, Real Time, aka PROPIO_PRIO 389 # 2: BE, Best Effort, aka PROPIO_NORMAL 390 # 3: PROPIO_IDLE 391 # io_shares: list of disk-time-fractions, one per disk, 392 # as percentage integer 0..100 393 # io_limits: list of limit on/off, one per disk 394 # 0: no limit, shares use of other containers' unused disk time 395 # 1: limited, container's use of disk time is capped to given DTF 396 # ioprio_classes defaults to best-effort 397 # io_limit defaults to no limit, use slack time 398 if not disks: # defaults to all drives 399 disks = all_drive_names() 400 io_shares = [io_shares [0]] * len(disks) 401 ioprio_classes = [ioprio_classes[0]] * len(disks) 402 io_limits = [io_limits [0]] * len(disks) 403 if not (len(disks) == len(ioprio_classes) and len(disks) == len(io_shares) 404 and len(disks) == len(io_limits)): 405 raise error.AutotestError('Unequal number of values for io controls') 406 service_level = io_attr(container_name, 'io_service_level') 407 if not os.path.exists(service_level): 408 return # kernel predates propio features 409 # or io cgroup is mounted separately from cpusets 410 disk_infos = [] 411 for disk,ioclass,limit,share in zip(disks, ioprio_classes, 412 io_limits, io_shares): 413 parts = (disk, str(ioclass), str(limit), str(share)) 414 disk_info = ' '.join(parts) 415 utils.write_one_line(service_level, disk_info) 416 disk_infos.append(disk_info) 417 logging.debug('set_io_controls of %s to %s', 418 container_name, ', '.join(disk_infos)) 419 420 421def abbrev_list(vals): 422 """Condense unsigned (0,4,5,6,7,10) to '0,4-7,10'.""" 423 ranges = [] 424 lower = 0 425 upper = -2 426 for val in sorted(vals)+[-1]: 427 if val != upper+1: 428 if lower == upper: 429 ranges.append(str(lower)) 430 elif lower <= upper: 431 ranges.append('%d-%d' % (lower, upper)) 432 lower = val 433 upper = val 434 return ','.join(ranges) 435 436 437def create_container_with_specific_mems_cpus(name, mems, cpus): 438 need_fake_numa() 439 os.mkdir(full_path(name)) 440 utils.write_one_line(cpuset_attr(name, 'mem_hardwall'), '1') 441 utils.write_one_line(mems_path(name), ','.join(map(str, mems))) 442 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) 443 logging.debug('container %s has %d cpus and %d nodes totalling %s bytes', 444 name, len(cpus), len(get_mem_nodes(name)), 445 utils.human_format(container_bytes(name)) ) 446 447 448def create_container_via_memcg(name, parent, bytes, cpus): 449 # create container via direct memcg cgroup writes 450 os.mkdir(full_path(name)) 451 nodes = utils.read_one_line(mems_path(parent)) 452 utils.write_one_line(mems_path(name), nodes) # inherit parent's nodes 453 utils.write_one_line(memory_path(name)+'.limit_in_bytes', str(bytes)) 454 utils.write_one_line(cpus_path(name), ','.join(map(str, cpus))) 455 logging.debug('Created container %s directly via memcg,' 456 ' has %d cpus and %s bytes', 457 name, len(cpus), utils.human_format(container_bytes(name))) 458 459 460def _create_fake_numa_container_directly(name, parent, mbytes, cpus): 461 need_fake_numa() 462 lockfile = my_lock('inner') # serialize race between parallel tests 463 try: 464 # Pick specific mem nodes for new cpuset's exclusive use 465 # For now, arbitrarily pick highest available node numbers 466 needed_kbytes = mbytes * 1024 467 nodes = sorted(list(available_exclusive_mem_nodes(parent))) 468 kbytes = 0 469 nodecnt = 0 470 while kbytes < needed_kbytes and nodecnt < len(nodes): 471 nodecnt += 1 472 kbytes += node_avail_kbytes(nodes[-nodecnt]) 473 if kbytes < needed_kbytes: 474 parent_mbytes = container_mbytes(parent) 475 if mbytes > parent_mbytes: 476 raise error.AutotestError( 477 "New container's %d Mbytes exceeds " 478 "parent container's %d Mbyte size" 479 % (mbytes, parent_mbytes) ) 480 else: 481 raise error.AutotestError( 482 "Existing sibling containers hold " 483 "%d Mbytes needed by new container" 484 % ((needed_kbytes - kbytes)//1024) ) 485 mems = nodes[-nodecnt:] 486 487 create_container_with_specific_mems_cpus(name, mems, cpus) 488 finally: 489 my_unlock(lockfile) 490 491 492def create_container_directly(name, mbytes, cpus): 493 parent = os.path.dirname(name) 494 if fake_numa_containers: 495 _create_fake_numa_container_directly(name, parent, mbytes, cpus) 496 else: 497 create_container_via_memcg(name, parent, mbytes<<20, cpus) 498 499 500def create_container_with_mbytes_and_specific_cpus(name, mbytes, 501 cpus=None, root=SUPER_ROOT, io={}, move_in=True, timeout=0): 502 """\ 503 Create a cpuset container and move job's current pid into it 504 Allocate the list "cpus" of cpus to that container 505 506 name = arbitrary string tag 507 mbytes = reqested memory for job in megabytes 508 cpus = list of cpu indicies to associate with the cpuset 509 defaults to all cpus avail with given root 510 root = the parent cpuset to nest this new set within 511 '': unnested top-level container 512 io = arguments for proportional IO containers 513 move_in = True: Move current process into the new container now. 514 timeout = must be 0: persist until explicitly deleted. 515 """ 516 need_mem_containers() 517 if not container_exists(root): 518 raise error.AutotestError('Parent container "%s" does not exist' 519 % root) 520 if cpus is None: 521 # default to biggest container we can make under root 522 cpus = get_cpus(root) 523 else: 524 cpus = set(cpus) # interface uses list 525 if not cpus: 526 raise error.AutotestError('Creating container with no cpus') 527 name = os.path.join(root, name) # path relative to super_root 528 if os.path.exists(full_path(name)): 529 raise error.AutotestError('Container %s already exists' % name) 530 create_container_directly(name, mbytes, cpus) 531 set_io_controls(name, **io) 532 if move_in: 533 move_self_into_container(name) 534 return name 535 536 537def get_boot_numa(): 538 # get boot-time numa=fake=xyz option for current boot 539 # eg numa=fake=nnn, numa=fake=nnnM, or nothing 540 label = 'numa=fake=' 541 for arg in utils.read_one_line('/proc/cmdline').split(): 542 if arg.startswith(label): 543 return arg[len(label):] 544 return '' 545