#!/usr/bin/env python3

"""This script runs cuda-memcheck on the specified unit test. Each test case
is run in its isolated process with a timeout so that:
1) different test cases won't influence each other, and
2) in case of hang, the script would still finish in a finite amount of time.
The output will be written to a log file result.log

Example usage:
    python run_cuda_memcheck.py ../test_torch.py 600

Note that running cuda-memcheck could be very slow.
"""

import asyncio
import torch
import multiprocessing
import argparse
import subprocess
import tqdm
import os
import sys
import cuda_memcheck_common as cmc

ALL_TESTS = []
GPUS = torch.cuda.device_count()

# parse arguments
parser = argparse.ArgumentParser(description="Run isolated cuda-memcheck on unit tests")
parser.add_argument('filename', help="the python file for a test, such as test_torch.py")
parser.add_argument('timeout', type=int, help='kill the test if it does not terminate in a certain amount of seconds')
parser.add_argument('--strict', action='store_true',
                    help='Whether to show cublas/cudnn errors. These errors are ignored by default because'
                         'cublas/cudnn does not run error-free under cuda-memcheck, and ignoring these errors')
parser.add_argument('--nproc', type=int, default=multiprocessing.cpu_count(),
                    help='Number of processes running tests, default to number of cores in the system')
parser.add_argument('--gpus', default='all',
                    help='GPU assignments for each process, it could be "all", or : separated list like "1,2:3,4:5,6"')
parser.add_argument('--ci', action='store_true',
                    help='Whether this script is executed in CI. When executed inside a CI, this script fails when '
                         'an error is detected. Also, it will not show tqdm progress bar, but directly print the error'
                         'to stdout instead.')
parser.add_argument('--nohang', action='store_true', help='Treat timeout as success')
parser.add_argument('--split', type=int, default=1, help='Split the job into pieces')
parser.add_argument('--rank', type=int, default=0, help='Which piece this process should pick')
args = parser.parse_args()

# Filters that ignores cublas/cudnn errors
# TODO (@zasdfgbnm): When can we remove this? Will cublas/cudnn run error-free under cuda-memcheck?
def is_ignored_only(output):
    try:
        report = cmc.parse(output)
    except cmc.ParseError:
        # in case the simple parser fails parsing the output of cuda memcheck
        # then this error is never ignored.
        return False
    count_ignored_errors = 0
    for e in report.errors:
        if 'libcublas' in ''.join(e.stack) or 'libcudnn' in ''.join(e.stack) or 'libcufft' in ''.join(e.stack):
            count_ignored_errors += 1
    return count_ignored_errors == report.num_errors

# Set environment PYTORCH_CUDA_MEMCHECK=1 to allow skipping some tests
os.environ['PYTORCH_CUDA_MEMCHECK'] = '1'

# Discover tests:
# To get a list of tests, run:
# pytest --setup-only test/test_torch.py
# and then parse the output
proc = subprocess.Popen(['pytest', '--setup-only', args.filename], stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = proc.communicate()
lines = stdout.decode().strip().splitlines()
for line in lines:
    if '(fixtures used:' in line:
        line = line.strip().split()[0]
        line = line[line.find('::') + 2:]
        line = line.replace('::', '.')
        ALL_TESTS.append(line)

# Do a simple filtering:
# if 'cpu' or 'CPU' is in the name and 'cuda' or 'CUDA' is not in the name, then skip it
def is_cpu_only(name):
    name = name.lower()
    return ('cpu' in name) and "cuda" not in name

ALL_TESTS = [x for x in ALL_TESTS if not is_cpu_only(x)]

# Split all tests into chunks, and only on the selected chunk
ALL_TESTS.sort()
chunk_size = (len(ALL_TESTS) + args.split - 1) // args.split
start = chunk_size * args.rank
end = chunk_size * (args.rank + 1)
ALL_TESTS = ALL_TESTS[start:end]

# Run tests:
# Since running cuda-memcheck on PyTorch unit tests is very slow, these tests must be run in parallel.
# This is done by using the coroutine feature in new Python versions.  A number of coroutines are created;
# they create subprocesses and awaiting them to finish. The number of running subprocesses could be
# specified by the user and by default is the same as the number of CPUs in the machine.
# These subprocesses are balanced across different GPUs on the system by assigning one devices per process,
# or as specified by the user
progress = 0
if not args.ci:
    logfile = open('result.log', 'w')
    progressbar = tqdm.tqdm(total=len(ALL_TESTS))
else:
    logfile = sys.stdout

    # create a fake progress bar that does not display anything
    class ProgressbarStub:
        def update(self, *args):
            return
    progressbar = ProgressbarStub()

async def run1(coroutine_id):
    global progress

    if args.gpus == 'all':
        gpuid = coroutine_id % GPUS
    else:
        gpu_assignments = args.gpus.split(':')
        assert args.nproc == len(gpu_assignments), 'Please specify GPU assignment for each process, separated by :'
        gpuid = gpu_assignments[coroutine_id]

    while progress < len(ALL_TESTS):
        test = ALL_TESTS[progress]
        progress += 1
        cmd = f'CUDA_VISIBLE_DEVICES={gpuid} cuda-memcheck --error-exitcode 1 python {args.filename} {test}'
        proc = await asyncio.create_subprocess_shell(cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE)
        try:
            stdout, stderr = await asyncio.wait_for(proc.communicate(), args.timeout)
        except asyncio.TimeoutError:
            print('Timeout:', test, file=logfile)
            proc.kill()
            if args.ci and not args.nohang:
                sys.exit("Hang detected on cuda-memcheck")
        else:
            if proc.returncode == 0:
                print('Success:', test, file=logfile)
            else:
                stdout = stdout.decode()
                stderr = stderr.decode()
                should_display = args.strict or not is_ignored_only(stdout)
                if should_display:
                    print('Fail:', test, file=logfile)
                    print(stdout, file=logfile)
                    print(stderr, file=logfile)
                    if args.ci:
                        sys.exit("Failure detected on cuda-memcheck")
                else:
                    print('Ignored:', test, file=logfile)
        del proc
        progressbar.update(1)

async def main():
    tasks = [asyncio.ensure_future(run1(i)) for i in range(args.nproc)]
    for t in tasks:
        await t

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())