[PATCH v2 1/1][autotest-client-tests] UBUNTU: SAUCE: ubuntu_performance_deep_learning: init deep learning framework performance test

Mon Jul 5 14:48:10 UTC 2021

The purpose of this test is to generate the performance data of deep
learning framework. Currently it supports TensorFlow testing only.

If the target shell script complete to run and the target data file is
generated, the job is passed.

This test job is able to run and pass on a machine with expected devices
like a GPU supported by the expected drivers and deep learning
frameworks.

The test environment is mostly prepared by MAAS via a customized curtin
preseed. For tasks like driver installation, software installation
highly associated with driver or required reboot, are setup by the
preseed. The rest of tasks are completed by the autotest framework, and
defined in the corresponding testing job.

Signed-off-by: Taihsiang Ho (tai271828) <taihsiang.ho at canonical.com>
---
 ubuntu_performance_deep_learning/control      | 12 +++
 ubuntu_performance_deep_learning/helper.py    | 27 ++++++
 .../ubuntu_performance_deep_learning.py       | 91 +++++++++++++++++++
 .../ubuntu_performance_tensor_flow.sh         | 69 ++++++++++++++
 4 files changed, 199 insertions(+)
 create mode 100644 ubuntu_performance_deep_learning/control
 create mode 100644 ubuntu_performance_deep_learning/helper.py
 create mode 100644 ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
 create mode 100755 ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh

diff --git a/ubuntu_performance_deep_learning/control b/ubuntu_performance_deep_learning/control
new file mode 100644
index 00000000..829c2202
--- /dev/null
+++ b/ubuntu_performance_deep_learning/control
@@ -0,0 +1,12 @@
+AUTHOR = 'Taihsiang Ho <taihsiang.ho at canonical.com>'
+TIME = 'MEDIUM'
+NAME = 'Basic TensorFlow Testing'
+TEST_TYPE = 'client'
+TEST_CLASS = 'General'
+TEST_CATEGORY = 'Benchmark'
+
+DOC = """
+Perform basic tensor flow testing
+"""
+
+job.run_test_detail('ubuntu_performance_deep_learning', test_name='tensor-flow-cnn-resnet', tag='tensor-flow-cnn-resnet', timeout=60*20)
diff --git a/ubuntu_performance_deep_learning/helper.py b/ubuntu_performance_deep_learning/helper.py
new file mode 100644
index 00000000..526a7e15
--- /dev/null
+++ b/ubuntu_performance_deep_learning/helper.py
@@ -0,0 +1,27 @@
+import re
+
+
+def get_stats(stdout_results):
+    # search for the benchmark output line
+    # for example, search for "300 300.0  6776.8  0.000  0.960 0.00000" which has
+    #     1. 6 numbers, either integers (300) or floats in x.x format (6776.8)
+    #     2. the third number (6776.8) is what we want
+    #
+    # regular expression:
+    #     1. (\d+(\.\d+)?) for x.x or x
+    #         1.1. \d for numbers, equivalent to [0-9]
+    #         1.2. \d+ one or more numbers. + is short for {1, }
+    #         1.3. (\.\d+)? zero or one ".x". ? is short for {0, 1}
+    #     2. \s for space, short for [\f\n\r\t\v\u00A0\u2028\u2029]
+    #         2.1. \s+ for one or more spaces
+    #     3. (){n} for n repetitions of group
+    pattern = r"""(\d+(\.\d+)?)         # for x.x or x
+                  (\s+(\d+(\.\d+)?)){2} # 2 repetitions of _x.x or _x
+                  (\s+(\d+(\.\d+)?)){3} # 3 repetitions of _x.x or _x"""
+    rc = re.compile(pattern, re.VERBOSE)
+    matches = rc.findall(stdout_results, re.MULTILINE)
+
+    # get the key number
+    target_number = matches[1][3]
+
+    return target_number
diff --git a/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py b/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
new file mode 100644
index 00000000..e6ab2e51
--- /dev/null
+++ b/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
@@ -0,0 +1,91 @@
+import os
+import helper
+from autotest.client import test, utils
+
+
+TEST_ITERATION = 3
+
+
+class ubuntu_performance_deep_learning(test.test):
+    version = 1
+
+    def initialize(self):
+        pass
+
+    def install_required_pkgs(self):
+        """
+        Install required packages.
+
+        This installation method assumes the corresponding GPU drivers are
+        ready to load if the drivers are required by the deep learning
+        framework. That being said, the installation will only install packages
+        running in userspace, e.g. the framework itself and the corresponding
+        runtime.
+        """
+        p_dir = os.path.dirname(os.path.abspath(__file__))
+        uptf_cmd = os.path.join(p_dir, "ubuntu_performance_tensor_flow.sh")
+        cmd = "{} setup".format(uptf_cmd)
+        utils.system(cmd)
+
+    def setup(self):
+        self.install_required_pkgs()
+
+    def tensor_flow_cnn_resnet(self, benchmark):
+        """Test for running basic tensor flow features"""
+        unit = "images/sec"
+        max_error_threshold = 0.05
+        values = {}
+
+        # benchmark is the benchmark item of config.yaml
+        benchmark = benchmark.replace("-", "_")
+        if "TEST_CONFIG" in os.environ:
+            benchmark += "_" + os.environ["TEST_CONFIG"]
+
+        p_dir = os.path.dirname(os.path.abspath(__file__))
+        uptf_cmd = os.path.join(p_dir, "ubuntu_performance_tensor_flow.sh")
+        cmd = "{} test".format(uptf_cmd)
+
+        for i in range(TEST_ITERATION):
+            stdout_result = utils.system_output(cmd, retain_output=True)
+            values[i] = helper.get_stats(stdout_result)
+
+            if values[i]:
+                print("")
+                print("Test %d of %d:" % (i + 1, TEST_ITERATION))
+                print("{}[{}] {} {}".format(benchmark, i, values[i], unit))
+
+        #
+        #  Compute min/max/average:
+        #
+        if values[i]:
+            v = [float(values[i]) for i in values]
+            maximum = max(v)
+            minimum = min(v)
+            average = sum(v) / float(len(v))
+            max_err = (maximum - minimum) / average
+
+            print("")
+            print(benchmark + "_minimum {:.2f} {}".format(minimum, unit))
+            print(benchmark + "_maximum {:.2f} {}".format(maximum, unit))
+            print(benchmark + "_average {:.2f} {}".format(average, unit))
+            print(benchmark + "_maximum_error {:.2%}".format(max_err))
+            print("")
+
+            if max_err > max_error_threshold:
+                print("FAIL: maximum error is greater than 5%")
+            else:
+                print("PASS: test passes specified performance thresholds")
+        else:
+            print("NOT-RUN or FAIL to PARSE DATA")
+
+    def run_once(self, test_name):
+        if test_name == "tensor-flow-cnn-resnet":
+            self.tensor_flow_cnn_resnet(test_name)
+
+            print("")
+            print("tensor_flow_cnn_resnet shell script has run.")
+
+        print("")
+
+    def postprocess_iteration(self):
+        pass
diff --git a/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh b/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh
new file mode 100755
index 00000000..77efaf39
--- /dev/null
+++ b/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+#
+# perform TensorFlow performance testing and corresponding pre-setup.
+#
+
+set -eo pipefail
+
+CONTAINER_VER="20.12"
+
+install_nvidia_docker() {
+    local distribution
+    distribution="$(. /etc/os-release;echo $ID$VERSION_ID)"
+    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
+        sudo tee /etc/apt/sources.list.d/nvidia-docker.list > /dev/null
+    sudo apt update
+    sudo apt install nvidia-docker2 -y
+    sudo systemctl restart docker
+}
+
+get_num_gpus() {
+    # required to passthrough GPUs into containers
+    nvidia-smi -L | wc -l
+}
+
+setup() {
+    # pre-setup testing environment and necessary tools
+    install_nvidia_docker
+}
+
+run_test() {
+    sudo nvidia-docker run \
+         --shm-size=1g \
+         --ulimit memlock=-1 \
+         --ulimit stack=67108864 \
+         -ti --rm nvcr.io/nvidia/tensorflow:${CONTAINER_VER}-tf1-py3 -- \
+         mpiexec \
+         --bind-to socket \
+         --allow-run-as-root \
+         -np "$(get_num_gpus)" \
+         python -u /workspace/nvidia-examples/cnn/resnet.py \
+         --layers=50 \
+         --precision=fp16 \
+         --batch_size=256 \
+         --num_iter=300 \
+         --iter_unit=batch \
+         --display_every=300
+}
+
+case $1 in
+    setup)
+        echo ""
+        echo "On setting up necessary test environment..."
+        echo ""
+        setup
+        echo ""
+        echo "Setting up necessary test environment..."
+        echo ""
+        ;;
+    test)
+        echo ""
+        echo "On running test..."
+        echo ""
+        run_test
+        echo ""
+        echo "Running test..."
+        echo ""
+        ;;
+esac
-- 
2.32.0