[PATCH 1/1] UBUNTU: SAUCE: ubuntu_performance_deep_learning: init deep learning framework performance test

Thu Jun 24 17:46:33 UTC 2021

The purpose of this test is to generate the performance data of deep
learning framework. Currently it supports TensorFlow testing only.

If the target shell script complete to run and the target data file is
generated, the job is passed.

The test environment is mostly prepared by MAAS via a customized curtin
preseed. For tasks like driver installation, software installation
highly associated with driver or required reboot, are setup by the
preseed. The rest of tasks are completed by the autotest framework, and
defined in the corresponding testing job.

Signed-off-by: Taihsiang Ho (tai271828) <taihsiang.ho at canonical.com>
---
 ubuntu_performance_deep_learning/control      | 12 +++
 ubuntu_performance_deep_learning/helper.py    | 27 ++++++
 .../ubuntu_performance_deep_learning.py       | 85 +++++++++++++++++++
 .../ubuntu_performance_tensor_flow.sh         | 63 ++++++++++++++
 4 files changed, 187 insertions(+)
 create mode 100644 ubuntu_performance_deep_learning/control
 create mode 100644 ubuntu_performance_deep_learning/helper.py
 create mode 100644 ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
 create mode 100755 ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh

diff --git a/ubuntu_performance_deep_learning/control b/ubuntu_performance_deep_learning/control
new file mode 100644
index 00000000..68a0a626
--- /dev/null
+++ b/ubuntu_performance_deep_learning/control
@@ -0,0 +1,12 @@
+AUTHOR = 'Taihsiang Ho <taihsiang.ho at canonical.com>'
+TIME = 'MEDIUM'
+NAME = 'Basic TensorFlow Testing'
+TEST_TYPE = 'client'
+TEST_CLASS = 'General'
+TEST_CATEGORY = 'Benchmark'
+
+DOC = """
+Perform basic tensor flow testing
+"""
+
+job.run_test_detail('ubuntu_performance_deep_learning', test_name='tensor-flow-cnn-resnet', tag='tensor-flow-cnn-resnet', timeout=60*15)
diff --git a/ubuntu_performance_deep_learning/helper.py b/ubuntu_performance_deep_learning/helper.py
new file mode 100644
index 00000000..526a7e15
--- /dev/null
+++ b/ubuntu_performance_deep_learning/helper.py
@@ -0,0 +1,27 @@
+import re
+
+
+def get_stats(stdout_results):
+    # search for the benchmark output line
+    # for example, search for "300 300.0  6776.8  0.000  0.960 0.00000" which has
+    #     1. 6 numbers, either integers (300) or floats in x.x format (6776.8)
+    #     2. the third number (6776.8) is what we want
+    #
+    # regular expression:
+    #     1. (\d+(\.\d+)?) for x.x or x
+    #         1.1. \d for numbers, equivalent to [0-9]
+    #         1.2. \d+ one or more numbers. + is short for {1, }
+    #         1.3. (\.\d+)? zero or one ".x". ? is short for {0, 1}
+    #     2. \s for space, short for [\f\n\r\t\v\u00A0\u2028\u2029]
+    #         2.1. \s+ for one or more spaces
+    #     3. (){n} for n repetitions of group
+    pattern = r"""(\d+(\.\d+)?)         # for x.x or x
+                  (\s+(\d+(\.\d+)?)){2} # 2 repetitions of _x.x or _x
+                  (\s+(\d+(\.\d+)?)){3} # 3 repetitions of _x.x or _x"""
+    rc = re.compile(pattern, re.VERBOSE)
+    matches = rc.findall(stdout_results, re.MULTILINE)
+
+    # get the key number
+    target_number = matches[1][3]
+
+    return target_number
diff --git a/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py b/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
new file mode 100644
index 00000000..00c6074f
--- /dev/null
+++ b/ubuntu_performance_deep_learning/ubuntu_performance_deep_learning.py
@@ -0,0 +1,85 @@
+import os
+import helper
+from autotest.client import test, utils
+from autotest.client.shared import error
+
+
+TEST_ITERATION = 3
+
+
+class ubuntu_performance_deep_learning(test.test):
+    version = 1
+
+    def initialize(self):
+        pass
+
+    def install_required_pkgs(self):
+        p_dir = os.path.dirname(os.path.abspath(__file__))
+        uptf_cmd = os.path.join(p_dir, "ubuntu_performance_tensor_flow.sh")
+        cmd = "{} setup".format(uptf_cmd)
+        shell_exit_code = utils.system(cmd, ignore_status=True)
+
+        return shell_exit_code
+
+    def setup(self):
+        self.install_required_pkgs()
+
+    def tensor_flow_cnn_resnet(self, benchmark):
+        """Test for running basic tensor flow features"""
+        unit = "images/sec"
+        max_error_threshold = 0.05
+        values = {}
+
+        # benchmark is the benchmark item of config.yaml
+        benchmark = benchmark.replace("-", "_")
+        if "TEST_CONFIG" in os.environ:
+            benchmark += "_" + os.environ["TEST_CONFIG"]
+
+        p_dir = os.path.dirname(os.path.abspath(__file__))
+        uptf_cmd = os.path.join(p_dir, "ubuntu_performance_tensor_flow.sh")
+        cmd = "{} test".format(uptf_cmd)
+
+        for i in range(TEST_ITERATION):
+            stdout_result = utils.system_output(cmd, retain_output=True)
+            values[i] = helper.get_stats(stdout_result)
+
+            if values[i]:
+                print("")
+                print("Test %d of %d:" % (i + 1, TEST_ITERATION))
+                print("{}[{}] {} {}".format(benchmark, i, values[i], unit))
+
+        #
+        #  Compute min/max/average:
+        #
+        if values[i]:
+            v = [float(values[i]) for i in values]
+            maximum = max(v)
+            minimum = min(v)
+            average = sum(v) / float(len(v))
+            max_err = (maximum - minimum) / average
+
+            print("")
+            print(benchmark + "_minimum {:.2f} {}".format(minimum, unit))
+            print(benchmark + "_maximum {:.2f} {}".format(maximum, unit))
+            print(benchmark + "_average {:.2f} {}".format(average, unit))
+            print(benchmark + "_maximum_error {:.2%}".format(max_err))
+            print("")
+
+            if max_err > max_error_threshold:
+                print("FAIL: maximum error is greater than 5%")
+            else:
+                print("PASS: test passes specified performance thresholds")
+        else:
+            print("NOT-RUN or FAIL to PARSE DATA")
+
+    def run_once(self, test_name):
+        if test_name == "tensor-flow-cnn-resnet":
+            self.tensor_flow_cnn_resnet(test_name)
+
+            print("")
+            print("tensor_flow_cnn_resnet shell script has run.")
+
+        print("")
+
+    def postprocess_iteration(self):
+        pass
diff --git a/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh b/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh
new file mode 100755
index 00000000..cbf9ff0e
--- /dev/null
+++ b/ubuntu_performance_deep_learning/ubuntu_performance_tensor_flow.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/bash
+#
+# perform TensorFlow performance testing and corresponding pre-setup.
+#
+
+set -eo pipefail
+
+CONTAINER_VER="20.12"
+
+install_nvidia_docker() {
+    local distribution
+    distribution="$(. /etc/os-release;echo $ID$VERSION_ID)"
+    curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
+    curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | \
+	sudo tee /etc/apt/sources.list.d/nvidia-docker.list > /dev/null
+    sudo apt update
+    sudo apt install nvidia-docker2 -y
+    sudo systemctl restart docker
+}
+
+get_num_gpus() {
+    # required to passthrough GPUs into containers
+    nvidia-smi -L | wc -l
+}
+
+setup() {
+    # pre-setup testing environment and necessary tools
+    install_nvidia_docker
+}
+
+run_test() {
+    sudo nvidia-docker run \
+	 --shm-size=1g \
+	 --ulimit memlock=-1 \
+	 --ulimit stack=67108864 \
+	 -ti --rm nvcr.io/nvidia/tensorflow:${CONTAINER_VER}-tf1-py3 -- \
+	 mpiexec \
+	 --bind-to socket \
+	 --allow-run-as-root \
+	 -np "$(get_num_gpus)" \
+	 python -u /workspace/nvidia-examples/cnn/resnet.py \
+	 --layers=50 \
+	 --precision=fp16 \
+	 --batch_size=256 \
+	 --num_iter=300 \
+	 --iter_unit=batch \
+	 --display_every=300
+}
+
+case $1 in
+    setup)
+	setup
+	echo ""
+	echo "Setting up necessary test environment..."
+	echo ""
+	;;
+    test)
+	run_test
+	echo ""
+	echo "Running test..."
+	echo ""
+	;;
+esac
-- 
2.32.0