Add support for arm64 and Jetson boards.

Closed Pablo Rodriguez requested to merge paroque28/device-plugin:master-arm64 into master

In Jetson boards, we already have support for Nvidia containers in Docker. With this plugin, we could have Kubernetes orchestrating the pods.

As you may see on the code, I am not an NVIDIA expert so every feedback is accepted.

CC: @renaudwastaken A proof of my test is attached:

root@jetson-nano-qspi-sd:~# kubectl describe pod pod1
Name:         pod1
Namespace:    default
Priority:     0
Node:         jetson-nano-qspi-sd/192.168.100.44
Start Time:   Wed, 25 Mar 2020 01:41:36 +0000
Labels:       <none>
Annotations:  kubectl.kubernetes.io/last-applied-configuration:
                {"apiVersion":"v1","kind":"Pod","metadata":{"annotations":{},"name":"pod1","namespace":"default"},"spec":{"containers":[{"args":["100000"]...
Status:       Running
IP:           10.42.0.77
IPs:
  IP:  10.42.0.77
Containers:
  pod1-ctr:
    Container ID:  docker://5ebad034c0f268eadc90745fd128ddc4fbb387e636087dd2cc11539202152f62
    Image:         paroque28/l4t-tensorflow
    Image ID:      docker-pullable://paroque28/l4t-tensorflow@sha256:730b8d8c27ebf06e92b9ce819d611ff99e2df904cb74064cdb817834a10cb45a
    Port:          <none>
    Host Port:     <none>
    Command:
      sleep
    Args:
      100000
    State:          Running
      Started:      Wed, 25 Mar 2020 01:41:40 +0000
    Ready:          True
    Restart Count:  0
    Limits:
      nvidia.com/gpu:  1
    Requests:
      nvidia.com/gpu:  1
    Environment:       <none>
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from default-token-zvplt (ro)
Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
Volumes:
  default-token-zvplt:
    Type:        Secret (a volume populated by a Secret)
    SecretName:  default-token-zvplt
    Optional:    false
QoS Class:       BestEffort
Node-Selectors:  <none>
Tolerations:     node.kubernetes.io/not-ready:NoExecute for 300s
                 node.kubernetes.io/unreachable:NoExecute for 300s
Events:
  Type    Reason     Age        From                          Message
  ----    ------     ----       ----                          -------
  Normal  Scheduled  <unknown>  default-scheduler             Successfully assigned default/pod1 to jetson-nano-qspi-sd
  Normal  Pulling    11s        kubelet, jetson-nano-qspi-sd  Pulling image "paroque28/l4t-tensorflow"
  Normal  Pulled     10s        kubelet, jetson-nano-qspi-sd  Successfully pulled image "paroque28/l4t-tensorflow"
  Normal  Created    10s        kubelet, jetson-nano-qspi-sd  Created container pod1-ctr
  Normal  Started    9s         kubelet, jetson-nano-qspi-sd  Started container pod1-ctr
root@jetson-nano-qspi-sd:~# kubectl exec -it pod1 bash
root@pod1:/# ls
bin  boot  dev  etc  home  lib  media  mnt  opt  proc  root  run  sbin  srv  sys  tensorflow_demo.py  tmp  usr  var
root@pod1:/# python3 tensorflow_demo.py 
2020-03-25 01:42:13.687199: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:516: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:517: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:518: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:519: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:520: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/usr/local/lib/python3.6/dist-packages/tensorflow/python/framework/dtypes.py:525: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:541: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:542: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:543: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:544: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:545: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
/usr/local/lib/python3.6/dist-packages/tensorboard/compat/tensorflow_stub/dtypes.py:550: FutureWarning: Passing (type, 1) or '1type' as a synonym of type is deprecated; in a future version of numpy, it will be understood as (type, (1,)) / '(1,)type'.
  np_resource = np.dtype([("resource", np.ubyte, 1)])
WARNING: Logging before flag parsing goes to stderr.
W0325 01:42:24.940386 547524526096 deprecation.py:506] From /usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/init_ops.py:1251: calling VarianceScaling.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
# Fit model on training data
Train on 50000 samples, validate on 10000 samples
2020-03-25 01:42:27.510438: W tensorflow/core/platform/profile_utils/cpu_utils.cc:98] Failed to find bogomips in /proc/cpuinfo; cannot determine CPU frequency
2020-03-25 01:42:27.511017: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x3752bd0 executing computations on platform Host. Devices:
2020-03-25 01:42:27.511074: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): <undefined>, <undefined>
2020-03-25 01:42:27.520490: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcuda.so.1
2020-03-25 01:42:27.640300: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:972] ARM64 does not support NUMA - returning NUMA node zero
2020-03-25 01:42:27.640631: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x49be0e0 executing computations on platform CUDA. Devices:
2020-03-25 01:42:27.640746: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): NVIDIA Tegra X1, Compute Capability 5.3
2020-03-25 01:42:27.641281: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:972] ARM64 does not support NUMA - returning NUMA node zero
2020-03-25 01:42:27.641426: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1640] Found device 0 with properties: 
name: NVIDIA Tegra X1 major: 5 minor: 3 memoryClockRate(GHz): 0.9216
pciBusID: 0000:00:00.0
2020-03-25 01:42:27.641771: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudart.so.10.0
2020-03-25 01:42:27.675381: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcublas.so.10.0
2020-03-25 01:42:27.707481: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcufft.so.10.0
2020-03-25 01:42:27.749666: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcurand.so.10.0
2020-03-25 01:42:27.846545: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcusolver.so.10.0
2020-03-25 01:42:27.879553: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcusparse.so.10.0
2020-03-25 01:42:28.076276: I tensorflow/stream_executor/platform/default/dso_loader.cc:42] Successfully opened dynamic library libcudnn.so.7
2020-03-25 01:42:28.076627: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:972] ARM64 does not support NUMA - returning NUMA node zero
2020-03-25 01:42:28.076960: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:972] ARM64 does not support NUMA - returning NUMA node zero
2020-03-25 01:42:28.077110: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1763] Adding visible gpu devices: 0
2020-03-25 01:42:28.077259: I tensorflow/stream_executor/platform/default/dso_loader.cc:42]