...
 
Commits (47)
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "tensorflow/lite/micro/debug_log.h"
#include <cstdint>
#include <cstdio>
#include <cstring>
// Print to debug console by default. One can define next to extend destinations
// set: EMSDP_LOG_TO_MEMORY
// : fill .debug_log memory region (data section) with passed chars.
// EMSDP_LOG_TO_HOST
// : Use MetaWare HostLink to print output log. Requires Synopsys MetaWare
// debugger
// EMSDP_LOG_TO_UART
// : use default debug UART (out to FTDI channel 0). The same USB Port is used
// for JTAG.
#define EMSDP_LOG_TO_UART
// Memory size for symbols dump in EMSDP_LOG_TO_MEMORY destination
#define EMSDP_LOG_TO_MEMORY_SIZE (2 * 1024)
// EMSDP Debug UART related defines (registers and bits)
#define EMSDP_DBG_UART_BASE (0xF0004000U)
#define DW_UART_CPR_FIFO_STAT (1 << 10)
#define DW_UART_USR_TFNF (0x02)
#define DW_UART_LSR_TXD_EMPTY (0x20)
// EMSDP UART registers map (only necessairy fields)
typedef volatile struct dw_uart_reg {
uint32_t DATA; /* data in/out and DLL */
uint32_t RES1[4];
uint32_t LSR; /* Line Status Register */
uint32_t RES2[25];
uint32_t USR; /* UART status register */
uint32_t RES3[29];
uint32_t CPR; /* Component parameter register */
} DW_UART_REG;
// For simplicity we assume U-boot has already initialized debug console during
// application loading (or on reset). Hence, we use only status and data
// registers to organize blocking loop for printing symbols. No input and no IRQ
// handling. See embarc_osp repository for full EMSDP uart driver.
// (https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_osp)
void DbgUartSendStr(const char* s) {
DW_UART_REG* uart_reg_ptr = (DW_UART_REG*)(EMSDP_DBG_UART_BASE);
const char* src = s;
while (*src) {
// Check uart status to send char
bool uart_is_ready = false;
if (uart_reg_ptr->CPR & DW_UART_CPR_FIFO_STAT)
uart_is_ready = ((uart_reg_ptr->USR & DW_UART_USR_TFNF) != 0);
else
uart_is_ready = ((uart_reg_ptr->LSR & DW_UART_LSR_TXD_EMPTY) != 0);
// Send char if uart is ready.
if (uart_is_ready) uart_reg_ptr->DATA = *src++;
}
}
// Simple dump of symbols to a pre-allocated memory region.
// When total log exceeds memory region size, cursor is moved to its begining.
// The memory region can be viewed afterward with debugger.
// It can be viewed/read with debugger afterward.
void LogToMem(const char* s) {
static int cursor = 0;
#pragma Bss(".debug_log")
static volatile char debug_log_mem[EMSDP_LOG_TO_MEMORY_SIZE];
#pragma Bss()
const char* src = s;
while (*src) {
debug_log_mem[cursor] = *src++;
cursor = (cursor < EMSDP_LOG_TO_MEMORY_SIZE) ? cursor + 1 : 0;
}
debug_log_mem[cursor] = '^';
}
extern "C" void DebugLog(const char* s) {
#ifndef TF_LITE_STRIP_ERROR_STRINGS
#if defined EMSDP_LOG_TO_UART
DbgUartSendStr(s);
#endif
#if defined EMSDP_LOG_TO_MEMORY
#warning \
"EMSDP_LOG_TO_MEMORY is defined. View .debug_log memory region for stdout"
LogToMem(s);
#endif
#if defined EMSDP_LOG_TO_HOST
#warning "EMSDP_LOG_TO_HOST is defined. Ensure hostlib is linked."
fprintf(stderr, "%s", s);
#endif
#endif // TF_LITE_STRIP_ERROR_STRINGS
}
......@@ -14,6 +14,7 @@ of the device.
## Table of contents
- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
- [Deploy to Arduino](#deploy-to-arduino)
- [Deploy to ESP32](#deploy-to-esp32)
- [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
......@@ -21,6 +22,78 @@ of the device.
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Train your own model](#train-your-own-model)
## Deploy to ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
### Initial Setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_hello_world_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/hello_world/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Deploy to Arduino
The following instructions will help you build and deploy this sample
......
......@@ -16,6 +16,7 @@ kilobytes of Flash.
## Table of contents
- [Deploy to ARC EM SDP](#deploy-to-arc-em-sdp)
- [Deploy to Arduino](#deploy-to-arduino)
- [Deploy to ESP32](#deploy-to-esp32)
- [Deploy to SparkFun Edge](#deploy-to-sparkfun-edge)
......@@ -25,6 +26,95 @@ kilobytes of Flash.
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Train your own model](#train-your-own-model)
## Deploy to ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example is quantized with symmetric uint8 scheme. As noted in
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
embARC MLI supports optimized kernels for int8 quantization only. Therefore,
this example will only use TFLM reference kernels.
The ARC EM SDP board contains the rich set of extension interfaces. You can
choose any compatible microphone and modify
[audio_provider.cc](/tensorflow/lite/micro/examples/micro_speech/audio_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[command_responder.cc](/tensorflow/lite/micro/examples/micro_speech/command_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
As default example doesn’t provide any output without real audio, it is
recommended to get started with example for mock data. The project for ARC EM
SDP platform can be generated with the following command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_micro_speech_mock_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/micro_speech_mock/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Deploy to Arduino
The following instructions will help you build and deploy this sample
......
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically for micro speech example.
# In particular:
# - Extend Heap and stack size for application needs
# - Use Linker command file with better usage of fast memory
# - In case project was generated with MLI usage, reduce scratch buffers.
MICRO_SPEECH_HDRS += \
micro_speech_patch.txt
MICRO_SPEECH_TEST_HDRS += \
micro_speech_patch.txt
MICRO_SPEECH_MOCK_HDRS += \
micro_speech_patch.txt
%/micro_speech_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $<
@echo emsdp.lcf > [email protected]
@sed -E -i 's#-Hheap=[^ ]*#\-Hheap=16K \-Hstack=16K#g' $(word 2, $^)
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
$(word 2, $^)
@echo Makefile >> [email protected]
endif
......@@ -5,7 +5,9 @@ network to recognize people in images captured by a camera. It is designed to
run on systems with small amounts of memory such as microcontrollers and DSPs.
## Table of contents
- [Getting started](#getting-started)
- [Running on ARC EM SDP](#running-on-arc-em-sdp)
- [Running on Arduino](#running-on-arduino)
- [Running on ESP32](#running-on-esp32)
- [Running on SparkFun Edge](#running-on-sparkfun-edge)
......@@ -13,6 +15,94 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
- [Debugging image capture](#debugging-image-capture)
- [Training your own model](#training-your-own-model)
## Running on ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example is quantized with symmetric uint8 scheme. As noted in
[kernels/arc_mli/README.md](/tensorflow/lite/micro/kernels/arc_mli/README.md),
embARC MLI supports optimized kernels for int8 quantization only. Therefore,
this example will only use TFLM reference kernels.
The ARC EM SDP board contains the reach set of extension interfaces. You can
choose any compatible camera and modify
[image_provider.cc](/tensorflow/lite/micro/examples/person_detection/image_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection/detection_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Running on Arduino
The following instructions will help you build and deploy this sample
......
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically
# for person detection example. In particular:
# - Use Linker command file with better usage of fast memory
# - In case project was generated with MLI usage, reduce scratch buffers.
person_detection_HDRS += \
person_detection_patch.txt
person_detection_TEST_HDRS += \
person_detection_patch.txt
%/person_detection_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/tools/make/targets/arc/emsdp/emsdp_v2.lcf $<
@echo emsdp.lcf > [email protected]
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= false\n\
CXXFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0\
CCFLAGS += -DSCRATCH_MEM_X_SIZE=0 -DSCRATCH_MEM_Y_SIZE=0 -DSCRATCH_MEM_Z_SIZE=0#'\
$(word 2, $^)
@echo Makefile >> [email protected]
endif
......@@ -6,13 +6,101 @@ run on systems with small amounts of memory such as microcontrollers and DSPs.
This uses the experimental int8 quantized version of the person detection model.
## Table of contents
- [Getting started](#getting-started)
- [Running on ARC EM SDP](#running-on-arc-em-sdp)
- [Running on Arduino](#running-on-arduino)
- [Running on SparkFun Edge](#running-on-sparkfun-edge)
- [Run the tests on a development machine](#run-the-tests-on-a-development-machine)
- [Debugging image capture](#debugging-image-capture)
- [Training your own model](#training-your-own-model)
## Running on ARC EM SDP
The following instructions will help you to build and deploy this example to
[ARC EM SDP](https://www.synopsys.com/dw/ipdir.php?ds=arc-em-software-development-platform)
board. General information and instructions on using the board with TensorFlow
Lite Micro can be found in the common
[ARC targets description](/tensorflow/lite/micro/tools/make/targets/arc/README.md).
This example uses asymmetric int8 quantization and can therefore leverage
optimized int8 kernels from the embARC MLI library
The ARC EM SDP board contains a rich set of extension interfaces. You can choose
any compatible camera and modify
[image_provider.cc](/tensorflow/lite/micro/examples/person_detection_experimental/image_provider.cc)
file accordingly to use input from your specific camera. By default, results of
running this example are printed to the console. If you would like to instead
implement some target-specific actions, you need to modify
[detection_responder.cc](/tensorflow/lite/micro/examples/person_detection_experimental/detection_responder.cc)
accordingly.
The reference implementations of these files are used by default on the EM SDP.
### Initial setup
Follow the instructions on the
[ARC EM SDP Initial Setup](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP)
to get and install all required tools for work with ARC EM SDP.
### Generate Example Project
The example project for ARC EM SDP platform can be generated with the following
command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
```
### Build and Run Example
For more detailed information on building and running examples see the
appropriate sections of general descriptions of the
[ARC EM SDP usage with TFLM](/tensorflow/lite/micro/tools/make/targets/arc/README.md#ARC-EM-Software-Development-Platform-ARC-EM-SDP).
In the directory with generated project you can also find a
*README_ARC_EMSDP.md* file with instructions and options on building and
running. Here we only briefly mention main steps which are typically enough to
get it started.
1. You need to
[connect the board](/tensorflow/lite/micro/tools/make/targets/arc/README.md#connect-the-board)
and open an serial connection.
2. Go to the generated example project director
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
```
3. Build the example using
```
make app
```
4. To generate artefacts for self-boot of example from the board use
```
make flash
```
5. To run application from the board using microSD card:
* Copy the content of the created /bin folder into the root of microSD
card. Note that the card must be formatted as FAT32 with default cluster
size (but less than 32 Kbytes)
* Plug in the microSD card into the J11 connector.
* Push the RST button. If a red LED is lit beside RST button, push the CFG
button.
6. If you have the MetaWare Debugger installed in your environment:
* To run application from the console using it type `make run`.
* To stop the execution type `Ctrl+C` in the console several times.
In both cases (step 5 and 6) you will see the application output in the serial
terminal.
## Running on Arduino
The following instructions will help you build and deploy this sample
......
ifeq ($(TARGET), arc_emsdp)
# Patch of arc make project to adjust it specifically
# for experimental person detection example. In particular:
# - Use Linker command file with better usage of fast memory
# - Stripout TFLM reference code by default.
person_detection_HDRS += \
person_detection_int8_patch.txt
person_detection_TEST_HDRS += \
person_detection_int8_patch.txt
%/person_detection_int8_patch.txt: %/emsdp.lcf %/Makefile
@cp tensorflow/lite/micro/examples/person_detection_experimental/arc_emsdp/emsdp.lcf $<
@echo emsdp.lcf > [email protected]
@sed -E -i 's#MLI_ONLY *\?= *false#MLI_ONLY \?= true#' $(word 2, $^)
@echo Makefile > [email protected]
endif
# Copyright 2020 The TensorFlow Authors. All Rights Reserved.
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# http://www.apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
#
# Difference with common EMSDP LCF file (to reduce data access time):
# - move data from external PSRAM to on-chip memory
# - move text from SRAM to ICCM
#
# CCMWRAP memory regions indicate unusable portions of the address space
# due to CCM memory wrapping into upper addresses beyond its size
MEMORY {
PSRAM : ORIGIN = 0x10000400, LENGTH = (0x01000000 >> 1) - 0x400
SRAM : ORIGIN = 0x20000000, LENGTH = 0x00040000
IVT : ORIGIN = 0x60000000, LENGTH = 0x400
ICCM0 : ORIGIN = 0x60000400, LENGTH = (0x00020000 - 0x400)
# CCMWRAP0: ORIGIN = 0x60020000, LENGTH = 0x0ffe0000
DCCM : ORIGIN = 0x80000000, LENGTH = 0x00020000
# CCMWRAP1: ORIGIN = 0x80020000, LENGTH = 0x0ffe0000
XCCM : ORIGIN = 0x90000000, LENGTH = 0x00004000
# CCMWRAP2: ORIGIN = 0x90004000, LENGTH = 0x0fffc000
YCCM : ORIGIN = 0xa0000000, LENGTH = 0x00004000
# CCMWRAP3: ORIGIN = 0xa0004000, LENGTH = 0x0fffc000
}
SECTIONS {
GROUP BLOCK(4) : {
.vectors (TEXT) SIZE(DEFINED _IVTSIZE?_IVTSIZE:756): {} = FILL(0xa5a5a5a5,4)
} > IVT
GROUP BLOCK(4): {
.text? : { *('.text$crt*') }
* (TEXT): {}
* (LIT): {}
} > ICCM0
GROUP BLOCK(4): {
.rodata_in_data? : {}
} > PSRAM
GROUP BLOCK(4): {
/* _SDA_BASE_ computed implicitly */
.sdata?: {}
.sbss?: {}
* (DATA): {}
* (BSS): {}
.debug_log? : {}
} > SRAM
GROUP BLOCK(4): {
.Zdata? : {}
.heap? ALIGN(4) SIZE(DEFINED _HEAPSIZE?_HEAPSIZE:8K): {}
.stack ALIGN(4) SIZE(DEFINED _STACKSIZE?_STACKSIZE:8K): {}
} > DCCM
GROUP BLOCK(4): {
.Xdata? : {}
} > XCCM
GROUP BLOCK(4): {
.Ydata? : {}
} > YCCM
}
# EmbARC MLI Library Based Optimizations of TensorFlow Lite Micro Kernels for ARC Platforms.
This folder contains kernel implementations which use optimized
[embARC MLI Library](https://github.com/foss-for-synopsys-dwc-arc-processors/embarc_mli).
It allows acceleration of inference operations which use int8 (asymmetric
quantization).
## Usage
embARC MLI Library is used by default to speed up execution of some kernels for
asymmetrically quantized layers. This means that usual project generation for
ARC specific target implies usage of embARC MLI.
For example:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp generate_person_detection_int8_make_project
```
In case MLI implementation can’t be used, kernels in this folder fallback to
TFLM reference implementations. For applications which may not benefit from MLI
library, projects can be generated without these implementations by adding
`TAGS=no_arc_mli` in the command line, which can reduce overall code size:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp TAGS=no_arc_mli generate_person_detection_int8_make_project
```
For ARC EM SDP board, a pre-compiled MLI library is downloaded and used in the
application. For a custom target ARC-based platform, MLI sources are downloaded
and compiled during project generation phase. To build library from sources for
ARC EM SDP platform, add `BUILD_ARC_MLI=true` option to make command:
```
make -f tensorflow/lite/micro/tools/make/Makefile TARGET=arc_emsdp BUILD_ARC_MLI=true generate_person_detection_int8_make_project
```
If an application exclusively uses accelerated MLI kernel implementations, one
can strip out TFLM reference kernel implementations to reduce code size of
application. Build application with `MLI_ONLY=true` option in generated project
(after the project was built):
```
cd tensorflow/lite/micro/tools/make/gen/arc_emsdp_arc/prj/person_detection_int8/make
make app MLI_ONLY=true
```
if you try this and application execution fails, then most probably MLI can’t be
used for some nodes and you need to revert to using TFLM reference kernels.
## Limitations
Currently, the MLI Library provides optimized implementation only for int8
(asymmetric) versions of the following kernels: 1. Convolution 2D – Per axis
quantization only, `dilation_ratio==1` 2. Depthwise Convolution 2D – Per axis
quantization only, `dilation_ratio==1` 3. Average Pooling 4. Max Pooling 5.
Fully Connected
Currently only
[/tensorflow/lite/micro/examples/person_detection_experimental](/tensorflow/lite/micro/examples/person_detection_experimental)
is quantized using this specification. Other examples can be executed on
ARC-based targets, but will only use reference kernels.
## Scratch Buffers and Slicing
The following information applies only for ARC EM SDP and other targets with XY
memory. embARC MLI uses specific optimizations which assumes node operands are
in XY memory and/or DCCM (Data Closely Coupled Memory). As operands might be
quite big and may not fit in available XY memory, special slicing logic is
applied which allows kernel calculations to be split into multiple parts. For
this reason, internal static buffers are allocated in these X, Y and DCCM memory
banks and used to execute sub-calculations.
All this is performed automatically and invisible to the user. Half of the DCCM
memory bank and the full XY banks are occupied for MLI specific needs. If the
user needs space in XY memory for other tasks, these arrays can be reduced by
setting specific sizes. For this, add the following option to build command
replacing **<size[a|b|c]>** with required values:
```
EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=<size_a> -DSCRATCH_MEM_X_SIZE=<size_b> -DSCRATCH_MEM_Y_SIZE=<size_c>”
```
For example, to reduce sizes of arrays placed in DCCM and XCCM to 32k and 8k
respectively, use next command:
```
make app EXT_CFLAGS=”-DSCRATCH_MEM_Z_SIZE=32*1024 -DSCRATCH_MEM_X_SIZE=8*1024”
```
## License
TensorFlow's code is covered by the Apache2 License included in the repository,
and third party dependencies are covered by their respective licenses, in the
third_party folder of this package.
This diff is collapsed.
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#include "mli_slicers.h" // NOLINT
#include <algorithm>
namespace tflite {
namespace ops {
namespace micro {
TensorSlicer::TensorSlicer(const mli_tensor* full_tensor, int slice_dim,
int slice_size, int padding_pre, int padding_post,
int overlap, bool interleave_mode)
: full_tensor_(full_tensor),
sliceDim_(slice_dim),
pad_pre_(padding_pre),
pad_post_(padding_post),
overlap_(overlap),
sub_cfg_{0},
sub_tensor_{0},
done_(false) {
/* In the interleave mode, the slicing happens from the deepest dimension up
to the slice_dim for example in an HWC layout this can mode can be used to
slice in the C dimenstion. in this mode the data is not contiguous in memory
anymore */
if (interleave_mode) {
for (int i = 0; i < full_tensor->rank; i++) {
if (i > slice_dim) {
sub_cfg_.size[i] = 1;
} else if (i == slice_dim) {
sub_cfg_.size[i] = slice_size;
} else {
sub_cfg_.size[i] = full_tensor->shape[i];
}
}
sub_cfg_.sub_tensor_rank = full_tensor->rank;
} else {
/* In the not interleaved mode, the slicing happens from the outer most
dimension up to the slice_dim for example in an HWC layout this mode can be
used to slice in the H dimension. in this mode the data of the slice is
still contiguous in memory (if that was the case in the input tensor */
for (int i = 0; i < full_tensor->rank; i++) {
if (i < slice_dim) {
sub_cfg_.size[i] = 1;
} else if (i == slice_dim) {
sub_cfg_.size[i] = slice_size;
} else {
sub_cfg_.size[i] = full_tensor->shape[i];
}
}
sub_cfg_.sub_tensor_rank = full_tensor->rank - slice_dim;
}
ComputeSubTensor();
}
void TensorSlicer::ComputeSubTensor(void) {
// subtsr_cfg_ is used to keep track of the iteration.
// A copy is created to update it with the correct clipping and padding for
// the current slice
mli_sub_tensor_cfg cfg_new = sub_cfg_;
// begin and end spans the complete input region including padding areas.
const int begin = (int)sub_cfg_.offset[sliceDim_] - pad_pre_;
// end is clipped to the end of the full input region. this is needed for
// cases where the last slice is smaller than the rest.
const int end = std::min(begin + sub_cfg_.size[sliceDim_] + overlap_,
full_tensor_->shape[sliceDim_] + pad_post_);
// The start coordinate of the subtensor is clipped to zero
cfg_new.offset[sliceDim_] = std::max(begin, 0);
// and the stop coordinate is clipped to the size of the full tensor
const int stop_coord =
std::min(end, static_cast<int>(full_tensor_->shape[sliceDim_]));
// compute the size of the subtensor
cfg_new.size[sliceDim_] = stop_coord - cfg_new.offset[sliceDim_];
// compute the padding configuration for the current slice.
actual_padding_pre = cfg_new.offset[sliceDim_] - begin;
actual_padding_post = end - stop_coord;
mli_hlp_create_subtensor(full_tensor_, &cfg_new, &sub_tensor_);
}
void TensorSlicer::Next(void) {
for (int i = full_tensor_->rank - 1; i >= 0; i--) {
sub_cfg_.offset[i] += sub_cfg_.size[i];
if (sub_cfg_.offset[i] >= full_tensor_->shape[i]) {
// wrap
sub_cfg_.offset[i] = 0;
// and continue to the next dimension, if no next dimension we are done.
if (i == 0) done_ = true;
continue;
} else {
// carry is false, so break from the loop
break;
}
}
if (!done_) ComputeSubTensor();
}
bool TensorSlicer::Done(void) { return done_; }
int TensorSlicer::GetPaddingPre(void) { return actual_padding_pre; }
int TensorSlicer::GetPaddingPost(void) { return actual_padding_post; }
mli_tensor* TensorSlicer::Sub(void) { return &sub_tensor_; }
} // namespace micro
} // namespace ops
} // namespace tflite
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
#define TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
#include "mli_api.h" // NOLINT
namespace tflite {
namespace ops {
namespace micro {
class TensorSlicer {
public:
TensorSlicer(const mli_tensor* full_tensor, int slice_dim, int slice_size,
int padding_pre = 0, int padding_post = 0, int overlap = 0,
bool interleave_mode = false);
~TensorSlicer() = default;
void Next();
bool Done();
int GetPaddingPre();
int GetPaddingPost();
mli_tensor* Sub();
// Default constructor is deleted
TensorSlicer() = delete;
private:
const mli_tensor* full_tensor_;
mli_tensor sub_tensor_;
mli_sub_tensor_cfg sub_cfg_;
bool done_;
int sliceDim_;
int pad_pre_, pad_post_, overlap_;
int actual_padding_pre, actual_padding_post;
void ComputeSubTensor();
};
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_KERNELS_ARC_MLI_SLICERS_H_
This diff is collapsed.
This diff is collapsed.
/* Copyright 2020 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/common.h"
namespace tflite {
namespace ops {
namespace micro {
/**
* @brief Function to allocate scratch buffers for the convolution tensors
*
* @detail This function will update the data pointers in the 4 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param weights [IO] pointer to the weights tensor
* @param bias [IO] pointer to the bias tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_conv_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* weights,
mli_tensor* bias,
mli_tensor* out);
/**
* @brief Function to allocate scratch buffers for pooling kernels with only
* input and output buffers
*
* @detail This function will update the data pointers in the 2 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_pooling_tensors(TfLiteContext* context,
mli_tensor* in,
mli_tensor* out);
/**
* @brief Function to allocate scratch buffers for the fully connect tensors
*
* @detail This function will update the data pointers in the 4 tensors with
* pointers to scratch buffers in fast local memory.
*
* @param context [I] pointer to TfLite context (needed for error handling)
* @param in [IO] pointer to the input tensor
* @param weights [IO] pointer to the weights tensor
* @param bias [IO] pointer to the bias tensor
* @param output [IO] pointer to the output tensor
*
* @return Tf Lite status code
*/
TfLiteStatus get_arc_scratch_buffer_for_fully_connect_tensors(
TfLiteContext* context, mli_tensor* in, mli_tensor* weights,
mli_tensor* bias, mli_tensor* out);
/**
* @brief Function to calculate slice size for io tensors
*
* @detail This function will calculate the slice size in the height dimension
* for input and output tensors. it takes into account the kernel size and the
* padding. the function will look at the capacity filed in the in and out
* tensor to determine the available buffersize.
*
* @param in [I] pointer to the input tensor
* @param out [I] pointer to the output tensor
* @param kernelHeight [I] size of the kernel in height dimension
* @param strideHeight [I] input stride in height dimension
* @param padding_top [I] number of lines with zeros at the top
* @param padding_bot [I] number of lines with zeros at the bottom
* @param inSliceHeight [O] slice size in height dimension for the input tensor
* @param outSliceHeight [O] slice size in height dimension for the output
* tensor
*
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_io(
const mli_tensor* in, const mli_tensor* out, const int kernelHeight,
const int strideHeight, const int padding_top, const int padding_bot,
int* in_slice_height, int* out_slice_height);
/**
* @brief Function to calculate slice size for weight slicing
*
* @detail This function will calculate the slice size in the output channel
* dimension for weight and bias tensors. the function will look at the capacity
* filed in the weights and bias tensor to determine the available buffersize.
*
* @param weights [I] pointer to the input tensor
* @param bias [I] pointer to the output tensor
* @param weightOutChDimension [I] dimension of the output channels in the
* weights tensor
* @param sliceChannels [O] slice size in output channel dimension
*
* @return Tf Lite status code
*/
TfLiteStatus arc_scratch_buffer_calc_slice_size_weights(
const mli_tensor* weights, const mli_tensor* bias,
const int weight_out_ch_dimension, int* slice_channels);
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUF_MGR_H_
This diff is collapsed.
/* Copyright 2019 The TensorFlow Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
==============================================================================*/
#ifndef TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
#define TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
#include "mli_api.h" // NOLINT
#include "tensorflow/lite/c/common.h"
namespace tflite {
namespace ops {
namespace micro {
void init_arc_scratch_buffers(void);
void* get_arc_scratch_buffer(
int size); // Function to assign fast memory from one of 3 scratch buffers.
void get_arc_scratch_buffer_max_size(int* size);
void get_arc_scratch_buffer_two_max_sizes(int* size1, int* size2);
static inline bool inside_arc_dccm(void* p) {
#if core_config_dccm_present
return ((unsigned)p >= core_config_dccm_base) &&
((unsigned)p < core_config_dccm_base + core_config_dccm_size);
#else
return false;
#endif
}
static inline bool inside_arc_xccm(void* p) {
#if core_config_xy
return ((unsigned)p >= core_config_xy_x_base) &&
((unsigned)p < core_config_xy_x_base + core_config_xy_size);
#else
return false;
#endif
}
static inline bool inside_arc_yccm(void* p) {
#if core_config_xy
return ((unsigned)p >= core_config_xy_y_base) &&
((unsigned)p < core_config_xy_y_base + core_config_xy_size);
#else
return false;
#endif
}
static inline bool inside_arc_ccm(void* p) {
return inside_arc_dccm(p) || inside_arc_xccm(p) || inside_arc_yccm(p);
}
} // namespace micro
} // namespace ops
} // namespace tflite
#endif // TENSORFLOW_LITE_MICRO_ARC_SCRATCH_BUFFERS_H_
......@@ -409,8 +409,9 @@ TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannel) {
TF_LITE_MICRO_TEST(Kernel1x1QuantizedPerChannelRelu6) {
// conv params:
// padding, stride_<width,height>, dilation_<width, height>, activation
TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1, kTfLiteActRelu6};
// padding, stride_<width,height>, activation, dilation_<width, height>
TfLiteConvParams conv_params = {kTfLitePaddingValid, 1, 1,
kTfLiteActRelu6, 1, 1};
const int kInputShape[] = {4, 1, 2, 2, 4}; // [len,N,H,W,C]
const int kInputElements =
kInputShape[1] * kInputShape[2] * kInputShape[3] * kInputShape[4];
......
......@@ -496,7 +496,7 @@ TF_LITE_MICRO_TEST(SimpleAveragePoolTestInt8PaddingSameStride1ActNone) {
F2QS(8.5, output_min, output_max), F2QS(7., output_min, output_max)},
{4, 1, 2, 4, 1}, // Output shape
output_min, output_max, // output quantization range
kTfLitePaddingValid, kTfLiteActNone, output_data);
kTfLitePaddingSame, kTfLiteActNone, output_data);
}
TF_LITE_MICRO_TEST(SimpleMaxPoolTestFloat) {
......
......@@ -90,7 +90,7 @@ patch_cifar10_dataset() {
}
build_embarc_mli() {
gmake -j 4 -C ${1}/lib/make TCF_FILE=${2}
make -j 4 -C ${1}/lib/make TCF_FILE=${2}
}
# Main function handling the download, verify, extract, and patch process.
......@@ -173,7 +173,12 @@ download_and_extract() {
elif [[ ${action} == "patch_cifar10_dataset" ]]; then
patch_cifar10_dataset ${dir}
elif [[ ${action} == "build_embarc_mli" ]]; then
build_embarc_mli ${dir} ${action_param1}
if [[ "${action_param1}" == *.tcf ]]; then
cp ${action_param1} ${dir}/hw/arc.tcf
build_embarc_mli ${dir} ../../hw/arc.tcf
else
build_embarc_mli ${dir} ${action_param1}
fi
elif [[ ${action} ]]; then
echo "Unknown action '${action}'"
exit 1
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.