Commit 45649a54 authored by pawan-nandakishore's avatar pawan-nandakishore

Initial commit

parents
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
# C extensions
*.so
# Distribution / packaging
.Python
env/
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
*.egg-info/
.installed.cfg
*.egg
# PyInstaller
# Usually these files are written by a python script from a template
# before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec
# Installer logs
pip-log.txt
pip-delete-this-directory.txt
# Unit test / coverage reports
htmlcov/
.tox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
# Translations
*.mo
*.pot
# Django stuff:
*.log
# Sphinx documentation
docs/_build/
# PyBuilder
target/
# DotEnv configuration
.env
# Database
*.db
*.rdb
# Pycharm
.idea
# VS Code
.vscode/
# Spyder
.spyproject/
# Jupyter NB Checkpoints
.ipynb_checkpoints/
# exclude data from source control by default
/data/
# Mac OS-specific storage files
.DS_Store
# vim
*.swp
*.swo
# Mypy cache
.mypy_cache/
data-science-workflow
==============================
work flow using docker
Project Organization
------------
├── LICENSE
├── Makefile <- Makefile with commands like `make data` or `make train`
├── README.md <- The top-level README for developers using this project.
├── data
│   ├── external <- Data from third party sources.
│   ├── interim <- Intermediate data that has been transformed.
│   ├── processed <- The final, canonical data sets for modeling.
│   └── raw <- The original, immutable data dump.
├── docs <- A default Sphinx project; see sphinx-doc.org for details
├── models <- Trained and serialized models, model predictions, or model summaries
├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
│ the creator's initials, and a short `-` delimited description, e.g.
`1.0-jqp-initial-data-exploration`.
├── references <- Data dictionaries, manuals, and all other explanatory materials.
├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
│   └── figures <- Generated graphics and figures to be used in reporting
├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
│ generated with `pip freeze > requirements.txt`
├── setup.py <- makes project pip installable (pip install -e .) so src can be imported
├── src <- Source code for use in this project.
│   ├── __init__.py <- Makes src a Python module
│ │
│   ├── data <- Scripts to download or generate data
│   │   └── make_dataset.py
│ │
│   ├── features <- Scripts to turn raw data into features for modeling
│   │   └── build_features.py
│ │
│   ├── models <- Scripts to train models and then use trained models to make
│ │ │ predictions
│   │   ├── predict_model.py
│   │   └── train_model.py
│ │
│   └── visualization <- Scripts to create exploratory and results oriented visualizations
│   └── visualize.py
└── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io
--------
<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
Default localhost addresses:
Jupyter: Always comes up on http://127.0.0.1:8888/?token='get from output'
Streamlit: http://172.24.0.3:8501/
Tensorboard: http://0.0.0.0:6006/
If you open another container for each of these apps. They will open at different ports. You can also open individual containers as well.
\ No newline at end of file
{
"project_name": "project_name",
"repo_name": "{{ cookiecutter.project_name.lower().replace(' ', '_') }}",
"author_name": "Your name (or your organization/company/team)",
"description": "A short description of the project.",
"open_source_license": ["MIT", "BSD-3-Clause", "No license file"],
"s3_bucket": "[OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')",
"aws_profile": "default",
"python_interpreter": ["python3", "python"]
}
.PHONY: clean data lint requirements sync_data_to_s3 sync_data_from_s3
#################################################################################
# GLOBALS #
#################################################################################
PROJECT_DIR := $(shell dirname $(realpath $(lastword $(MAKEFILE_LIST))))
BUCKET = [OPTIONAL] your-bucket-for-syncing-data (do not include 's3://')
PROFILE = default
PROJECT_NAME = data-science-workflow
PYTHON_INTERPRETER = python3
ifeq (,$(shell which conda))
HAS_CONDA=False
else
HAS_CONDA=True
endif
#################################################################################
# COMMANDS #
#################################################################################
## Install Python Dependencies
requirements: test_environment
$(PYTHON_INTERPRETER) -m pip install -U pip setuptools wheel
$(PYTHON_INTERPRETER) -m pip install -r requirements.txt
## Make Dataset
data: requirements
$(PYTHON_INTERPRETER) src/data/make_dataset.py data/raw data/processed
## Delete all compiled Python files
clean:
find . -type f -name "*.py[co]" -delete
find . -type d -name "__pycache__" -delete
## Lint using flake8
lint:
flake8 src
## Upload Data to S3
sync_data_to_s3:
ifeq (default,$(PROFILE))
aws s3 sync data/ s3://$(BUCKET)/data/
else
aws s3 sync data/ s3://$(BUCKET)/data/ --profile $(PROFILE)
endif
## Download Data from S3
sync_data_from_s3:
ifeq (default,$(PROFILE))
aws s3 sync s3://$(BUCKET)/data/ data/
else
aws s3 sync s3://$(BUCKET)/data/ data/ --profile $(PROFILE)
endif
## Set up python interpreter environment
create_environment:
ifeq (True,$(HAS_CONDA))
@echo ">>> Detected conda, creating conda environment."
ifeq (3,$(findstring 3,$(PYTHON_INTERPRETER)))
conda create --name $(PROJECT_NAME) python=3
else
conda create --name $(PROJECT_NAME) python=2.7
endif
@echo ">>> New conda env created. Activate with:\nsource activate $(PROJECT_NAME)"
else
$(PYTHON_INTERPRETER) -m pip install -q virtualenv virtualenvwrapper
@echo ">>> Installing virtualenvwrapper if not already installed.\nMake sure the following lines are in shell startup file\n\
export WORKON_HOME=$$HOME/.virtualenvs\nexport PROJECT_HOME=$$HOME/Devel\nsource /usr/local/bin/virtualenvwrapper.sh\n"
@bash -c "source `which virtualenvwrapper.sh`;mkvirtualenv $(PROJECT_NAME) --python=$(PYTHON_INTERPRETER)"
@echo ">>> New virtualenv created. Activate with:\nworkon $(PROJECT_NAME)"
endif
## Test python environment is setup correctly
test_environment:
$(PYTHON_INTERPRETER) test_environment.py
#################################################################################
# PROJECT RULES #
#################################################################################
#################################################################################
# Self Documenting Commands #
#################################################################################
.DEFAULT_GOAL := help
# Inspired by <http://marmelab.com/blog/2016/02/29/auto-documented-makefile.html>
# sed script explained:
# /^##/:
# * save line in hold space
# * purge line
# * Loop:
# * append newline + line to hold space
# * go to next line
# * if line starts with doc comment, strip comment character off and loop
# * remove target prerequisites
# * append hold space (+ newline) to line
# * replace newline plus comments by `---`
# * print line
# Separate expressions are necessary because labels cannot be delimited by
# semicolon; see <http://stackoverflow.com/a/11799865/1968>
.PHONY: help
help:
@echo "$$(tput bold)Available rules:$$(tput sgr0)"
@echo
@sed -n -e "/^## / { \
h; \
s/.*//; \
:doc" \
-e "H; \
n; \
s/^## //; \
t doc" \
-e "s/:.*//; \
G; \
s/\\n## /---/; \
s/\\n/ /g; \
p; \
}" ${MAKEFILE_LIST} \
| LC_ALL='C' sort --ignore-case \
| awk -F '---' \
-v ncol=$$(tput cols) \
-v indent=19 \
-v col_on="$$(tput setaf 6)" \
-v col_off="$$(tput sgr0)" \
'{ \
printf "%s%*s%s ", col_on, -indent, $$1, col_off; \
n = split($$2, words, " "); \
line_length = ncol - indent; \
for (i = 1; i <= n; i++) { \
line_length -= length(words[i]) + 1; \
if (line_length <= 0) { \
line_length = ncol - indent - length(words[i]) - 1; \
printf "\n%*s ", -indent, " "; \
} \
printf "%s ", words[i]; \
} \
printf "\n"; \
}' \
| more $(shell test $(shell uname) = Darwin && echo '--no-init --raw-control-chars')
###########################################################################################################
## GENERAL COMMANDS
###########################################################################################################
DEFAULT_STREAMLIT_FILE = dashboards/mnist_explorer.py
NOTEBOOK_CONTAINER_NAME = jupyter-notebook
STREAMLIT_CONTAINER_NAME = streamlit_app
TENSORBOARD_CONTAINER_NAME = tensorboard
build: # bring docker containers up
docker-compose build
down: # bring docker containers down
docker-compose down
stop: # stop and exit docker containers
docker-compose stop
# start docker containers, start streamlit container
# with given filename. If no name is given start
# streamlit app with default file name
up:
ifdef FILE
@echo using file-name ${FILE}
FILE=${FILE} docker-compose up
else
FILE=${DEFAULT_STREAMLIT_FILE} docker-compose up
endif
build-run: # combine build and run operation
docker-compose build
ifdef FILE
@echo using file-name ${FILE}
FILE=${FILE} docker-compose up
else
FILE=${DEFAULT_STREAMLIT_FILE} docker-compose up
endif
# remove logs from data folder
clean-logs:
sudo rm -fr data/logs/*
python: # start python in a new streamlit container
docker-compose run --rm explorer python3 local_drive/dashboards${FILE}
streamlit: # start streamlit app in a new streamlit container
docker-compose run --rm explorer streamlit run local_drive/dashboards${file}
notebook-container: # step into jupyter docker container containing jupyter notebook
@echo ${NOTEBOOK_CONTAINER_NAME}
docker exec -it ${NOTEBOOK_CONTAINER_NAME} bash
streamlit-container: # step into streamlit app container
docker exec -it ${STREAMLIT_CONTAINER_NAME} bash
tensorboard-container: # step into tensorboard container
docker exec -it ${TENSORBOARD_CONTAINER_NAME} bash
delete-containers:
docker rm ${NOTEBOOK_CONTAINER_NAME} ${STREAMLIT_CONTAINER_NAME} ${TENSORBOARD_CONTAINER_NAME}
\ No newline at end of file
data-science-workflow
==============================
ork flow using docker
Project Organization
------------
├── LICENSE
├── Makefile <- Makefile with commands like `make data` or `make train`
├── README.md <- The top-level README for developers using this project.
├── data
│   ├── external <- Data from third party sources.
│   ├── interim <- Intermediate data that has been transformed.
│   ├── processed <- The final, canonical data sets for modeling.
│   └── raw <- The original, immutable data dump.
├── docs <- A default Sphinx project; see sphinx-doc.org for details
├── models <- Trained and serialized models, model predictions, or model summaries
├── notebooks <- Jupyter notebooks. Naming convention is a number (for ordering),
│ the creator's initials, and a short `-` delimited description, e.g.
`1.0-jqp-initial-data-exploration`.
├── references <- Data dictionaries, manuals, and all other explanatory materials.
├── reports <- Generated analysis as HTML, PDF, LaTeX, etc.
│   └── figures <- Generated graphics and figures to be used in reporting
├── requirements.txt <- The requirements file for reproducing the analysis environment, e.g.
│ generated with `pip freeze > requirements.txt`
├── setup.py <- makes project pip installable (pip install -e .) so src can be imported
├── src <- Source code for use in this project.
│   ├── __init__.py <- Makes src a Python module
│ │
│   ├── data <- Scripts to download or generate data
│   │   └── make_dataset.py
│ │
│   ├── features <- Scripts to turn raw data into features for modeling
│   │   └── build_features.py
│ │
│   ├── models <- Scripts to train models and then use trained models to make
│ │ │ predictions
│   │   ├── predict_model.py
│   │   └── train_model.py
│ │
│   └── visualization <- Scripts to create exploratory and results oriented visualizations
│   └── visualize.py
└── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io
--------
<p><small>Project based on the <a target="_blank" href="https://drivendata.github.io/cookiecutter-data-science/">cookiecutter data science project template</a>. #cookiecutterdatascience</small></p>
import streamlit as st
import pandas as pd
import numpy as np
st.title('My first app')
st.write("Here's our first attempt at using data to create a table:")
st.write(pd.DataFrame({
'first column': [1, 2, 3, 4],
'second column': [10, 20, 30, 40]
}))
if st.checkbox('Show dataframe'):
chart_data = pd.DataFrame(
np.random.randn(20, 3),
columns=['a', 'b', 'c'])
st.line_chart(chart_data)
\ No newline at end of file
import tensorflow as tf
import streamlit as st
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import ImageGrid
from bokeh.io import output_file, show
from bokeh.layouts import gridplot
from bokeh.plotting import figure
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
class_value = st.sidebar.selectbox("Class",[0,1,2,3,4,5,6,7,8,9])
random_seed = st.sidebar.number_input("Random seed", value=1)
nums = st.sidebar.number_input("Number of examples", value=20)
@st.cache
def class_examples(class_value, nums=20, fix_seed=1):
if fix_seed > 0:
np.random.seed(1)
class_indx = np.where(y_train == class_value)
examples = x_train[class_indx]
samples = examples[np.random.choice(examples.shape[0], nums, replace=False), :,:]
return samples
display_samples = class_examples(class_value, nums, fix_seed=random_seed)
st.sidebar.button("Reload",class_examples(class_value, nums, fix_seed=random_seed))
num_cols = 4.0
mod_value = display_samples.shape[0] % num_cols
num_rows = (display_samples.shape[0] - mod_value)/num_cols
fig = plt.figure(figsize=(4., 4.))
grid = ImageGrid(fig, 111, # similar to subplot(111)
nrows_ncols=(int(num_rows), int(num_cols)),
axes_pad=0.1, # pad between axes in inch.
)
for ax, im in zip(grid, display_samples):
# Iterating over the grid returns the Axes.
ax.imshow(im)
st.pyplot()
from plotly.subplots import make_subplots
import plotly.graph_objects as go
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
go.Scatter(x=[1, 2, 3], y=[4, 5, 6]),
row=1, col=1
)
fig.add_trace(
go.Scatter(x=[20, 30, 40], y=[50, 60, 70]),
row=1, col=2
)
fig.update_layout(height=600, width=800, title_text="Side By Side Subplots")
st.plotly_chart(fig)
version: '2.3'
services:
notebook:
container_name: jupyter-notebook
build:
context: ./docker/jupyter_docker
ports:
- "8888:8888"
environment:
- NVIDIA_VISIBLE_DEVICES=all
runtime: nvidia
volumes:
- ./:/local_drive
command: jupyter notebook --allow-root --notebook-dir= local_drive/ --ip=0.0.0.0 --port=8888 --no-browser
tensorboard:
build:
context: ./docker/tensorboard_docker
container_name: tensorboard
volumes_from:
- notebook
ports:
- "6006:6006"
command: tensorboard --port=6006 --logdir=/local_drive/data/logs --host=0.0.0.0
runtime: nvidia
explorer:
container_name: streamlit_app
build:
context: ./docker/streamlit_docker
ports:
- "8501:8501"
volumes:
- ./:/local_drive
runtime: nvidia
tty: true
environment:
- LC_ALL=C.UTF-8
- LANG=C.UTF-8
command: streamlit run local_drive/${FILE}
FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
RUN apt-get update && apt-get install -y \