...
 
Commits (2)
......@@ -15,7 +15,11 @@ test:
script:
- docker build -t col_img ./src
- docker run -d -it -p 1312:1312 -e "DB_HOST=db" -e "DB_PORT=8086" -e "GA_VIEW_ID=983581608" -e "INFLUXDB_DB=some_name" -e "INFLUXDB_USER=some_user" -e "INFLUXDB_USER_PASSWORD=db_user_password" -v `pwd`/test:/usr/src/test -v `pwd`/test/ga_fake_key.json:/ga_key_file.json --name=collector col_img
- docker exec collector pytest ../test/unit_tests.py
- docker exec collector pip install pytest-cov
- docker exec collector pytest --cov-report term --cov=./ ../test/unit_tests.py
- docker exec collector rm -r __pycache__
- docker exec collector rm .coverage
- docker exec collector rm -r ../test/__pycache__
build:
only:
......
# Ingest
ETL Ingestions jobs for google analytics data
# Online_visibility_collector
![](https://gitlab.com/data-major/online_visibility_collector/badges/master/pipeline.svg)
![](https://gitlab.com/data-major/online_visibility_collector/badges/master/coverage.svg)
## Description
*Online visibility collector* (**OVC**) is a python based standalone and schedulled ETL designed to aggregate data from various monitoring web services into one single **Time Series Database**. Currently, **OVC** only supports [InfluxDB](https://www.influxdata.com/)
Current API connections:
* Google analytics V4 (you'll need to generate an **json api key file** [documentation](https://developers.google.com/analytics/devguides/reporting/core/v4/authorization))
## Usage
### Testing
The docker image available at [registry.gitlab.com/data-major/online_visibility_collector](registry.gitlab.com/data-major/online_visibility_collector) requires the following environment variables
|variable name | description | example |
|--------------------------:|:-------------------------------------------------------------------------|:----------------:|
| `DB_HOST` | Hostname of the Influx database | somewhere.com |
| `DB_PORT ` | Port of the Influx database | 8086 |
| `GA_VIEW_ID` | The ID of the Google Analytics view you want to ingest the data from | 608124608 |
| `INFLUXDB_DB` | Name of the database | some_name |
| `INFLUXDB_USER` | Username on InfluxDB | some_user |
| `INFLUXDB_USER_PASSWORD` | Password from the User | some_password |
Furthermore, you need to mount google **json api key file** on `/ga_key_file.json`
### Deployment with docker-compose
Here is an example of a docker-compose based environment
```yml
version: '3.7'
services:
db:
image: influxdb:1.7-alpine
restart: always
container_name: db
networks:
- monitoring
volumes:
- influxdb-volume:/var/lib/influxdb
expose:
- 8086
environment:
- INFLUXDB_DB=some_name
- INFLUXDB_USER=some_user
- INFLUXDB_ADMIN_USER=some_other_user
- INFLUXDB_ADMIN_PASSWORD=some_password
- INFLUXDB_USER_PASSWORD=db_user_password
collector:
container_name: collector
image: registry.gitlab.com/data-major/online_visibility_collector
volumes:
- ./path/to/the/google_analytics_key.json:/ga_key_file.json
networks:
- monitoring
ports:
- 1312:1312
depends_on:
- db
environment:
- DB_HOST=db
- DB_PORT=8086
- GA_VIEW_ID=608124608
- INFLUXDB_DB=some_name
- INFLUXDB_USER=some_user
- INFLUXDB_USER_PASSWORD=db_user_password
networks:
monitoring:
volumes:
influxdb-volume:
```
pytest ../test/test.py
```
\ No newline at end of file
import yaml, requests, os, json, ast, time, argparse
import requests, ast, time, argparse
import schedule
from collections import defaultdict
from apiclient.discovery import build
......@@ -6,30 +6,7 @@ from datetime import datetime, timedelta
from oauth2client.service_account import ServiceAccountCredentials
from abc import ABC, abstractmethod
from influxdb import InfluxDBClient
class ConfigFactory(object):
def __init__(self, file):
with open(file, 'r') as stream:
try:
d = yaml.safe_load(stream)
self.ga_metrics = d["google_analytics_metrics"]
self.ga_dimensions = d["google_analytics_dimensions"]
except yaml.YAMLError as exc:
raise exc
# TODO: find a way to do this without volumes (env variables would be great)
KEY_FILE_LOCATION = '/ga_key_file.json'
self.ga_key_conf = json.loads(open(KEY_FILE_LOCATION).read())
try:
self.ga_view_id = os.environ['GA_VIEW_ID']
self.db_host = os.environ['DB_HOST']
self.db_port = os.environ['DB_PORT']
self.db_user = os.environ['INFLUXDB_USER']
self.db_password = os.environ['INFLUXDB_USER_PASSWORD']
self.db_name = os.environ['INFLUXDB_DB']
except KeyError as e:
print("Missing ENV VARIABLE: {}".format(e))
raise e
from utils import ConfigFactory
class GA_Batch(object):
"""Google analytics data batch"""
......
import yaml, json, os
class ConfigFactory(object):
def __init__(self, file):
with open(file, 'r') as stream:
try:
d = yaml.safe_load(stream)
self.ga_metrics = d["google_analytics_metrics"]
self.ga_dimensions = d["google_analytics_dimensions"]
except yaml.YAMLError as exc:
raise exc
# TODO: find a way to do this without volumes (env variables would be great)
KEY_FILE_LOCATION = '/ga_key_file.json'
self.ga_key_conf = json.loads(open(KEY_FILE_LOCATION).read())
try:
self.ga_view_id = os.environ['GA_VIEW_ID']
self.db_host = os.environ['DB_HOST']
self.db_port = os.environ['DB_PORT']
self.db_user = os.environ['INFLUXDB_USER']
self.db_password = os.environ['INFLUXDB_USER_PASSWORD']
self.db_name = os.environ['INFLUXDB_DB']
except KeyError as e:
print("Missing ENV VARIABLE: {}".format(e))
raise e
\ No newline at end of file
......@@ -5,6 +5,7 @@ from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from unittest.mock import patch, Mock, MagicMock
from collector import *
from utils import *
'''UNIT TESTING'''
class TestConfigFactory:
......