Commit 2c812d38 authored by zar3bski's avatar zar3bski 💬

GA and GS can be ingested independently based on available ENV VAR

parent ad0fdd32
Pipeline #91499408 passed with stage
in 4 minutes and 10 seconds
......@@ -9,9 +9,11 @@
Current API connections:
* Google analytics V4 (you'll need to generate an **json api key file** [documentation](https://developers.google.com/analytics/devguides/reporting/core/v4/authorization))
* Google analytics V4
* Google Search Console V3
For Google analytics V4 and Google Search V3, you'll need to generate **service account key file** [documentation](https://developers.google.com/analytics/devguides/reporting/core/v4/authorization) and grant it rights to those applications
## Usage
The docker image available at [registry.gitlab.com/data-major/online_visibility_collector](registry.gitlab.com/data-major/online_visibility_collector) requires the following environment variables
......@@ -26,7 +28,9 @@ The docker image available at [registry.gitlab.com/data-major/online_visibility_
| `GA_VIEW_ID` % | The ID of the **Google Analytics** view you want to ingest the data from | 608124608 |
| `GS_URL` % | The url of a domain you own on **Google Search Console** | https://some.url |
Furthermore, you need to mount google **json api key file** on `/ga_key_file.json`
`GA_VIEW_ID` and `GS_URL` are optional, depending on whether you want to ingest data from these services.
However, for both services, you'll need to mount your google **json api key file** on `/g_service_account.json`
### Deployment with docker-compose
......@@ -56,7 +60,7 @@ services:
container_name: collector
image: registry.gitlab.com/data-major/online_visibility_collector
volumes:
- ./path/to/the/google_analytics_key.json:/ga_key_file.json
- ./path/to/the/google_analytics_key.json:/g_service_account.json
networks:
- monitoring
ports:
......@@ -66,10 +70,11 @@ services:
environment:
- DB_HOST=db
- DB_PORT=8086
- GA_VIEW_ID=608124608
- INFLUXDB_DB=some_name
- INFLUXDB_USER=some_user
- INFLUXDB_USER_PASSWORD=db_user_password
- GA_VIEW_ID=608124608
- GS_URL=https://some.verified.domain
networks:
monitoring:
......
......@@ -3,20 +3,23 @@ from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from abc import abstractmethod
from influxdb import InfluxDBClient
from utils import ConfigFactory
from utils import ConfigFactory, log
from datetime import date, timedelta, datetime
import oauth2client
class Batch(object):
'''structure data obtained through API. Make sure of its integrity with config file'''
def __init__(self, **kwargs):
self.date = (date.today()-timedelta(days=1))
self.date = kwargs.get('date')
self.config = kwargs.get('config')
self.raw_data = kwargs.get('raw_data',{})
self.origin = kwargs.get('origin', "unknown")
self.dimensions_names = self._get_dimensions_names()
self.metrics_names = self._get_metrics_names()
self.rows = self._get_rows()
try:
self.dimensions_names = self._get_dimensions_names()
self.metrics_names = self._get_metrics_names()
self.rows = self._get_rows()
except:
logging.warning("Wrong data format from {}".format(self.origin))
def _get_metrics_names(self):
if self.origin == "google_analytics":
......@@ -49,22 +52,25 @@ class GoogleExtractor(object):
self.end_date = (date.today()-timedelta(days=1))
self.start_date = (date.today()-timedelta(days=2))
@log
def _get_google_search_daily(self):
analytics = build('webmasters', 'v3', credentials=self.credentials)
r = {
'startDate' : self.start_date.strftime("%Y-%m-%d"),
'endDate' : self.end_date.strftime("%Y-%m-%d") ,
'startDate' : (self.start_date-timedelta(days=1)).strftime("%Y-%m-%d"),
'endDate' : (self.end_date-timedelta(days=1)).strftime("%Y-%m-%d") ,
'dimensions': self.config.gsc_dimensions ,
'searchType': 'web',
'rowLimit' : 10
}
return analytics.searchanalytics().query(siteUrl=self.config.gs_url, body=r).execute()
@log
def _get_google_analytics_daily(self):
"""fetch data from google analytics API v4 and EXTRACT relevant data ATTENTION: only returns one report """
analytics = build('analyticsreporting', 'v4', credentials=self.credentials)
return analytics.reports().batchGet(
body={
'reportRequests': [
......@@ -79,7 +85,7 @@ class GoogleExtractor(object):
@abstractmethod
def extract(self, origin):
return Batch(raw_data = getattr(self, '_get_{}_daily'.format(origin))(), origin=origin, config=self.config)
return Batch(raw_data = getattr(self, '_get_{}_daily'.format(origin))(), origin=origin, config=self.config, date=self.end_date)
class DataFormater(object):
def __init__(self, batch: Batch):
......@@ -97,33 +103,29 @@ class DataFormater(object):
"fields": dict(zip(self.batch.metrics_names, row["values"]))} for row in self.batch.rows ))
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s;%(levelname)s;%(message)s")
logging.getLogger('googleapiclient.discovery_cache').setLevel(logging.ERROR)
config = ConfigFactory("config.yml")
db_client = InfluxDBClient(config.db_host, config.db_port, config.db_password, config.db_user, config.db_name)
try:
os.environ['GA_VIEW_ID']
if 'GA_VIEW_ID' in os.environ:
logging.info("Starting Google Analytics ingestion")
batch = GoogleExtractor(config).extract("google_analytics")
DataFormater(batch)>>db_client.write_points
except KeyError:
logging.info("GA_VIEW_ID not found in ENV var => google_analytics data won't be fetched")
try:
os.environ['GS_URL']
if 'GS_URL' in os.environ:
logging.info("Starting Google Search ingestion")
batch = GoogleExtractor(config).extract("google_search")
DataFormater(batch)>>db_client.write_points
except KeyError:
logging.info("GS_URL not found in ENV var => google_search data won't be fetched")
if __name__ == '__main__':
logging.basicConfig(level=logging.INFO, format="%(asctime)s;%(levelname)s;%(message)s")
parser = argparse.ArgumentParser()
parser.add_argument("mode")
args = parser.parse_args()
if args.mode == "deamon":
logging.info('Starting online_visibility_collector in deamon mode')
schedule.every().day.at("06:30").do(main)
while 1:
schedule.run_pending()
......
import yaml, json, os
import yaml, json, os, logging
def log(func):
def wrapper(*args, **kwargs):
res = func(*args, **kwargs)
logging.info('Running {} - {}'.format(func.__name__, args, kwargs))
return res
return wrapper
class ConfigFactory(object):
def __init__(self, file):
......
import sys
sys.path.insert(1, '/usr/src/collector')
import pytest, unittest, os
import pytest, unittest, os, logging
from apiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
from unittest.mock import patch, Mock, MagicMock
from collector import *
from utils import *
ga_api_mock = MagicMock(return_value=json.loads(open('../test/api_responses/ga_normal_answer.json').read()))
gsc_api_mock = MagicMock(return_value=json.loads(open('../test/api_responses/gsc_normal_answer.json').read()))
ga_api_mock = MagicMock(return_value=json.loads(open('../test/api_responses/ga_normal_answer.json').read()))
gsc_api_mock = MagicMock(return_value=json.loads(open('../test/api_responses/gsc_normal_answer.json').read()))
shitty_api_mock = MagicMock(return_value=json.dumps({"some":{"random":"bullshit","time":2}}))
@pytest.fixture(scope="module")
def config():
......@@ -47,15 +48,17 @@ class TestConfigFactory(unittest.TestCase):
class Test_GoogleExtractor:
"""test EXTRACT"""
def test_data_interval_interval_is_one_day(self, config):
extractor = GoogleExtractor(config)
assert hasattr(extractor, "end_date")
assert hasattr(extractor, "start_date")
assert (extractor.end_date - extractor.start_date) == timedelta(days=1)
def test_extractor_has_config(self, config):
def test_extractor_has_config_and_credentials(self, config):
extractor = GoogleExtractor(config)
assert type(extractor.config) == ConfigFactory
assert type(extractor.credentials) == ServiceAccountCredentials
class Test_Batch_Google_Analytics:
......@@ -78,6 +81,11 @@ class Test_Batch_Google_Analytics:
assert hasattr(batch, "rows")
assert next(batch.rows) == {'dimensions': ['desktop', 'France'], 'values': [8, 0]}
@patch('collector.GoogleExtractor._get_google_analytics_daily', shitty_api_mock)
def test_wrong_data_format_is_logged(self, config, caplog):
batch = GoogleExtractor(config).extract("google_analytics")
assert "Wrong data format from google_analytics" in caplog.text
class Test_Batch_Google_Search:
@patch('collector.GoogleExtractor._get_google_search_daily', gsc_api_mock)
......@@ -96,6 +104,11 @@ class Test_Batch_Google_Search:
assert hasattr(batch, "rows")
assert next(batch.rows) == {'dimensions': ['data-major','https://www.data-major.com/'], 'values': [2.0,1.0,2.0,6.0]}
@patch('collector.GoogleExtractor._get_google_search_daily', shitty_api_mock)
def test_wrong_data_format_is_logged(self, config, caplog):
batch = GoogleExtractor(config).extract("google_search")
assert "Wrong data format from google_search" in caplog.text
@patch('collector.GoogleExtractor._get_google_search_daily', MagicMock(return_value=json.loads(open('../test/api_responses/gsc_different_metric_order.json').read())))
def test_different_metric_order_should_not_alter_rows(self, config):
batch = GoogleExtractor(config).extract("google_search")
......@@ -125,9 +138,30 @@ class TestDataFormater:
assert len(list(data.get_points(measurement='google_search'))) == 3
assert sum([x["position"] for x in list(data.get_points(measurement='google_search', tags={'query': 'data-major'}))]) == 6.0
'''INTEGRATION TESTS'''
class TestMain:
@patch('collector.GoogleExtractor._get_google_search_daily', gsc_api_mock)
@patch('collector.GoogleExtractor._get_google_analytics_daily', ga_api_mock)
def test_ingestion_of_google_analytics(self):
def test_ingestion_of_both(self, config, caplog):
caplog.set_level(logging.INFO)
main()
assert "Starting Google Analytics ingestion" in caplog.text
assert "Starting Google Search ingestion" in caplog.text
@patch('collector.GoogleExtractor._get_google_analytics_daily', ga_api_mock)
def test_ingestion_that_missing_env_variable_prevent_ingestion(self, config, caplog):
caplog.set_level(logging.INFO)
del os.environ['GS_URL']
main()
assert "Starting Google Analytics ingestion" in caplog.text
assert "Starting Google Search ingestion" not in caplog.text
@patch('collector.GoogleExtractor._get_google_search_daily', gsc_api_mock)
def test_ingestion_that_missing_env_variable_prevent_ingestion(self, config, caplog):
caplog.set_level(logging.INFO)
del os.environ['GA_VIEW_ID']
main()
assert "Starting Google Analytics ingestion" not in caplog.text
assert "Starting Google Search ingestion" in caplog.text
\ No newline at end of file
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment