Commit a5ec4027 authored by Henrik tom Wörden's avatar Henrik tom Wörden
Browse files

Refactor Crawler

parent 75b703dc
__pycache__
.tox
.coverage
cache.db
*cache.db
*.egg-info
.docker/cert
......@@ -7,10 +7,12 @@ pip install tox --user
tox
# Run Integration Tests Locally
1. Mount `integrationtests/full_test/extroot` to the folder that will be used as
extroot. E.g. `sudo mount -o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`
2. Start an empty CaosDB instance
3. run test.sh
1. Change directory to `integrationtests/full_test/`.
2. Mount `extroot` to the folder that will be used as extroot. E.g. `sudo mount
-o bind extroot ../../../caosdb-deploy/profiles/empty/paths/extroot`.
3. Start an empty CaosDB instance (with the mounted extroot).
4. Run `test.sh`.
# Code Formatting
autopep8 -i -r ./
......@@ -25,14 +25,13 @@
import argparse
import logging
import sys
from argparse import RawTextHelpFormatter
import caosdb as db
from caosadvancedtools.cfood import fileguide
from caosadvancedtools.crawler import FileCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
from caosadvancedtools.utils import set_log_level
from caosadvancedtools.guard import UPDATE
from scifolder import (AnalysisCFood, ExperimentCFood, ProjectCFood,
PublicationCFood, SimulationCFood)
......@@ -54,8 +53,8 @@ if __name__ == "__main__":
logger = logging.getLogger("caosadvancedtools")
conlogger = logging.getLogger("connection")
conlogger.setLevel(level=logging.ERROR)
logger.setLevel(level=logging.WARN)
logger.setLevel(level=logging.DEBUG)
fileguide.access = access
parser = get_parser()
args = parser.parse_args()
......@@ -63,10 +62,10 @@ if __name__ == "__main__":
files = FileCrawler.query_files(args.path)
logger.info("Query done...")
config = db.configuration.get_config()
c = FileCrawler(files=files, use_cache=True, access=access,
c = FileCrawler(files=files, use_cache=True,
interactive=False, hideKnown=True,
food=[ProjectCFood,
ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood,
])
cfood_types=[ProjectCFood,
ExperimentCFood, AnalysisCFood,
PublicationCFood, SimulationCFood,
])
c.crawl(security_level=UPDATE)
......@@ -4,6 +4,7 @@ echo "Filling the database"
./filldb.sh
echo "Testing the crawler database"
python3 -m pytest test_crawler.py
python3 test_table.py
# TODO the following test deletes lots of the data inserted by the crawler
echo "Testing im and export"
python3 test_im_und_export.py
#!/usr/bin/env python3
import os
import unittest
from tempfile import TemporaryDirectory
import caosdb as db
......
......@@ -18,17 +18,13 @@
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
import argparse
import logging
import sys
from argparse import RawTextHelpFormatter
import caosdb as db
import pandas as pd
from caosadvancedtools.crawler import TableCrawler
from caosadvancedtools.guard import INSERT, RETRIEVE, UPDATE, Guard
from caosadvancedtools.utils import set_log_level
from caosadvancedtools.guard import UPDATE
if __name__ == "__main__":
logger = logging.getLogger("caosadvancedtools")
......
......@@ -25,18 +25,21 @@
# ** end header
""" Defines how something that shall be inserted into CaosDB is treated.
CaosDB can automatically be filled with Records based on some file structure.
The Crawler will iterate over the files and test for each file whether a CFood
exists that matches the file path. If one does, it is instanciated to treat the
match. This occurs in basically three steps:
1. create a list of identifiables, i.e. unique representation of CaosDB Records
(such as an experiment belonging to a project and a date/time)
2. the identifiables are either found in CaosDB or they are created.
3. the identifiables are update based on the date in the file structure
CaosDB can automatically be filled with Records based on some structure, a file
structure, a table or similar.
The Crawler will iterate over the respective items and test for each item
whether a CFood class exists that matches the file path, i.e. whether CFood
class wants to treat that pariticular item. If one does, it is instanciated to
treat the match. This occurs in basically three steps:
1. Create a list of identifiables, i.e. unique representation of CaosDB Records
(such as an experiment belonging to a project and a date/time).
2. The identifiables are either found in CaosDB or they are created.
3. The identifiables are update based on the date in the file structure.
"""
import logging
import re
from abc import ABCMeta, abstractmethod
import caosdb as db
from caosdb.exceptions import EntityDoesNotExistError
......@@ -62,24 +65,37 @@ def get_entity(name):
return ENTITIES[name]
class AbstractCFood(object):
class FileGuide(object):
def access(path):
""" should be replaced by a function that adds
a prefix to paths to allow to access caosdb files locally"""
def __init__(self):
raise NotImplementedError()
fileguide = FileGuide()
class AbstractCFood(object, metaclass=ABCMeta):
def __init__(self, item):
""" Abstract base class for Crawler food (CFood)."""
self.to_be_updated = db.Container()
self.identifiables = db.Container()
self.item = item
self.attached_items = []
@abstractmethod
def create_identifiables(self):
"""
should set the instance variable Container with the identifiables
"""
raise NotImplementedError()
@abstractmethod
def update_identifiables(self):
""" Changes the identifiables as needed and adds changed identifiables
to self.to_be_updated
"""
raise NotImplementedError()
def push_identifiables_to_CaosDB(self):
""" Updates the self.to_be_updated Container, i.e. pushes the changes
......@@ -108,6 +124,56 @@ class AbstractCFood(object):
logger.debug(self.to_be_updated)
guard.safe_update(self.to_be_updated)
@classmethod
def match_item(cls, item):
""" Matches an item found by the crawler against this class. Returns
True if the item shall be treated by this class, i.e. if this class
matches the item.
Parameters
----------
item : object
iterated by the crawler
To be overwritten by subclasses!
"""
return True
def collect_information(self):
""" The CFood collects information for further processing.
Often CFoods need information from files or even from the database in
order to make processing decision. It is intended that this function is
called after match. Thus match can be used without connecting to the
database.
To be overwritten by subclasses
"""
pass
def attach(self, item):
self.attached_items.append(item)
# TODO looking for should `attach` the files itsself. This would allow to
# group them right away and makes it unnecessary to check matches later
# again.
def looking_for(self, item):
"""
returns True if item can be added to this CFood.
Typically a CFood exists for a file and defines how to deal with the
file. However, sometimes additional files "belong" to a CFood. E.g. an
experiment CFood might match against a README file but labnotes.txt
also shall be treated by the cfood (and not a special cfood created for
labnotes.txt)
This function can be used to define what files shall be 'attached'.
To be overwritten by subclasses
"""
return False
@staticmethod
# move to api?
def set_parents(entity, names):
......@@ -140,55 +206,6 @@ class AbstractCFood(object):
entity.add_property(prop, value)
class CMeal(object):
"""
CMeal groups equivalent Files and allow their collected insertion.
Sometimes there is no one file that can be used to trigger the creation of
some Record. E.g. if a collection of images shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups
except that for the file name should match. The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
The cook function of a cfood allows this class to work. Instead of directly
instantiating a CFood the cook function is used. If the CFood is also a
child of CMeal, it will be checked (using get_suitable_cfood) in the cook
function whether a new CFood should be created or if the file match should
be added to an existing one. In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
"""
existing_instances = []
matching_groups = []
def __init__(self, *args, **kwargs):
self.__class__.existing_instances.append(self)
self.crawled_files = []
def add(self, crawled_file):
self.crawled_files.append(crawled_file)
@classmethod
def get_suitable_cfood(cls, match):
for cfood in cls.existing_instances:
suitable = True
for group in cls.matching_groups:
if (group not in match.groupdict() or
group not in cfood.match.groupdict() or
match.group(group) != cfood.match.group(group)):
suitable = False
if suitable:
return cfood
return None
def get_entity_for_path(path):
try:
q = "FIND FILE WHICH IS STORED AT '{}'".format(path)
......@@ -210,7 +227,7 @@ class AbstractFileCFood(AbstractCFood):
# function match()
_pattern = None
def __init__(self, crawled_path, access=lambda x: x):
def __init__(self, crawled_path, *args, **kwargs):
""" Abstract base class for file based Crawler food (CFood).
Parameters
......@@ -218,15 +235,11 @@ class AbstractFileCFood(AbstractCFood):
crawled_path : The file that the crawler is currently matching. Its
path should match against the pattern of this class
access : callable, optional
A function that takes a CaosDB path and returns a local path
"""
super().__init__()
self.access = access
super().__init__(*args, item=crawled_path, **kwargs)
self._crawled_file = None
self.crawled_path = crawled_path
self.match = type(self).match_file(crawled_path)
self.attached_ones = []
self.match = re.match(type(self).get_re(), crawled_path)
self.attached_filenames = []
@property
......@@ -236,18 +249,6 @@ class AbstractFileCFood(AbstractCFood):
return self._crawled_file
def collect_information(self):
""" The CFood collects information for further processing.
Often CFoods need information from files or even from the database in
order to make processing decision. It is intended that this function is
called after match. Thus match can be used without connecting to the
database.
To be overwritten by subclasses
"""
pass
@staticmethod
def get_re():
""" Returns the regular expression used to identify files that shall be
......@@ -258,45 +259,16 @@ class AbstractFileCFood(AbstractCFood):
raise NotImplementedError()
@classmethod
def cook(cls, crawled_file, **kwargs):
""" possibly checks for existing CFoods whether the match should be
added or whether a new CFood instance needs to be returned
This function should typically be used to create CFoods in order to
prevent the creation of unnecessary instances.
This standard implementation does not do a check but may be overwritten
by subclasses.
Retruns
-------------
CFood: if a new instance was created
None: otherwise
"""
return cls(crawled_file, **kwargs)
@classmethod
def match_file(cls, string):
def match_item(cls, path):
""" Matches the regular expression of this class against file names
Parameters
----------
string : str
path : str
The path of the file that shall be matched.
"""
# TODO this does not quite work. Sometimes the wrong expression is in
# _pattern; FIX
# if cls._pattern is None:
# cls._pattern = re.compile(cls.get_re())
# return cls._pattern.match(string)
return re.match(cls.get_re(), string)
def attach(self, crawled_file):
self.attached_ones.append(crawled_file)
return re.match(cls.get_re(), path) is not None
# TODO looking for should `attach` the files itsself. This would allow to
# group them right away and makes it unnecessary to check matches later
......@@ -320,37 +292,6 @@ class AbstractFileCFood(AbstractCFood):
return False
@staticmethod
# move to api?
def set_parents(entity, names):
entity.parents.clear()
for n in names:
entity.add_parent(get_entity(n))
@staticmethod
# move to api?
def remove_property(entity, prop):
# TODO only do something when it is necessary?
if isinstance(prop, db.Entity):
name = prop.name
else:
name = prop
while entity.get_property(name) is not None:
entity.remove_property(name)
@staticmethod
# move to api?
def set_property(entity, prop, value, datatype=None):
AbstractCFood.remove_property(entity, prop)
if datatype is not None:
entity.add_property(prop, value, datatype=datatype)
else:
entity.add_property(prop, value)
def assure_object_is_in_list(obj, containing_object, property_name,
to_be_updated, datatype=None):
......@@ -564,12 +505,11 @@ def get_ids_for_entities_with_names(entities):
class RowCFood(AbstractCFood):
def __init__(self, row, unique_cols, recordtype):
def __init__(self, item, unique_cols, recordtype, **kwargs):
"""
table : pandas table
"""
super().__init__()
self.row = row
super().__init__(item, **kwargs)
self.unique_cols = unique_cols
self.recordtype = recordtype
......@@ -578,13 +518,79 @@ class RowCFood(AbstractCFood):
rec.add_parent(self.recordtype)
for col in self.unique_cols:
rec.add_property(col, self.row.loc[col])
rec.add_property(col, self.item.loc[col])
self.identifiables.append(rec)
def update_identifiables(self):
rec = self.identifiables[0]
for key, value in self.row.iteritems():
for key, value in self.item.iteritems():
if key in self.unique_cols:
continue
rec.add_property(key, value)
class CMeal(object):
"""
CMeal groups equivalent items and allow their collected insertion.
Sometimes there is no one item that can be used to trigger the creation of
some Record. E.g. if a collection of image files shall be referenced from one
Record that groups them, it is unclear which image should trigger the
creation of the Record.
CMeals are grouped based on the groups in the used regular expression. If,
in the above example, all the images reside in one folder, all groups of
the filename match except that for the file name should match.
The groups that shall match
need to be listed in the matching_groups class property. Subclasses will
overwrite this property.
This allows to use has_suitable_cfood in the match_item function of a CFood
to check whether the necessary CFood was already created.
In order to allow this all instances of a
CFood class are tracked in the existing_instances class member.
Subclasses must have a cls.get_re function and a match member variable
(see AbstractFileCFood)
"""
existing_instances = []
matching_groups = []
def __init__(self):
self.__class__.existing_instances.append(self)
@classmethod
def all_groups_equal(cls, m1, m2):
equal = True
for group in cls.matching_groups:
if (group not in m1.groupdict() or
group not in m2.groupdict() or
m1.group(group) != m2.group(group)):
equal = False
return equal
@classmethod
def has_suitable_cfood(cls, item):
""" checks whether the required cfood object already exists.
item : the crawled item
"""
match = re.match(cls.get_re(), item)
for cfood in cls.existing_instances:
if cls.all_groups_equal(match, cfood.match):
return True
return False
def belongs_to_meal(self, item):
# This is already the main item
if item == self.item:
return False
match = re.match(self.get_re(), item)
return self.all_groups_equal(match, self.match)
......@@ -82,44 +82,56 @@ class UnknownCache(object):
class Crawler(object):
def __init__(self, food=None, access=lambda x: x, use_cache=False,
abort_on_exception=True, interactive=True):
def __init__(self, cfood_types, use_cache=False,
abort_on_exception=True, interactive=True, hideKnown=False):
"""
Parameters
----------
food : list of CFood classes, optional
cfood_types : list of CFood classes
The Crawler will use those CFoods when crawling.
pattern : str
The regex pattern for matching against file names.
use_cache : bool, optional
Whether to use caching (not re-inserting probably existing
objects into CaosDB), defaults to False.
access : callable, optional
A function that takes a CaosDB path and returns a local path
abort_on_exception : if true, exceptions are raise.
Otherwise the crawler continues if an exception occurs.
interactive : boolean, optional
If true, questions will be posed during execution of the
crawl function.
"""
if food is None:
self.food = []
else:
self.food = food
self.cfood_types = cfood_types
self.interactive = interactive
self.access = access
self.report = db.Container()
self.use_cache = use_cache
self.hideKnown = hideKnown
self.abort_on_exception = abort_on_exception
if self.use_cache:
self.cache = Cache()
def iteritems(self):
""" generates items to be crawled with an index"""
yield 0, None
def collect_cfoods(self):
"""
to be overwritten by subclasses.
This is the first phase of the crawl. It collects all cfoods that shall
be processed. The second phase is iterating over cfoods and updating
CaosDB. This separate first step is necessary in order to allow a
single cfood being influenced by multiple crawled items. E.g. the
FileCrawler can have a single cfood treat multiple files.
This is a very basic implementation and this function should be
overwritten by subclasses.
The basic structure of this function should be, that what ever is
being processed is iterated and each cfood is checked whether the
item 'matches'. If it does, a cfood is instantiated passing the item
as an argument.
The match can depend on the cfoods already being created, i.e. a file
migth no longer match because it is already treaded by an earlier
cfood.
should return cfoods, tbs and errors_occured.
# TODO do this via logging?
......@@ -129,27 +141,83 @@ class Crawler(object):
cfoods = []
tbs = []
errors_occured = False
matches = {idx: [] for idx, _ in self.iteritems()}
for food in self.food:
cfoods.append(food())
logger.info(separated("Matching files against CFoods"))
for Cfood in self.food:
try:
cfood = Cfood()
for Cfood in self.cfood_types:
logger.debug("Matching against {}...".format(Cfood.__name__))
if cfood is not None:
cfoods.append(cfood)
except Exception as e:
traceback.print_exc()
print(e)
for idx, item in self.iteritems():
if Cfood.match_item(item):
try:
cfoods.append(Cfood(item))
matches[idx].append(Cfood.__name__)
logger.debug("{} matched\n{}.".format(
Cfood.__name__,
item))
except Exception as e:
traceback.print_exc()
print(e)
if self.abort_on_exception:
raise e