crawler.py 13.8 KB
Newer Older
Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
1
2
3
4
5
6
7
8
#!/usr/bin/env python
# encoding: utf-8
#
# ** header v3.0
# This file is a part of the CaosDB Project.
#
# Copyright (C) 2018 Research Group Biomedical Physics,
# Max-Planck-Institute for Dynamics and Self-Organization Göttingen
9
# Copyright (C) 2020 Henrik tom Wörden
Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as
# published by the Free Software Foundation, either version 3 of the
# License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.
#
# You should have received a copy of the GNU Affero General Public License
# along with this program. If not, see <https://www.gnu.org/licenses/>.
#
# ** end header
#
26
27
28
29
30
31
32
33
34
35
36
37
""" Crawls a file structure and inserts Records into CaosDB based on what is
found.

CaosDB can automatically be filled with Records based on some file structure.
The Crawler will iterate over the files and test for each file whether a CFood
exists that matches the file path. If one does, it is instanciated to treat the
match. This occurs in basically three steps:
1. create a list of identifiables, i.e. unique representation of CaosDB Records
(such as an experiment belonging to a project and a date/time)
2. the identifiables are either found in CaosDB or they are created.
3. the identifiables are update based on the date in the file structure
"""
Henrik tom Wörden's avatar
Henrik tom Wörden committed
38
39


40
import logging
Henrik tom Wörden's avatar
Henrik tom Wörden committed
41
import os
42
import traceback
43
from datetime import datetime
44

Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
45
import caosdb as db
46
47
48
from caosdb.exceptions import TransactionError

from .cache import Cache
49
50
from .cfood import RowCFood
from .guard import RETRIEVE
Henrik tom Wörden's avatar
Henrik tom Wörden committed
51
from .guard import global_guard as guard
52
53
54
55
56
57
58
59
60

logger = logging.getLogger(__name__)


def separated(text):
    return "-"*60 + "\n" + text


class UnknownCache(object):
Henrik tom Wörden's avatar
Henrik tom Wörden committed
61
62
    def __init__(self, interactive=False, load=False):

63
64
65
66
        if interactive and "y" == input(
                "\nDo you want to load filenames that previously were not "
                "matched by any CFood?\nIn that case, they will not show up "
                "again. (y)"):
Henrik tom Wörden's avatar
Henrik tom Wörden committed
67
68
69
            load = True

        if load and os.path.exists("known_cache.db"):
70
71
72
73
74
75
76
77
78
79
80
81
            with open("known_cache.db") as fi:
                self.filenames = [el.strip("\n") for el in fi.readlines()]
        else:
            self.filenames = []

    def save(self):
        with open("known_cache.db", "w") as fi:
            for name in self.filenames:
                fi.write(name + "\n")

    def add(self, el):
        self.filenames.append(el)
Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
82

Henrik tom Wörden's avatar
Henrik tom Wörden committed
83
84

class Crawler(object):
Henrik tom Wörden's avatar
Henrik tom Wörden committed
85
86
    def __init__(self, cfood_types, use_cache=False,
                 abort_on_exception=True, interactive=True, hideKnown=False):
87
88
89
        """
        Parameters
        ----------
Henrik tom Wörden's avatar
Henrik tom Wörden committed
90
        cfood_types : list of CFood classes
91
               The Crawler will use those CFoods when crawling.
92
93
94
        use_cache : bool, optional
                    Whether to use caching (not re-inserting probably existing
                    objects into CaosDB), defaults to False.
Henrik tom Wörden's avatar
Henrik tom Wörden committed
95
96
        abort_on_exception : if true, exceptions are raise.
                    Otherwise the crawler continues if an exception occurs.
97
98
99
100
        interactive : boolean, optional
                      If true, questions will be posed during execution of the
                      crawl function.

101
        """
102

Henrik tom Wörden's avatar
Henrik tom Wörden committed
103
        self.cfood_types = cfood_types
104
        self.interactive = interactive
Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
105
        self.report = db.Container()
106
        self.use_cache = use_cache
Henrik tom Wörden's avatar
Henrik tom Wörden committed
107
        self.hideKnown = hideKnown
108
        self.abort_on_exception = abort_on_exception
109
110
111

        if self.use_cache:
            self.cache = Cache()
Henrik tom Wörden's avatar
update    
Henrik tom Wörden committed
112

Henrik tom Wörden's avatar
Henrik tom Wörden committed
113
114
115
116
    def iteritems(self):
        """ generates items to be crawled with an index"""
        yield 0, None

117
118
    def collect_cfoods(self):
        """
Henrik tom Wörden's avatar
Henrik tom Wörden committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
        This is the first phase of the crawl. It collects all cfoods that shall
        be processed. The second phase is iterating over cfoods and updating
        CaosDB. This separate first step is necessary in order to allow a
        single cfood being influenced by multiple crawled items. E.g. the
        FileCrawler can have a single cfood treat multiple files.

        This is a very basic implementation and this function should be
        overwritten by subclasses.

        The basic structure of this function should be, that what ever is
        being processed is iterated and each cfood is checked whether the
        item 'matches'. If it does, a cfood is instantiated passing the item
        as an argument.
        The match can depend on the cfoods already being created, i.e. a file
        migth no longer match because it is already treaded by an earlier
        cfood.
135
136
137
138
139
140

        should return cfoods, tbs and errors_occured.
        # TODO do this via logging?
        tbs text returned from traceback
        errors_occured True if at least one error occured
        """
141
        cfoods = []
142
143
        tbs = []
        errors_occured = False
Henrik tom Wörden's avatar
Henrik tom Wörden committed
144
        matches = {idx: [] for idx, _ in self.iteritems()}
145

Henrik tom Wörden's avatar
Henrik tom Wörden committed
146
        logger.info(separated("Matching files against CFoods"))
147

Henrik tom Wörden's avatar
Henrik tom Wörden committed
148
149
        for Cfood in self.cfood_types:
            logger.debug("Matching against {}...".format(Cfood.__name__))
150

Henrik tom Wörden's avatar
Henrik tom Wörden committed
151
152
153
154
155
156
157
158
159
160
161
            for idx, item in self.iteritems():
                if Cfood.match_item(item):
                    try:
                        cfoods.append(Cfood(item))
                        matches[idx].append(Cfood.__name__)
                        logger.debug("{} matched\n{}.".format(
                                Cfood.__name__,
                                item))
                    except Exception as e:
                        traceback.print_exc()
                        print(e)
162

Henrik tom Wörden's avatar
Henrik tom Wörden committed
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
                        if self.abort_on_exception:
                            raise e
                        errors_occured = True
                        tbs.append(e)

        logger.info(separated("CFoods are collecting information..."))

        for cfood in cfoods:
            cfood.collect_information()

        logger.info(separated("Trying to attach further items to created CFoods"))

        for cfood in cfoods:
            logger.debug("Matching against {}...".format(Cfood.__name__))

            for idx, item in self.iteritems():
                if cfood.looking_for(item):
                    logger.debug("{} matched\n{}.".format(
                            cfood.__class__.__name__,
                            item))
                    cfood.attach(item)
                    matches[idx].append(Cfood.__name__)

        self.check_matches(matches)
187

188
        return cfoods, tbs, errors_occured
189

Henrik tom Wörden's avatar
Henrik tom Wörden committed
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
    def check_matches(self, matches):
        # possibly load previously encountered "Missing matches" and
        # "Multiple matches"
        ucache = UnknownCache(interactive=self.interactive, load=self.hideKnown)

        for idx, item in self.iteritems():
            if len(matches[idx]) == 0:
                msg = ("ATTENTION: No matching cfood!\n"
                       "Tried to match {}\n".format(item))

                if item in ucache.filenames:
                    logger.debug(msg)
                else:
                    logger.warning(msg)
                ucache.add(item)

            if len(matches[idx]) > 1:
                msg = ("Attention: More than one matching cfood!\n"
                       + "Tried to match {}\n".format(item)
                       + "\tRecordTypes:\t" + ", ".join(
                            matches[idx])+"\n")

                if item in ucache.filenames:
                    logger.debug(msg)
                else:
                    logger.warning(msg)
                ucache.add(item)

        # Save the encountered prblem matches
        ucache.save()

221
222
223
    def cached_find_identifiables(self, identifiables):
        if self.use_cache:
            hashes = self.cache.update_ids_from_cache(identifiables)
224

225
        self.find_or_insert_identifiables(identifiables)
226

227
228
        if self.use_cache:
            self.cache.insert_list(hashes, identifiables)
229

230
    def crawl(self, security_level=RETRIEVE):
231
232
        guard.set_level(level=security_level)

233
        cfoods, tbs, errors_occured = self.collect_cfoods()
234

235
        if self.interactive and "y" != input("Do you want to continue? (y)"):
Henrik tom Wörden's avatar
Henrik tom Wörden committed
236
237
            return

238
        logger.info(separated("Creating and updating Identifiables"))
239

240
241
242
243
        for cfood in cfoods:
            try:
                cfood.create_identifiables()

244
                self.cached_find_identifiables(cfood.identifiables)
245
246
247
248
249
250

                cfood.update_identifiables()
                cfood.push_identifiables_to_CaosDB()
            except Exception as e:
                traceback.print_exc()
                print(e)
251
252
253

                if self.abort_on_exception:
                    raise e
Henrik tom Wörden's avatar
Guard    
Henrik tom Wörden committed
254
255
                errors_occured = True
                tbs.append(e)
256

257
        if errors_occured:
258
259
            logger.warning("Crawler terminated with failures!")
            logger.warning(tbs)
260
261
        else:
            logger.info("Crawler terminated successfully!")
262

Henrik tom Wörden's avatar
Guard    
Henrik tom Wörden committed
263
    # TODO remove static?
264
    @staticmethod
265
    def find_or_insert_identifiables(identifiables):
266
267
268
269
270
271
272
273
274
275
        """ Sets the ids of identifiables (that do not have already an id from the
        cache) based on searching CaosDB and retrieves those entities.
        The remaining entities (those which can not be retrieved) have no
        correspondence in CaosDB and are thus inserted.
        """
        # looking for matching entities in CaosDB when there is no valid id
        # i.e. there was none set from a cache

        for ent in identifiables:
            if ent.id is None or ent.id < 0:
276
277
                logger.debug("Looking for: {}".format(ent))
                existing = Crawler.find_existing(ent)
278
279
280

                if existing is not None:
                    ent.id = existing.id
281
            else:
282
                logger.debug("Id is known of: {}".format(ent))
283
284
285
286
287

        # insert missing, i.e. those which are not valid
        missing_identifiables = db.Container()
        missing_identifiables.extend([ent for ent in identifiables
                                      if ent.id is None or ent.id < 0])
288
289
290
291
292
        # TODO the following should not be necessary. Fix it

        for ent in missing_identifiables:
            ent.id = None

293
294
        if len(missing_identifiables) > 0:
            logger.info("Going to insert the following entities:")
295
296

            for ent in missing_identifiables:
297
                logger.info(ent)
298

Henrik tom Wörden's avatar
Guard    
Henrik tom Wörden committed
299
        if len(missing_identifiables) == 0:
300
            logger.debug("No new entities to be inserted.")
Henrik tom Wörden's avatar
Guard    
Henrik tom Wörden committed
301
302
        else:
            guard.safe_insert(missing_identifiables)
303

304
        logger.debug("Retrieving entities from CaosDB...")
305
306
307
        identifiables.retrieve(unique=True, raise_exception_on_error=False)

    @staticmethod
308
    def find_existing(entity):
309
310
311
312
313
314
315
316
317
        """searches for an entity that matches the identifiable in CaosDB

        Characteristics of the identifiable like, properties, name or id are
        used for the match.
        """

        if entity.name is None:
            # TODO multiple parents are ignored! Sufficient?
            query_string = "FIND Record " + entity.get_parents()[0].name
318
319
320
            query_string += " WITH "

            for p in entity.get_properties():
321
                query_string += ("'" + p.name + "'='" + str(get_value(p))
322
323
324
                                     + "' AND ")
            # remove the last AND
            query_string = query_string[:-4]
325
326
327
        else:
            query_string = "FIND '{}'".format(entity.name)

328
        logger.debug(query_string)
329
330
331
332
333
334
335
336
337
338
339
340
341
342
        q = db.Query(query_string)
        # the identifiable should identify an object uniquely. Thus the query
        # is using the unique keyword
        try:
            r = q.execute(unique=True)
        except TransactionError:
            r = None

        # if r is not None:
        #     print("Found Entity with id:", r.id)
        # else:
        #     print("Did not find an existing entity.")

        return r
Henrik tom Wörden's avatar
Henrik tom Wörden committed
343

344

345
class FileCrawler(Crawler):
Henrik tom Wörden's avatar
Henrik tom Wörden committed
346
    def __init__(self, files, **kwargs):
347
348
349
350
351
352
353
354
355
        """
        Parameters
        ----------
        files : files to be crawled

        """
        super().__init__(**kwargs)
        self.files = files

Henrik tom Wörden's avatar
Henrik tom Wörden committed
356
357
358
    def iteritems(self):
        for idx, p in enumerate(sorted([f.path for f in self.files])):
            yield idx, p
359

Henrik tom Wörden's avatar
Henrik tom Wörden committed
360
361
362
363
364
365
    @staticmethod
    def query_files(path):
        query_str = "FIND FILE WHICH IS STORED AT " + (path if path.endswith("/") else path + "/") + "**"
        logger.info("FILES QUERY: " + query_str)
        files = db.execute_query(query_str)
        logger.info("{} FILES TO BE PROCESSED.".format(len(files)))
366

Henrik tom Wörden's avatar
Henrik tom Wörden committed
367
        return files
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382


class TableCrawler(Crawler):

    def __init__(self, table, unique_cols, recordtype, **kwargs):
        """
        Parameters
        ----------
        table : pandas DataFrame
        unique_cols : the columns that provide the properties for the
                      identifiable
        recordtype : Record Type of the Records to be created
        """
        self.table = table

Henrik tom Wörden's avatar
Henrik tom Wörden committed
383
384
385
386
        # TODO I do not like this yet, but I do not see a better way so far.
        class ThisRowCF(RowCFood):
            def __init__(self, item):
                super().__init__(item, unique_cols, recordtype)
387

Henrik tom Wörden's avatar
Henrik tom Wörden committed
388
        super().__init__(cfood_types=[ThisRowCF], **kwargs)
389

Henrik tom Wörden's avatar
Henrik tom Wörden committed
390
391
392
    def iteritems(self):
        for idx, row in self.table.iterrows():
            yield idx, row
393
394


395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
def get_value(prop):
    """ Returns the value of a Property

    Parameters
    ----------
    prop : The property of which the value shall be returned.

    Returns
    -------
    out : The value of the property; if the value is an entity, its ID.

    """

    if isinstance(prop.value, db.Entity):
        return prop.value.id
410
411
    elif isinstance(prop.value, datetime):
        return prop.value.isoformat()
412
413
    else:
        return prop.value