bsdf_lite.py 18.5 KB
Newer Older
Almar Klein's avatar
Almar Klein committed
1
# This file is distributed under the terms of the 2-clause BSD License.
Almar Klein's avatar
Almar Klein committed
2
3
4
5
6
7
# Copyright (c) 2017, Almar Klein

"""
Python implementation of the Binary Structured Data Format (BSDF).

BSDF is a binary format for serializing structured (scientific) data.
Almar Klein's avatar
Almar Klein committed
8
See http://bsdf.io for more information.
Almar Klein's avatar
Almar Klein committed
9
10
11
12
13
14
15
16
17
18
19
20

This is a lite (i.e minimal) variant of the Python implementation. Intended for
easy incorporation in projects, and as a demonstration how simple
a BSDF implementation can be.

This module has no dependencies and works on Python 3.4+.
"""

import bz2
import hashlib
import logging
import struct
Almar Klein's avatar
flake    
Almar Klein committed
21
import sys
Almar Klein's avatar
Almar Klein committed
22
23
24
25
26
27
import zlib
from io import BytesIO

logger = logging.getLogger(__name__)


28
VERSION = 2, 2, 1
Almar Klein's avatar
Almar Klein committed
29
__version__ = '.'.join(str(i) for i in VERSION)
Almar Klein's avatar
Almar Klein committed
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59


# %% The encoder and decoder implementation


# Shorthands
spack = struct.pack
strunpack = struct.unpack


def lencode(x):
    """ Encode an unsigned integer into a variable sized blob of bytes.
    """
    # We could support 16 bit and 32 bit as well, but the gain is low, since
    # 9 bytes for collections with over 250 elements is marginal anyway.
    if x <= 250:
        return spack('<B', x)
    else:
        return spack('<BQ', 253, x)


# Include len decoder for completeness; we've inlined it for performance.
def lendecode(f):
    """ Decode an unsigned integer from a file.
    """
    n = strunpack('<B', f.read(1))[0]
    if n == 253: n = strunpack('<Q', f.read(8))[0]  # noqa
    return n


Almar Klein's avatar
style    
Almar Klein committed
60
def encode_type_id(b, ext_id):
61
62
    """ Encode the type identifier, with or without extension id.
    """
Almar Klein's avatar
style    
Almar Klein committed
63
64
    if ext_id is not None:
        bb = ext_id.encode('UTF-8')
65
66
67
68
69
        return b.upper() + lencode(len(bb)) + bb  # noqa
    else:
        return b  # noqa


Almar Klein's avatar
Almar Klein committed
70
71
class BsdfLiteSerializer(object):
    """ Instances of this class represent a BSDF encoder/decoder.
Almar Klein's avatar
style    
Almar Klein committed
72

Almar Klein's avatar
Almar Klein committed
73
74
    This is a lite variant of the Python BSDF serializer. It does not support
    lazy loading or streaming, but is otherwise fully functional, including
75
    support for custom extensions.
Almar Klein's avatar
style    
Almar Klein committed
76

77
    It acts as a placeholder for a set of extensions and encoding/decoding
Almar Klein's avatar
Almar Klein committed
78
79
80
81
82
    options. Options for encoding:

    * compression (int or str): ``0`` or "no" for no compression (default),
      ``1`` or "zlib" for Zlib compression (same as zip files and PNG), and
      ``2`` or "bz2" for Bz2 compression (more compact but slower writing).
Almar Klein's avatar
Almar Klein committed
83
84
      Note that some BSDF implementations (e.g. JavaScript) may not support
      compression.
Almar Klein's avatar
Almar Klein committed
85
86
87
88
89
    * use_checksum (bool): whether to include a checksum with binary blobs.
    * float64 (bool): Whether to write floats as 64 bit (default) or 32 bit.

    """

90
91
92
93
94
95
96
    def __init__(self, extensions=None, **options):
        self._extensions = {}  # name -> extension
        self._extensions_by_cls = {}  # cls -> (name, extension.encode)
        if extensions is None:
            extensions = standard_extensions
        for extension in extensions:
            self.add_extension(extension)
Almar Klein's avatar
Almar Klein committed
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
        self._parse_options(**options)

    def _parse_options(self, compression=0, use_checksum=False, float64=True):

        # Validate compression
        if isinstance(compression, str):
            m = {'no': 0, 'zlib': 1, 'bz2': 2}
            compression = m.get(compression.lower(), compression)
        if compression not in (0, 1, 2):
            raise TypeError('Compression must be 0, 1, 2, '
                            '"no", "zlib", or "bz2"')
        self._compression = compression

        # Other encoding args
        self._use_checksum = bool(use_checksum)
        self._float64 = bool(float64)

114
115
    def add_extension(self, extension_class):
        """ Add an extension to this serializer instance, which must be
116
        a subclass of Extension. Can be used as a decorator.
Almar Klein's avatar
Almar Klein committed
117
        """
118
119
120
121
122
        # Check class
        if not (isinstance(extension_class, type) and
                issubclass(extension_class, Extension)):
            raise TypeError('add_extension() expects a Extension class.')
        extension = extension_class()
Almar Klein's avatar
Almar Klein committed
123

124
125
126
127
128
129
130
131
        # Get name
        name = extension.name
        if not isinstance(name, str):
            raise TypeError('Extension name must be str.')
        if len(name) == 0 or len(name) > 250:
            raise NameError('Extension names must be nonempty and shorter '
                            'than 251 chars.')
        if name in self._extensions:
132
            logger.warn('BSDF warning: overwriting extension "%s", '
133
                        'consider removing first' % name)
Almar Klein's avatar
Almar Klein committed
134

135
136
137
138
139
        # Get classes
        cls = extension.cls
        if not cls:
            clss = []
        elif isinstance(cls, (tuple, list)):
Almar Klein's avatar
Almar Klein committed
140
141
142
143
144
            clss = cls
        else:
            clss = [cls]
        for cls in clss:
            if not isinstance(cls, type):
145
                raise TypeError('Extension classes must be types.')
Almar Klein's avatar
Almar Klein committed
146

Almar Klein's avatar
Almar Klein committed
147
148
        # Store
        for cls in clss:
149
150
            self._extensions_by_cls[cls] = name, extension.encode
        self._extensions[name] = extension
151
        return extension_class
Almar Klein's avatar
Almar Klein committed
152

153
    def remove_extension(self, name):
Almar Klein's avatar
Almar Klein committed
154
155
156
        """ Remove a converted by its unique name.
        """
        if not isinstance(name, str):
157
158
159
160
161
162
            raise TypeError('Extension name must be str.')
        if name in self._extensions:
            self._extensions.pop(name)
        for cls in list(self._extensions_by_cls.keys()):
            if self._extensions_by_cls[cls][0] == name:
                self._extensions_by_cls.pop(cls)
Almar Klein's avatar
Almar Klein committed
163

164
    def _encode(self, f, value, ext_id):
Almar Klein's avatar
Almar Klein committed
165
166
167
        """ Main encoder function.
        """

168
        x = encode_type_id
Almar Klein's avatar
Almar Klein committed
169
170

        if value is None:
Almar Klein's avatar
style    
Almar Klein committed
171
            f.write(x(b'v', ext_id))  # V for void
Almar Klein's avatar
Almar Klein committed
172
        elif value is True:
Almar Klein's avatar
style    
Almar Klein committed
173
            f.write(x(b'y', ext_id))  # Y for yes
Almar Klein's avatar
Almar Klein committed
174
        elif value is False:
Almar Klein's avatar
style    
Almar Klein committed
175
            f.write(x(b'n', ext_id))  # N for no
Almar Klein's avatar
Almar Klein committed
176
        elif isinstance(value, int):
177
            if -32768 <= value <= 32767:
Almar Klein's avatar
style    
Almar Klein committed
178
                f.write(x(b'h', ext_id) + spack('h', value))  # H for ...
Almar Klein's avatar
Almar Klein committed
179
            else:
Almar Klein's avatar
style    
Almar Klein committed
180
                f.write(x(b'i', ext_id) + spack('<q', value))  # I for int
Almar Klein's avatar
Almar Klein committed
181
182
        elif isinstance(value, float):
            if self._float64:
Almar Klein's avatar
style    
Almar Klein committed
183
                f.write(x(b'd', ext_id) + spack('<d', value))  # D for double
Almar Klein's avatar
Almar Klein committed
184
            else:
Almar Klein's avatar
style    
Almar Klein committed
185
                f.write(x(b'f', ext_id) + spack('<f', value))  # f for float
Almar Klein's avatar
Almar Klein committed
186
187
        elif isinstance(value, str):
            bb = value.encode('UTF-8')
Almar Klein's avatar
style    
Almar Klein committed
188
            f.write(x(b's', ext_id) + lencode(len(bb)))  # S for str
Almar Klein's avatar
Almar Klein committed
189
190
            f.write(bb)
        elif isinstance(value, (list, tuple)):
Almar Klein's avatar
style    
Almar Klein committed
191
            f.write(x(b'l', ext_id) + lencode(len(value)))  # L for list
Almar Klein's avatar
Almar Klein committed
192
            for v in value:
193
                self._encode(f, v, None)
Almar Klein's avatar
Almar Klein committed
194
        elif isinstance(value, dict):
Almar Klein's avatar
style    
Almar Klein committed
195
            f.write(x(b'm', ext_id) + lencode(len(value)))  # M for mapping
Almar Klein's avatar
Almar Klein committed
196
            for key, v in value.items():
197
                assert isinstance(key, str)
Almar Klein's avatar
Almar Klein committed
198
199
200
                name_b = key.encode('UTF-8')
                f.write(lencode(len(name_b)))
                f.write(name_b)
201
                self._encode(f, v, None)
Almar Klein's avatar
Almar Klein committed
202
        elif isinstance(value, bytes):
Almar Klein's avatar
style    
Almar Klein committed
203
            f.write(x(b'b', ext_id))  # B for blob
Almar Klein's avatar
Almar Klein committed
204
205
206
207
208
            # Compress
            compression = self._compression
            if compression == 0:
                compressed = value
            elif compression == 1:
209
                compressed = zlib.compress(value, 9)
Almar Klein's avatar
Almar Klein committed
210
            elif compression == 2:
211
                compressed = bz2.compress(value, 9)
Almar Klein's avatar
Almar Klein committed
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
            else:
                assert False, 'Unknown compression identifier'
            # Get sizes
            data_size = len(value)
            used_size = len(compressed)
            extra_size = 0
            allocated_size = used_size + extra_size
            # Write sizes - write at least in a size that allows resizing
            if allocated_size <= 250 and compression == 0:
                f.write(spack('<B', allocated_size))
                f.write(spack('<B', used_size))
                f.write(lencode(data_size))
            else:
                f.write(spack('<BQ', 253, allocated_size))
                f.write(spack('<BQ', 253, used_size))
                f.write(spack('<BQ', 253, data_size))
            # Compression and checksum
            f.write(spack('B', compression))
            if self._use_checksum:
                f.write(b'\xff' + hashlib.md5(compressed).digest())
            else:
                f.write(b'\x00')
234
            # Byte alignment (only necessary for uncompressed data)
Almar Klein's avatar
Almar Klein committed
235
            if compression == 0:
Almar Klein's avatar
Almar Klein committed
236
                alignment = 8 - (f.tell() + 1) % 8  # +1 for the byte to write
Almar Klein's avatar
Almar Klein committed
237
238
239
240
241
242
243
                f.write(spack('<B', alignment))  # padding for byte alignment
                f.write(b'\x00' * alignment)
            else:
                f.write(spack('<B', 0))
            # The actual data and extra space
            f.write(compressed)
            f.write(b'\x00' * (allocated_size - used_size))
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
        elif getattr(value, "shape", None) == () and str(
            getattr(value, "dtype", "")
        ).startswith(("uint", "int", "float")):
            # Implicit conversion of numpy scalars
            if 'int' in str(value.dtype):
                value = int(value)
                if -32768 <= value <= 32767:
                    f.write(x(b'h', ext_id) + spack('h', value))
                else:
                    f.write(x(b'i', ext_id) + spack('<q', value))
            else:
                value = float(value)
                if self._float64:
                    f.write(x(b'd', ext_id) + spack('<d', value))
                else:
                    f.write(x(b'f', ext_id) + spack('<f', value))
Almar Klein's avatar
Almar Klein committed
260
        else:
261
262
263
264
265
            if ext_id is not None:
                raise ValueError(
                    'Extension %s wronfully encodes object to another '
                    'extension object (though it may encode to a list/dict '
                    'that contains other extension objects).' % ext_id)
Almar Klein's avatar
Almar Klein committed
266
            # Try if the value is of a type we know
267
            ex = self._extensions_by_cls.get(value.__class__, None)
Almar Klein's avatar
Almar Klein committed
268
            # Maybe its a subclass of a type we know
269
            if ex is None:
270
                for name, c in self._extensions.items():
271
                    if c.match(self, value):
272
                        ex = name, c.encode
Almar Klein's avatar
Almar Klein committed
273
274
                        break
                else:
275
                    ex = None
Almar Klein's avatar
Almar Klein committed
276
            # Success or fail
277
            if ex is not None:
Almar Klein's avatar
style    
Almar Klein committed
278
                ext_id2, extension_encode = ex
279
                self._encode(f, extension_encode(self, value), ext_id2)
Almar Klein's avatar
Almar Klein committed
280
281
            else:
                t = ('Class %r is not a valid base BSDF type, nor is it '
282
                     'handled by an extension.')
Almar Klein's avatar
Almar Klein committed
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
                raise TypeError(t % value.__class__.__name__)

    def _decode(self, f):
        """ Main decoder function.
        """

        # Get value
        char = f.read(1)
        c = char.lower()

        # Conversion (uppercase value identifiers signify converted values)
        if not char:
            raise EOFError()
        elif char != c:
            n = strunpack('<B', f.read(1))[0]
            # if n == 253: n = strunpack('<Q', f.read(8))[0]  # noqa - noneed
Almar Klein's avatar
style    
Almar Klein committed
299
            ext_id = f.read(n).decode('UTF-8')
Almar Klein's avatar
Almar Klein committed
300
        else:
Almar Klein's avatar
style    
Almar Klein committed
301
            ext_id = None
Almar Klein's avatar
Almar Klein committed
302
303
304
305
306
307
308

        if c == b'v':
            value = None
        elif c == b'y':
            value = True
        elif c == b'n':
            value = False
309
310
        elif c == b'h':
            value = strunpack('<h', f.read(2))[0]
Almar Klein's avatar
Almar Klein committed
311
312
313
314
315
316
317
318
319
320
321
322
        elif c == b'i':
            value = strunpack('<q', f.read(8))[0]
        elif c == b'f':
            value = strunpack('<f', f.read(4))[0]
        elif c == b'd':
            value = strunpack('<d', f.read(8))[0]
        elif c == b's':
            n_s = strunpack('<B', f.read(1))[0]
            if n_s == 253: n_s = strunpack('<Q', f.read(8))[0]  # noqa
            value = f.read(n_s).decode('UTF-8')
        elif c == b'l':
            n = strunpack('<B', f.read(1))[0]
323
            if n >= 254:
Almar Klein's avatar
Almar Klein committed
324
                # Streaming
325
326
327
328
329
330
331
332
333
334
335
                closed = n == 254
                n = strunpack('<Q', f.read(8))[0]
                if closed:
                    value = [self._decode(f) for i in range(n)]
                else:
                    value = []
                    try:
                        while True:
                            value.append(self._decode(f))
                    except EOFError:
                        pass
Almar Klein's avatar
Almar Klein committed
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
            else:
                # Normal
                if n == 253: n = strunpack('<Q', f.read(8))[0]  # noqa
                value = [self._decode(f) for i in range(n)]
        elif c == b'm':
            value = dict()
            n = strunpack('<B', f.read(1))[0]
            if n == 253: n = strunpack('<Q', f.read(8))[0]  # noqa
            for i in range(n):
                n_name = strunpack('<B', f.read(1))[0]
                if n_name == 253: n_name = strunpack('<Q', f.read(8))[0]  # noqa
                assert n_name > 0
                name = f.read(n_name).decode('UTF-8')
                value[name] = self._decode(f)
        elif c == b'b':
            # Read blob header data (5 to 42 bytes)
            # Size
            allocated_size = strunpack('<B', f.read(1))[0]
            if allocated_size == 253: allocated_size = strunpack('<Q', f.read(8))[0]  # noqa
            used_size = strunpack('<B', f.read(1))[0]
            if used_size == 253: used_size = strunpack('<Q', f.read(8))[0]  # noqa
            data_size = strunpack('<B', f.read(1))[0]
            if data_size == 253: data_size = strunpack('<Q', f.read(8))[0]  # noqa
            # Compression and checksum
            compression = strunpack('<B', f.read(1))[0]
            has_checksum = strunpack('<B', f.read(1))[0]
            if has_checksum:
                checksum = f.read(16)  # noqa - not used yet
            # Skip alignment
            alignment = strunpack('<B', f.read(1))[0]
            f.read(alignment)
            # Get data
            compressed = f.read(used_size)
            # Skip remaining space
            f.read(allocated_size - used_size)
            # Decompress
            if compression == 0:
                value = compressed
            elif compression == 1:
                value = zlib.decompress(compressed)
            elif compression == 2:
                value = bz2.decompress(compressed)
            else:
                raise RuntimeError('Invalid compression %i' % compression)
        else:
            raise RuntimeError('Parse error %r' % char)

383
        # Convert value if we have a nextension for it
Almar Klein's avatar
style    
Almar Klein committed
384
385
        if ext_id is not None:
            extension = self._extensions.get(ext_id, None)
386
            if extension is not None:
387
                value = extension.decode(self, value)
Almar Klein's avatar
Almar Klein committed
388
            else:
389
                logger.warn('BSDF warning: no extension found for %r' % ext_id)
Almar Klein's avatar
Almar Klein committed
390
391
392

        return value

393
    def encode(self, ob):
Almar Klein's avatar
Almar Klein committed
394
395
396
397
398
399
400
        """ Save the given object to bytes.
        """
        f = BytesIO()
        self.save(f, ob)
        return f.getvalue()

    def save(self, f, ob):
401
        """ Write the given object to the given file object.
Almar Klein's avatar
Almar Klein committed
402
403
        """
        f.write(b'BSDF')
Almar Klein's avatar
Almar Klein committed
404
405
        f.write(struct.pack('<B', VERSION[0]))
        f.write(struct.pack('<B', VERSION[1]))
Almar Klein's avatar
Almar Klein committed
406

407
        self._encode(f, ob, None)
Almar Klein's avatar
Almar Klein committed
408

409
    def decode(self, bb):
Almar Klein's avatar
Almar Klein committed
410
411
412
413
414
415
        """ Load the data structure that is BSDF-encoded in the given bytes.
        """
        f = BytesIO(bb)
        return self.load(f)

    def load(self, f):
416
        """ Load a BSDF-encoded object from the given file object.
Almar Klein's avatar
Almar Klein committed
417
418
419
420
421
422
423
424
        """
        # Check magic string
        if f.read(4) != b'BSDF':
            raise RuntimeError('This does not look a BSDF file.')
        # Check version
        major_version = strunpack('<B', f.read(1))[0]
        minor_version = strunpack('<B', f.read(1))[0]
        file_version = '%i.%i' % (major_version, minor_version)
Almar Klein's avatar
Almar Klein committed
425
        if major_version != VERSION[0]:  # major version should be 2
426
427
            t = ('Reading file with different major version (%s) '
                 'from the implementation (%s).')
428
            raise RuntimeError(t % (file_version, __version__))
Almar Klein's avatar
Almar Klein committed
429
        if minor_version > VERSION[1]:  # minor should be < ours
430
            t = ('BSDF warning: reading file with higher minor version (%s) '
Almar Klein's avatar
Almar Klein committed
431
                 'than the implementation (%s).')
432
            logger.warn(t % (file_version, __version__))
Almar Klein's avatar
Almar Klein committed
433
434
435
436

        return self._decode(f)


437
438
439
440
441
442
# %% Standard extensions

# Defining extensions as a dict would be more compact and feel lighter, but
# that would only allow lambdas, which is too limiting, e.g. for ndarray
# extension.

Almar Klein's avatar
flake    
Almar Klein committed
443
444
class Extension(object):
    """ Base class to implement BSDF extensions for special data types.
Almar Klein's avatar
Almar Klein committed
445

446
447
448
449
    Extension classes are provided to the BSDF serializer, which
    instantiates the class. That way, the extension can be somewhat dynamic:
    e.g. the NDArrayExtension exposes the ndarray class only when numpy
    is imported.
Almar Klein's avatar
Almar Klein committed
450

451
452
    A extension instance must have two attributes. These can be attribiutes of
    the class, or of the instance set in ``__init__()``:
Almar Klein's avatar
Almar Klein committed
453

454
455
    * name (str): the name by which encoded values will be identified.
    * cls (type): the type (or list of types) to match values with.
Almar Klein's avatar
Almar Klein committed
456
457
      This is optional, but it makes the encoder select extensions faster.

458
    Further, it needs 3 methods:
Almar Klein's avatar
Almar Klein committed
459

460
461
462
463
464
465
    * `match(serializer, value) -> bool`: return whether the extension can
      convert the given value. The default is ``isinstance(value, self.cls)``.
    * `encode(serializer, value) -> encoded_value`: the function to encode a
      value to more basic data types.
    * `decode(serializer, encoded_value) -> value`: the function to decode an
      encoded value back to its intended representation.
Almar Klein's avatar
Almar Klein committed
466

467
    """
Almar Klein's avatar
Almar Klein committed
468

469
470
    name = ''
    cls = ()
Almar Klein's avatar
Almar Klein committed
471

472
473
    def __repr__(self):
        return '<BSDF extension %r at 0x%s>' % (self.name, hex(id(self)))
Almar Klein's avatar
Almar Klein committed
474

475
    def match(self, s, v):
476
        return isinstance(v, self.cls)
Almar Klein's avatar
Almar Klein committed
477

478
    def encode(self, s, v):
479
        raise NotImplementedError()
Almar Klein's avatar
Almar Klein committed
480

481
    def decode(self, s, v):
482
        raise NotImplementedError()
483
484
485


class ComplexExtension(Extension):
Almar Klein's avatar
Almar Klein committed
486

487
488
    name = 'c'
    cls = complex
Almar Klein's avatar
Almar Klein committed
489

490
    def encode(self, s, v):
491
        return (v.real, v.imag)
Almar Klein's avatar
Almar Klein committed
492

493
    def decode(self, s, v):
494
495
496
497
        return complex(v[0], v[1])


class NDArrayExtension(Extension):
Almar Klein's avatar
Almar Klein committed
498

499
    name = 'ndarray'
Almar Klein's avatar
Almar Klein committed
500

501
502
503
504
    def __init__(self):
        if 'numpy' in sys.modules:
            import numpy as np
            self.cls = np.ndarray
Almar Klein's avatar
Almar Klein committed
505

506
    def match(self, s, v):  # e.g. for nd arrays in JS
Almar Klein's avatar
flake    
Almar Klein committed
507
508
509
        return (hasattr(v, 'shape') and
                hasattr(v, 'dtype') and
                hasattr(v, 'tobytes'))
Almar Klein's avatar
Almar Klein committed
510

511
    def encode(self, s, v):
512
513
514
        return dict(shape=v.shape,
                    dtype=str(v.dtype),
                    data=v.tobytes())
Almar Klein's avatar
Almar Klein committed
515

516
    def decode(self, s, v):
517
518
519
520
521
522
523
524
525
526
        try:
            import numpy as np
        except ImportError:
            return v
        a = np.frombuffer(v['data'], dtype=v['dtype'])
        a.shape = v['shape']
        return a


standard_extensions = [ComplexExtension, NDArrayExtension]