Commit 3e2675a7 authored by hydrargyrum's avatar hydrargyrum

flatten-json: replace list-guess complex algorithm by a naive list-first impl

Instead of pre-processing input to look for lists and dicts, just create lists
by default, and turn them to dicts on-the-fly when we obviously can't use a
list anymore.
parent 4bc36327
......@@ -23,148 +23,89 @@ def flatten(obj, separator):
return ret
def expand(obj, separator, no_lists=False):
# should we expand {"0": 0, "1": 1} to itself or to [0, 1]?
# we try to guess! unless told not to guess
class ImpossibleList(Exception):
if no_lists:
listable = []
listable = sorted(build_list_candidates(obj, separator))
def get_create(sub, path_el, default):
def expand_nolists(obj, separator):
ret = {}
for k, value in obj.items():
path = k.split(separator)
sub = ret
for path_el in path[:-1]:
sub = sub.setdefault(path_el, {})
sub[path[-1]] = value
return ret
def expand_lists(obj, separator):
# should we expand {"0": 0, "1": 1} to itself or to [0, 1]?
# we try to guess!
def generic_setdefault(sub, path_el, default):
if isinstance(sub, dict):
return sub.setdefault(path_el, default)
assert isinstance(sub, list)
path_el = int(path_el)
if isinstance(path_el, str):
if path_el.isdigit():
path_el = int(path_el)
raise ImpossibleList()
if len(sub) > path_el:
return sub[path_el]
assert path_el == len(sub)
return sub[path_el]
elif len(sub) + 1 > path_el:
return sub[path_el]
raise ImpossibleList()
ret = {}
# prepare by building lists first
for path in listable:
sub = ret
for path_el in path[:-1]:
sub = get_create(sub, path_el, {})
get_create(sub, path[-1], [])
def list_to_dict(lis):
assert isinstance(lis, list)
dct = {}
for n, v in enumerate(lis):
dct[n] = v
return dct
# We create each element as a list first
# and as soon as we encounter that prevents us from being a list
# e.g. a non-numeric or too-far key, then we transform it in a dict.
# encapsulate root list so we can change root element type seamlessly
# without making a special case
ret = {'root': []}
for k, value in obj.items():
path = k.split(separator)
sub = ret
for path_el in path[:-1]:
sub = get_create(sub, path_el, {})
get_create(sub, path[-1], value)
# if we don't care about building a list, we just need that:
# for k, value in obj.items():
# path = k.split(separator)
# sub = ret
# for path_el in path[:-1]:
# sub = sub.setdefault(path_el, {})
# sub[path[-1]] = value
return ret
parent = ret
parent_key = 'root'
sub = ret['root']
for n, path_el in enumerate(path, 1):
if n < len(path):
# not the last component, those are containers
# create a new list by default
to_set = []
# last path component, it's not a container but a value
to_set = value
def build_list_candidates(obj, separator):
# guess where are lists
# first, convert all keys to tuples (so we can forget about the separator)
all_keys = set(tuple(key.split(separator)) for key in obj)
# {"foo/bar": 0} -> {("foo", "bar")}
# add intermediate levels
# {("foo", "bar")} -> {("foo",), ("foo", "bar")}
for path in list(all_keys):
for n in range(len(path)):
# Sorting by length is useful to do some kind of breadth-first browsing!
# For example, we will have: foo, foo/bar, foo/baz, foo/bar/qux
# instead of (with lexical sort): foo/bar, foo/bar/qux, foo/baz
# With lexical sort, foo/bar/qux would prevent to find all foo subkeys (bar and baz)
# By sorting by depth, we have foo/bar and foo/baz close to each other
# and so it's breadth-first traversing.
all_keys = sorted(all_keys, key=lambda k: (len(k), k))
# The goal is that if we find a key that's not at the same level
# or that has a different prefix, then we know that we have browsed all siblings.
# When reaching foo/bar/baz, we're sure there won't be any other keys than bar and baz
# directly under foo. So we can check if there's a list and it's complete.
# compat dict: key = path, value = list of direct children of path (the key).
# We only accept integers in those dict-value lists.
# The values will be used later for hole-verification.
# compat contains the candidates that may be lists after the input expand.
compat = {}
# incompat: set of paths that we know that cannot be lists after the input expand.
# For example, if there's a non-numeric key under a path prefix
# then this path prefix cannot be a list, only a dict.
# It's a set because we don't need more info about this path:
# it's incompatible and that's all.
incompat = set()
previous = () # parent of the previous iteration
parent = None # just make sure it's defined
def has_hole(lis):
for i in range(len(lis)):
if lis[i] != i:
return False
return True
def check_holes(parent):
# We've visited all siblings of parent
# and it was not incompatible.
# Check there are no holes in the list.
# If there's only [0, 2]
# then the input JSON did not represent a list but a dict
# else we would have had [0, 1], or [0, 1, 2]
assert parent in compat
assert parent not in incompat
if not has_hole(compat[parent]):
# There's a hole, this parent is not a list.
compat.pop(parent, None)
for path in all_keys:
parent = path[:-1]
if parent == previous: # we're traversing siblings of a parent
if parent in incompat:
# This parent is poisoned.
# Nothing interesting here.
else: # this key is not a sibling of previous key's parent
if previous in compat:
previous = parent
# Here, either we're "compatible" so far (under this parent)
# or we're encountering a new parent and have seen any child yet
# (so, we're compat so far too, because there were no incompatible
# child yet)
if path[-1].isdigit():
compat.setdefault(parent, []).append(int(path[-1]))
# This key prevents this parent from being a list.
# We have to be a dict. Poison this parent.
compat.pop(parent, None)
if path_el.isdigit():
path_el = int(path_el)
# End of the loop: we need to do the verification of the last parent we had.
# We usually do it when we encounter a different parent
# but the loop ended, so we do the last parent here.
if parent in compat:
new_parent = sub
sub = generic_setdefault(sub, path_el, to_set)
except ImpossibleList:
sub = parent[parent_key] = list_to_dict(sub)
sub = generic_setdefault(sub, path_el, to_set)
parent = new_parent
parent_key = path_el
# The values were used only for the sake of hole-verification.
# Only the valid list paths are useful for caller.
return list(compat)
return ret['root']
def main():
......@@ -221,20 +162,19 @@ def main():
parser.add_argument('--flatten', action='store_const', const=flatten, dest='op')
parser.add_argument('--expand', action='store_const', const=expand, dest='op')
parser.add_argument('--expand', action='store_const', const=expand_lists, dest='op')
parser.add_argument('--no-lists', action='store_true')
parser.add_argument('--separator', default='/')
args = parser.parse_args()
if args.op is flatten and args.no_lists:
parser.error('--no-lists can only be used with --expand')
if args.no_lists:
if args.op is flatten:
parser.error('--no-lists can only be used with --expand')
args.op = expand_nolists
obj = json.load(sys.stdin)
op_cb = (args.op or flatten)
if args.no_lists:
obj = expand(obj, separator=args.separator, no_lists=True)
obj = op_cb(obj, separator=args.separator)
obj = op_cb(obj, separator=args.separator)
json.dump(obj, sys.stdout)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment