Skip to content
Snippets Groups Projects
Commit 7a253f87 authored by Eduardo Bonet's avatar Eduardo Bonet :speech_balloon:
Browse files

Uses OJ Saj Parser to speed up Ipynbdiff

Removes our Ruby implementation of a Json Source Map generator in favor
of OJ Saj parser

Changelog: performance
parent 8498be51
No related branches found
No related tags found
1 merge request!91577Resolve "Further optimise IpynbSymbolMap by using C extensions"
Showing
with 177 additions and 406 deletions
......@@ -15,7 +15,7 @@ PATH
specs:
ipynbdiff (0.4.7)
diffy (~> 3.3)
json (~> 2.5, >= 2.5.1)
oj (~> 3.13.16)
PATH
remote: vendor/gems/mail-smtp_pool
......
......@@ -79,7 +79,7 @@ def notebook_diff
rescue Timeout::Error => e
rendered_timeout.increment(source: Gitlab::Runtime.sidekiq? ? BACKGROUND_EXECUTION : FOREGROUND_EXECUTION)
log_event(LOG_IPYNBDIFF_TIMEOUT, e)
rescue IpynbDiff::InvalidNotebookError, IpynbDiff::InvalidTokenError => e
rescue IpynbDiff::InvalidNotebookError => e
log_event(LOG_IPYNBDIFF_INVALID, e)
end
end
......
......@@ -91,7 +91,7 @@ def source_line_from_block(transformed_line, transformed_blocks)
return 0 unless line_in_source.present?
line_in_source + 1
line_in_source
end
def image_as_rich_text(line_text)
......
......@@ -39,7 +39,7 @@ def make_lines(old_lines, new_lines, texts = nil, types = nil)
where(:case, :transformed_blocks, :result) do
'if transformed diff is empty' | [] | 0
'if the transformed line does not map to any in the original file' | [{ source_line: nil }] | 0
'if the transformed line maps to a line in the source file' | [{ source_line: 2 }] | 3
'if the transformed line maps to a line in the source file' | [{ source_line: 3 }] | 3
end
with_them do
......@@ -81,8 +81,8 @@ def make_lines(old_lines, new_lines, texts = nil, types = nil)
let(:blocks) do
{
from: [0, 2, 1, nil, nil, 3].map { |i| { source_line: i } },
to: [0, 1, nil, 2, nil, 3].map { |i| { source_line: i } }
from: [1, 3, 2, nil, nil, 4].map { |i| { source_line: i } },
to: [1, 2, nil, 3, nil, 4].map { |i| { source_line: i } }
}
end
......
......@@ -3,7 +3,7 @@ PATH
specs:
ipynbdiff (0.4.7)
diffy (~> 3.3)
json (~> 2.5, >= 2.5.1)
oj (~> 3.13.16)
GEM
remote: https://rubygems.org/
......@@ -15,9 +15,9 @@ GEM
coderay (1.1.3)
diff-lcs (1.5.0)
diffy (3.4.2)
json (2.6.2)
memory_profiler (1.0.0)
method_source (1.0.0)
oj (3.13.16)
parser (3.1.2.0)
ast (~> 2.4.1)
proc_to_ast (0.1.0)
......@@ -40,7 +40,7 @@ GEM
rspec-mocks (3.11.1)
diff-lcs (>= 1.2.0, < 2.0)
rspec-support (~> 3.11.0)
rspec-parameterized (0.5.1)
rspec-parameterized (0.5.2)
binding_ninja (>= 0.2.3)
parser
proc_to_ast
......
......@@ -23,7 +23,7 @@ Gem::Specification.new do |s|
s.require_paths = ['lib']
s.add_runtime_dependency 'diffy', '~> 3.3'
s.add_runtime_dependency 'json', '~> 2.5', '>= 2.5.1'
s.add_runtime_dependency 'oj', '~> 3.13.16'
s.add_development_dependency 'bundler', '~> 2.2'
s.add_development_dependency 'pry', '~> 0.14'
......
# frozen_string_literal: true
module IpynbDiff
class InvalidTokenError < StandardError
end
# Creates a symbol map for a ipynb file (JSON format)
class IpynbSymbolMap
class << self
def parse(notebook, objects_to_ignore = [])
IpynbSymbolMap.new(notebook, objects_to_ignore).parse('')
end
end
attr_reader :current_line, :char_idx, :results
WHITESPACE_CHARS = ["\t", "\r", ' ', "\n"].freeze
VALUE_STOPPERS = [',', '[', ']', '{', '}', *WHITESPACE_CHARS].freeze
def initialize(notebook, objects_to_ignore = [])
@chars = notebook.chars
@current_line = 0
@char_idx = 0
@results = {}
@objects_to_ignore = objects_to_ignore
end
def parse(prefix = '.')
raise_if_file_ended
skip_whitespaces
if (c = current_char) == '"'
parse_string
elsif c == '['
parse_array(prefix)
elsif c == '{'
parse_object(prefix)
else
parse_value
end
results
end
def parse_array(prefix)
# [1, 2, {"some": "object"}, [1]]
i = 0
current_should_be '['
loop do
raise_if_file_ended
break if skip_beginning(']')
new_prefix = "#{prefix}.#{i}"
add_result(new_prefix, current_line)
parse(new_prefix)
i += 1
end
end
def parse_object(prefix)
# {"name":"value", "another_name": [1, 2, 3]}
current_should_be '{'
loop do
raise_if_file_ended
break if skip_beginning('}')
prop_name = parse_string(return_value: true)
next_and_skip_whitespaces
current_should_be ':'
next_and_skip_whitespaces
if @objects_to_ignore.include? prop_name
skip
else
new_prefix = "#{prefix}.#{prop_name}"
add_result(new_prefix, current_line)
parse(new_prefix)
end
end
end
def parse_string(return_value: false)
current_should_be '"'
init_idx = @char_idx
loop do
increment_char_index
raise_if_file_ended
if current_char == '"' && !prev_backslash?
init_idx += 1
break
end
end
@chars[init_idx...@char_idx].join if return_value
end
def add_result(key, line_number)
@results[key] = line_number
end
def parse_value
increment_char_index until raise_if_file_ended || VALUE_STOPPERS.include?(current_char)
end
def skip_whitespaces
while WHITESPACE_CHARS.include?(current_char)
raise_if_file_ended
check_for_new_line
increment_char_index
end
end
def increment_char_index
@char_idx += 1
end
def next_and_skip_whitespaces
increment_char_index
skip_whitespaces
end
def current_char
raise_if_file_ended
@chars[@char_idx]
end
def prev_backslash?
@chars[@char_idx - 1] == '\\' && @chars[@char_idx - 2] != '\\'
end
def current_should_be(another_char)
raise InvalidTokenError unless current_char == another_char
end
def check_for_new_line
@current_line += 1 if current_char == "\n"
end
def raise_if_file_ended
@char_idx >= @chars.size && raise(InvalidTokenError)
end
def skip
raise_if_file_ended
skip_whitespaces
if (c = current_char) == '"'
parse_string
elsif c == '['
skip_array
elsif c == '{'
skip_object
else
parse_value
end
end
def skip_array
loop do
raise_if_file_ended
break if skip_beginning(']')
skip
end
end
def skip_object
loop do
raise_if_file_ended
break if skip_beginning('}')
parse_string
next_and_skip_whitespaces
current_should_be ':'
next_and_skip_whitespaces
skip
end
end
def skip_beginning(closing_char)
check_for_new_line
next_and_skip_whitespaces
return true if current_char == closing_char
next_and_skip_whitespaces if current_char == ','
end
end
end
......@@ -14,7 +14,7 @@ class OutputTransformer
'stream' => %w[text]
}.freeze
def initialize(hide_images: false)
def initialize(hide_images = false)
@hide_images = hide_images
end
......
# frozen_string_literal: true
module IpynbDiff
require 'oj'
# Creates a symbol map for a ipynb file (JSON format)
class SymbolMap
class << self
def parser
@parser ||= SymbolMap.new.tap { |p| Oj::Parser.saj.handler = p }
end
def parse(notebook, *args)
parser.parse(notebook)
end
end
attr_accessor :symbols
def hash_start(key, line, column)
add_symbol(key_or_index(key), line)
end
def hash_end(key, line, column)
@current_path.pop
end
def array_start(key, line, column)
@current_array_index << 0
add_symbol(key, line)
end
def array_end(key, line, column)
@current_path.pop
@current_array_index.pop
end
def add_value(value, key, line, column)
add_symbol(key_or_index(key), line)
@current_path.pop
end
def parse(notebook)
reset
Oj::Parser.saj.parse(notebook)
symbols
end
private
def add_symbol(symbol, line)
@symbols[@current_path.append(symbol).join('.')] = line if symbol
end
def key_or_index(key)
if key.nil? || key.empty? # value in an array
if @current_path.empty?
@current_path = ['']
return nil
end
symbol = @current_array_index.last
@current_array_index[-1] += 1
symbol
else
key
end
end
def reset
@current_path = []
@current_path_line_starts = []
@symbols = {}
@current_array_index = []
end
end
end
# frozen_string_literal: true
module IpynbDiff
require 'oj'
class InvalidNotebookError < StandardError
end
......@@ -10,26 +12,25 @@ class Transformer
require 'yaml'
require 'output_transformer'
require 'symbolized_markdown_helper'
require 'ipynb_symbol_map'
require 'symbol_map'
require 'transformed_notebook'
include SymbolizedMarkdownHelper
@include_frontmatter = true
@objects_to_ignore = ['application/javascript', 'application/vnd.holoviews_load.v0+json']
def initialize(include_frontmatter: true, hide_images: false)
@include_frontmatter = include_frontmatter
@hide_images = hide_images
@out_transformer = OutputTransformer.new(hide_images: hide_images)
@out_transformer = OutputTransformer.new(hide_images)
end
def validate_notebook(notebook)
notebook_json = JSON.parse(notebook)
notebook_json = Oj::Parser.usual.parse(notebook)
return notebook_json if notebook_json.key?('cells')
raise InvalidNotebookError
rescue JSON::ParserError
rescue EncodingError
raise InvalidNotebookError
end
......@@ -38,7 +39,7 @@ def transform(notebook)
notebook_json = validate_notebook(notebook)
transformed = transform_document(notebook_json)
symbol_map = IpynbSymbolMap.parse(notebook)
symbol_map = SymbolMap.parse(notebook)
TransformedNotebook.new(transformed, symbol_map)
end
......
# frozen_string_literal: true
require 'rspec'
require 'json'
require 'rspec-parameterized'
require 'ipynb_symbol_map'
describe IpynbDiff::IpynbSymbolMap do
def res(*cases)
cases&.to_h || []
end
describe '#parse_string' do
using RSpec::Parameterized::TableSyntax
let(:mapper) { IpynbDiff::IpynbSymbolMap.new(input) }
where(:input, :result) do
# Empty string
'""' | ''
# Some string with quotes
'"he\nll\"o"' | 'he\nll\"o'
end
with_them do
it { expect(mapper.parse_string(return_value: true)).to eq(result) }
it { expect(mapper.parse_string).to be_nil }
it { expect(mapper.results).to be_empty }
end
it 'raises if invalid string' do
mapper = IpynbDiff::IpynbSymbolMap.new('"')
expect { mapper.parse_string }.to raise_error(IpynbDiff::InvalidTokenError)
end
end
describe '#parse_object' do
using RSpec::Parameterized::TableSyntax
let(:mapper) { IpynbDiff::IpynbSymbolMap.new(notebook, objects_to_ignore) }
before do
mapper.parse_object('')
end
where(:notebook, :objects_to_ignore, :result) do
# Empty object
'{ }' | [] | res
# Object with string
'{ "hello" : "world" }' | [] | res(['.hello', 0])
# Object with boolean
'{ "hello" : true }' | [] | res(['.hello', 0])
# Object with integer
'{ "hello" : 1 }' | [] | res(['.hello', 0])
# Object with 2 properties in the same line
'{ "hello" : "world" , "my" : "bad" }' | [] | res(['.hello', 0], ['.my', 0])
# Object with 2 properties in the different lines line
"{ \"hello\" : \"world\" , \n \n \"my\" : \"bad\" }" | [] | res(['.hello', 0], ['.my', 2])
# Object with 2 properties, but one is ignored
"{ \"hello\" : \"world\" , \n \n \"my\" : \"bad\" }" | ['hello'] | res(['.my', 2])
end
with_them do
it { expect(mapper.results).to include(result) }
end
end
describe '#parse_array' do
using RSpec::Parameterized::TableSyntax
where(:notebook, :result) do
# Empty Array
'[]' | res
# Array with string value
'["a"]' | res(['.0', 0])
# Array with boolean
'[ true ]' | res(['.0', 0])
# Array with integer
'[ 1 ]' | res(['.0', 0])
# Two values on the same line
'["a", "b"]' | res(['.0', 0], ['.1', 0])
# With line breaks'
"[\n \"a\" \n , \n \"b\" ]" | res(['.0', 1], ['.1', 3])
end
let(:mapper) { IpynbDiff::IpynbSymbolMap.new(notebook) }
before do
mapper.parse_array('')
end
with_them do
it { expect(mapper.results).to match_array(result) }
end
end
describe '#skip_object' do
subject { IpynbDiff::IpynbSymbolMap.parse(JSON.pretty_generate(source)) }
end
describe '#parse' do
let(:objects_to_ignore) { [] }
subject { IpynbDiff::IpynbSymbolMap.parse(JSON.pretty_generate(source), objects_to_ignore) }
context 'Empty object' do
let(:source) { {} }
it { is_expected.to be_empty }
end
context 'Object with inner object and number' do
let(:source) { { obj1: { obj2: 1 } } }
it { is_expected.to match_array(res(['.obj1', 1], ['.obj1.obj2', 2])) }
end
context 'Object with inner object and number, string and array with object' do
let(:source) { { obj1: { obj2: [123, 2, true], obj3: "hel\nlo", obj4: true, obj5: 123, obj6: 'a' } } }
it do
is_expected.to match_array(
res(['.obj1', 1],
['.obj1.obj2', 2],
['.obj1.obj2.0', 3],
['.obj1.obj2.1', 4],
['.obj1.obj2.2', 5],
['.obj1.obj3', 7],
['.obj1.obj4', 8],
['.obj1.obj5', 9],
['.obj1.obj6', 10])
)
end
end
context 'When index is exceeded because of failure' do
it 'raises an exception' do
source = '{"\\a": "a\""}'
mapper = IpynbDiff::IpynbSymbolMap.new(source)
expect(mapper).to receive(:prev_backslash?).at_least(1).time.and_return(false)
expect { mapper.parse('') }.to raise_error(IpynbDiff::InvalidTokenError)
end
end
context 'Object with inner object and number, string and array with object' do
let(:source) { { obj1: { obj2: [123, 2, true], obj3: "hel\nlo", obj4: true, obj5: 123, obj6: { obj7: 'a' } } } }
let(:objects_to_ignore) { %w(obj2 obj6) }
it do
is_expected.to match_array(
res(['.obj1', 1],
['.obj1.obj3', 7],
['.obj1.obj4', 8],
['.obj1.obj5', 9],
)
)
end
end
end
end
# frozen_string_literal: true
require 'rspec'
require 'json'
require 'rspec-parameterized'
require 'symbol_map'
describe IpynbDiff::SymbolMap do
def res(*cases)
cases&.to_h || []
end
describe '#parse' do
subject { IpynbDiff::SymbolMap.parse(JSON.pretty_generate(source)) }
context 'Empty object' do
let(:source) { {} }
it { is_expected.to be_empty }
end
context 'Object with inner object and number' do
let(:source) { { obj1: { obj2: 1 } } }
it { is_expected.to match_array(res( ['.obj1', 2], ['.obj1.obj2', 3])) }
end
context 'Object with inner object and number, string and array with object' do
let(:source) { { obj1: { obj2: [123, 2, true], obj3: "hel\nlo", obj4: true, obj5: 123, obj6: 'a' } } }
it do
is_expected.to match_array(
res(['.obj1', 2],
['.obj1.obj2', 3],
['.obj1.obj2.0', 4],
['.obj1.obj2.1', 5],
['.obj1.obj2.2', 6],
['.obj1.obj3', 8],
['.obj1.obj4', 9],
['.obj1.obj5', 10],
['.obj1.obj6', 11])
)
end
end
end
end
......@@ -57,8 +57,7 @@
"tags": [
"senoid"
]
},
"outputs": [
}, "outputs": [
{
"data": {
"text/plain": [
......
3
36
37
38
39
40
12
16
25
......@@ -5,10 +5,10 @@
require 'json'
require 'rspec-parameterized'
BASE_PATH = File.join(File.expand_path(File.dirname(__FILE__)), 'testdata')
TRANSFORMER_BASE_PATH = File.join(File.expand_path(File.dirname(__FILE__)), 'testdata')
def read_file(*paths)
File.read(File.join(BASE_PATH, *paths))
File.read(File.join(TRANSFORMER_BASE_PATH, *paths))
end
def default_config
......@@ -68,12 +68,27 @@ def read_notebook(input_path)
expect(transformed.as_text).to eq expected_md
end
it 'generates the expected symbol map' do
expect(transformed.blocks.map { |b| b[:source_symbol] }.join("\n")).to eq expected_symbols
it 'marks the lines correctly' do
blocks = transformed.blocks.map { |b| b[:source_symbol] }.join("\n")
result = expected_symbols
expect(blocks).to eq result
end
end
end
it 'generates the correct transformed to source line map' do
input = read_file('text_png_output', 'input.ipynb' )
expected_line_numbers = read_file('text_png_output', 'expected_line_numbers.txt' )
transformed = IpynbDiff::Transformer.new(**{ include_frontmatter: false }).transform(input)
line_numbers = transformed.blocks.map { |b| b[:source_line] }.join("\n")
expect(line_numbers).to eq(expected_line_numbers)
end
context 'When the notebook is invalid' do
[
['because the json is invalid', 'a'],
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment