Commit 3f648d35 authored by KitaitiMakoto's avatar KitaitiMakoto

Merge branch 'rexml'

parents 0d306196 0362ac42
Pipeline #34714823 failed with stages
in 40 seconds
......@@ -4,6 +4,8 @@
* [REFACTORING]Add {EPUB::Parser::NokogiriAttributeWithPrefix} and use `Nokogiri::XML::Node#attribute_with_prefix` instead of `EPUB::Parser::Utils#extract_attribute`
* Set default value for detect_encoding argument for {EPUB::Publication::Package::Manifest::Item#read} to false
* Make XML library switchable between REXML and Nokogiri
* Make REXML a default XML backend
== 0.3.7
......
......@@ -159,8 +159,6 @@ Then documentation will be available in `doc` directory.
== REQUIREMENTS
* Ruby 2.3.0 or later
* `patch` command to install Nokogiri
* C compiler to compile Nokogiri
== SIMILAR EFFORTS
......@@ -180,6 +178,8 @@ If you find other gems, please tell me or request a pull request.
* [REFACTORING]Add {EPUB::Parser::NokogiriAttributeWithPrefix} and use `Nokogiri::XML::Node#attribute_with_prefix` instead of `EPUB::Parser::Utils#extract_attribute`
* Set default value for detect_encoding argument for {EPUB::Publication::Package::Manifest::Item#read} to false
* Make XML library switchable between REXML and Nokogiri
* Make REXML a default XML backend
=== 0.3.7
......@@ -193,25 +193,20 @@ If you find other gems, please tell me or request a pull request.
* [BUG FIX]Ignore fragment when find item by relative IRI
* Disable https://github.com/ko1/pretty_backtrace[PrettyBacktrace] by default
=== 0.3.5
* [BUG FIX]Fix a bug that {EPUB::ContentDocument::Navigation::Item#item} is `nil` when `href` includes double dots(`..`)(Thanks https://gitlab.com/aelkiss[aelkiss]!)
See {file:CHANGELOG.adoc} for older changelogs and details.
== TODOS
* Consider to implement IRI feature instead of to use Addressable
* EPUB 3.0.1
* EPUB 3.1
* EPUB 3.2
* Help features for `epub-open` tool
* Vocabulary Association Mechanisms
* Implementing navigation document and so on
* Media Overlays
* Content Document
* Digital Signature
* Using SAX on parsing
* Abstraction of XML parser(making it possible to use REXML, standard bundled XML library of Ruby)
* in `Searcher`
* Handle with encodings other than UTF-8
== DONE
......@@ -225,6 +220,8 @@ See {file:CHANGELOG.adoc} for older changelogs and details.
* Archive library abstraction
* Extracting and organizing common behavior from some classes to modules
* Multiple rootfiles
* Abstraction of XML parser(making it possible to use REXML, standard bundled XML library of Ruby)
* excluding in `Searcher`
== LICENSE
......
......@@ -39,11 +39,21 @@ namespace :test do
File.rename "#{input_dir}.epub.tmp", "#{input_dir}.epub"
end
Rake::TestTask.new do |task|
task.test_files = FileList['test/**/test_*.rb']
task.warning = true
task.options = '--no-show-detail-immediately --verbose'
# TODO: Test with both REXML and Nokogiri in testing framework
%w[REXML Nokogiri].each do |xml_backend|
task "set_xml_backend_#{xml_backend.downcase}" do
ENV["EPUB_PARSER_XML_BACKEND"] = xml_backend
end
Rake::TestTask.new "test_with_#{xml_backend.downcase}" do |task|
task.test_files = FileList['test/**/test_*.rb']
task.warning = true
task.options = '--no-show-detail-immediately --verbose'
EPUB::Parser::XMLDocument.backend = xml_backend
end
task "test_with_#{xml_backend.downcase}" => "set_xml_backend_#{xml_backend.downcase}"
end
task :test => [:test_with_rexml, :test_with_nokogiri]
end
task :doc => 'doc:default'
......
......@@ -117,6 +117,14 @@ ret == book # => true; this API is not good I feel... Welcome suggestion!
# do something with your book
----
==== Switching XML Library
EPUB Parser uses https://ruby-doc.org/stdlib-2.5.3/libdoc/rexml/rdoc/index.html[REXML], a standard-bundled library, by default. You can use https://www.nokogiri.org/[Nokogiri], a Ruby bindings for http://xmlsoft.org/[Libxml2] and http://xmlsoft.org/XSLT/[Libxslt] and more if you have already installed Nokogiri gem by RubyGems or Bundler.
----
EPUB::Parser::XMLDocument.backend = :Nokogiri
----
==== Switching ZIP library
EPUB Parser uses https://github.com/javanthropus/archive-zip[Archive::Zip], a pure Ruby ZIP library, by default. You can use https://bitbucket.org/winebarrel/zip-ruby/wiki/Home[Zip/Ruby], a Ruby bindings for https://libzip.org/[libzip] if you have already installed Zip/Ruby gem by RubyGems or Bundler.
......@@ -197,8 +205,6 @@ Then documentation will be available in `doc` directory.
== Requirements
* Ruby 2.2.0 or later
* `patch` command to install Nokogiri
* C compiler to compile Zip/Ruby and Nokogiri
== History
......
......@@ -39,9 +39,9 @@ Gem::Specification.new do |s|
s.add_development_dependency 'pretty_backtrace'
s.add_development_dependency 'epub-maker'
s.add_development_dependency 'asciidoctor'
s.add_development_dependency 'nokogiri', '>= 1.6.0', '< 1.9'
s.add_runtime_dependency 'archive-zip'
s.add_runtime_dependency 'nokogiri', '>= 1.6.0', '< 1.9'
s.add_runtime_dependency 'addressable', '>= 2.3.5'
s.add_runtime_dependency 'rchardet', '>= 1.6.1'
s.add_runtime_dependency 'epub-cfi'
......
......@@ -18,7 +18,7 @@ module EPUB
# @return [String] Returns the value of title element.
# If none, returns empty string
def title
title_elem = nokogiri.search('title').first
title_elem = rexml.get_elements('.//title').first
if title_elem
title_elem.text
else
......@@ -29,12 +29,12 @@ module EPUB
# @return [REXML::Document] content as REXML::Document object
def rexml
require 'rexml/document'
@rexml ||= REXML::Document.new(raw_document)
end
# @return [Nokogiri::XML::Document] content as Nokogiri::XML::Document object
def nokogiri
require 'nokogiri'
@nokogiri ||= Nokogiri.XML(raw_document)
end
end
......
require 'epub'
require 'epub/constants'
require 'epub/book'
require 'nokogiri'
module EPUB
class Parser
......@@ -96,7 +95,7 @@ module EPUB
end
require 'epub/parser/version'
require 'epub/parser/utils'
require 'epub/parser/xml_document'
require 'epub/parser/ocf'
require 'epub/parser/publication'
require 'epub/parser/content_document'
require 'epub/content_document'
require 'epub/constants'
require 'epub/parser/utils'
require 'nokogiri'
require 'epub/parser/xml_document'
module EPUB
class Parser
class ContentDocument
using Parser::NokogiriAttributeWithPrefix
using XMLDocument::Refinements
# @param [EPUB::Publication::Package::Manifest::Item] item
def initialize(item)
......@@ -28,7 +27,7 @@ module EPUB
end
return content_document if content_document.nil?
content_document.item = @item
document = Nokogiri.XML(@item.read)
document = XMLDocument.new(@item.read)
# parse_content_document(document)
if @item.nav?
content_document.navigations = parse_navigations(document)
......@@ -36,13 +35,13 @@ module EPUB
content_document
end
# @param [Nokogiri::HTML::Document] document HTML document or element including nav
# @param [XMLDocument, REXML::Document, Nokogiri::HTML::Document] document HTML document or element including nav
# @return [Array<EPUB::ContentDocument::Navigation::Nav>] navs array of Nav object
def parse_navigations(document)
document.search('/xhtml:html/xhtml:body//xhtml:nav', EPUB::NAMESPACES).collect {|elem| parse_navigation elem}
document.each_element_by_xpath('/xhtml:html/xhtml:body//xhtml:nav', EPUB::NAMESPACES).collect {|elem| parse_navigation elem}
end
# @param [Nokogiri::XML::Element] element nav element
# @param [REXML::Element, Nokogiri::XML::Element] element nav element
# @return [EPUB::ContentDocument::Navigation::Nav] nav Nav object
def parse_navigation(element)
nav = EPUB::ContentDocument::Navigation::Navigation.new
......@@ -50,23 +49,23 @@ module EPUB
hidden = element.attribute_with_prefix('hidden')
nav.hidden = hidden.nil? ? nil : true
nav.type = element.attribute_with_prefix('type', 'epub')
element.xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map do |elem|
element.each_element_by_xpath('./xhtml:ol/xhtml:li', EPUB::NAMESPACES).map do |elem|
nav.items << parse_navigation_item(elem)
end
nav
end
# @param [Nokogiri::XML::Element] element li element
# @param [REXML::Element, Nokogiri::XML::Element] element li element
def parse_navigation_item(element)
item = EPUB::ContentDocument::Navigation::Item.new
a_or_span = element.xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
a_or_span = element.each_element_by_xpath('./xhtml:a[1]|xhtml:span[1]', EPUB::NAMESPACES).first
return a_or_span if a_or_span.nil?
item.text = a_or_span.text
item.text = a_or_span.content
if a_or_span.name == 'a'
if item.text.empty?
embedded_content = a_or_span.xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
embedded_content = a_or_span.each_element_by_xpath('./xhtml:audio[1]|xhtml:canvas[1]|xhtml:embed[1]|xhtml:iframe[1]|xhtml:img[1]|xhtml:math[1]|xhtml:object[1]|xhtml:svg[1]|xhtml:video[1]', EPUB::NAMESPACES).first
unless embedded_content.nil?
case embedded_content.name
when 'audio', 'canvas', 'embed', 'iframe'
......@@ -84,22 +83,27 @@ module EPUB
item.href = a_or_span.attribute_with_prefix('href')
item.item = @item.find_item_by_relative_iri(item.href)
end
item.items = element.xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
item.items = element.each_element_by_xpath('./xhtml:ol[1]/xhtml:li', EPUB::NAMESPACES).map {|li| parse_navigation_item(li)}
item
end
private
# @param [Nokogiri::XML::Element] element nav element
# @param [REXML::Element, Nokogiri::XML::Element] element nav element
# @return [String] heading heading text
def find_heading(element)
heading = element.xpath('./xhtml:h1|xhtml:h2|xhtml:h3|xhtml:h4|xhtml:h5|xhtml:h6|xhtml:hgroup', EPUB::NAMESPACES).first
heading = element.each_element_by_xpath('./xhtml:h1|xhtml:h2|xhtml:h3|xhtml:h4|xhtml:h5|xhtml:h6|xhtml:hgroup', EPUB::NAMESPACES).first
return nil if heading.nil?
return heading.text unless heading.name == 'hgroup'
return heading.content unless heading.name == 'hgroup'
(heading/'h1' || heading/'h2' || heading/'h3' || heading/'h4' || heading/'h5' || heading/'h6').first.text
(heading.each_element_by_xpath(".//xhtml:h1", EPUB::NAMESPACES) ||
heading.each_element_by_xpath(".//xhtml:h2", EPUB::NAMESPACES) ||
heading.each_element_by_xpath(".//xhtml:h3", EPUB::NAMESPACES) ||
heading.each_element_by_xpath(".//xhtml:h4", EPUB::NAMESPACES) ||
heading.each_element_by_xpath(".//xhtml:h5", EPUB::NAMESPACES) ||
heading.each_element_by_xpath(".//xhtml:h6", EPUB::NAMESPACES)).first.content
end
end
end
......
module EPUB
class Parser
module Metadata
using NokogiriAttributeWithPrefix
using XMLDocument::Refinements
def parse_metadata(elem, unique_identifier_id, default_namespace)
metadata = EPUB::Publication::Package::Metadata.new
id_map = {}
default_namespace_uri = EPUB::NAMESPACES[default_namespace]
elem.element_children.each do |child|
namespace_uri = child.namespace && child.namespace.href
elem.each_element do |child|
elem_name = child.name
model =
case namespace_uri
case child.namespace_uri
when EPUB::NAMESPACES['dc']
case elem_name
when 'identifier'
......
......@@ -2,12 +2,12 @@ require 'epub/constants'
require 'epub/ocf'
require 'epub/ocf/physical_container'
require 'epub/parser/metadata'
require 'nokogiri'
require "epub/parser/xml_document"
module EPUB
class Parser
class OCF
using NokogiriAttributeWithPrefix
using XMLDocument::Refinements
include Metadata
DIRECTORY = 'META-INF'
......@@ -37,8 +37,8 @@ module EPUB
def parse_container(xml)
container = EPUB::OCF::Container.new
doc = Nokogiri.XML(xml)
doc.xpath('/ocf:container/ocf:rootfiles/ocf:rootfile', EPUB::NAMESPACES).each do |elem|
doc = XMLDocument.new(xml)
doc.each_element_by_xpath "/ocf:container/ocf:rootfiles/ocf:rootfile", EPUB::NAMESPACES do |elem|
rootfile = EPUB::OCF::Container::Rootfile.new
rootfile.full_path = Addressable::URI.parse(elem.attribute_with_prefix('full-path'))
rootfile.media_type = elem.attribute_with_prefix('media-type')
......@@ -59,7 +59,7 @@ module EPUB
end
def parse_metadata(content)
doc = Nokogiri.XML(content)
doc = XMLDocument.new(content)
unless multiple_rendition_metadata?(doc)
warn "Not implemented: #{self.class}##{__method__}" if $VERBOSE
metadata = EPUB::OCF::UnknownFormatMetadata.new
......@@ -82,7 +82,7 @@ module EPUB
def multiple_rendition_metadata?(doc)
doc.root &&
doc.root.name == 'metadata' &&
doc.namespaces['xmlns'] == EPUB::NAMESPACES['metadata']
doc.root.namespaces['xmlns'] == EPUB::NAMESPACES['metadata']
end
end
end
......
require 'strscan'
require 'nokogiri'
require 'epub/publication'
require 'epub/constants'
require 'epub/parser/metadata'
......@@ -7,7 +6,7 @@ require 'epub/parser/metadata'
module EPUB
class Parser
class Publication
using NokogiriAttributeWithPrefix
using XMLDocument::Refinements
include Metadata
class << self
......@@ -19,7 +18,7 @@ module EPUB
end
def initialize(opf)
@doc = Nokogiri.XML(opf)
@doc = XMLDocument.new(opf)
end
def parse
......@@ -45,16 +44,16 @@ module EPUB
end
def parse_metadata(doc)
super(doc.xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first, doc.root['unique-identifier'], 'opf')
super(doc.each_element_by_xpath('/opf:package/opf:metadata', EPUB::NAMESPACES).first, doc.root['unique-identifier'], 'opf')
end
def parse_manifest(doc)
manifest = EPUB::Publication::Package::Manifest.new
elem = doc.xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
elem = doc.each_element_by_xpath('/opf:package/opf:manifest', EPUB::NAMESPACES).first
manifest.id = elem.attribute_with_prefix('id')
fallback_map = {}
elem.xpath('./opf:item', EPUB::NAMESPACES).each do |e|
elem.each_element_by_xpath('./opf:item', EPUB::NAMESPACES).each do |e|
item = EPUB::Publication::Package::Manifest::Item.new
%w[id media-type media-overlay].each do |attr|
item.__send__ "#{attr.gsub(/-/, '_')}=", e.attribute_with_prefix(attr)
......@@ -75,12 +74,12 @@ module EPUB
def parse_spine(doc)
spine = EPUB::Publication::Package::Spine.new
elem = doc.xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
elem = doc.each_element_by_xpath('/opf:package/opf:spine', EPUB::NAMESPACES).first
%w[id toc page-progression-direction].each do |attr|
spine.__send__ "#{attr.gsub(/-/, '_')}=", elem.attribute_with_prefix(attr)
end
elem.xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
elem.each_element_by_xpath('./opf:itemref', EPUB::NAMESPACES).each do |e|
itemref = EPUB::Publication::Package::Spine::Itemref.new
%w[idref id].each do |attr|
itemref.__send__ "#{attr}=", e.attribute_with_prefix(attr)
......@@ -96,7 +95,7 @@ module EPUB
def parse_guide(doc)
guide = EPUB::Publication::Package::Guide.new
doc.xpath('/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES).each do |ref|
doc.each_element_by_xpath '/opf:package/opf:guide/opf:reference', EPUB::NAMESPACES do |ref|
reference = EPUB::Publication::Package::Guide::Reference.new
%w[type title].each do |attr|
reference.__send__ "#{attr}=", ref.attribute_with_prefix(attr)
......@@ -110,7 +109,7 @@ module EPUB
def parse_bindings(doc, handler_map)
bindings = EPUB::Publication::Package::Bindings.new
doc.xpath('/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES).each do |elem|
doc.each_element_by_xpath '/opf:package/opf:bindings/opf:mediaType', EPUB::NAMESPACES do |elem|
media_type = EPUB::Publication::Package::Bindings::MediaType.new
media_type.media_type = elem.attribute_with_prefix('media-type')
media_type.handler = handler_map[elem.attribute_with_prefix('handler')]
......
require "nokogiri"
module EPUB
class Parser
module NokogiriAttributeWithPrefix
refine Nokogiri::XML::Node do
def attribute_with_prefix(name, prefix = nil)
attribute_with_ns(name, EPUB::NAMESPACES[prefix])&.value
end
end
end
end
end
require "rexml/document"
begin
require "nokogiri"
rescue LoadError
end
module EPUB
class Parser
class XMLDocument
class << self
attr_accessor :backend
def new(xml)
if backend == :Nokogiri
Nokogiri.XML(xml)
else
REXML::Document.new(xml)
end
end
end
module Refinements
refine REXML::Node do
def element?
node_type == :element
end
def text?
node_type == :text
end
end
refine REXML::Element do
def each_element_by_xpath(xpath, namespaces = nil, &block)
REXML::XPath.each self, xpath, namespaces, &block
end
def attribute_with_prefix(name, prefix = nil)
attribute(name, EPUB::NAMESPACES[prefix])&.value
end
alias namespace_uri namespace
def content
texts.join
end
end
refine REXML::Text do
alias content value
end
if const_defined? :Nokogiri
refine Nokogiri::XML::Node do
def each_element_by_xpath(xpath, namespaces = nil, &block)
xpath(xpath, namespaces).each &block
end
def attribute_with_prefix(name, prefix = nil)
attribute_with_ns(name, EPUB::NAMESPACES[prefix])&.value
end
def each_element(xpath = nil, &block)
element_children.each(&block)
end
def namespace_uri
namespace.href
end
end
end
end
end
end
end
require 'epub/publication'
require "epub/parser/xml_document"
module EPUB
module Searcher
......@@ -28,7 +29,7 @@ module EPUB
spine_step = Result::Step.new(:element, 2, {:name => 'spine', :id => spine.id})
spine.each_itemref.with_index do |itemref, index|
itemref_step = Result::Step.new(:itemref, index, {:id => itemref.id})
XHTML::ALGORITHMS[algorithm].search_text(Nokogiri.XML(itemref.item.read), word).each do |sub_result|
XHTML::ALGORITHMS[algorithm].search_text(Parser::XMLDocument.new(itemref.item.read), word).each do |sub_result|
results << Result.new([spine_step, itemref_step] + sub_result.parent_steps, sub_result.start_steps, sub_result.end_steps)
end
end
......@@ -38,7 +39,7 @@ module EPUB
# @todo: Refactoring
# @return [Array<Hash>] An array of rearch results. Each result is composed of:
# :element: [Nokogiri::XML::ELement] Found element
# :element: [REXML::Element, Nokogiri::XML::ELement] Found element
# :itemref: [EPUB::Publication::Package::Spine::Itemref] Itemref that element's document belongs to
# :location: [EPUB::CFI::Location] CFI that indicates the element
# :package: [EPUB::Publication::Package] Package that the element belongs to
......@@ -70,6 +71,8 @@ module EPUB
element: elem
}
end
rescue LoadError
raise "#{self.class}##{__method__} requires Nokogiri gem for now. Install Nokogiri and then try again."
end
results
......
require 'epub'
require 'epub/parser/utils'
require 'epub/parser/xml_document'
module EPUB
module Searcher
class XHTML
using Parser::NokogiriAttributeWithPrefix
using Parser::XMLDocument::Refinements
ALGORITHMS = {}
class << self
# @param element [Nokogiri::XML::Element, Nokogiri::XML::Document]
# @param element [REXML::Element, REXML::Document, Nokogiri::XML::Element, Nokogiri::XML::Document]
# @param word [String]
# @return [Array<Result>]
def search_text(element, word)
......@@ -23,7 +23,7 @@ module EPUB
end
class Restricted < self
# @param element [Nokogiri::XML::Element]
# @param element [REXML::Element, Nokogiri::XML::Element]
# @return [Array<Result>]
def search_text(word, element=nil)
results = []
......
......@@ -15,3 +15,4 @@ if ENV["PRETTY_BACKTRACE"]
end
require 'epub/parser'
EPUB::Parser::XMLDocument.backend = ENV["EPUB_PARSER_XML_BACKEND"].to_sym
......@@ -9,7 +9,7 @@ class TestSearcher < Test::Unit::TestCase
super
opf_path = File.expand_path('../fixtures/book/OPS/ルートファイル.opf', __FILE__)
nav_path = File.expand_path('../fixtures/book/OPS/nav.xhtml', __FILE__)
@package = EPUB::Parser::Publication.new(open(opf_path)).parse
@package = EPUB::Parser::Publication.new(File.read(opf_path)).parse
@package.spine.each_itemref do |itemref|
stub(itemref.item).read {
itemref.idref == 'nav' ? File.read(nav_path) : '<html></html>'
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment