diff --git a/benchmark/parse_doctype.yaml b/benchmark/parse_doctype.yaml
new file mode 100644
index 00000000..41a5cc98
--- /dev/null
+++ b/benchmark/parse_doctype.yaml
@@ -0,0 +1,77 @@
+loop_count: 100
+contexts:
+ - gems:
+ rexml: 3.2.6
+ require: false
+ prelude: require 'rexml'
+ - name: master
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+ require 'rexml'
+ - name: 3.2.6(YJIT)
+ gems:
+ rexml: 3.2.6
+ require: false
+ prelude: |
+ require 'rexml'
+ RubyVM::YJIT.enable
+ - name: master(YJIT)
+ prelude: |
+ $LOAD_PATH.unshift(File.expand_path("lib"))
+ require 'rexml'
+ RubyVM::YJIT.enable
+
+prelude: |
+ require 'rexml/document'
+ require 'rexml/parsers/sax2parser'
+ require 'rexml/parsers/streamparser'
+ require 'rexml/streamlistener'
+
+ # Single entity reference: -> &foo;
+ single_entity_xml = <<~XML
+
+ ]>
+ &word;
+ XML
+
+ # Chained entity references: a -> b -> c -> ... -> value (n_depth levels deep)
+ n_depth = 100
+ chained_entities_xml = <<~XML
+ " : "" }.join("\n")}
+ ]>
+ &e1;
+ XML
+
+ # Many distinct entities referenced in document content
+ n_entities = 100
+ many_entities_xml = <<~XML
+ " }.join("\n")}
+ ]>
+ #{(1..n_entities).map {|i| "&ent#{i};" }.join}
+ XML
+
+ # Same entity referenced repeatedly in attribute values
+ n_items = 100
+ repeated_entity_xml = <<~XML
+
+ ]>
+ #{'- &word;
' * n_items}
+ XML
+
+ class Listener
+ include REXML::StreamListener
+ end
+
+benchmark:
+ 'single_entity(dom)' : REXML::Document.new(single_entity_xml).root.text
+ 'single_entity(stream)' : REXML::Parsers::StreamParser.new(single_entity_xml, Listener.new).parse
+ 'chained_entities(dom)' : REXML::Document.new(chained_entities_xml).root.text
+ 'chained_entities(stream)': REXML::Parsers::StreamParser.new(chained_entities_xml, Listener.new).parse
+ 'many_entities(dom)' : REXML::Document.new(many_entities_xml).root.text
+ 'many_entities(stream)' : REXML::Parsers::StreamParser.new(many_entities_xml, Listener.new).parse
+ 'repeated_entity(dom)' : REXML::Document.new(repeated_entity_xml).root.text
+ 'repeated_entity(stream)' : REXML::Parsers::StreamParser.new(repeated_entity_xml, Listener.new).parse
diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb
index a9cf9f7e..d4f0bbe4 100644
--- a/lib/rexml/doctype.rb
+++ b/lib/rexml/doctype.rb
@@ -174,8 +174,8 @@ def context
@parent&.context
end
- def entity( name )
- @entities[name]&.unnormalized
+ def entity( name, expanding: nil )
+ @entities[name]&.unnormalized(expanding: expanding)
end
def add child
diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb
index 1ba5a7bb..f55088fc 100644
--- a/lib/rexml/entity.rb
+++ b/lib/rexml/entity.rb
@@ -1,5 +1,7 @@
# frozen_string_literal: false
+require 'set'
require_relative 'child'
+require_relative 'parseexception'
require_relative 'source'
require_relative 'xmltokens'
@@ -70,13 +72,20 @@ def Entity::matches? string
# Evaluates to the unnormalized value of this entity; that is, replacing
# &ent; entities.
- def unnormalized
+ def unnormalized(expanding: nil)
document&.record_entity_expansion
return nil if @value.nil?
- @unnormalized = Text::unnormalize(@value, parent,
- entity_expansion_text_limit: document&.entity_expansion_text_limit)
+ expanding ||= Set.new
+ raise REXML::ParseException.new("Detected an entity reference loop: #{@name}") if expanding.include?(@name)
+
+ expanding.add(@name)
+ result = Text::unnormalize(@value, parent,
+ entity_expansion_text_limit: document&.entity_expansion_text_limit,
+ expanding: expanding)
+ expanding.delete(@name)
+ result
end
#once :unnormalized
diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb
index 0b48ec14..af8887e2 100644
--- a/lib/rexml/parsers/baseparser.rb
+++ b/lib/rexml/parsers/baseparser.rb
@@ -535,14 +535,20 @@ def pull_event
end
private :pull_event
- def entity( reference, entities )
+ def entity( reference, entities, expanding: nil )
return unless entities
value = entities[ reference ]
return if value.nil?
+ expanding ||= Set.new
+ raise REXML::ParseException.new("Detected an entity reference loop: #{reference}", @source) if expanding.include?(reference)
+
record_entity_expansion
- unnormalize( value, entities )
+ expanding.add(reference)
+ result = unnormalize( value, entities, nil, expanding: expanding )
+ expanding.delete(reference)
+ result
end
# Escapes all possible entities
@@ -562,7 +568,7 @@ def normalize( input, entities=nil, entity_filter=nil )
end
# Unescapes all possible entities
- def unnormalize( string, entities=nil, filter=nil )
+ def unnormalize( string, entities=nil, filter=nil, expanding: nil )
if string.include?("\r")
rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" )
else
@@ -588,7 +594,7 @@ def unnormalize( string, entities=nil, filter=nil )
if matches.size > 0
matches.tally.each do |entity_reference, n|
entity_expansion_count_before = @entity_expansion_count
- entity_value = entity( entity_reference, entities )
+ entity_value = entity( entity_reference, entities, expanding: expanding )
if entity_value
if n > 1
entity_expansion_count_delta =
diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb
index 8d5281cd..2319cef8 100644
--- a/lib/rexml/text.rb
+++ b/lib/rexml/text.rb
@@ -384,11 +384,11 @@ def Text::normalize( input, doctype=nil, entity_filter=nil )
end
# Unescapes all possible entities
- def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil )
+ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil, expanding: nil )
entity_expansion_text_limit ||= Security.entity_expansion_text_limit
sum = 0
string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) {
- s = Text.expand($&, doctype, filter)
+ s = Text.expand($&, doctype, filter, expanding: expanding)
if sum + s.bytesize > entity_expansion_text_limit
raise "entity expansion has grown too large"
else
@@ -398,7 +398,7 @@ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expa
}
end
- def Text.expand(ref, doctype, filter)
+ def Text.expand(ref, doctype, filter, expanding: nil)
if ref[1] == ?#
if ref[2] == ?x
[ref[3...-1].to_i(16)].pack('U*')
@@ -410,7 +410,7 @@ def Text.expand(ref, doctype, filter)
elsif filter and filter.include?( ref[1...-1] )
ref
elsif doctype
- doctype.entity( ref[1...-1] ) or ref
+ doctype.entity( ref[1...-1], expanding: expanding ) or ref
else
entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ]
entity_value ? entity_value.value : ref
diff --git a/test/test_entity.rb b/test/test_entity.rb
index 03f268dd..a96dc504 100644
--- a/test/test_entity.rb
+++ b/test/test_entity.rb
@@ -1,7 +1,10 @@
# frozen_string_literal: false
+require 'rexml/document'
require 'rexml/entity'
require 'rexml/source'
+require 'rexml/parsers/streamparser'
+require 'rexml/streamlistener'
module REXMLTests
class EntityTester < Test::Unit::TestCase
@@ -241,6 +244,114 @@ def test_single_pass_unnormalization # ticket 123
assert_equal '&&', REXML::Text::unnormalize('&&')
end
+ def test_entity_direct_circular_reference
+ source = <<~XML
+
+ ]>
+ &x;
+ XML
+ listener = Class.new { include REXML::StreamListener }.new
+ parser = REXML::Parsers::StreamParser.new(source, listener)
+ exception = assert_raise(REXML::ParseException) do
+ parser.parse
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Detected an entity reference loop: x
+Line: 4
+Position: 57
+Last 80 unconsumed characters:
+\x20
+ DETAIL
+ end
+
+ def test_entity_indirect_circular_reference
+ source = <<~XML
+
+
+ ]>
+ &a;
+ XML
+ listener = Class.new { include REXML::StreamListener }.new
+ parser = REXML::Parsers::StreamParser.new(source, listener)
+ exception = assert_raise(REXML::ParseException) do
+ parser.parse
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Detected an entity reference loop: a
+Line: 5
+Position: 77
+Last 80 unconsumed characters:
+\x20
+ DETAIL
+ end
+
+ def test_entity_circular_reference_with_long_value
+ source = <<~XML
+
+ ]>
+ &x;
+ XML
+ listener = Class.new { include REXML::StreamListener }.new
+ parser = REXML::Parsers::StreamParser.new(source, listener)
+ exception = assert_raise(REXML::ParseException) do
+ parser.parse
+ end
+ assert_equal(<<-DETAIL.chomp, exception.to_s)
+Detected an entity reference loop: x
+Line: 4
+Position: 557
+Last 80 unconsumed characters:
+\x20
+ DETAIL
+ end
+
+ def test_entity_direct_circular_reference_via_dom
+ source = <<~XML
+
+ ]>
+ &x;
+ XML
+ doc = REXML::Document.new(source)
+ exception = assert_raise(REXML::ParseException) do
+ doc.root.text
+ end
+ assert_equal("Detected an entity reference loop: x", exception.to_s)
+ end
+
+ def test_entity_indirect_circular_reference_via_dom
+ source = <<~XML
+
+
+ ]>
+ &a;
+ XML
+ doc = REXML::Document.new(source)
+ exception = assert_raise(REXML::ParseException) do
+ doc.root.text
+ end
+ assert_equal("Detected an entity reference loop: a", exception.to_s)
+ end
+
+ def test_entity_unnormalized_direct_circular_reference
+ source = <<~XML
+
+ ]>
+
+ XML
+ doc = REXML::Document.new(source)
+ entity = doc.doctype.entities["x"]
+ exception = assert_raise(REXML::ParseException) do
+ entity.unnormalized
+ end
+ assert_equal("Detected an entity reference loop: x", exception.to_s)
+ end
+
def test_entity_filter
document = REXML::Document.new(<<-XML)