From 4e968a87ced5743db6febecd59832c34568008c2 Mon Sep 17 00:00:00 2001 From: NAITOH Jun Date: Sun, 10 May 2026 09:54:39 +0900 Subject: [PATCH 1/3] Fix infinite entity expansion by detecting circular references on REXML::Parsers::BaseParser ## Why? Fix an issue where a `stack level too deep (SystemStackError)` error caused by a circular reference was not detected as an appropriate error. ``` require 'rexml/parsers/streamparser' require 'rexml/streamlistener' source = <<~XML ]> &x; XML listener = Class.new { include REXML::StreamListener }.new REXML::Parsers::StreamParser.new(source, listener).parse ``` - before ``` lib/rexml/parsers/baseparser.rb:544:in 'REXML::Parsers::BaseParser#entity': stack level too deep (SystemStackError) ``` - after ``` lib/rexml/parsers/baseparser.rb:545:in 'REXML::Parsers::BaseParser#entity': Detected an entity reference loop: x (REXML::ParseException) Line: 4 Position: 57 Last 80 unconsumed characters: ``` --- lib/rexml/parsers/baseparser.rb | 14 +++++-- test/test_entity.rb | 66 +++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 0b48ec14..af8887e2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -535,14 +535,20 @@ def pull_event end private :pull_event - def entity( reference, entities ) + def entity( reference, entities, expanding: nil ) return unless entities value = entities[ reference ] return if value.nil? + expanding ||= Set.new + raise REXML::ParseException.new("Detected an entity reference loop: #{reference}", @source) if expanding.include?(reference) + record_entity_expansion - unnormalize( value, entities ) + expanding.add(reference) + result = unnormalize( value, entities, nil, expanding: expanding ) + expanding.delete(reference) + result end # Escapes all possible entities @@ -562,7 +568,7 @@ def normalize( input, entities=nil, entity_filter=nil ) end # Unescapes all possible entities - def unnormalize( string, entities=nil, filter=nil ) + def unnormalize( string, entities=nil, filter=nil, expanding: nil ) if string.include?("\r") rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) else @@ -588,7 +594,7 @@ def unnormalize( string, entities=nil, filter=nil ) if matches.size > 0 matches.tally.each do |entity_reference, n| entity_expansion_count_before = @entity_expansion_count - entity_value = entity( entity_reference, entities ) + entity_value = entity( entity_reference, entities, expanding: expanding ) if entity_value if n > 1 entity_expansion_count_delta = diff --git a/test/test_entity.rb b/test/test_entity.rb index 03f268dd..cd2b56d1 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -2,6 +2,8 @@ require 'rexml/entity' require 'rexml/source' +require 'rexml/parsers/streamparser' +require 'rexml/streamlistener' module REXMLTests class EntityTester < Test::Unit::TestCase @@ -241,6 +243,70 @@ def test_single_pass_unnormalization # ticket 123 assert_equal '&&', REXML::Text::unnormalize('&amp;&') end + def test_entity_direct_circular_reference + source = <<~XML + + ]> + &x; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: x +Line: 4 +Position: 57 +Last 80 unconsumed characters: +\x20 + DETAIL + end + + def test_entity_indirect_circular_reference + source = <<~XML + + + ]> + &a; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: a +Line: 5 +Position: 77 +Last 80 unconsumed characters: +\x20 + DETAIL + end + + def test_entity_circular_reference_with_long_value + source = <<~XML + + ]> + &x; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: x +Line: 4 +Position: 557 +Last 80 unconsumed characters: +\x20 + DETAIL + end + def test_entity_filter document = REXML::Document.new(<<-XML) Date: Sun, 10 May 2026 09:58:41 +0900 Subject: [PATCH 2/3] Fix infinite entity expansion by detecting circular references on REXML::Document ## Why? Fix an issue where a `stack level too deep (SystemStackError)` error caused by a circular reference was not detected as an appropriate error. ``` require 'rexml' source = <<~XML ]> &x; XML REXML::Document.new(source).root.text ``` - before ``` lib/rexml/text.rb:390:in 'String#gsub': stack level too deep (SystemStackError) ``` - after ``` lib/rexml/entity.rb:81:in 'REXML::Entity#unnormalized': Detected an entity reference loop: x (REXML::ParseException) ``` --- lib/rexml/doctype.rb | 4 ++-- lib/rexml/entity.rb | 15 ++++++++++++--- lib/rexml/text.rb | 8 ++++---- test/test_entity.rb | 45 ++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 63 insertions(+), 9 deletions(-) diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index a9cf9f7e..d4f0bbe4 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -174,8 +174,8 @@ def context @parent&.context end - def entity( name ) - @entities[name]&.unnormalized + def entity( name, expanding: nil ) + @entities[name]&.unnormalized(expanding: expanding) end def add child diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 1ba5a7bb..f55088fc 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -1,5 +1,7 @@ # frozen_string_literal: false +require 'set' require_relative 'child' +require_relative 'parseexception' require_relative 'source' require_relative 'xmltokens' @@ -70,13 +72,20 @@ def Entity::matches? string # Evaluates to the unnormalized value of this entity; that is, replacing # &ent; entities. - def unnormalized + def unnormalized(expanding: nil) document&.record_entity_expansion return nil if @value.nil? - @unnormalized = Text::unnormalize(@value, parent, - entity_expansion_text_limit: document&.entity_expansion_text_limit) + expanding ||= Set.new + raise REXML::ParseException.new("Detected an entity reference loop: #{@name}") if expanding.include?(@name) + + expanding.add(@name) + result = Text::unnormalize(@value, parent, + entity_expansion_text_limit: document&.entity_expansion_text_limit, + expanding: expanding) + expanding.delete(@name) + result end #once :unnormalized diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8d5281cd..2319cef8 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -384,11 +384,11 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) end # Unescapes all possible entities - def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil ) + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil, expanding: nil ) entity_expansion_text_limit ||= Security.entity_expansion_text_limit sum = 0 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { - s = Text.expand($&, doctype, filter) + s = Text.expand($&, doctype, filter, expanding: expanding) if sum + s.bytesize > entity_expansion_text_limit raise "entity expansion has grown too large" else @@ -398,7 +398,7 @@ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expa } end - def Text.expand(ref, doctype, filter) + def Text.expand(ref, doctype, filter, expanding: nil) if ref[1] == ?# if ref[2] == ?x [ref[3...-1].to_i(16)].pack('U*') @@ -410,7 +410,7 @@ def Text.expand(ref, doctype, filter) elsif filter and filter.include?( ref[1...-1] ) ref elsif doctype - doctype.entity( ref[1...-1] ) or ref + doctype.entity( ref[1...-1], expanding: expanding ) or ref else entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] entity_value ? entity_value.value : ref diff --git a/test/test_entity.rb b/test/test_entity.rb index cd2b56d1..a96dc504 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -1,5 +1,6 @@ # frozen_string_literal: false +require 'rexml/document' require 'rexml/entity' require 'rexml/source' require 'rexml/parsers/streamparser' @@ -307,6 +308,50 @@ def test_entity_circular_reference_with_long_value DETAIL end + def test_entity_direct_circular_reference_via_dom + source = <<~XML + + ]> + &x; + XML + doc = REXML::Document.new(source) + exception = assert_raise(REXML::ParseException) do + doc.root.text + end + assert_equal("Detected an entity reference loop: x", exception.to_s) + end + + def test_entity_indirect_circular_reference_via_dom + source = <<~XML + + + ]> + &a; + XML + doc = REXML::Document.new(source) + exception = assert_raise(REXML::ParseException) do + doc.root.text + end + assert_equal("Detected an entity reference loop: a", exception.to_s) + end + + def test_entity_unnormalized_direct_circular_reference + source = <<~XML + + ]> + + XML + doc = REXML::Document.new(source) + entity = doc.doctype.entities["x"] + exception = assert_raise(REXML::ParseException) do + entity.unnormalized + end + assert_equal("Detected an entity reference loop: x", exception.to_s) + end + def test_entity_filter document = REXML::Document.new(<<-XML) Date: Sun, 10 May 2026 10:15:47 +0900 Subject: [PATCH 3/3] Add benchmark ## Benchmark ``` $ benchmark-driver benchmark/parse_doctype.yaml Calculating ------------------------------------- before after before(YJIT) after(YJIT) single_entity(dom) 22.381k 24.492k 5.098k 5.211k i/s - 100.000 times in 0.004468s 0.004083s 0.019615s 0.019189s single_entity(stream) 46.882k 48.054k 8.796k 8.691k i/s - 100.000 times in 0.002133s 0.002081s 0.011369s 0.011506s chained_entities(dom) 2.665k 2.531k 2.349k 2.340k i/s - 100.000 times in 0.037520s 0.039509s 0.042574s 0.042726s chained_entities(stream) 2.455k 2.387k 2.351k 2.290k i/s - 100.000 times in 0.040732s 0.041888s 0.042538s 0.043660s many_entities(dom) 675.927 679.094 684.308 674.686 i/s - 100.000 times in 0.147945s 0.147255s 0.146133s 0.148217s many_entities(stream) 2.602k 2.501k 2.403k 2.276k i/s - 100.000 times in 0.038427s 0.039986s 0.041607s 0.043944s repeated_entity(dom) 419.090 414.295 445.286 444.575 i/s - 100.000 times in 0.238612s 0.241374s 0.224575s 0.224934s repeated_entity(stream) 1.449k 1.401k 1.590k 1.582k i/s - 100.000 times in 0.069014s 0.071358s 0.062898s 0.063201s Comparison: single_entity(dom) after: 24491.8 i/s before: 22381.4 i/s - 1.09x slower after(YJIT): 5211.3 i/s - 4.70x slower before(YJIT): 5098.1 i/s - 4.80x slower single_entity(stream) after: 48053.8 i/s before: 46882.3 i/s - 1.02x slower before(YJIT): 8795.8 i/s - 5.46x slower after(YJIT): 8691.1 i/s - 5.53x slower chained_entities(dom) before: 2665.2 i/s after: 2531.1 i/s - 1.05x slower before(YJIT): 2348.9 i/s - 1.13x slower after(YJIT): 2340.5 i/s - 1.14x slower chained_entities(stream) before: 2455.1 i/s after: 2387.3 i/s - 1.03x slower before(YJIT): 2350.8 i/s - 1.04x slower after(YJIT): 2290.4 i/s - 1.07x slower many_entities(dom) before(YJIT): 684.3 i/s after: 679.1 i/s - 1.01x slower before: 675.9 i/s - 1.01x slower after(YJIT): 674.7 i/s - 1.01x slower many_entities(stream) before: 2602.3 i/s after: 2500.9 i/s - 1.04x slower before(YJIT): 2403.4 i/s - 1.08x slower after(YJIT): 2275.6 i/s - 1.14x slower repeated_entity(dom) before(YJIT): 445.3 i/s after(YJIT): 444.6 i/s - 1.00x slower before: 419.1 i/s - 1.06x slower after: 414.3 i/s - 1.07x slower repeated_entity(stream) before(YJIT): 1589.9 i/s after(YJIT): 1582.3 i/s - 1.00x slower before: 1449.0 i/s - 1.10x slower after: 1401.4 i/s - 1.13x slower ``` - YJIT=ON : 0.95 - 1.09x faster - YJIT=OFF : 0.94 - 1.02x faster --- benchmark/parse_doctype.yaml | 77 ++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 benchmark/parse_doctype.yaml diff --git a/benchmark/parse_doctype.yaml b/benchmark/parse_doctype.yaml new file mode 100644 index 00000000..41a5cc98 --- /dev/null +++ b/benchmark/parse_doctype.yaml @@ -0,0 +1,77 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + # Single entity reference: -> &foo; + single_entity_xml = <<~XML + + ]> + &word; + XML + + # Chained entity references: a -> b -> c -> ... -> value (n_depth levels deep) + n_depth = 100 + chained_entities_xml = <<~XML + " : "" }.join("\n")} + ]> + &e1; + XML + + # Many distinct entities referenced in document content + n_entities = 100 + many_entities_xml = <<~XML + " }.join("\n")} + ]> + #{(1..n_entities).map {|i| "&ent#{i};" }.join} + XML + + # Same entity referenced repeatedly in attribute values + n_items = 100 + repeated_entity_xml = <<~XML + + ]> + #{'&word;' * n_items} + XML + + class Listener + include REXML::StreamListener + end + +benchmark: + 'single_entity(dom)' : REXML::Document.new(single_entity_xml).root.text + 'single_entity(stream)' : REXML::Parsers::StreamParser.new(single_entity_xml, Listener.new).parse + 'chained_entities(dom)' : REXML::Document.new(chained_entities_xml).root.text + 'chained_entities(stream)': REXML::Parsers::StreamParser.new(chained_entities_xml, Listener.new).parse + 'many_entities(dom)' : REXML::Document.new(many_entities_xml).root.text + 'many_entities(stream)' : REXML::Parsers::StreamParser.new(many_entities_xml, Listener.new).parse + 'repeated_entity(dom)' : REXML::Document.new(repeated_entity_xml).root.text + 'repeated_entity(stream)' : REXML::Parsers::StreamParser.new(repeated_entity_xml, Listener.new).parse