diff --git a/benchmark/parse_doctype.yaml b/benchmark/parse_doctype.yaml new file mode 100644 index 00000000..41a5cc98 --- /dev/null +++ b/benchmark/parse_doctype.yaml @@ -0,0 +1,77 @@ +loop_count: 100 +contexts: + - gems: + rexml: 3.2.6 + require: false + prelude: require 'rexml' + - name: master + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + - name: 3.2.6(YJIT) + gems: + rexml: 3.2.6 + require: false + prelude: | + require 'rexml' + RubyVM::YJIT.enable + - name: master(YJIT) + prelude: | + $LOAD_PATH.unshift(File.expand_path("lib")) + require 'rexml' + RubyVM::YJIT.enable + +prelude: | + require 'rexml/document' + require 'rexml/parsers/sax2parser' + require 'rexml/parsers/streamparser' + require 'rexml/streamlistener' + + # Single entity reference: -> &foo; + single_entity_xml = <<~XML + + ]> + &word; + XML + + # Chained entity references: a -> b -> c -> ... -> value (n_depth levels deep) + n_depth = 100 + chained_entities_xml = <<~XML + " : "" }.join("\n")} + ]> + &e1; + XML + + # Many distinct entities referenced in document content + n_entities = 100 + many_entities_xml = <<~XML + " }.join("\n")} + ]> + #{(1..n_entities).map {|i| "&ent#{i};" }.join} + XML + + # Same entity referenced repeatedly in attribute values + n_items = 100 + repeated_entity_xml = <<~XML + + ]> + #{'&word;' * n_items} + XML + + class Listener + include REXML::StreamListener + end + +benchmark: + 'single_entity(dom)' : REXML::Document.new(single_entity_xml).root.text + 'single_entity(stream)' : REXML::Parsers::StreamParser.new(single_entity_xml, Listener.new).parse + 'chained_entities(dom)' : REXML::Document.new(chained_entities_xml).root.text + 'chained_entities(stream)': REXML::Parsers::StreamParser.new(chained_entities_xml, Listener.new).parse + 'many_entities(dom)' : REXML::Document.new(many_entities_xml).root.text + 'many_entities(stream)' : REXML::Parsers::StreamParser.new(many_entities_xml, Listener.new).parse + 'repeated_entity(dom)' : REXML::Document.new(repeated_entity_xml).root.text + 'repeated_entity(stream)' : REXML::Parsers::StreamParser.new(repeated_entity_xml, Listener.new).parse diff --git a/lib/rexml/doctype.rb b/lib/rexml/doctype.rb index a9cf9f7e..d4f0bbe4 100644 --- a/lib/rexml/doctype.rb +++ b/lib/rexml/doctype.rb @@ -174,8 +174,8 @@ def context @parent&.context end - def entity( name ) - @entities[name]&.unnormalized + def entity( name, expanding: nil ) + @entities[name]&.unnormalized(expanding: expanding) end def add child diff --git a/lib/rexml/entity.rb b/lib/rexml/entity.rb index 1ba5a7bb..f55088fc 100644 --- a/lib/rexml/entity.rb +++ b/lib/rexml/entity.rb @@ -1,5 +1,7 @@ # frozen_string_literal: false +require 'set' require_relative 'child' +require_relative 'parseexception' require_relative 'source' require_relative 'xmltokens' @@ -70,13 +72,20 @@ def Entity::matches? string # Evaluates to the unnormalized value of this entity; that is, replacing # &ent; entities. - def unnormalized + def unnormalized(expanding: nil) document&.record_entity_expansion return nil if @value.nil? - @unnormalized = Text::unnormalize(@value, parent, - entity_expansion_text_limit: document&.entity_expansion_text_limit) + expanding ||= Set.new + raise REXML::ParseException.new("Detected an entity reference loop: #{@name}") if expanding.include?(@name) + + expanding.add(@name) + result = Text::unnormalize(@value, parent, + entity_expansion_text_limit: document&.entity_expansion_text_limit, + expanding: expanding) + expanding.delete(@name) + result end #once :unnormalized diff --git a/lib/rexml/parsers/baseparser.rb b/lib/rexml/parsers/baseparser.rb index 0b48ec14..af8887e2 100644 --- a/lib/rexml/parsers/baseparser.rb +++ b/lib/rexml/parsers/baseparser.rb @@ -535,14 +535,20 @@ def pull_event end private :pull_event - def entity( reference, entities ) + def entity( reference, entities, expanding: nil ) return unless entities value = entities[ reference ] return if value.nil? + expanding ||= Set.new + raise REXML::ParseException.new("Detected an entity reference loop: #{reference}", @source) if expanding.include?(reference) + record_entity_expansion - unnormalize( value, entities ) + expanding.add(reference) + result = unnormalize( value, entities, nil, expanding: expanding ) + expanding.delete(reference) + result end # Escapes all possible entities @@ -562,7 +568,7 @@ def normalize( input, entities=nil, entity_filter=nil ) end # Unescapes all possible entities - def unnormalize( string, entities=nil, filter=nil ) + def unnormalize( string, entities=nil, filter=nil, expanding: nil ) if string.include?("\r") rv = string.gsub( Private::CARRIAGE_RETURN_NEWLINE_PATTERN, "\n" ) else @@ -588,7 +594,7 @@ def unnormalize( string, entities=nil, filter=nil ) if matches.size > 0 matches.tally.each do |entity_reference, n| entity_expansion_count_before = @entity_expansion_count - entity_value = entity( entity_reference, entities ) + entity_value = entity( entity_reference, entities, expanding: expanding ) if entity_value if n > 1 entity_expansion_count_delta = diff --git a/lib/rexml/text.rb b/lib/rexml/text.rb index 8d5281cd..2319cef8 100644 --- a/lib/rexml/text.rb +++ b/lib/rexml/text.rb @@ -384,11 +384,11 @@ def Text::normalize( input, doctype=nil, entity_filter=nil ) end # Unescapes all possible entities - def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil ) + def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expansion_text_limit: nil, expanding: nil ) entity_expansion_text_limit ||= Security.entity_expansion_text_limit sum = 0 string.gsub( /\r\n?/, "\n" ).gsub( REFERENCE ) { - s = Text.expand($&, doctype, filter) + s = Text.expand($&, doctype, filter, expanding: expanding) if sum + s.bytesize > entity_expansion_text_limit raise "entity expansion has grown too large" else @@ -398,7 +398,7 @@ def Text::unnormalize( string, doctype=nil, filter=nil, illegal=nil, entity_expa } end - def Text.expand(ref, doctype, filter) + def Text.expand(ref, doctype, filter, expanding: nil) if ref[1] == ?# if ref[2] == ?x [ref[3...-1].to_i(16)].pack('U*') @@ -410,7 +410,7 @@ def Text.expand(ref, doctype, filter) elsif filter and filter.include?( ref[1...-1] ) ref elsif doctype - doctype.entity( ref[1...-1] ) or ref + doctype.entity( ref[1...-1], expanding: expanding ) or ref else entity_value = DocType::DEFAULT_ENTITIES[ ref[1...-1] ] entity_value ? entity_value.value : ref diff --git a/test/test_entity.rb b/test/test_entity.rb index 03f268dd..a96dc504 100644 --- a/test/test_entity.rb +++ b/test/test_entity.rb @@ -1,7 +1,10 @@ # frozen_string_literal: false +require 'rexml/document' require 'rexml/entity' require 'rexml/source' +require 'rexml/parsers/streamparser' +require 'rexml/streamlistener' module REXMLTests class EntityTester < Test::Unit::TestCase @@ -241,6 +244,114 @@ def test_single_pass_unnormalization # ticket 123 assert_equal '&&', REXML::Text::unnormalize('&amp;&') end + def test_entity_direct_circular_reference + source = <<~XML + + ]> + &x; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: x +Line: 4 +Position: 57 +Last 80 unconsumed characters: +\x20 + DETAIL + end + + def test_entity_indirect_circular_reference + source = <<~XML + + + ]> + &a; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: a +Line: 5 +Position: 77 +Last 80 unconsumed characters: +\x20 + DETAIL + end + + def test_entity_circular_reference_with_long_value + source = <<~XML + + ]> + &x; + XML + listener = Class.new { include REXML::StreamListener }.new + parser = REXML::Parsers::StreamParser.new(source, listener) + exception = assert_raise(REXML::ParseException) do + parser.parse + end + assert_equal(<<-DETAIL.chomp, exception.to_s) +Detected an entity reference loop: x +Line: 4 +Position: 557 +Last 80 unconsumed characters: +\x20 + DETAIL + end + + def test_entity_direct_circular_reference_via_dom + source = <<~XML + + ]> + &x; + XML + doc = REXML::Document.new(source) + exception = assert_raise(REXML::ParseException) do + doc.root.text + end + assert_equal("Detected an entity reference loop: x", exception.to_s) + end + + def test_entity_indirect_circular_reference_via_dom + source = <<~XML + + + ]> + &a; + XML + doc = REXML::Document.new(source) + exception = assert_raise(REXML::ParseException) do + doc.root.text + end + assert_equal("Detected an entity reference loop: a", exception.to_s) + end + + def test_entity_unnormalized_direct_circular_reference + source = <<~XML + + ]> + + XML + doc = REXML::Document.new(source) + entity = doc.doctype.entities["x"] + exception = assert_raise(REXML::ParseException) do + entity.unnormalized + end + assert_equal("Detected an entity reference loop: x", exception.to_s) + end + def test_entity_filter document = REXML::Document.new(<<-XML)