Skip to content

Commit 5859bde

Browse files
authored
Added XML declaration check & Source#skip_spaces method (#282)
## Why? ### Added XML declaration check - The version attribute is required in XML declaration. - Only version attribute, encoding attribute, and standalone attribute are allowed in XML declaration. - XML declaration is only allowed once. See: https://www.w3.org/TR/xml/#NT-XMLDecl ### Added `Source#skip_spaces` method In the case of `@source.match?(/\s+/um, true)`, if there are no spaces at the beginning, I want to stop reading immediately. However, it continues to read the buffer until it finds a match, but it never finds a match. As a result, it continues reading until the end of the file. In the case of large XML files, drop_parsed_content occur frequently until the buffer is cleared, which may affect performance. ## Benchmark ``` before after before(YJIT) after(YJIT) dom 32.534 35.130 54.559 53.528 i/s - 100.000 times in 3.073715s 2.846540s 1.832883s 1.868189s sax 44.785 44.089 78.303 77.842 i/s - 100.000 times in 2.232907s 2.268138s 1.277093s 1.284657s pull 51.750 51.105 90.819 90.658 i/s - 100.000 times in 1.932351s 1.956759s 1.101094s 1.103050s stream 51.427 51.444 89.820 88.971 i/s - 100.000 times in 1.944502s 1.943855s 1.113340s 1.123960s Comparison: dom before(YJIT): 54.6 i/s after(YJIT): 53.5 i/s - 1.02x slower after: 35.1 i/s - 1.55x slower before: 32.5 i/s - 1.68x slower sax before(YJIT): 78.3 i/s after(YJIT): 77.8 i/s - 1.01x slower before: 44.8 i/s - 1.75x slower after: 44.1 i/s - 1.78x slower pull before(YJIT): 90.8 i/s after(YJIT): 90.7 i/s - 1.00x slower before: 51.8 i/s - 1.75x slower after: 51.1 i/s - 1.78x slower stream before(YJIT): 89.8 i/s after(YJIT): 89.0 i/s - 1.01x slower after: 51.4 i/s - 1.75x slower before: 51.4 i/s - 1.75x slower ``` - YJIT=ON : 0.98x - 1.00x faster - YJIT=OFF : 0.98x - 1.07x faster
1 parent 1d876e3 commit 5859bde

File tree

5 files changed

+244
-57
lines changed

5 files changed

+244
-57
lines changed

lib/rexml/parsers/baseparser.rb

Lines changed: 107 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -144,6 +144,7 @@ module Private
144144
PEREFERENCE_PATTERN = /#{PEREFERENCE}/um
145145
TAG_PATTERN = /((?>#{QNAME_STR}))\s*/um
146146
CLOSE_PATTERN = /(#{QNAME_STR})\s*>/um
147+
EQUAL_PATTERN = /\s*=\s*/um
147148
ATTLISTDECL_END = /\s+#{NAME}(?:#{ATTDEF})*\s*>/um
148149
NAME_PATTERN = /#{NAME}/um
149150
GEDECL_PATTERN = "\\s+#{NAME}\\s+#{ENTITYDEF}\\s*>"
@@ -168,6 +169,7 @@ def initialize( source )
168169
@entity_expansion_limit = Security.entity_expansion_limit
169170
@entity_expansion_text_limit = Security.entity_expansion_text_limit
170171
@source.ensure_buffer
172+
@version = nil
171173
end
172174

173175
def add_listener( listener )
@@ -280,7 +282,7 @@ def pull_event
280282
return [ :comment, process_comment ]
281283
elsif @source.match?("DOCTYPE", true)
282284
base_error_message = "Malformed DOCTYPE"
283-
unless @source.match?(/\s+/um, true)
285+
unless @source.skip_spaces
284286
if @source.match?(">")
285287
message = "#{base_error_message}: name is missing"
286288
else
@@ -290,7 +292,7 @@ def pull_event
290292
raise REXML::ParseException.new(message, @source)
291293
end
292294
name = parse_name(base_error_message)
293-
@source.match?(/\s*/um, true) # skip spaces
295+
@source.skip_spaces
294296
if @source.match?("[", true)
295297
id = [nil, nil, nil]
296298
@document_status = :in_doctype
@@ -306,7 +308,7 @@ def pull_event
306308
# For backward compatibility
307309
id[1], id[2] = id[2], nil
308310
end
309-
@source.match?(/\s*/um, true) # skip spaces
311+
@source.skip_spaces
310312
if @source.match?("[", true)
311313
@document_status = :in_doctype
312314
elsif @source.match?(">", true)
@@ -319,7 +321,7 @@ def pull_event
319321
end
320322
args = [:start_doctype, name, *id]
321323
if @document_status == :after_doctype
322-
@source.match?(/\s*/um, true)
324+
@source.skip_spaces
323325
@stack << [ :end_doctype ]
324326
end
325327
return args
@@ -330,7 +332,7 @@ def pull_event
330332
end
331333
end
332334
if @document_status == :in_doctype
333-
@source.match?(/\s*/um, true) # skip spaces
335+
@source.skip_spaces
334336
start_position = @source.position
335337
if @source.match?("<!", true)
336338
if @source.match?("ELEMENT", true)
@@ -391,7 +393,7 @@ def pull_event
391393
return [ :attlistdecl, element, pairs, contents ]
392394
elsif @source.match?("NOTATION", true)
393395
base_error_message = "Malformed notation declaration"
394-
unless @source.match?(/\s+/um, true)
396+
unless @source.skip_spaces
395397
if @source.match?(">")
396398
message = "#{base_error_message}: name is missing"
397399
else
@@ -404,7 +406,7 @@ def pull_event
404406
id = parse_id(base_error_message,
405407
accept_external_id: true,
406408
accept_public_id: true)
407-
@source.match?(/\s*/um, true) # skip spaces
409+
@source.skip_spaces
408410
unless @source.match?(">", true)
409411
message = "#{base_error_message}: garbage before end >"
410412
raise REXML::ParseException.new(message, @source)
@@ -425,7 +427,7 @@ def pull_event
425427
end
426428
end
427429
if @document_status == :after_doctype
428-
@source.match?(/\s*/um, true)
430+
@source.skip_spaces
429431
end
430432
begin
431433
start_position = @source.position
@@ -642,6 +644,10 @@ def need_source_encoding_update?(xml_declaration_encoding)
642644
true
643645
end
644646

647+
def normalize_xml_declaration_encoding(xml_declaration_encoding)
648+
/\AUTF-16(?:BE|LE)\z/i.match?(xml_declaration_encoding) ? "UTF-16" : nil
649+
end
650+
645651
def parse_name(base_error_message)
646652
md = @source.match(Private::NAME_PATTERN, true)
647653
unless md
@@ -735,37 +741,85 @@ def process_comment
735741

736742
def process_instruction
737743
name = parse_name("Malformed XML: Invalid processing instruction node")
738-
if @source.match?(/\s+/um, true)
739-
match_data = @source.match(/(.*?)\?>/um, true)
740-
unless match_data
741-
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
744+
if name == "xml"
745+
xml_declaration
746+
else # PITarget
747+
if @source.skip_spaces # e.g. <?name content?>
748+
start_position = @source.position
749+
content = @source.read_until("?>")
750+
unless content.chomp!("?>")
751+
@source.position = start_position
752+
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
753+
end
754+
else # e.g. <?name?>
755+
content = nil
756+
unless @source.match?("?>", true)
757+
raise ParseException.new("Malformed XML: Unclosed processing instruction: <#{name}>", @source)
758+
end
742759
end
743-
content = match_data[1]
744-
else
745-
content = nil
760+
[:processing_instruction, name, content]
761+
end
762+
end
763+
764+
def xml_declaration
765+
unless @version.nil?
766+
raise ParseException.new("Malformed XML: XML declaration is duplicated", @source)
767+
end
768+
if @document_status
769+
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
770+
end
771+
unless @source.skip_spaces
772+
raise ParseException.new("Malformed XML: XML declaration misses spaces before version", @source)
773+
end
774+
unless @source.match?("version", true)
775+
raise ParseException.new("Malformed XML: XML declaration misses version", @source)
776+
end
777+
@version = parse_attribute_value_with_equal("xml")
778+
unless @source.skip_spaces
746779
unless @source.match?("?>", true)
747-
raise ParseException.new("Malformed XML: Unclosed processing instruction", @source)
780+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
748781
end
782+
encoding = normalize_xml_declaration_encoding(@source.encoding)
783+
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.0"?>
749784
end
750-
if name == "xml"
751-
if @document_status
752-
raise ParseException.new("Malformed XML: XML declaration is not at the start", @source)
753-
end
754-
version = VERSION.match(content)
755-
version = version[1] unless version.nil?
756-
encoding = ENCODING.match(content)
757-
encoding = encoding[1] unless encoding.nil?
758-
if need_source_encoding_update?(encoding)
759-
@source.encoding = encoding
785+
786+
if @source.match?("encoding", true)
787+
encoding = parse_attribute_value_with_equal("xml")
788+
unless @source.skip_spaces
789+
unless @source.match?("?>", true)
790+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
791+
end
792+
if need_source_encoding_update?(encoding)
793+
@source.encoding = encoding
794+
end
795+
encoding ||= normalize_xml_declaration_encoding(@source.encoding)
796+
return [ :xmldecl, @version, encoding, nil ] # e.g. <?xml version="1.1" encoding="UTF-8"?>
760797
end
761-
if encoding.nil? and /\AUTF-16(?:BE|LE)\z/i =~ @source.encoding
762-
encoding = "UTF-16"
798+
end
799+
800+
if @source.match?("standalone", true)
801+
standalone = parse_attribute_value_with_equal("xml")
802+
case standalone
803+
when "yes", "no"
804+
else
805+
raise ParseException.new("Malformed XML: XML declaration standalone is not yes or no : <#{standalone}>", @source)
763806
end
764-
standalone = STANDALONE.match(content)
765-
standalone = standalone[1] unless standalone.nil?
766-
return [ :xmldecl, version, encoding, standalone ]
767807
end
768-
[:processing_instruction, name, content]
808+
@source.skip_spaces
809+
unless @source.match?("?>", true)
810+
raise ParseException.new("Malformed XML: Unclosed XML declaration", @source)
811+
end
812+
813+
if need_source_encoding_update?(encoding)
814+
@source.encoding = encoding
815+
end
816+
encoding ||= normalize_xml_declaration_encoding(@source.encoding)
817+
818+
# e.g. <?xml version="1.0" ?>
819+
# <?xml version="1.1" encoding="UTF-8" ?>
820+
# <?xml version="1.1" standalone="yes"?>
821+
# <?xml version="1.1" encoding="UTF-8" standalone="yes" ?>
822+
[ :xmldecl, @version, encoding, standalone ]
769823
end
770824

771825
if StringScanner::Version < "3.1.1"
@@ -787,6 +841,25 @@ def scan_quote
787841
end
788842
end
789843

844+
def parse_attribute_value_with_equal(name)
845+
unless @source.match?(Private::EQUAL_PATTERN, true)
846+
message = "Missing attribute equal: <#{name}>"
847+
raise REXML::ParseException.new(message, @source)
848+
end
849+
unless quote = scan_quote
850+
message = "Missing attribute value start quote: <#{name}>"
851+
raise REXML::ParseException.new(message, @source)
852+
end
853+
start_position = @source.position
854+
value = @source.read_until(quote)
855+
unless value.chomp!(quote)
856+
@source.position = start_position
857+
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
858+
raise REXML::ParseException.new(message, @source)
859+
end
860+
value
861+
end
862+
790863
def parse_attributes(prefixes)
791864
attributes = {}
792865
expanded_names = {}
@@ -801,23 +874,8 @@ def parse_attributes(prefixes)
801874
name = match[1]
802875
prefix = match[2]
803876
local_part = match[3]
804-
805-
unless @source.match?(/\s*=\s*/um, true)
806-
message = "Missing attribute equal: <#{name}>"
807-
raise REXML::ParseException.new(message, @source)
808-
end
809-
unless quote = scan_quote
810-
message = "Missing attribute value start quote: <#{name}>"
811-
raise REXML::ParseException.new(message, @source)
812-
end
813-
start_position = @source.position
814-
value = @source.read_until(quote)
815-
unless value.chomp!(quote)
816-
@source.position = start_position
817-
message = "Missing attribute value end quote: <#{name}>: <#{quote}>"
818-
raise REXML::ParseException.new(message, @source)
819-
end
820-
@source.match?(/\s*/um, true)
877+
value = parse_attribute_value_with_equal(name)
878+
@source.skip_spaces
821879
if prefix == "xmlns"
822880
if local_part == "xml"
823881
if value != Private::XML_PREFIXED_NAMESPACE

lib/rexml/source.rb

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,9 +65,10 @@ class Source
6565
attr_reader :encoding
6666

6767
module Private
68+
SPACES_PATTERN = /\s+/um
6869
SCANNER_RESET_SIZE = 100000
6970
PRE_DEFINED_TERM_PATTERNS = {}
70-
pre_defined_terms = ["'", '"', "<", "]]>"]
71+
pre_defined_terms = ["'", '"', "<", "]]>", "?>"]
7172
if StringScanner::Version < "3.1.1"
7273
pre_defined_terms.each do |term|
7374
PRE_DEFINED_TERM_PATTERNS[term] = /#{Regexp.escape(term)}/
@@ -150,6 +151,10 @@ def match?(pattern, cons=false)
150151
end
151152
end
152153

154+
def skip_spaces
155+
@scanner.skip(Private::SPACES_PATTERN) ? true : false
156+
end
157+
153158
def position
154159
@scanner.pos
155160
end

test/parse/test_document_type_declaration.rb

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,10 @@ def test_no_name
4949
end
5050
assert_equal(<<-DETAIL.chomp, exception.to_s)
5151
Malformed DOCTYPE: name is missing
52-
Line: 3
53-
Position: 17
52+
Line: 1
53+
Position: 10
5454
Last 80 unconsumed characters:
55-
<!DOCTYPE> <r/>
55+
<!DOCTYPE>
5656
DETAIL
5757
end
5858
end

0 commit comments

Comments
 (0)