From 44bf1dd1cae8b04c779b7f6eae324754afde4576 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 15 May 2014 10:22:05 +0200 Subject: [PATCH] Split up handling of element names/namespaces. This is now split up on Ragel level, simplifying the corresponding Ruby code. --- ext/ragel/base_lexer.rl | 29 ++++++++++++++++++++++------- lib/oga/xml/lexer.rb | 26 +++++++++++++++++--------- 2 files changed, 39 insertions(+), 16 deletions(-) diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl index 1aec342..7f06ca9 100644 --- a/ext/ragel/base_lexer.rl +++ b/ext/ragel/base_lexer.rl @@ -30,7 +30,8 @@ newline = '\n' | '\r\n'; whitespace = [ \t]; - identifier = [a-zA-Z0-9\-_:]+; + identifier = [a-zA-Z0-9\-_]+; + attribute = [a-zA-Z0-9\-_:]+; # Strings # @@ -183,7 +184,7 @@ }; # Attributes and their values (e.g. version="1.0"). - identifier => { + attribute => { callback("on_attribute", data, encoding, ts, te); }; @@ -202,12 +203,21 @@ # namespace (if any). Remaining work is delegated to a dedicated # machine. action start_element { - callback("on_element_start", data, encoding, ts + 1, te); - + fhold; fcall element_head; } - element_start = '<' identifier; + # Machine used for lexing the name/namespace of an element. + element_name := |* + identifier ':' => { + callback("on_element_ns", data, encoding, ts, te - 1); + }; + + identifier => { + callback("on_element_name", data, encoding, ts, te); + fret; + }; + *|; # Machine used for processing the characters inside a element head. An # element head is everything between ` { + callback_simple("on_element_start"); + fcall element_name; + }; + newline => { callback_simple("on_newline"); }; # Attribute names. - identifier => { + attribute => { callback("on_attribute", data, encoding, ts, te); }; @@ -239,7 +254,7 @@ *|; main := |* - element_start => start_element; + '<' => start_element; doctype_start => start_doctype; cdata_start => start_cdata; comment_start => start_comment; diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 37a6774..fbd0b18 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -248,22 +248,30 @@ module Oga ## # Called on the start of an element. # + def on_element_start + add_token(:T_ELEM_START) + end + + ## + # Called on the name of an element. + # # @param [String] name The name of the element, including namespace. # - def on_element_start(name) - add_token(:T_ELEM_START) - - if name.include?(':') - ns, name = name.split(':') - - add_token(:T_ELEM_NS, ns) - end - + def on_element_name(name) @elements << name if html? add_token(:T_ELEM_NAME, name) end + ## + # Called on the element namespace. + # + # @param [String] namespace + # + def on_element_ns(namespace) + add_token(:T_ELEM_NS, namespace) + end + ## # Called on the closing `>` of the open tag of an element. #