diff --git a/ext/c/lexer.rl b/ext/c/lexer.rl index bd04c06..c4ddc6e 100644 --- a/ext/c/lexer.rl +++ b/ext/c/lexer.rl @@ -2,7 +2,7 @@ VALUE oga_cLexer; -%%machine lexer; +%%machine c_lexer; /** * Calls a method defined in the Ruby side of the lexer. The String value is @@ -85,239 +85,7 @@ VALUE oga_xml_lexer_advance(VALUE self) } %%{ - newline = '\n' | '\r\n'; - whitespace = [ \t]; - identifier = [a-zA-Z0-9\-_:]+; - - # Strings - # - # Strings in HTML can either be single or double quoted. If a string - # starts with one of these quotes it must be closed with the same type - # of quote. - dquote = '"'; - squote = "'"; - - # Machine for processing double quoted strings. - string_dquote := |* - ^dquote+ => { - liboga_xml_lexer_callback(self, "on_string", encoding, ts, te); - }; - - dquote => { fret; }; - *|; - - # Machine for processing single quoted strings. - string_squote := |* - ^squote+ => { - liboga_xml_lexer_callback(self, "on_string", encoding, ts, te); - }; - - squote => { fret; }; - *|; - - # DOCTYPES - # - # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax - # - # These rules support the 3 flavours of doctypes: - # - # 1. Normal doctypes, as introduced in the HTML5 specification. - # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. - # 3. Legacy doctypes - # - doctype_start = ' { - liboga_xml_lexer_callback(self, "on_doctype_type", encoding, ts, te); - }; - - # Lex the public/system IDs as regular strings. - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - # Whitespace inside doctypes is ignored since there's no point in - # including it. - whitespace; - - identifier => { - liboga_xml_lexer_callback(self, "on_doctype_name", encoding, ts, te); - }; - - '>' => { - liboga_xml_lexer_callback_simple(self, "on_doctype_end"); - fret; - }; - *|; - - # CDATA - # - # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections - # - # CDATA tags are broken up into 3 parts: the start, the content and the - # end tag. - # - # In HTML CDATA tags have no meaning/are not supported. Oga does - # support them but treats their contents as plain text. - # - cdata_start = ''; - - action start_cdata { - liboga_xml_lexer_callback_simple(self, "on_cdata_start"); - fcall cdata; - } - - # Machine that for processing the contents of CDATA tags. Everything - # inside a CDATA tag is treated as plain text. - cdata := |* - any* cdata_end => { - liboga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); - liboga_xml_lexer_callback_simple(self, "on_cdata_end"); - fret; - }; - *|; - - # Comments - # - # http://www.w3.org/TR/html-markup/syntax.html#comments - # - # Comments are lexed into 3 parts: the start tag, the content and the - # end tag. - # - # Unlike the W3 specification these rules *do* allow character - # sequences such as `--` and `->`. Putting extra checks in for these - # sequences would actually make the rules/actions more complex. - # - comment_start = ''; - - action start_comment { - liboga_xml_lexer_callback_simple(self, "on_comment_start"); - fcall comment; - } - - # Machine used for processing the contents of a comment. Everything - # inside a comment is treated as plain text (similar to CDATA tags). - comment := |* - any* comment_end => { - liboga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); - liboga_xml_lexer_callback_simple(self, "on_comment_end"); - fret; - }; - *|; - - # XML declaration tags - # - # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd - # - xml_decl_start = ''; - - action start_xml_decl { - liboga_xml_lexer_callback_simple(self, "on_xml_decl_start"); - fcall xml_decl; - } - - # Machine that processes the contents of an XML declaration tag. - xml_decl := |* - xml_decl_end => { - liboga_xml_lexer_callback_simple(self, "on_xml_decl_end"); - fret; - }; - - # Attributes and their values (e.g. version="1.0"). - identifier => { - liboga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); - }; - - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - any; - *|; - - # Elements - # - # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements - # - - # Action that creates the tokens for the opening tag, name and - # namespace (if any). Remaining work is delegated to a dedicated - # machine. - action start_element { - liboga_xml_lexer_callback(self, "on_element_start", encoding, ts + 1, te); - - fcall element_head; - } - - element_start = '<' identifier; - - # Machine used for processing the characters inside a element head. An - # element head is everything between ``. - # - # For example, in `

` the element head is ` foo="bar"`. - # - element_head := |* - whitespace | '='; - - newline => { - liboga_xml_lexer_callback_simple(self, "on_newline"); - }; - - # Attribute names. - identifier => { - liboga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); - }; - - # Attribute values. - dquote => { fcall string_dquote; }; - squote => { fcall string_squote; }; - - # The closing character of the open tag. - ('>' | '/') => { - fhold; - fret; - }; - *|; - - main := |* - element_start => start_element; - doctype_start => start_doctype; - cdata_start => start_cdata; - comment_start => start_comment; - xml_decl_start => start_xml_decl; - - # Enter the body of the tag. If HTML mode is enabled and the current - # element is a void element we'll close it and bail out. - '>' => { - liboga_xml_lexer_callback_simple(self, "on_element_open_end"); - }; - - # Regular closing tags. - '' => { - liboga_xml_lexer_callback_simple(self, "on_element_end"); - }; - - # Self closing elements that are not handled by the HTML mode. - '/>' => { - liboga_xml_lexer_callback_simple(self, "on_element_end"); - }; - - # Note that this rule should be declared at the very bottom as it - # will otherwise take precedence over the other rules. - ^('<' | '>')+ => { - liboga_xml_lexer_callback(self, "on_text", encoding, ts, te); - }; - *|; + include base_lexer "base_lexer.rl"; }%% void Init_liboga_xml_lexer() diff --git a/ext/ragel/base_lexer.rl b/ext/ragel/base_lexer.rl new file mode 100644 index 0000000..ae85d42 --- /dev/null +++ b/ext/ragel/base_lexer.rl @@ -0,0 +1,237 @@ +%%machine base_lexer; + +%%{ + newline = '\n' | '\r\n'; + whitespace = [ \t]; + identifier = [a-zA-Z0-9\-_:]+; + + # Strings + # + # Strings in HTML can either be single or double quoted. If a string + # starts with one of these quotes it must be closed with the same type + # of quote. + dquote = '"'; + squote = "'"; + + # Machine for processing double quoted strings. + string_dquote := |* + ^dquote+ => { + liboga_xml_lexer_callback(self, "on_string", encoding, ts, te); + }; + + dquote => { fret; }; + *|; + + # Machine for processing single quoted strings. + string_squote := |* + ^squote+ => { + liboga_xml_lexer_callback(self, "on_string", encoding, ts, te); + }; + + squote => { fret; }; + *|; + + # DOCTYPES + # + # http://www.w3.org/TR/html-markup/syntax.html#doctype-syntax + # + # These rules support the 3 flavours of doctypes: + # + # 1. Normal doctypes, as introduced in the HTML5 specification. + # 2. Deprecated doctypes, the more verbose ones used prior to HTML5. + # 3. Legacy doctypes + # + doctype_start = ' { + liboga_xml_lexer_callback(self, "on_doctype_type", encoding, ts, te); + }; + + # Lex the public/system IDs as regular strings. + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + # Whitespace inside doctypes is ignored since there's no point in + # including it. + whitespace; + + identifier => { + liboga_xml_lexer_callback(self, "on_doctype_name", encoding, ts, te); + }; + + '>' => { + liboga_xml_lexer_callback_simple(self, "on_doctype_end"); + fret; + }; + *|; + + # CDATA + # + # http://www.w3.org/TR/html-markup/syntax.html#cdata-sections + # + # CDATA tags are broken up into 3 parts: the start, the content and the + # end tag. + # + # In HTML CDATA tags have no meaning/are not supported. Oga does + # support them but treats their contents as plain text. + # + cdata_start = ''; + + action start_cdata { + liboga_xml_lexer_callback_simple(self, "on_cdata_start"); + fcall cdata; + } + + # Machine that for processing the contents of CDATA tags. Everything + # inside a CDATA tag is treated as plain text. + cdata := |* + any* cdata_end => { + liboga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); + liboga_xml_lexer_callback_simple(self, "on_cdata_end"); + fret; + }; + *|; + + # Comments + # + # http://www.w3.org/TR/html-markup/syntax.html#comments + # + # Comments are lexed into 3 parts: the start tag, the content and the + # end tag. + # + # Unlike the W3 specification these rules *do* allow character + # sequences such as `--` and `->`. Putting extra checks in for these + # sequences would actually make the rules/actions more complex. + # + comment_start = ''; + + action start_comment { + liboga_xml_lexer_callback_simple(self, "on_comment_start"); + fcall comment; + } + + # Machine used for processing the contents of a comment. Everything + # inside a comment is treated as plain text (similar to CDATA tags). + comment := |* + any* comment_end => { + liboga_xml_lexer_callback(self, "on_text", encoding, ts, te - 3); + liboga_xml_lexer_callback_simple(self, "on_comment_end"); + fret; + }; + *|; + + # XML declaration tags + # + # http://www.w3.org/TR/REC-xml/#sec-prolog-dtd + # + xml_decl_start = ''; + + action start_xml_decl { + liboga_xml_lexer_callback_simple(self, "on_xml_decl_start"); + fcall xml_decl; + } + + # Machine that processes the contents of an XML declaration tag. + xml_decl := |* + xml_decl_end => { + liboga_xml_lexer_callback_simple(self, "on_xml_decl_end"); + fret; + }; + + # Attributes and their values (e.g. version="1.0"). + identifier => { + liboga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); + }; + + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + any; + *|; + + # Elements + # + # http://www.w3.org/TR/html-markup/syntax.html#syntax-elements + # + + # Action that creates the tokens for the opening tag, name and + # namespace (if any). Remaining work is delegated to a dedicated + # machine. + action start_element { + liboga_xml_lexer_callback(self, "on_element_start", encoding, ts + 1, te); + + fcall element_head; + } + + element_start = '<' identifier; + + # Machine used for processing the characters inside a element head. An + # element head is everything between ``. + # + # For example, in `

` the element head is ` foo="bar"`. + # + element_head := |* + whitespace | '='; + + newline => { + liboga_xml_lexer_callback_simple(self, "on_newline"); + }; + + # Attribute names. + identifier => { + liboga_xml_lexer_callback(self, "on_attribute", encoding, ts, te); + }; + + # Attribute values. + dquote => { fcall string_dquote; }; + squote => { fcall string_squote; }; + + # The closing character of the open tag. + ('>' | '/') => { + fhold; + fret; + }; + *|; + + main := |* + element_start => start_element; + doctype_start => start_doctype; + cdata_start => start_cdata; + comment_start => start_comment; + xml_decl_start => start_xml_decl; + + # Enter the body of the tag. If HTML mode is enabled and the current + # element is a void element we'll close it and bail out. + '>' => { + liboga_xml_lexer_callback_simple(self, "on_element_open_end"); + }; + + # Regular closing tags. + '' => { + liboga_xml_lexer_callback_simple(self, "on_element_end"); + }; + + # Self closing elements that are not handled by the HTML mode. + '/>' => { + liboga_xml_lexer_callback_simple(self, "on_element_end"); + }; + + # Note that this rule should be declared at the very bottom as it + # will otherwise take precedence over the other rules. + ^('<' | '>')+ => { + liboga_xml_lexer_callback(self, "on_text", encoding, ts, te); + }; + *|; +}%% diff --git a/task/lexer.rake b/task/lexer.rake index 9f4a190..68338bb 100644 --- a/task/lexer.rake +++ b/task/lexer.rake @@ -17,11 +17,11 @@ rule '.rb' => '.rl' do |task| end rule '.c' => '.rl' do |task| - sh "ragel -C -G2 #{task.source} -o #{task.name}" + sh "ragel -I ext/ragel -C -G2 #{task.source} -o #{task.name}" end rule '.java' => '.rl' do |task| - sh "ragel -J #{task.source} -o #{task.name}" + sh "ragel -I ext/ragel -J #{task.source} -o #{task.name}" end desc 'Generates the lexers'