From 2ec91f130fcdfee918578d045b07367aec434260 Mon Sep 17 00:00:00 2001 From: Yorick Peterse Date: Thu, 5 Mar 2015 23:00:43 +0100 Subject: [PATCH] Lazy decoding of XML/HTML entities. Instead of decoding entities in the lexer we'll do this whenever XML::Text#text is called. This removes the overhead from the parsing phase and ensures the process is only triggered when actually needed. Note that calling #to_xml and/or the #inspect methods on a Text (or parent) instance will also trigger the entity conversion process. The new entity decoding API supports both regular entities (e.g. &) as well as codepoint based entities (both regular and hexadecimal codepoints). To allow safe read-only access to Text instances from multiple threads a mutex is used. This mutex ensures that only 1 thread can trigger the conversion process. Fixes #68 --- lib/oga.rb | 2 + lib/oga/html/entities.rb | 2150 +++++++++++++++++++++++++++ lib/oga/xml/entities.rb | 56 +- lib/oga/xml/lexer.rb | 4 +- lib/oga/xml/text.rb | 54 +- spec/oga/html/entities_spec.rb | 15 + spec/oga/xml/entities_spec.rb | 4 + spec/oga/xml/lexer/entities_spec.rb | 55 - spec/oga/xml/text_spec.rb | 73 + 9 files changed, 2337 insertions(+), 76 deletions(-) create mode 100644 lib/oga/html/entities.rb create mode 100644 spec/oga/html/entities_spec.rb delete mode 100644 spec/oga/xml/lexer/entities_spec.rb diff --git a/lib/oga.rb b/lib/oga.rb index a8e940f..5090321 100644 --- a/lib/oga.rb +++ b/lib/oga.rb @@ -3,6 +3,7 @@ gem 'racc' require 'ast' require 'set' require 'stringio' +require 'thread' require_relative 'oga/version' require_relative 'oga/oga' @@ -43,6 +44,7 @@ require_relative 'oga/xml/pull_parser' require_relative 'oga/html/parser' require_relative 'oga/html/sax_parser' +require_relative 'oga/html/entities' require_relative 'oga/xpath/lexer' require_relative 'oga/xpath/parser' diff --git a/lib/oga/html/entities.rb b/lib/oga/html/entities.rb new file mode 100644 index 0000000..f7fada0 --- /dev/null +++ b/lib/oga/html/entities.rb @@ -0,0 +1,2150 @@ +module Oga + module HTML + module Entities + ## + # Hash mapping HTML entities to their Unicode character replacements. + # + # Based on the JSON output as listed at + # http://www.w3.org/TR/html5/syntax.html#named-character-references + # + # @return [Hash] + # + DECODE_MAPPING = { + 'Á' => [193].pack('U'), + 'á' => [225].pack('U'), + 'Ă' => [258].pack('U'), + 'ă' => [259].pack('U'), + '∾' => [8766].pack('U'), + '∿' => [8767].pack('U'), + '∾̳' => [8766, 819].pack('U'), + 'Â' => [194].pack('U'), + 'â' => [226].pack('U'), + '´' => [180].pack('U'), + 'А' => [1040].pack('U'), + 'а' => [1072].pack('U'), + 'Æ' => [198].pack('U'), + 'æ' => [230].pack('U'), + '⁡' => [8289].pack('U'), + '𝔄' => [120068].pack('U'), + '𝔞' => [120094].pack('U'), + 'À' => [192].pack('U'), + 'à' => [224].pack('U'), + 'ℵ' => [8501].pack('U'), + 'ℵ' => [8501].pack('U'), + 'Α' => [913].pack('U'), + 'α' => [945].pack('U'), + 'Ā' => [256].pack('U'), + 'ā' => [257].pack('U'), + '⨿' => [10815].pack('U'), + '&' => [38].pack('U'), + '&' => [38].pack('U'), + '⩓' => [10835].pack('U'), + '∧' => [8743].pack('U'), + '⩕' => [10837].pack('U'), + '⩜' => [10844].pack('U'), + '⩘' => [10840].pack('U'), + '⩚' => [10842].pack('U'), + '∠' => [8736].pack('U'), + '⦤' => [10660].pack('U'), + '∠' => [8736].pack('U'), + '∡' => [8737].pack('U'), + '⦨' => [10664].pack('U'), + '⦩' => [10665].pack('U'), + '⦪' => [10666].pack('U'), + '⦫' => [10667].pack('U'), + '⦬' => [10668].pack('U'), + '⦭' => [10669].pack('U'), + '⦮' => [10670].pack('U'), + '⦯' => [10671].pack('U'), + '∟' => [8735].pack('U'), + '⊾' => [8894].pack('U'), + '⦝' => [10653].pack('U'), + '∢' => [8738].pack('U'), + 'Å' => [197].pack('U'), + '⍼' => [9084].pack('U'), + 'Ą' => [260].pack('U'), + 'ą' => [261].pack('U'), + '𝔸' => [120120].pack('U'), + '𝕒' => [120146].pack('U'), + '≈' => [8776].pack('U'), + '⩯' => [10863].pack('U'), + '⩰' => [10864].pack('U'), + '≊' => [8778].pack('U'), + '≋' => [8779].pack('U'), + ''' => [39].pack('U'), + '⁡' => [8289].pack('U'), + '≈' => [8776].pack('U'), + '≊' => [8778].pack('U'), + 'Å' => [197].pack('U'), + 'å' => [229].pack('U'), + '𝒜' => [119964].pack('U'), + '𝒶' => [119990].pack('U'), + '≔' => [8788].pack('U'), + '*' => [42].pack('U'), + '≈' => [8776].pack('U'), + '≍' => [8781].pack('U'), + 'Ã' => [195].pack('U'), + 'ã' => [227].pack('U'), + 'Ä' => [196].pack('U'), + 'ä' => [228].pack('U'), + '∳' => [8755].pack('U'), + '⨑' => [10769].pack('U'), + '≌' => [8780].pack('U'), + '϶' => [1014].pack('U'), + '‵' => [8245].pack('U'), + '∽' => [8765].pack('U'), + '⋍' => [8909].pack('U'), + '∖' => [8726].pack('U'), + '⫧' => [10983].pack('U'), + '⊽' => [8893].pack('U'), + '⌆' => [8966].pack('U'), + '⌅' => [8965].pack('U'), + '⌅' => [8965].pack('U'), + '⎵' => [9141].pack('U'), + '⎶' => [9142].pack('U'), + '≌' => [8780].pack('U'), + 'Б' => [1041].pack('U'), + 'б' => [1073].pack('U'), + '„' => [8222].pack('U'), + '∵' => [8757].pack('U'), + '∵' => [8757].pack('U'), + '∵' => [8757].pack('U'), + '⦰' => [10672].pack('U'), + '϶' => [1014].pack('U'), + 'ℬ' => [8492].pack('U'), + 'ℬ' => [8492].pack('U'), + 'Β' => [914].pack('U'), + 'β' => [946].pack('U'), + 'ℶ' => [8502].pack('U'), + '≬' => [8812].pack('U'), + '𝔅' => [120069].pack('U'), + '𝔟' => [120095].pack('U'), + '⋂' => [8898].pack('U'), + '◯' => [9711].pack('U'), + '⋃' => [8899].pack('U'), + '⨀' => [10752].pack('U'), + '⨁' => [10753].pack('U'), + '⨂' => [10754].pack('U'), + '⨆' => [10758].pack('U'), + '★' => [9733].pack('U'), + '▽' => [9661].pack('U'), + '△' => [9651].pack('U'), + '⨄' => [10756].pack('U'), + '⋁' => [8897].pack('U'), + '⋀' => [8896].pack('U'), + '⤍' => [10509].pack('U'), + '⧫' => [10731].pack('U'), + '▪' => [9642].pack('U'), + '▴' => [9652].pack('U'), + '▾' => [9662].pack('U'), + '◂' => [9666].pack('U'), + '▸' => [9656].pack('U'), + '␣' => [9251].pack('U'), + '▒' => [9618].pack('U'), + '░' => [9617].pack('U'), + '▓' => [9619].pack('U'), + '█' => [9608].pack('U'), + '=⃥' => [61, 8421].pack('U'), + '≡⃥' => [8801, 8421].pack('U'), + '⫭' => [10989].pack('U'), + '⌐' => [8976].pack('U'), + '𝔹' => [120121].pack('U'), + '𝕓' => [120147].pack('U'), + '⊥' => [8869].pack('U'), + '⊥' => [8869].pack('U'), + '⋈' => [8904].pack('U'), + '⧉' => [10697].pack('U'), + '╗' => [9559].pack('U'), + '╖' => [9558].pack('U'), + '╕' => [9557].pack('U'), + '┐' => [9488].pack('U'), + '╔' => [9556].pack('U'), + '╓' => [9555].pack('U'), + '╒' => [9554].pack('U'), + '┌' => [9484].pack('U'), + '═' => [9552].pack('U'), + '─' => [9472].pack('U'), + '╦' => [9574].pack('U'), + '╤' => [9572].pack('U'), + '╥' => [9573].pack('U'), + '┬' => [9516].pack('U'), + '╩' => [9577].pack('U'), + '╧' => [9575].pack('U'), + '╨' => [9576].pack('U'), + '┴' => [9524].pack('U'), + '⊟' => [8863].pack('U'), + '⊞' => [8862].pack('U'), + '⊠' => [8864].pack('U'), + '╝' => [9565].pack('U'), + '╜' => [9564].pack('U'), + '╛' => [9563].pack('U'), + '┘' => [9496].pack('U'), + '╚' => [9562].pack('U'), + '╙' => [9561].pack('U'), + '╘' => [9560].pack('U'), + '└' => [9492].pack('U'), + '║' => [9553].pack('U'), + '│' => [9474].pack('U'), + '╬' => [9580].pack('U'), + '╫' => [9579].pack('U'), + '╪' => [9578].pack('U'), + '┼' => [9532].pack('U'), + '╣' => [9571].pack('U'), + '╢' => [9570].pack('U'), + '╡' => [9569].pack('U'), + '┤' => [9508].pack('U'), + '╠' => [9568].pack('U'), + '╟' => [9567].pack('U'), + '╞' => [9566].pack('U'), + '├' => [9500].pack('U'), + '‵' => [8245].pack('U'), + '˘' => [728].pack('U'), + '˘' => [728].pack('U'), + '¦' => [166].pack('U'), + 'ℬ' => [8492].pack('U'), + '𝒷' => [119991].pack('U'), + '⁏' => [8271].pack('U'), + '∽' => [8765].pack('U'), + '⋍' => [8909].pack('U'), + '\' => [92].pack('U'), + '⧅' => [10693].pack('U'), + '⟈' => [10184].pack('U'), + '•' => [8226].pack('U'), + '•' => [8226].pack('U'), + '≎' => [8782].pack('U'), + '⪮' => [10926].pack('U'), + '≏' => [8783].pack('U'), + '≎' => [8782].pack('U'), + '≏' => [8783].pack('U'), + 'Ć' => [262].pack('U'), + 'ć' => [263].pack('U'), + '⋒' => [8914].pack('U'), + '∩' => [8745].pack('U'), + '⩄' => [10820].pack('U'), + '⩉' => [10825].pack('U'), + '⩋' => [10827].pack('U'), + '⩇' => [10823].pack('U'), + '⩀' => [10816].pack('U'), + 'ⅅ' => [8517].pack('U'), + '∩︀' => [8745, 65024].pack('U'), + '⁁' => [8257].pack('U'), + 'ˇ' => [711].pack('U'), + 'ℭ' => [8493].pack('U'), + '⩍' => [10829].pack('U'), + 'Č' => [268].pack('U'), + 'č' => [269].pack('U'), + 'Ç' => [199].pack('U'), + 'ç' => [231].pack('U'), + 'Ĉ' => [264].pack('U'), + 'ĉ' => [265].pack('U'), + '∰' => [8752].pack('U'), + '⩌' => [10828].pack('U'), + '⩐' => [10832].pack('U'), + 'Ċ' => [266].pack('U'), + 'ċ' => [267].pack('U'), + '¸' => [184].pack('U'), + '¸' => [184].pack('U'), + '⦲' => [10674].pack('U'), + '¢' => [162].pack('U'), + '·' => [183].pack('U'), + '·' => [183].pack('U'), + 'ℭ' => [8493].pack('U'), + '𝔠' => [120096].pack('U'), + 'Ч' => [1063].pack('U'), + 'ч' => [1095].pack('U'), + '✓' => [10003].pack('U'), + '✓' => [10003].pack('U'), + 'Χ' => [935].pack('U'), + 'χ' => [967].pack('U'), + '○' => [9675].pack('U'), + 'ˆ' => [710].pack('U'), + '≗' => [8791].pack('U'), + '↺' => [8634].pack('U'), + '↻' => [8635].pack('U'), + '⊛' => [8859].pack('U'), + '⊚' => [8858].pack('U'), + '⊝' => [8861].pack('U'), + '⊙' => [8857].pack('U'), + '®' => [174].pack('U'), + 'Ⓢ' => [9416].pack('U'), + '⊖' => [8854].pack('U'), + '⊕' => [8853].pack('U'), + '⊗' => [8855].pack('U'), + '⧃' => [10691].pack('U'), + '≗' => [8791].pack('U'), + '⨐' => [10768].pack('U'), + '⫯' => [10991].pack('U'), + '⧂' => [10690].pack('U'), + '∲' => [8754].pack('U'), + '”' => [8221].pack('U'), + '’' => [8217].pack('U'), + '♣' => [9827].pack('U'), + '♣' => [9827].pack('U'), + '∷' => [8759].pack('U'), + ':' => [58].pack('U'), + '⩴' => [10868].pack('U'), + '≔' => [8788].pack('U'), + '≔' => [8788].pack('U'), + ',' => [44].pack('U'), + '@' => [64].pack('U'), + '∁' => [8705].pack('U'), + '∘' => [8728].pack('U'), + '∁' => [8705].pack('U'), + 'ℂ' => [8450].pack('U'), + '≅' => [8773].pack('U'), + '⩭' => [10861].pack('U'), + '≡' => [8801].pack('U'), + '∯' => [8751].pack('U'), + '∮' => [8750].pack('U'), + '∮' => [8750].pack('U'), + 'ℂ' => [8450].pack('U'), + '𝕔' => [120148].pack('U'), + '∐' => [8720].pack('U'), + '∐' => [8720].pack('U'), + '©' => [169].pack('U'), + '©' => [169].pack('U'), + '℗' => [8471].pack('U'), + '∳' => [8755].pack('U'), + '↵' => [8629].pack('U'), + '⨯' => [10799].pack('U'), + '✗' => [10007].pack('U'), + '𝒞' => [119966].pack('U'), + '𝒸' => [119992].pack('U'), + '⫏' => [10959].pack('U'), + '⫑' => [10961].pack('U'), + '⫐' => [10960].pack('U'), + '⫒' => [10962].pack('U'), + '⋯' => [8943].pack('U'), + '⤸' => [10552].pack('U'), + '⤵' => [10549].pack('U'), + '⋞' => [8926].pack('U'), + '⋟' => [8927].pack('U'), + '↶' => [8630].pack('U'), + '⤽' => [10557].pack('U'), + '⋓' => [8915].pack('U'), + '∪' => [8746].pack('U'), + '⩈' => [10824].pack('U'), + '≍' => [8781].pack('U'), + '⩆' => [10822].pack('U'), + '⩊' => [10826].pack('U'), + '⊍' => [8845].pack('U'), + '⩅' => [10821].pack('U'), + '∪︀' => [8746, 65024].pack('U'), + '↷' => [8631].pack('U'), + '⤼' => [10556].pack('U'), + '⋞' => [8926].pack('U'), + '⋟' => [8927].pack('U'), + '⋎' => [8910].pack('U'), + '⋏' => [8911].pack('U'), + '¤' => [164].pack('U'), + '↶' => [8630].pack('U'), + '↷' => [8631].pack('U'), + '⋎' => [8910].pack('U'), + '⋏' => [8911].pack('U'), + '∲' => [8754].pack('U'), + '∱' => [8753].pack('U'), + '⌭' => [9005].pack('U'), + '‡' => [8225].pack('U'), + '†' => [8224].pack('U'), + 'ℸ' => [8504].pack('U'), + '↡' => [8609].pack('U'), + '⇓' => [8659].pack('U'), + '↓' => [8595].pack('U'), + '‐' => [8208].pack('U'), + '⫤' => [10980].pack('U'), + '⊣' => [8867].pack('U'), + '⤏' => [10511].pack('U'), + '˝' => [733].pack('U'), + 'Ď' => [270].pack('U'), + 'ď' => [271].pack('U'), + 'Д' => [1044].pack('U'), + 'д' => [1076].pack('U'), + 'ⅅ' => [8517].pack('U'), + 'ⅆ' => [8518].pack('U'), + '‡' => [8225].pack('U'), + '⇊' => [8650].pack('U'), + '⤑' => [10513].pack('U'), + '⩷' => [10871].pack('U'), + '°' => [176].pack('U'), + '∇' => [8711].pack('U'), + 'Δ' => [916].pack('U'), + 'δ' => [948].pack('U'), + '⦱' => [10673].pack('U'), + '⥿' => [10623].pack('U'), + '𝔇' => [120071].pack('U'), + '𝔡' => [120097].pack('U'), + '⥥' => [10597].pack('U'), + '⇃' => [8643].pack('U'), + '⇂' => [8642].pack('U'), + '´' => [180].pack('U'), + '˙' => [729].pack('U'), + '˝' => [733].pack('U'), + '`' => [96].pack('U'), + '˜' => [732].pack('U'), + '⋄' => [8900].pack('U'), + '⋄' => [8900].pack('U'), + '⋄' => [8900].pack('U'), + '♦' => [9830].pack('U'), + '♦' => [9830].pack('U'), + '¨' => [168].pack('U'), + 'ⅆ' => [8518].pack('U'), + 'ϝ' => [989].pack('U'), + '⋲' => [8946].pack('U'), + '÷' => [247].pack('U'), + '÷' => [247].pack('U'), + '⋇' => [8903].pack('U'), + '⋇' => [8903].pack('U'), + 'Ђ' => [1026].pack('U'), + 'ђ' => [1106].pack('U'), + '⌞' => [8990].pack('U'), + '⌍' => [8973].pack('U'), + '$' => [36].pack('U'), + '𝔻' => [120123].pack('U'), + '𝕕' => [120149].pack('U'), + '¨' => [168].pack('U'), + '˙' => [729].pack('U'), + '⃜' => [8412].pack('U'), + '≐' => [8784].pack('U'), + '≑' => [8785].pack('U'), + '≐' => [8784].pack('U'), + '∸' => [8760].pack('U'), + '∔' => [8724].pack('U'), + '⊡' => [8865].pack('U'), + '⌆' => [8966].pack('U'), + '∯' => [8751].pack('U'), + '¨' => [168].pack('U'), + '⇓' => [8659].pack('U'), + '⇐' => [8656].pack('U'), + '⇔' => [8660].pack('U'), + '⫤' => [10980].pack('U'), + '⟸' => [10232].pack('U'), + '⟺' => [10234].pack('U'), + '⟹' => [10233].pack('U'), + '⇒' => [8658].pack('U'), + '⊨' => [8872].pack('U'), + '⇑' => [8657].pack('U'), + '⇕' => [8661].pack('U'), + '∥' => [8741].pack('U'), + '↓' => [8595].pack('U'), + '⇓' => [8659].pack('U'), + '↓' => [8595].pack('U'), + '⤓' => [10515].pack('U'), + '⇵' => [8693].pack('U'), + '̑' => [785].pack('U'), + '⇊' => [8650].pack('U'), + '⇃' => [8643].pack('U'), + '⇂' => [8642].pack('U'), + '⥐' => [10576].pack('U'), + '⥞' => [10590].pack('U'), + '↽' => [8637].pack('U'), + '⥖' => [10582].pack('U'), + '⥟' => [10591].pack('U'), + '⇁' => [8641].pack('U'), + '⥗' => [10583].pack('U'), + '⊤' => [8868].pack('U'), + '↧' => [8615].pack('U'), + '⤐' => [10512].pack('U'), + '⌟' => [8991].pack('U'), + '⌌' => [8972].pack('U'), + '𝒟' => [119967].pack('U'), + '𝒹' => [119993].pack('U'), + 'Ѕ' => [1029].pack('U'), + 'ѕ' => [1109].pack('U'), + '⧶' => [10742].pack('U'), + 'Đ' => [272].pack('U'), + 'đ' => [273].pack('U'), + '⋱' => [8945].pack('U'), + '▿' => [9663].pack('U'), + '▾' => [9662].pack('U'), + '⇵' => [8693].pack('U'), + '⥯' => [10607].pack('U'), + '⦦' => [10662].pack('U'), + 'Џ' => [1039].pack('U'), + 'џ' => [1119].pack('U'), + '⟿' => [10239].pack('U'), + 'É' => [201].pack('U'), + 'é' => [233].pack('U'), + '⩮' => [10862].pack('U'), + 'Ě' => [282].pack('U'), + 'ě' => [283].pack('U'), + '≖' => [8790].pack('U'), + 'Ê' => [202].pack('U'), + 'ê' => [234].pack('U'), + '≕' => [8789].pack('U'), + 'Э' => [1069].pack('U'), + 'э' => [1101].pack('U'), + '⩷' => [10871].pack('U'), + 'Ė' => [278].pack('U'), + '≑' => [8785].pack('U'), + 'ė' => [279].pack('U'), + 'ⅇ' => [8519].pack('U'), + '≒' => [8786].pack('U'), + '𝔈' => [120072].pack('U'), + '𝔢' => [120098].pack('U'), + '⪚' => [10906].pack('U'), + 'È' => [200].pack('U'), + 'è' => [232].pack('U'), + '⪖' => [10902].pack('U'), + '⪘' => [10904].pack('U'), + '⪙' => [10905].pack('U'), + '∈' => [8712].pack('U'), + '⏧' => [9191].pack('U'), + 'ℓ' => [8467].pack('U'), + '⪕' => [10901].pack('U'), + '⪗' => [10903].pack('U'), + 'Ē' => [274].pack('U'), + 'ē' => [275].pack('U'), + '∅' => [8709].pack('U'), + '∅' => [8709].pack('U'), + '◻' => [9723].pack('U'), + '∅' => [8709].pack('U'), + '▫' => [9643].pack('U'), + ' ' => [8195].pack('U'), + ' ' => [8196].pack('U'), + ' ' => [8197].pack('U'), + 'Ŋ' => [330].pack('U'), + 'ŋ' => [331].pack('U'), + ' ' => [8194].pack('U'), + 'Ę' => [280].pack('U'), + 'ę' => [281].pack('U'), + '𝔼' => [120124].pack('U'), + '𝕖' => [120150].pack('U'), + '⋕' => [8917].pack('U'), + '⧣' => [10723].pack('U'), + '⩱' => [10865].pack('U'), + 'ε' => [949].pack('U'), + 'Ε' => [917].pack('U'), + 'ε' => [949].pack('U'), + 'ϵ' => [1013].pack('U'), + '≖' => [8790].pack('U'), + '≕' => [8789].pack('U'), + '≂' => [8770].pack('U'), + '⪖' => [10902].pack('U'), + '⪕' => [10901].pack('U'), + '⩵' => [10869].pack('U'), + '=' => [61].pack('U'), + '≂' => [8770].pack('U'), + '≟' => [8799].pack('U'), + '⇌' => [8652].pack('U'), + '≡' => [8801].pack('U'), + '⩸' => [10872].pack('U'), + '⧥' => [10725].pack('U'), + '⥱' => [10609].pack('U'), + '≓' => [8787].pack('U'), + 'ℰ' => [8496].pack('U'), + 'ℯ' => [8495].pack('U'), + '≐' => [8784].pack('U'), + '⩳' => [10867].pack('U'), + '≂' => [8770].pack('U'), + 'Η' => [919].pack('U'), + 'η' => [951].pack('U'), + 'Ð' => [208].pack('U'), + 'ð' => [240].pack('U'), + 'Ë' => [203].pack('U'), + 'ë' => [235].pack('U'), + '€' => [8364].pack('U'), + '!' => [33].pack('U'), + '∃' => [8707].pack('U'), + '∃' => [8707].pack('U'), + 'ℰ' => [8496].pack('U'), + 'ⅇ' => [8519].pack('U'), + 'ⅇ' => [8519].pack('U'), + '≒' => [8786].pack('U'), + 'Ф' => [1060].pack('U'), + 'ф' => [1092].pack('U'), + '♀' => [9792].pack('U'), + 'ffi' => [64259].pack('U'), + 'ff' => [64256].pack('U'), + 'ffl' => [64260].pack('U'), + '𝔉' => [120073].pack('U'), + '𝔣' => [120099].pack('U'), + 'fi' => [64257].pack('U'), + '◼' => [9724].pack('U'), + '▪' => [9642].pack('U'), + 'fj' => [102, 106].pack('U'), + '♭' => [9837].pack('U'), + 'fl' => [64258].pack('U'), + '▱' => [9649].pack('U'), + 'ƒ' => [402].pack('U'), + '𝔽' => [120125].pack('U'), + '𝕗' => [120151].pack('U'), + '∀' => [8704].pack('U'), + '∀' => [8704].pack('U'), + '⋔' => [8916].pack('U'), + '⫙' => [10969].pack('U'), + 'ℱ' => [8497].pack('U'), + '⨍' => [10765].pack('U'), + '½' => [189].pack('U'), + '⅓' => [8531].pack('U'), + '¼' => [188].pack('U'), + '⅕' => [8533].pack('U'), + '⅙' => [8537].pack('U'), + '⅛' => [8539].pack('U'), + '⅔' => [8532].pack('U'), + '⅖' => [8534].pack('U'), + '¾' => [190].pack('U'), + '⅗' => [8535].pack('U'), + '⅜' => [8540].pack('U'), + '⅘' => [8536].pack('U'), + '⅚' => [8538].pack('U'), + '⅝' => [8541].pack('U'), + '⅞' => [8542].pack('U'), + '⁄' => [8260].pack('U'), + '⌢' => [8994].pack('U'), + 'ℱ' => [8497].pack('U'), + '𝒻' => [119995].pack('U'), + 'ǵ' => [501].pack('U'), + 'Γ' => [915].pack('U'), + 'γ' => [947].pack('U'), + 'Ϝ' => [988].pack('U'), + 'ϝ' => [989].pack('U'), + '⪆' => [10886].pack('U'), + 'Ğ' => [286].pack('U'), + 'ğ' => [287].pack('U'), + 'Ģ' => [290].pack('U'), + 'Ĝ' => [284].pack('U'), + 'ĝ' => [285].pack('U'), + 'Г' => [1043].pack('U'), + 'г' => [1075].pack('U'), + 'Ġ' => [288].pack('U'), + 'ġ' => [289].pack('U'), + '≧' => [8807].pack('U'), + '≥' => [8805].pack('U'), + '⪌' => [10892].pack('U'), + '⋛' => [8923].pack('U'), + '≥' => [8805].pack('U'), + '≧' => [8807].pack('U'), + '⩾' => [10878].pack('U'), + '⩾' => [10878].pack('U'), + '⪩' => [10921].pack('U'), + '⪀' => [10880].pack('U'), + '⪂' => [10882].pack('U'), + '⪄' => [10884].pack('U'), + '⋛︀' => [8923, 65024].pack('U'), + '⪔' => [10900].pack('U'), + '𝔊' => [120074].pack('U'), + '𝔤' => [120100].pack('U'), + '⋙' => [8921].pack('U'), + '≫' => [8811].pack('U'), + '⋙' => [8921].pack('U'), + 'ℷ' => [8503].pack('U'), + 'Ѓ' => [1027].pack('U'), + 'ѓ' => [1107].pack('U'), + '≷' => [8823].pack('U'), + '⪥' => [10917].pack('U'), + '⪒' => [10898].pack('U'), + '⪤' => [10916].pack('U'), + '⪊' => [10890].pack('U'), + '⪊' => [10890].pack('U'), + '≩' => [8809].pack('U'), + '⪈' => [10888].pack('U'), + '⪈' => [10888].pack('U'), + '≩' => [8809].pack('U'), + '⋧' => [8935].pack('U'), + '𝔾' => [120126].pack('U'), + '𝕘' => [120152].pack('U'), + '`' => [96].pack('U'), + '≥' => [8805].pack('U'), + '⋛' => [8923].pack('U'), + '≧' => [8807].pack('U'), + '⪢' => [10914].pack('U'), + '≷' => [8823].pack('U'), + '⩾' => [10878].pack('U'), + '≳' => [8819].pack('U'), + '𝒢' => [119970].pack('U'), + 'ℊ' => [8458].pack('U'), + '≳' => [8819].pack('U'), + '⪎' => [10894].pack('U'), + '⪐' => [10896].pack('U'), + '>' => [62].pack('U'), + '≫' => [8811].pack('U'), + '>' => [62].pack('U'), + '⪧' => [10919].pack('U'), + '⩺' => [10874].pack('U'), + '⋗' => [8919].pack('U'), + '⦕' => [10645].pack('U'), + '⩼' => [10876].pack('U'), + '⪆' => [10886].pack('U'), + '⥸' => [10616].pack('U'), + '⋗' => [8919].pack('U'), + '⋛' => [8923].pack('U'), + '⪌' => [10892].pack('U'), + '≷' => [8823].pack('U'), + '≳' => [8819].pack('U'), + '≩︀' => [8809, 65024].pack('U'), + '≩︀' => [8809, 65024].pack('U'), + 'ˇ' => [711].pack('U'), + ' ' => [8202].pack('U'), + '½' => [189].pack('U'), + 'ℋ' => [8459].pack('U'), + 'Ъ' => [1066].pack('U'), + 'ъ' => [1098].pack('U'), + '⇔' => [8660].pack('U'), + '↔' => [8596].pack('U'), + '⥈' => [10568].pack('U'), + '↭' => [8621].pack('U'), + '^' => [94].pack('U'), + 'ℏ' => [8463].pack('U'), + 'Ĥ' => [292].pack('U'), + 'ĥ' => [293].pack('U'), + '♥' => [9829].pack('U'), + '♥' => [9829].pack('U'), + '…' => [8230].pack('U'), + '⊹' => [8889].pack('U'), + 'ℌ' => [8460].pack('U'), + '𝔥' => [120101].pack('U'), + 'ℋ' => [8459].pack('U'), + '⤥' => [10533].pack('U'), + '⤦' => [10534].pack('U'), + '⇿' => [8703].pack('U'), + '∻' => [8763].pack('U'), + '↩' => [8617].pack('U'), + '↪' => [8618].pack('U'), + 'ℍ' => [8461].pack('U'), + '𝕙' => [120153].pack('U'), + '―' => [8213].pack('U'), + '─' => [9472].pack('U'), + 'ℋ' => [8459].pack('U'), + '𝒽' => [119997].pack('U'), + 'ℏ' => [8463].pack('U'), + 'Ħ' => [294].pack('U'), + 'ħ' => [295].pack('U'), + '≎' => [8782].pack('U'), + '≏' => [8783].pack('U'), + '⁃' => [8259].pack('U'), + '‐' => [8208].pack('U'), + 'Í' => [205].pack('U'), + 'í' => [237].pack('U'), + '⁣' => [8291].pack('U'), + 'Î' => [206].pack('U'), + 'î' => [238].pack('U'), + 'И' => [1048].pack('U'), + 'и' => [1080].pack('U'), + 'İ' => [304].pack('U'), + 'Е' => [1045].pack('U'), + 'е' => [1077].pack('U'), + '¡' => [161].pack('U'), + '⇔' => [8660].pack('U'), + 'ℑ' => [8465].pack('U'), + '𝔦' => [120102].pack('U'), + 'Ì' => [204].pack('U'), + 'ì' => [236].pack('U'), + 'ⅈ' => [8520].pack('U'), + '⨌' => [10764].pack('U'), + '∭' => [8749].pack('U'), + '⧜' => [10716].pack('U'), + '℩' => [8489].pack('U'), + 'IJ' => [306].pack('U'), + 'ij' => [307].pack('U'), + 'ℑ' => [8465].pack('U'), + 'Ī' => [298].pack('U'), + 'ī' => [299].pack('U'), + 'ℑ' => [8465].pack('U'), + 'ⅈ' => [8520].pack('U'), + 'ℐ' => [8464].pack('U'), + 'ℑ' => [8465].pack('U'), + 'ı' => [305].pack('U'), + '⊷' => [8887].pack('U'), + 'Ƶ' => [437].pack('U'), + '⇒' => [8658].pack('U'), + '∈' => [8712].pack('U'), + '℅' => [8453].pack('U'), + '∞' => [8734].pack('U'), + '⧝' => [10717].pack('U'), + 'ı' => [305].pack('U'), + '∬' => [8748].pack('U'), + '∫' => [8747].pack('U'), + '⊺' => [8890].pack('U'), + 'ℤ' => [8484].pack('U'), + '∫' => [8747].pack('U'), + '⊺' => [8890].pack('U'), + '⋂' => [8898].pack('U'), + '⨗' => [10775].pack('U'), + '⨼' => [10812].pack('U'), + '⁣' => [8291].pack('U'), + '⁢' => [8290].pack('U'), + 'Ё' => [1025].pack('U'), + 'ё' => [1105].pack('U'), + 'Į' => [302].pack('U'), + 'į' => [303].pack('U'), + '𝕀' => [120128].pack('U'), + '𝕚' => [120154].pack('U'), + 'Ι' => [921].pack('U'), + 'ι' => [953].pack('U'), + '⨼' => [10812].pack('U'), + '¿' => [191].pack('U'), + 'ℐ' => [8464].pack('U'), + '𝒾' => [119998].pack('U'), + '∈' => [8712].pack('U'), + '⋵' => [8949].pack('U'), + '⋹' => [8953].pack('U'), + '⋴' => [8948].pack('U'), + '⋳' => [8947].pack('U'), + '∈' => [8712].pack('U'), + '⁢' => [8290].pack('U'), + 'Ĩ' => [296].pack('U'), + 'ĩ' => [297].pack('U'), + 'І' => [1030].pack('U'), + 'і' => [1110].pack('U'), + 'Ï' => [207].pack('U'), + 'ï' => [239].pack('U'), + 'Ĵ' => [308].pack('U'), + 'ĵ' => [309].pack('U'), + 'Й' => [1049].pack('U'), + 'й' => [1081].pack('U'), + '𝔍' => [120077].pack('U'), + '𝔧' => [120103].pack('U'), + 'ȷ' => [567].pack('U'), + '𝕁' => [120129].pack('U'), + '𝕛' => [120155].pack('U'), + '𝒥' => [119973].pack('U'), + '𝒿' => [119999].pack('U'), + 'Ј' => [1032].pack('U'), + 'ј' => [1112].pack('U'), + 'Є' => [1028].pack('U'), + 'є' => [1108].pack('U'), + 'Κ' => [922].pack('U'), + 'κ' => [954].pack('U'), + 'ϰ' => [1008].pack('U'), + 'Ķ' => [310].pack('U'), + 'ķ' => [311].pack('U'), + 'К' => [1050].pack('U'), + 'к' => [1082].pack('U'), + '𝔎' => [120078].pack('U'), + '𝔨' => [120104].pack('U'), + 'ĸ' => [312].pack('U'), + 'Х' => [1061].pack('U'), + 'х' => [1093].pack('U'), + 'Ќ' => [1036].pack('U'), + 'ќ' => [1116].pack('U'), + '𝕂' => [120130].pack('U'), + '𝕜' => [120156].pack('U'), + '𝒦' => [119974].pack('U'), + '𝓀' => [120000].pack('U'), + '⇚' => [8666].pack('U'), + 'Ĺ' => [313].pack('U'), + 'ĺ' => [314].pack('U'), + '⦴' => [10676].pack('U'), + 'ℒ' => [8466].pack('U'), + 'Λ' => [923].pack('U'), + 'λ' => [955].pack('U'), + '⟪' => [10218].pack('U'), + '⟨' => [10216].pack('U'), + '⦑' => [10641].pack('U'), + '⟨' => [10216].pack('U'), + '⪅' => [10885].pack('U'), + 'ℒ' => [8466].pack('U'), + '«' => [171].pack('U'), + '↞' => [8606].pack('U'), + '⇐' => [8656].pack('U'), + '←' => [8592].pack('U'), + '⇤' => [8676].pack('U'), + '⤟' => [10527].pack('U'), + '⤝' => [10525].pack('U'), + '↩' => [8617].pack('U'), + '↫' => [8619].pack('U'), + '⤹' => [10553].pack('U'), + '⥳' => [10611].pack('U'), + '↢' => [8610].pack('U'), + '⪫' => [10923].pack('U'), + '⤛' => [10523].pack('U'), + '⤙' => [10521].pack('U'), + '⪭' => [10925].pack('U'), + '⪭︀' => [10925, 65024].pack('U'), + '⤎' => [10510].pack('U'), + '⤌' => [10508].pack('U'), + '❲' => [10098].pack('U'), + '{' => [123].pack('U'), + '[' => [91].pack('U'), + '⦋' => [10635].pack('U'), + '⦏' => [10639].pack('U'), + '⦍' => [10637].pack('U'), + 'Ľ' => [317].pack('U'), + 'ľ' => [318].pack('U'), + 'Ļ' => [315].pack('U'), + 'ļ' => [316].pack('U'), + '⌈' => [8968].pack('U'), + '{' => [123].pack('U'), + 'Л' => [1051].pack('U'), + 'л' => [1083].pack('U'), + '⤶' => [10550].pack('U'), + '“' => [8220].pack('U'), + '„' => [8222].pack('U'), + '⥧' => [10599].pack('U'), + '⥋' => [10571].pack('U'), + '↲' => [8626].pack('U'), + '≦' => [8806].pack('U'), + '≤' => [8804].pack('U'), + '⟨' => [10216].pack('U'), + '←' => [8592].pack('U'), + '⇐' => [8656].pack('U'), + '←' => [8592].pack('U'), + '⇤' => [8676].pack('U'), + '⇆' => [8646].pack('U'), + '↢' => [8610].pack('U'), + '⌈' => [8968].pack('U'), + '⟦' => [10214].pack('U'), + '⥡' => [10593].pack('U'), + '⇃' => [8643].pack('U'), + '⥙' => [10585].pack('U'), + '⌊' => [8970].pack('U'), + '↽' => [8637].pack('U'), + '↼' => [8636].pack('U'), + '⇇' => [8647].pack('U'), + '↔' => [8596].pack('U'), + '⇔' => [8660].pack('U'), + '↔' => [8596].pack('U'), + '⇆' => [8646].pack('U'), + '⇋' => [8651].pack('U'), + '↭' => [8621].pack('U'), + '⥎' => [10574].pack('U'), + '⊣' => [8867].pack('U'), + '↤' => [8612].pack('U'), + '⥚' => [10586].pack('U'), + '⋋' => [8907].pack('U'), + '⊲' => [8882].pack('U'), + '⧏' => [10703].pack('U'), + '⊴' => [8884].pack('U'), + '⥑' => [10577].pack('U'), + '⥠' => [10592].pack('U'), + '↿' => [8639].pack('U'), + '⥘' => [10584].pack('U'), + '↼' => [8636].pack('U'), + '⥒' => [10578].pack('U'), + '⪋' => [10891].pack('U'), + '⋚' => [8922].pack('U'), + '≤' => [8804].pack('U'), + '≦' => [8806].pack('U'), + '⩽' => [10877].pack('U'), + '⩽' => [10877].pack('U'), + '⪨' => [10920].pack('U'), + '⩿' => [10879].pack('U'), + '⪁' => [10881].pack('U'), + '⪃' => [10883].pack('U'), + '⋚︀' => [8922, 65024].pack('U'), + '⪓' => [10899].pack('U'), + '⪅' => [10885].pack('U'), + '⋖' => [8918].pack('U'), + '⋚' => [8922].pack('U'), + '⪋' => [10891].pack('U'), + '⋚' => [8922].pack('U'), + '≦' => [8806].pack('U'), + '≶' => [8822].pack('U'), + '≶' => [8822].pack('U'), + '⪡' => [10913].pack('U'), + '≲' => [8818].pack('U'), + '⩽' => [10877].pack('U'), + '≲' => [8818].pack('U'), + '⥼' => [10620].pack('U'), + '⌊' => [8970].pack('U'), + '𝔏' => [120079].pack('U'), + '𝔩' => [120105].pack('U'), + '≶' => [8822].pack('U'), + '⪑' => [10897].pack('U'), + '⥢' => [10594].pack('U'), + '↽' => [8637].pack('U'), + '↼' => [8636].pack('U'), + '⥪' => [10602].pack('U'), + '▄' => [9604].pack('U'), + 'Љ' => [1033].pack('U'), + 'љ' => [1113].pack('U'), + '⋘' => [8920].pack('U'), + '≪' => [8810].pack('U'), + '⇇' => [8647].pack('U'), + '⌞' => [8990].pack('U'), + '⇚' => [8666].pack('U'), + '⥫' => [10603].pack('U'), + '◺' => [9722].pack('U'), + 'Ŀ' => [319].pack('U'), + 'ŀ' => [320].pack('U'), + '⎰' => [9136].pack('U'), + '⎰' => [9136].pack('U'), + '⪉' => [10889].pack('U'), + '⪉' => [10889].pack('U'), + '≨' => [8808].pack('U'), + '⪇' => [10887].pack('U'), + '⪇' => [10887].pack('U'), + '≨' => [8808].pack('U'), + '⋦' => [8934].pack('U'), + '⟬' => [10220].pack('U'), + '⇽' => [8701].pack('U'), + '⟦' => [10214].pack('U'), + '⟵' => [10229].pack('U'), + '⟸' => [10232].pack('U'), + '⟵' => [10229].pack('U'), + '⟷' => [10231].pack('U'), + '⟺' => [10234].pack('U'), + '⟷' => [10231].pack('U'), + '⟼' => [10236].pack('U'), + '⟶' => [10230].pack('U'), + '⟹' => [10233].pack('U'), + '⟶' => [10230].pack('U'), + '↫' => [8619].pack('U'), + '↬' => [8620].pack('U'), + '⦅' => [10629].pack('U'), + '𝕃' => [120131].pack('U'), + '𝕝' => [120157].pack('U'), + '⨭' => [10797].pack('U'), + '⨴' => [10804].pack('U'), + '∗' => [8727].pack('U'), + '_' => [95].pack('U'), + '↙' => [8601].pack('U'), + '↘' => [8600].pack('U'), + '◊' => [9674].pack('U'), + '◊' => [9674].pack('U'), + '⧫' => [10731].pack('U'), + '(' => [40].pack('U'), + '⦓' => [10643].pack('U'), + '⇆' => [8646].pack('U'), + '⌟' => [8991].pack('U'), + '⇋' => [8651].pack('U'), + '⥭' => [10605].pack('U'), + '‎' => [8206].pack('U'), + '⊿' => [8895].pack('U'), + '‹' => [8249].pack('U'), + 'ℒ' => [8466].pack('U'), + '𝓁' => [120001].pack('U'), + '↰' => [8624].pack('U'), + '↰' => [8624].pack('U'), + '≲' => [8818].pack('U'), + '⪍' => [10893].pack('U'), + '⪏' => [10895].pack('U'), + '[' => [91].pack('U'), + '‘' => [8216].pack('U'), + '‚' => [8218].pack('U'), + 'Ł' => [321].pack('U'), + 'ł' => [322].pack('U'), + '<' => [60].pack('U'), + '≪' => [8810].pack('U'), + '<' => [60].pack('U'), + '⪦' => [10918].pack('U'), + '⩹' => [10873].pack('U'), + '⋖' => [8918].pack('U'), + '⋋' => [8907].pack('U'), + '⋉' => [8905].pack('U'), + '⥶' => [10614].pack('U'), + '⩻' => [10875].pack('U'), + '◃' => [9667].pack('U'), + '⊴' => [8884].pack('U'), + '◂' => [9666].pack('U'), + '⦖' => [10646].pack('U'), + '⥊' => [10570].pack('U'), + '⥦' => [10598].pack('U'), + '≨︀' => [8808, 65024].pack('U'), + '≨︀' => [8808, 65024].pack('U'), + '¯' => [175].pack('U'), + '♂' => [9794].pack('U'), + '✠' => [10016].pack('U'), + '✠' => [10016].pack('U'), + '⤅' => [10501].pack('U'), + '↦' => [8614].pack('U'), + '↦' => [8614].pack('U'), + '↧' => [8615].pack('U'), + '↤' => [8612].pack('U'), + '↥' => [8613].pack('U'), + '▮' => [9646].pack('U'), + '⨩' => [10793].pack('U'), + 'М' => [1052].pack('U'), + 'м' => [1084].pack('U'), + '—' => [8212].pack('U'), + '∺' => [8762].pack('U'), + '∡' => [8737].pack('U'), + ' ' => [8287].pack('U'), + 'ℳ' => [8499].pack('U'), + '𝔐' => [120080].pack('U'), + '𝔪' => [120106].pack('U'), + '℧' => [8487].pack('U'), + 'µ' => [181].pack('U'), + '∣' => [8739].pack('U'), + '*' => [42].pack('U'), + '⫰' => [10992].pack('U'), + '·' => [183].pack('U'), + '−' => [8722].pack('U'), + '⊟' => [8863].pack('U'), + '∸' => [8760].pack('U'), + '⨪' => [10794].pack('U'), + '∓' => [8723].pack('U'), + '⫛' => [10971].pack('U'), + '…' => [8230].pack('U'), + '∓' => [8723].pack('U'), + '⊧' => [8871].pack('U'), + '𝕄' => [120132].pack('U'), + '𝕞' => [120158].pack('U'), + '∓' => [8723].pack('U'), + 'ℳ' => [8499].pack('U'), + '𝓂' => [120002].pack('U'), + '∾' => [8766].pack('U'), + 'Μ' => [924].pack('U'), + 'μ' => [956].pack('U'), + '⊸' => [8888].pack('U'), + '⊸' => [8888].pack('U'), + '∇' => [8711].pack('U'), + 'Ń' => [323].pack('U'), + 'ń' => [324].pack('U'), + '∠⃒' => [8736, 8402].pack('U'), + '≉' => [8777].pack('U'), + '⩰̸' => [10864, 824].pack('U'), + '≋̸' => [8779, 824].pack('U'), + 'ʼn' => [329].pack('U'), + '≉' => [8777].pack('U'), + '♮' => [9838].pack('U'), + '♮' => [9838].pack('U'), + 'ℕ' => [8469].pack('U'), + ' ' => [160].pack('U'), + '≎̸' => [8782, 824].pack('U'), + '≏̸' => [8783, 824].pack('U'), + '⩃' => [10819].pack('U'), + 'Ň' => [327].pack('U'), + 'ň' => [328].pack('U'), + 'Ņ' => [325].pack('U'), + 'ņ' => [326].pack('U'), + '≇' => [8775].pack('U'), + '⩭̸' => [10861, 824].pack('U'), + '⩂' => [10818].pack('U'), + 'Н' => [1053].pack('U'), + 'н' => [1085].pack('U'), + '–' => [8211].pack('U'), + '≠' => [8800].pack('U'), + '⤤' => [10532].pack('U'), + '⇗' => [8663].pack('U'), + '↗' => [8599].pack('U'), + '↗' => [8599].pack('U'), + '≐̸' => [8784, 824].pack('U'), + '​' => [8203].pack('U'), + '​' => [8203].pack('U'), + '​' => [8203].pack('U'), + '​' => [8203].pack('U'), + '≢' => [8802].pack('U'), + '⤨' => [10536].pack('U'), + '≂̸' => [8770, 824].pack('U'), + '≫' => [8811].pack('U'), + '≪' => [8810].pack('U'), + ' ' => [10].pack('U'), + '∄' => [8708].pack('U'), + '∄' => [8708].pack('U'), + '𝔑' => [120081].pack('U'), + '𝔫' => [120107].pack('U'), + '≧̸' => [8807, 824].pack('U'), + '≱' => [8817].pack('U'), + '≱' => [8817].pack('U'), + '≧̸' => [8807, 824].pack('U'), + '⩾̸' => [10878, 824].pack('U'), + '⩾̸' => [10878, 824].pack('U'), + '⋙̸' => [8921, 824].pack('U'), + '≵' => [8821].pack('U'), + '≫⃒' => [8811, 8402].pack('U'), + '≯' => [8815].pack('U'), + '≯' => [8815].pack('U'), + '≫̸' => [8811, 824].pack('U'), + '⇎' => [8654].pack('U'), + '↮' => [8622].pack('U'), + '⫲' => [10994].pack('U'), + '∋' => [8715].pack('U'), + '⋼' => [8956].pack('U'), + '⋺' => [8954].pack('U'), + '∋' => [8715].pack('U'), + 'Њ' => [1034].pack('U'), + 'њ' => [1114].pack('U'), + '⇍' => [8653].pack('U'), + '↚' => [8602].pack('U'), + '‥' => [8229].pack('U'), + '≦̸' => [8806, 824].pack('U'), + '≰' => [8816].pack('U'), + '⇍' => [8653].pack('U'), + '↚' => [8602].pack('U'), + '⇎' => [8654].pack('U'), + '↮' => [8622].pack('U'), + '≰' => [8816].pack('U'), + '≦̸' => [8806, 824].pack('U'), + '⩽̸' => [10877, 824].pack('U'), + '⩽̸' => [10877, 824].pack('U'), + '≮' => [8814].pack('U'), + '⋘̸' => [8920, 824].pack('U'), + '≴' => [8820].pack('U'), + '≪⃒' => [8810, 8402].pack('U'), + '≮' => [8814].pack('U'), + '⋪' => [8938].pack('U'), + '⋬' => [8940].pack('U'), + '≪̸' => [8810, 824].pack('U'), + '∤' => [8740].pack('U'), + '⁠' => [8288].pack('U'), + ' ' => [160].pack('U'), + 'ℕ' => [8469].pack('U'), + '𝕟' => [120159].pack('U'), + '⫬' => [10988].pack('U'), + '¬' => [172].pack('U'), + '≢' => [8802].pack('U'), + '≭' => [8813].pack('U'), + '∦' => [8742].pack('U'), + '∉' => [8713].pack('U'), + '≠' => [8800].pack('U'), + '≂̸' => [8770, 824].pack('U'), + '∄' => [8708].pack('U'), + '≯' => [8815].pack('U'), + '≱' => [8817].pack('U'), + '≧̸' => [8807, 824].pack('U'), + '≫̸' => [8811, 824].pack('U'), + '≹' => [8825].pack('U'), + '⩾̸' => [10878, 824].pack('U'), + '≵' => [8821].pack('U'), + '≎̸' => [8782, 824].pack('U'), + '≏̸' => [8783, 824].pack('U'), + '∉' => [8713].pack('U'), + '⋵̸' => [8949, 824].pack('U'), + '⋹̸' => [8953, 824].pack('U'), + '∉' => [8713].pack('U'), + '⋷' => [8951].pack('U'), + '⋶' => [8950].pack('U'), + '⋪' => [8938].pack('U'), + '⧏̸' => [10703, 824].pack('U'), + '⋬' => [8940].pack('U'), + '≮' => [8814].pack('U'), + '≰' => [8816].pack('U'), + '≸' => [8824].pack('U'), + '≪̸' => [8810, 824].pack('U'), + '⩽̸' => [10877, 824].pack('U'), + '≴' => [8820].pack('U'), + '⪢̸' => [10914, 824].pack('U'), + '⪡̸' => [10913, 824].pack('U'), + '∌' => [8716].pack('U'), + '∌' => [8716].pack('U'), + '⋾' => [8958].pack('U'), + '⋽' => [8957].pack('U'), + '⊀' => [8832].pack('U'), + '⪯̸' => [10927, 824].pack('U'), + '⋠' => [8928].pack('U'), + '∌' => [8716].pack('U'), + '⋫' => [8939].pack('U'), + '⧐̸' => [10704, 824].pack('U'), + '⋭' => [8941].pack('U'), + '⊏̸' => [8847, 824].pack('U'), + '⋢' => [8930].pack('U'), + '⊐̸' => [8848, 824].pack('U'), + '⋣' => [8931].pack('U'), + '⊂⃒' => [8834, 8402].pack('U'), + '⊈' => [8840].pack('U'), + '⊁' => [8833].pack('U'), + '⪰̸' => [10928, 824].pack('U'), + '⋡' => [8929].pack('U'), + '≿̸' => [8831, 824].pack('U'), + '⊃⃒' => [8835, 8402].pack('U'), + '⊉' => [8841].pack('U'), + '≁' => [8769].pack('U'), + '≄' => [8772].pack('U'), + '≇' => [8775].pack('U'), + '≉' => [8777].pack('U'), + '∤' => [8740].pack('U'), + '∦' => [8742].pack('U'), + '∦' => [8742].pack('U'), + '⫽⃥' => [11005, 8421].pack('U'), + '∂̸' => [8706, 824].pack('U'), + '⨔' => [10772].pack('U'), + '⊀' => [8832].pack('U'), + '⋠' => [8928].pack('U'), + '⪯̸' => [10927, 824].pack('U'), + '⊀' => [8832].pack('U'), + '⪯̸' => [10927, 824].pack('U'), + '⇏' => [8655].pack('U'), + '↛' => [8603].pack('U'), + '⤳̸' => [10547, 824].pack('U'), + '↝̸' => [8605, 824].pack('U'), + '⇏' => [8655].pack('U'), + '↛' => [8603].pack('U'), + '⋫' => [8939].pack('U'), + '⋭' => [8941].pack('U'), + '⊁' => [8833].pack('U'), + '⋡' => [8929].pack('U'), + '⪰̸' => [10928, 824].pack('U'), + '𝒩' => [119977].pack('U'), + '𝓃' => [120003].pack('U'), + '∤' => [8740].pack('U'), + '∦' => [8742].pack('U'), + '≁' => [8769].pack('U'), + '≄' => [8772].pack('U'), + '≄' => [8772].pack('U'), + '∤' => [8740].pack('U'), + '∦' => [8742].pack('U'), + '⋢' => [8930].pack('U'), + '⋣' => [8931].pack('U'), + '⊄' => [8836].pack('U'), + '⫅̸' => [10949, 824].pack('U'), + '⊈' => [8840].pack('U'), + '⊂⃒' => [8834, 8402].pack('U'), + '⊈' => [8840].pack('U'), + '⫅̸' => [10949, 824].pack('U'), + '⊁' => [8833].pack('U'), + '⪰̸' => [10928, 824].pack('U'), + '⊅' => [8837].pack('U'), + '⫆̸' => [10950, 824].pack('U'), + '⊉' => [8841].pack('U'), + '⊃⃒' => [8835, 8402].pack('U'), + '⊉' => [8841].pack('U'), + '⫆̸' => [10950, 824].pack('U'), + '≹' => [8825].pack('U'), + 'Ñ' => [209].pack('U'), + 'ñ' => [241].pack('U'), + '≸' => [8824].pack('U'), + '⋪' => [8938].pack('U'), + '⋬' => [8940].pack('U'), + '⋫' => [8939].pack('U'), + '⋭' => [8941].pack('U'), + 'Ν' => [925].pack('U'), + 'ν' => [957].pack('U'), + '#' => [35].pack('U'), + '№' => [8470].pack('U'), + ' ' => [8199].pack('U'), + '≍⃒' => [8781, 8402].pack('U'), + '⊯' => [8879].pack('U'), + '⊮' => [8878].pack('U'), + '⊭' => [8877].pack('U'), + '⊬' => [8876].pack('U'), + '≥⃒' => [8805, 8402].pack('U'), + '>⃒' => [62, 8402].pack('U'), + '⤄' => [10500].pack('U'), + '⧞' => [10718].pack('U'), + '⤂' => [10498].pack('U'), + '≤⃒' => [8804, 8402].pack('U'), + '<⃒' => [60, 8402].pack('U'), + '⊴⃒' => [8884, 8402].pack('U'), + '⤃' => [10499].pack('U'), + '⊵⃒' => [8885, 8402].pack('U'), + '∼⃒' => [8764, 8402].pack('U'), + '⤣' => [10531].pack('U'), + '⇖' => [8662].pack('U'), + '↖' => [8598].pack('U'), + '↖' => [8598].pack('U'), + '⤧' => [10535].pack('U'), + 'Ó' => [211].pack('U'), + 'ó' => [243].pack('U'), + '⊛' => [8859].pack('U'), + '⊚' => [8858].pack('U'), + 'Ô' => [212].pack('U'), + 'ô' => [244].pack('U'), + 'О' => [1054].pack('U'), + 'о' => [1086].pack('U'), + '⊝' => [8861].pack('U'), + 'Ő' => [336].pack('U'), + 'ő' => [337].pack('U'), + '⨸' => [10808].pack('U'), + '⊙' => [8857].pack('U'), + '⦼' => [10684].pack('U'), + 'Œ' => [338].pack('U'), + 'œ' => [339].pack('U'), + '⦿' => [10687].pack('U'), + '𝔒' => [120082].pack('U'), + '𝔬' => [120108].pack('U'), + '˛' => [731].pack('U'), + 'Ò' => [210].pack('U'), + 'ò' => [242].pack('U'), + '⧁' => [10689].pack('U'), + '⦵' => [10677].pack('U'), + 'Ω' => [937].pack('U'), + '∮' => [8750].pack('U'), + '↺' => [8634].pack('U'), + '⦾' => [10686].pack('U'), + '⦻' => [10683].pack('U'), + '‾' => [8254].pack('U'), + '⧀' => [10688].pack('U'), + 'Ō' => [332].pack('U'), + 'ō' => [333].pack('U'), + 'Ω' => [937].pack('U'), + 'ω' => [969].pack('U'), + 'Ο' => [927].pack('U'), + 'ο' => [959].pack('U'), + '⦶' => [10678].pack('U'), + '⊖' => [8854].pack('U'), + '𝕆' => [120134].pack('U'), + '𝕠' => [120160].pack('U'), + '⦷' => [10679].pack('U'), + '“' => [8220].pack('U'), + '‘' => [8216].pack('U'), + '⦹' => [10681].pack('U'), + '⊕' => [8853].pack('U'), + '⩔' => [10836].pack('U'), + '∨' => [8744].pack('U'), + '↻' => [8635].pack('U'), + '⩝' => [10845].pack('U'), + 'ℴ' => [8500].pack('U'), + 'ℴ' => [8500].pack('U'), + 'ª' => [170].pack('U'), + 'º' => [186].pack('U'), + '⊶' => [8886].pack('U'), + '⩖' => [10838].pack('U'), + '⩗' => [10839].pack('U'), + '⩛' => [10843].pack('U'), + 'Ⓢ' => [9416].pack('U'), + '𝒪' => [119978].pack('U'), + 'ℴ' => [8500].pack('U'), + 'Ø' => [216].pack('U'), + 'ø' => [248].pack('U'), + '⊘' => [8856].pack('U'), + 'Õ' => [213].pack('U'), + 'õ' => [245].pack('U'), + '⨷' => [10807].pack('U'), + '⊗' => [8855].pack('U'), + '⨶' => [10806].pack('U'), + 'Ö' => [214].pack('U'), + 'ö' => [246].pack('U'), + '⌽' => [9021].pack('U'), + '‾' => [8254].pack('U'), + '⏞' => [9182].pack('U'), + '⎴' => [9140].pack('U'), + '⏜' => [9180].pack('U'), + '∥' => [8741].pack('U'), + '¶' => [182].pack('U'), + '∥' => [8741].pack('U'), + '⫳' => [10995].pack('U'), + '⫽' => [11005].pack('U'), + '∂' => [8706].pack('U'), + '∂' => [8706].pack('U'), + 'П' => [1055].pack('U'), + 'п' => [1087].pack('U'), + '%' => [37].pack('U'), + '.' => [46].pack('U'), + '‰' => [8240].pack('U'), + '⊥' => [8869].pack('U'), + '‱' => [8241].pack('U'), + '𝔓' => [120083].pack('U'), + '𝔭' => [120109].pack('U'), + 'Φ' => [934].pack('U'), + 'φ' => [966].pack('U'), + 'ϕ' => [981].pack('U'), + 'ℳ' => [8499].pack('U'), + '☎' => [9742].pack('U'), + 'Π' => [928].pack('U'), + 'π' => [960].pack('U'), + '⋔' => [8916].pack('U'), + 'ϖ' => [982].pack('U'), + 'ℏ' => [8463].pack('U'), + 'ℎ' => [8462].pack('U'), + 'ℏ' => [8463].pack('U'), + '+' => [43].pack('U'), + '⨣' => [10787].pack('U'), + '⊞' => [8862].pack('U'), + '⨢' => [10786].pack('U'), + '∔' => [8724].pack('U'), + '⨥' => [10789].pack('U'), + '⩲' => [10866].pack('U'), + '±' => [177].pack('U'), + '±' => [177].pack('U'), + '⨦' => [10790].pack('U'), + '⨧' => [10791].pack('U'), + '±' => [177].pack('U'), + 'ℌ' => [8460].pack('U'), + '⨕' => [10773].pack('U'), + 'ℙ' => [8473].pack('U'), + '𝕡' => [120161].pack('U'), + '£' => [163].pack('U'), + '⪻' => [10939].pack('U'), + '≺' => [8826].pack('U'), + '⪷' => [10935].pack('U'), + '≼' => [8828].pack('U'), + '⪳' => [10931].pack('U'), + '⪯' => [10927].pack('U'), + '≺' => [8826].pack('U'), + '⪷' => [10935].pack('U'), + '≼' => [8828].pack('U'), + '≺' => [8826].pack('U'), + '⪯' => [10927].pack('U'), + '≼' => [8828].pack('U'), + '≾' => [8830].pack('U'), + '⪯' => [10927].pack('U'), + '⪹' => [10937].pack('U'), + '⪵' => [10933].pack('U'), + '⋨' => [8936].pack('U'), + '≾' => [8830].pack('U'), + '″' => [8243].pack('U'), + '′' => [8242].pack('U'), + 'ℙ' => [8473].pack('U'), + '⪹' => [10937].pack('U'), + '⪵' => [10933].pack('U'), + '⋨' => [8936].pack('U'), + '∏' => [8719].pack('U'), + '∏' => [8719].pack('U'), + '⌮' => [9006].pack('U'), + '⌒' => [8978].pack('U'), + '⌓' => [8979].pack('U'), + '∝' => [8733].pack('U'), + '∷' => [8759].pack('U'), + '∝' => [8733].pack('U'), + '∝' => [8733].pack('U'), + '≾' => [8830].pack('U'), + '⊰' => [8880].pack('U'), + '𝒫' => [119979].pack('U'), + '𝓅' => [120005].pack('U'), + 'Ψ' => [936].pack('U'), + 'ψ' => [968].pack('U'), + ' ' => [8200].pack('U'), + '𝔔' => [120084].pack('U'), + '𝔮' => [120110].pack('U'), + '⨌' => [10764].pack('U'), + 'ℚ' => [8474].pack('U'), + '𝕢' => [120162].pack('U'), + '⁗' => [8279].pack('U'), + '𝒬' => [119980].pack('U'), + '𝓆' => [120006].pack('U'), + 'ℍ' => [8461].pack('U'), + '⨖' => [10774].pack('U'), + '?' => [63].pack('U'), + '≟' => [8799].pack('U'), + '"' => [34].pack('U'), + '"' => [34].pack('U'), + '⇛' => [8667].pack('U'), + '∽̱' => [8765, 817].pack('U'), + 'Ŕ' => [340].pack('U'), + 'ŕ' => [341].pack('U'), + '√' => [8730].pack('U'), + '⦳' => [10675].pack('U'), + '⟫' => [10219].pack('U'), + '⟩' => [10217].pack('U'), + '⦒' => [10642].pack('U'), + '⦥' => [10661].pack('U'), + '⟩' => [10217].pack('U'), + '»' => [187].pack('U'), + '↠' => [8608].pack('U'), + '⇒' => [8658].pack('U'), + '→' => [8594].pack('U'), + '⥵' => [10613].pack('U'), + '⇥' => [8677].pack('U'), + '⤠' => [10528].pack('U'), + '⤳' => [10547].pack('U'), + '⤞' => [10526].pack('U'), + '↪' => [8618].pack('U'), + '↬' => [8620].pack('U'), + '⥅' => [10565].pack('U'), + '⥴' => [10612].pack('U'), + '⤖' => [10518].pack('U'), + '↣' => [8611].pack('U'), + '↝' => [8605].pack('U'), + '⤜' => [10524].pack('U'), + '⤚' => [10522].pack('U'), + '∶' => [8758].pack('U'), + 'ℚ' => [8474].pack('U'), + '⤐' => [10512].pack('U'), + '⤏' => [10511].pack('U'), + '⤍' => [10509].pack('U'), + '❳' => [10099].pack('U'), + '}' => [125].pack('U'), + ']' => [93].pack('U'), + '⦌' => [10636].pack('U'), + '⦎' => [10638].pack('U'), + '⦐' => [10640].pack('U'), + 'Ř' => [344].pack('U'), + 'ř' => [345].pack('U'), + 'Ŗ' => [342].pack('U'), + 'ŗ' => [343].pack('U'), + '⌉' => [8969].pack('U'), + '}' => [125].pack('U'), + 'Р' => [1056].pack('U'), + 'р' => [1088].pack('U'), + '⤷' => [10551].pack('U'), + '⥩' => [10601].pack('U'), + '”' => [8221].pack('U'), + '”' => [8221].pack('U'), + '↳' => [8627].pack('U'), + 'ℜ' => [8476].pack('U'), + 'ℜ' => [8476].pack('U'), + 'ℛ' => [8475].pack('U'), + 'ℜ' => [8476].pack('U'), + 'ℝ' => [8477].pack('U'), + '▭' => [9645].pack('U'), + '®' => [174].pack('U'), + '®' => [174].pack('U'), + '∋' => [8715].pack('U'), + '⇋' => [8651].pack('U'), + '⥯' => [10607].pack('U'), + '⥽' => [10621].pack('U'), + '⌋' => [8971].pack('U'), + 'ℜ' => [8476].pack('U'), + '𝔯' => [120111].pack('U'), + '⥤' => [10596].pack('U'), + '⇁' => [8641].pack('U'), + '⇀' => [8640].pack('U'), + '⥬' => [10604].pack('U'), + 'Ρ' => [929].pack('U'), + 'ρ' => [961].pack('U'), + 'ϱ' => [1009].pack('U'), + '⟩' => [10217].pack('U'), + '→' => [8594].pack('U'), + '⇒' => [8658].pack('U'), + '→' => [8594].pack('U'), + '⇥' => [8677].pack('U'), + '⇄' => [8644].pack('U'), + '↣' => [8611].pack('U'), + '⌉' => [8969].pack('U'), + '⟧' => [10215].pack('U'), + '⥝' => [10589].pack('U'), + '⇂' => [8642].pack('U'), + '⥕' => [10581].pack('U'), + '⌋' => [8971].pack('U'), + '⇁' => [8641].pack('U'), + '⇀' => [8640].pack('U'), + '⇄' => [8644].pack('U'), + '⇌' => [8652].pack('U'), + '⇉' => [8649].pack('U'), + '↝' => [8605].pack('U'), + '⊢' => [8866].pack('U'), + '↦' => [8614].pack('U'), + '⥛' => [10587].pack('U'), + '⋌' => [8908].pack('U'), + '⊳' => [8883].pack('U'), + '⧐' => [10704].pack('U'), + '⊵' => [8885].pack('U'), + '⥏' => [10575].pack('U'), + '⥜' => [10588].pack('U'), + '↾' => [8638].pack('U'), + '⥔' => [10580].pack('U'), + '⇀' => [8640].pack('U'), + '⥓' => [10579].pack('U'), + '˚' => [730].pack('U'), + '≓' => [8787].pack('U'), + '⇄' => [8644].pack('U'), + '⇌' => [8652].pack('U'), + '‏' => [8207].pack('U'), + '⎱' => [9137].pack('U'), + '⎱' => [9137].pack('U'), + '⫮' => [10990].pack('U'), + '⟭' => [10221].pack('U'), + '⇾' => [8702].pack('U'), + '⟧' => [10215].pack('U'), + '⦆' => [10630].pack('U'), + 'ℝ' => [8477].pack('U'), + '𝕣' => [120163].pack('U'), + '⨮' => [10798].pack('U'), + '⨵' => [10805].pack('U'), + '⥰' => [10608].pack('U'), + ')' => [41].pack('U'), + '⦔' => [10644].pack('U'), + '⨒' => [10770].pack('U'), + '⇉' => [8649].pack('U'), + '⇛' => [8667].pack('U'), + '›' => [8250].pack('U'), + 'ℛ' => [8475].pack('U'), + '𝓇' => [120007].pack('U'), + '↱' => [8625].pack('U'), + '↱' => [8625].pack('U'), + ']' => [93].pack('U'), + '’' => [8217].pack('U'), + '’' => [8217].pack('U'), + '⋌' => [8908].pack('U'), + '⋊' => [8906].pack('U'), + '▹' => [9657].pack('U'), + '⊵' => [8885].pack('U'), + '▸' => [9656].pack('U'), + '⧎' => [10702].pack('U'), + '⧴' => [10740].pack('U'), + '⥨' => [10600].pack('U'), + '℞' => [8478].pack('U'), + 'Ś' => [346].pack('U'), + 'ś' => [347].pack('U'), + '‚' => [8218].pack('U'), + '⪼' => [10940].pack('U'), + '≻' => [8827].pack('U'), + '⪸' => [10936].pack('U'), + 'Š' => [352].pack('U'), + 'š' => [353].pack('U'), + '≽' => [8829].pack('U'), + '⪴' => [10932].pack('U'), + '⪰' => [10928].pack('U'), + 'Ş' => [350].pack('U'), + 'ş' => [351].pack('U'), + 'Ŝ' => [348].pack('U'), + 'ŝ' => [349].pack('U'), + '⪺' => [10938].pack('U'), + '⪶' => [10934].pack('U'), + '⋩' => [8937].pack('U'), + '⨓' => [10771].pack('U'), + '≿' => [8831].pack('U'), + 'С' => [1057].pack('U'), + 'с' => [1089].pack('U'), + '⋅' => [8901].pack('U'), + '⊡' => [8865].pack('U'), + '⩦' => [10854].pack('U'), + '⤥' => [10533].pack('U'), + '⇘' => [8664].pack('U'), + '↘' => [8600].pack('U'), + '↘' => [8600].pack('U'), + '§' => [167].pack('U'), + ';' => [59].pack('U'), + '⤩' => [10537].pack('U'), + '∖' => [8726].pack('U'), + '∖' => [8726].pack('U'), + '✶' => [10038].pack('U'), + '𝔖' => [120086].pack('U'), + '𝔰' => [120112].pack('U'), + '⌢' => [8994].pack('U'), + '♯' => [9839].pack('U'), + 'Щ' => [1065].pack('U'), + 'щ' => [1097].pack('U'), + 'Ш' => [1064].pack('U'), + 'ш' => [1096].pack('U'), + '↓' => [8595].pack('U'), + '←' => [8592].pack('U'), + '∣' => [8739].pack('U'), + '∥' => [8741].pack('U'), + '→' => [8594].pack('U'), + '↑' => [8593].pack('U'), + '­' => [173].pack('U'), + 'Σ' => [931].pack('U'), + 'σ' => [963].pack('U'), + 'ς' => [962].pack('U'), + 'ς' => [962].pack('U'), + '∼' => [8764].pack('U'), + '⩪' => [10858].pack('U'), + '≃' => [8771].pack('U'), + '≃' => [8771].pack('U'), + '⪞' => [10910].pack('U'), + '⪠' => [10912].pack('U'), + '⪝' => [10909].pack('U'), + '⪟' => [10911].pack('U'), + '≆' => [8774].pack('U'), + '⨤' => [10788].pack('U'), + '⥲' => [10610].pack('U'), + '←' => [8592].pack('U'), + '∘' => [8728].pack('U'), + '∖' => [8726].pack('U'), + '⨳' => [10803].pack('U'), + '⧤' => [10724].pack('U'), + '∣' => [8739].pack('U'), + '⌣' => [8995].pack('U'), + '⪪' => [10922].pack('U'), + '⪬' => [10924].pack('U'), + '⪬︀' => [10924, 65024].pack('U'), + 'Ь' => [1068].pack('U'), + 'ь' => [1100].pack('U'), + '/' => [47].pack('U'), + '⧄' => [10692].pack('U'), + '⌿' => [9023].pack('U'), + '𝕊' => [120138].pack('U'), + '𝕤' => [120164].pack('U'), + '♠' => [9824].pack('U'), + '♠' => [9824].pack('U'), + '∥' => [8741].pack('U'), + '⊓' => [8851].pack('U'), + '⊓︀' => [8851, 65024].pack('U'), + '⊔' => [8852].pack('U'), + '⊔︀' => [8852, 65024].pack('U'), + '√' => [8730].pack('U'), + '⊏' => [8847].pack('U'), + '⊑' => [8849].pack('U'), + '⊏' => [8847].pack('U'), + '⊑' => [8849].pack('U'), + '⊐' => [8848].pack('U'), + '⊒' => [8850].pack('U'), + '⊐' => [8848].pack('U'), + '⊒' => [8850].pack('U'), + '□' => [9633].pack('U'), + '□' => [9633].pack('U'), + '□' => [9633].pack('U'), + '⊓' => [8851].pack('U'), + '⊏' => [8847].pack('U'), + '⊑' => [8849].pack('U'), + '⊐' => [8848].pack('U'), + '⊒' => [8850].pack('U'), + '⊔' => [8852].pack('U'), + '▪' => [9642].pack('U'), + '▪' => [9642].pack('U'), + '→' => [8594].pack('U'), + '𝒮' => [119982].pack('U'), + '𝓈' => [120008].pack('U'), + '∖' => [8726].pack('U'), + '⌣' => [8995].pack('U'), + '⋆' => [8902].pack('U'), + '⋆' => [8902].pack('U'), + '☆' => [9734].pack('U'), + '★' => [9733].pack('U'), + 'ϵ' => [1013].pack('U'), + 'ϕ' => [981].pack('U'), + '¯' => [175].pack('U'), + '⋐' => [8912].pack('U'), + '⊂' => [8834].pack('U'), + '⪽' => [10941].pack('U'), + '⫅' => [10949].pack('U'), + '⊆' => [8838].pack('U'), + '⫃' => [10947].pack('U'), + '⫁' => [10945].pack('U'), + '⫋' => [10955].pack('U'), + '⊊' => [8842].pack('U'), + '⪿' => [10943].pack('U'), + '⥹' => [10617].pack('U'), + '⋐' => [8912].pack('U'), + '⊂' => [8834].pack('U'), + '⊆' => [8838].pack('U'), + '⫅' => [10949].pack('U'), + '⊆' => [8838].pack('U'), + '⊊' => [8842].pack('U'), + '⫋' => [10955].pack('U'), + '⫇' => [10951].pack('U'), + '⫕' => [10965].pack('U'), + '⫓' => [10963].pack('U'), + '≻' => [8827].pack('U'), + '⪸' => [10936].pack('U'), + '≽' => [8829].pack('U'), + '≻' => [8827].pack('U'), + '⪰' => [10928].pack('U'), + '≽' => [8829].pack('U'), + '≿' => [8831].pack('U'), + '⪰' => [10928].pack('U'), + '⪺' => [10938].pack('U'), + '⪶' => [10934].pack('U'), + '⋩' => [8937].pack('U'), + '≿' => [8831].pack('U'), + '∋' => [8715].pack('U'), + '∑' => [8721].pack('U'), + '∑' => [8721].pack('U'), + '♪' => [9834].pack('U'), + '⋑' => [8913].pack('U'), + '⊃' => [8835].pack('U'), + '¹' => [185].pack('U'), + '²' => [178].pack('U'), + '³' => [179].pack('U'), + '⪾' => [10942].pack('U'), + '⫘' => [10968].pack('U'), + '⫆' => [10950].pack('U'), + '⊇' => [8839].pack('U'), + '⫄' => [10948].pack('U'), + '⊃' => [8835].pack('U'), + '⊇' => [8839].pack('U'), + '⟉' => [10185].pack('U'), + '⫗' => [10967].pack('U'), + '⥻' => [10619].pack('U'), + '⫂' => [10946].pack('U'), + '⫌' => [10956].pack('U'), + '⊋' => [8843].pack('U'), + '⫀' => [10944].pack('U'), + '⋑' => [8913].pack('U'), + '⊃' => [8835].pack('U'), + '⊇' => [8839].pack('U'), + '⫆' => [10950].pack('U'), + '⊋' => [8843].pack('U'), + '⫌' => [10956].pack('U'), + '⫈' => [10952].pack('U'), + '⫔' => [10964].pack('U'), + '⫖' => [10966].pack('U'), + '⤦' => [10534].pack('U'), + '⇙' => [8665].pack('U'), + '↙' => [8601].pack('U'), + '↙' => [8601].pack('U'), + '⤪' => [10538].pack('U'), + 'ß' => [223].pack('U'), + ' ' => [9].pack('U'), + '⌖' => [8982].pack('U'), + 'Τ' => [932].pack('U'), + 'τ' => [964].pack('U'), + '⎴' => [9140].pack('U'), + 'Ť' => [356].pack('U'), + 'ť' => [357].pack('U'), + 'Ţ' => [354].pack('U'), + 'ţ' => [355].pack('U'), + 'Т' => [1058].pack('U'), + 'т' => [1090].pack('U'), + '⃛' => [8411].pack('U'), + '⌕' => [8981].pack('U'), + '𝔗' => [120087].pack('U'), + '𝔱' => [120113].pack('U'), + '∴' => [8756].pack('U'), + '∴' => [8756].pack('U'), + '∴' => [8756].pack('U'), + 'Θ' => [920].pack('U'), + 'θ' => [952].pack('U'), + 'ϑ' => [977].pack('U'), + 'ϑ' => [977].pack('U'), + '≈' => [8776].pack('U'), + '∼' => [8764].pack('U'), + '  ' => [8287, 8202].pack('U'), + ' ' => [8201].pack('U'), + ' ' => [8201].pack('U'), + '≈' => [8776].pack('U'), + '∼' => [8764].pack('U'), + 'Þ' => [222].pack('U'), + 'þ' => [254].pack('U'), + '∼' => [8764].pack('U'), + '˜' => [732].pack('U'), + '≃' => [8771].pack('U'), + '≅' => [8773].pack('U'), + '≈' => [8776].pack('U'), + '×' => [215].pack('U'), + '⊠' => [8864].pack('U'), + '⨱' => [10801].pack('U'), + '⨰' => [10800].pack('U'), + '∭' => [8749].pack('U'), + '⤨' => [10536].pack('U'), + '⊤' => [8868].pack('U'), + '⌶' => [9014].pack('U'), + '⫱' => [10993].pack('U'), + '𝕋' => [120139].pack('U'), + '𝕥' => [120165].pack('U'), + '⫚' => [10970].pack('U'), + '⤩' => [10537].pack('U'), + '‴' => [8244].pack('U'), + '™' => [8482].pack('U'), + '™' => [8482].pack('U'), + '▵' => [9653].pack('U'), + '▿' => [9663].pack('U'), + '◃' => [9667].pack('U'), + '⊴' => [8884].pack('U'), + '≜' => [8796].pack('U'), + '▹' => [9657].pack('U'), + '⊵' => [8885].pack('U'), + '◬' => [9708].pack('U'), + '≜' => [8796].pack('U'), + '⨺' => [10810].pack('U'), + '⃛' => [8411].pack('U'), + '⨹' => [10809].pack('U'), + '⧍' => [10701].pack('U'), + '⨻' => [10811].pack('U'), + '⏢' => [9186].pack('U'), + '𝒯' => [119983].pack('U'), + '𝓉' => [120009].pack('U'), + 'Ц' => [1062].pack('U'), + 'ц' => [1094].pack('U'), + 'Ћ' => [1035].pack('U'), + 'ћ' => [1115].pack('U'), + 'Ŧ' => [358].pack('U'), + 'ŧ' => [359].pack('U'), + '≬' => [8812].pack('U'), + '↞' => [8606].pack('U'), + '↠' => [8608].pack('U'), + 'Ú' => [218].pack('U'), + 'ú' => [250].pack('U'), + '↟' => [8607].pack('U'), + '⇑' => [8657].pack('U'), + '↑' => [8593].pack('U'), + '⥉' => [10569].pack('U'), + 'Ў' => [1038].pack('U'), + 'ў' => [1118].pack('U'), + 'Ŭ' => [364].pack('U'), + 'ŭ' => [365].pack('U'), + 'Û' => [219].pack('U'), + 'û' => [251].pack('U'), + 'У' => [1059].pack('U'), + 'у' => [1091].pack('U'), + '⇅' => [8645].pack('U'), + 'Ű' => [368].pack('U'), + 'ű' => [369].pack('U'), + '⥮' => [10606].pack('U'), + '⥾' => [10622].pack('U'), + '𝔘' => [120088].pack('U'), + '𝔲' => [120114].pack('U'), + 'Ù' => [217].pack('U'), + 'ù' => [249].pack('U'), + '⥣' => [10595].pack('U'), + '↿' => [8639].pack('U'), + '↾' => [8638].pack('U'), + '▀' => [9600].pack('U'), + '⌜' => [8988].pack('U'), + '⌜' => [8988].pack('U'), + '⌏' => [8975].pack('U'), + '◸' => [9720].pack('U'), + 'Ū' => [362].pack('U'), + 'ū' => [363].pack('U'), + '¨' => [168].pack('U'), + '_' => [95].pack('U'), + '⏟' => [9183].pack('U'), + '⎵' => [9141].pack('U'), + '⏝' => [9181].pack('U'), + '⋃' => [8899].pack('U'), + '⊎' => [8846].pack('U'), + 'Ų' => [370].pack('U'), + 'ų' => [371].pack('U'), + '𝕌' => [120140].pack('U'), + '𝕦' => [120166].pack('U'), + '↑' => [8593].pack('U'), + '⇑' => [8657].pack('U'), + '↑' => [8593].pack('U'), + '⤒' => [10514].pack('U'), + '⇅' => [8645].pack('U'), + '↕' => [8597].pack('U'), + '⇕' => [8661].pack('U'), + '↕' => [8597].pack('U'), + '⥮' => [10606].pack('U'), + '↿' => [8639].pack('U'), + '↾' => [8638].pack('U'), + '⊎' => [8846].pack('U'), + '↖' => [8598].pack('U'), + '↗' => [8599].pack('U'), + 'ϒ' => [978].pack('U'), + 'υ' => [965].pack('U'), + 'ϒ' => [978].pack('U'), + 'Υ' => [933].pack('U'), + 'υ' => [965].pack('U'), + '⊥' => [8869].pack('U'), + '↥' => [8613].pack('U'), + '⇈' => [8648].pack('U'), + '⌝' => [8989].pack('U'), + '⌝' => [8989].pack('U'), + '⌎' => [8974].pack('U'), + 'Ů' => [366].pack('U'), + 'ů' => [367].pack('U'), + '◹' => [9721].pack('U'), + '𝒰' => [119984].pack('U'), + '𝓊' => [120010].pack('U'), + '⋰' => [8944].pack('U'), + 'Ũ' => [360].pack('U'), + 'ũ' => [361].pack('U'), + '▵' => [9653].pack('U'), + '▴' => [9652].pack('U'), + '⇈' => [8648].pack('U'), + 'Ü' => [220].pack('U'), + 'ü' => [252].pack('U'), + '⦧' => [10663].pack('U'), + '⦜' => [10652].pack('U'), + 'ϵ' => [1013].pack('U'), + 'ϰ' => [1008].pack('U'), + '∅' => [8709].pack('U'), + 'ϕ' => [981].pack('U'), + 'ϖ' => [982].pack('U'), + '∝' => [8733].pack('U'), + '⇕' => [8661].pack('U'), + '↕' => [8597].pack('U'), + 'ϱ' => [1009].pack('U'), + 'ς' => [962].pack('U'), + '⊊︀' => [8842, 65024].pack('U'), + '⫋︀' => [10955, 65024].pack('U'), + '⊋︀' => [8843, 65024].pack('U'), + '⫌︀' => [10956, 65024].pack('U'), + 'ϑ' => [977].pack('U'), + '⊲' => [8882].pack('U'), + '⊳' => [8883].pack('U'), + '⫫' => [10987].pack('U'), + '⫨' => [10984].pack('U'), + '⫩' => [10985].pack('U'), + 'В' => [1042].pack('U'), + 'в' => [1074].pack('U'), + '⊫' => [8875].pack('U'), + '⊩' => [8873].pack('U'), + '⊨' => [8872].pack('U'), + '⊢' => [8866].pack('U'), + '⫦' => [10982].pack('U'), + '⋁' => [8897].pack('U'), + '∨' => [8744].pack('U'), + '⊻' => [8891].pack('U'), + '≚' => [8794].pack('U'), + '⋮' => [8942].pack('U'), + '‖' => [8214].pack('U'), + '|' => [124].pack('U'), + '‖' => [8214].pack('U'), + '|' => [124].pack('U'), + '∣' => [8739].pack('U'), + '|' => [124].pack('U'), + '❘' => [10072].pack('U'), + '≀' => [8768].pack('U'), + ' ' => [8202].pack('U'), + '𝔙' => [120089].pack('U'), + '𝔳' => [120115].pack('U'), + '⊲' => [8882].pack('U'), + '⊂⃒' => [8834, 8402].pack('U'), + '⊃⃒' => [8835, 8402].pack('U'), + '𝕍' => [120141].pack('U'), + '𝕧' => [120167].pack('U'), + '∝' => [8733].pack('U'), + '⊳' => [8883].pack('U'), + '𝒱' => [119985].pack('U'), + '𝓋' => [120011].pack('U'), + '⫋︀' => [10955, 65024].pack('U'), + '⊊︀' => [8842, 65024].pack('U'), + '⫌︀' => [10956, 65024].pack('U'), + '⊋︀' => [8843, 65024].pack('U'), + '⊪' => [8874].pack('U'), + '⦚' => [10650].pack('U'), + 'Ŵ' => [372].pack('U'), + 'ŵ' => [373].pack('U'), + '⩟' => [10847].pack('U'), + '⋀' => [8896].pack('U'), + '∧' => [8743].pack('U'), + '≙' => [8793].pack('U'), + '℘' => [8472].pack('U'), + '𝔚' => [120090].pack('U'), + '𝔴' => [120116].pack('U'), + '𝕎' => [120142].pack('U'), + '𝕨' => [120168].pack('U'), + '℘' => [8472].pack('U'), + '≀' => [8768].pack('U'), + '≀' => [8768].pack('U'), + '𝒲' => [119986].pack('U'), + '𝓌' => [120012].pack('U'), + '⋂' => [8898].pack('U'), + '◯' => [9711].pack('U'), + '⋃' => [8899].pack('U'), + '▽' => [9661].pack('U'), + '𝔛' => [120091].pack('U'), + '𝔵' => [120117].pack('U'), + '⟺' => [10234].pack('U'), + '⟷' => [10231].pack('U'), + 'Ξ' => [926].pack('U'), + 'ξ' => [958].pack('U'), + '⟸' => [10232].pack('U'), + '⟵' => [10229].pack('U'), + '⟼' => [10236].pack('U'), + '⋻' => [8955].pack('U'), + '⨀' => [10752].pack('U'), + '𝕏' => [120143].pack('U'), + '𝕩' => [120169].pack('U'), + '⨁' => [10753].pack('U'), + '⨂' => [10754].pack('U'), + '⟹' => [10233].pack('U'), + '⟶' => [10230].pack('U'), + '𝒳' => [119987].pack('U'), + '𝓍' => [120013].pack('U'), + '⨆' => [10758].pack('U'), + '⨄' => [10756].pack('U'), + '△' => [9651].pack('U'), + '⋁' => [8897].pack('U'), + '⋀' => [8896].pack('U'), + 'Ý' => [221].pack('U'), + 'ý' => [253].pack('U'), + 'Я' => [1071].pack('U'), + 'я' => [1103].pack('U'), + 'Ŷ' => [374].pack('U'), + 'ŷ' => [375].pack('U'), + 'Ы' => [1067].pack('U'), + 'ы' => [1099].pack('U'), + '¥' => [165].pack('U'), + '𝔜' => [120092].pack('U'), + '𝔶' => [120118].pack('U'), + 'Ї' => [1031].pack('U'), + 'ї' => [1111].pack('U'), + '𝕐' => [120144].pack('U'), + '𝕪' => [120170].pack('U'), + '𝒴' => [119988].pack('U'), + '𝓎' => [120014].pack('U'), + 'Ю' => [1070].pack('U'), + 'ю' => [1102].pack('U'), + 'Ÿ' => [376].pack('U'), + 'ÿ' => [255].pack('U'), + 'Ź' => [377].pack('U'), + 'ź' => [378].pack('U'), + 'Ž' => [381].pack('U'), + 'ž' => [382].pack('U'), + 'З' => [1047].pack('U'), + 'з' => [1079].pack('U'), + 'Ż' => [379].pack('U'), + 'ż' => [380].pack('U'), + 'ℨ' => [8488].pack('U'), + '​' => [8203].pack('U'), + 'Ζ' => [918].pack('U'), + 'ζ' => [950].pack('U'), + 'ℨ' => [8488].pack('U'), + '𝔷' => [120119].pack('U'), + 'Ж' => [1046].pack('U'), + 'ж' => [1078].pack('U'), + '⇝' => [8669].pack('U'), + 'ℤ' => [8484].pack('U'), + '𝕫' => [120171].pack('U'), + '𝒵' => [119989].pack('U'), + '𝓏' => [120015].pack('U'), + '‍' => [8205].pack('U'), + '‌' => [8204].pack('U'), + } + + ## + # Decodes HTML entities. + # + # @see [decode] + # + def self.decode(input) + return XML::Entities.decode(input, DECODE_MAPPING) + end + end # Entities + end # HTML +end # Oga diff --git a/lib/oga/xml/entities.rb b/lib/oga/xml/entities.rb index 6a6849d..5f62025 100644 --- a/lib/oga/xml/entities.rb +++ b/lib/oga/xml/entities.rb @@ -1,5 +1,9 @@ module Oga module XML + ## + # Module for encoding/decoding XML and HTML entities. The mapping of HTML + # entities can be found in {Oga::HTML::Entities::DECODE_MAPPING}. + # module Entities ## # Hash containing XML entities and the corresponding characters. @@ -11,15 +15,10 @@ module Oga # DECODE_MAPPING = { '<' => '<', - '<' => '<', '>' => '>', - '>' => '>', ''' => "'", - ''' => "'", '"' => '"', - '"' => '"', '&' => '&', - '&' => '&', } ## @@ -35,16 +34,46 @@ module Oga '<' => '<', } + ## + # @return [String] + # + AMPERSAND = '&'.freeze + + ## + # Regexp for matching XML/HTML entities such as " ". + # + # @return [Regexp] + # + REGULAR_ENTITY = /&[a-zA-Z]+;/ + + ## + # Regexp for matching XML/HTML entities such as "&". + # + # @return [Regexp] + # + CODEPOINT_ENTITY = /&#(x)?([a-zA-Z0-9]+);/ + + ## + # @return [Regexp] + # + ENCODE_REGEXP = Regexp.new(ENCODE_MAPPING.keys.join('|')) + ## # Decodes XML entities. # # @param [String] input + # @param [Array] keys + # @param [Hash] mapping # @return [String] # - def self.decode(input) - if input.include?('&') - DECODE_MAPPING.each do |find, replace| - input = input.gsub(find, replace) + def self.decode(input, mapping = DECODE_MAPPING) + return input unless input.include?(AMPERSAND) + + input = input.gsub(REGULAR_ENTITY, mapping) + + if input.include?(AMPERSAND) + input = input.gsub(CODEPOINT_ENTITY) do |match| + [$1 ? Integer($2, 16) : Integer($2)].pack('U') end end @@ -55,14 +84,11 @@ module Oga # Encodes special characters as XML entities. # # @param [String] input + # @param [Hash] mapping # @return [String] # - def self.encode(input) - ENCODE_MAPPING.each do |from, to| - input = input.gsub(from, to) if input.include?(from) - end - - return input + def self.encode(input, mapping = ENCODE_MAPPING) + return input.gsub(ENCODE_REGEXP, mapping) end end # Entities end # XML diff --git a/lib/oga/xml/lexer.rb b/lib/oga/xml/lexer.rb index 3d2495e..c84e90f 100644 --- a/lib/oga/xml/lexer.rb +++ b/lib/oga/xml/lexer.rb @@ -217,7 +217,7 @@ module Oga # @param [String] value The data between the quotes. # def on_string_body(value) - add_token(:T_STRING_BODY, Entities.decode(value)) + add_token(:T_STRING_BODY, value) end ## @@ -373,7 +373,7 @@ module Oga def on_text(value) return if value.empty? - add_token(:T_TEXT, Entities.decode(value)) + add_token(:T_TEXT, value) end ## diff --git a/lib/oga/xml/text.rb b/lib/oga/xml/text.rb index 7e67b9e..edb585a 100644 --- a/lib/oga/xml/text.rb +++ b/lib/oga/xml/text.rb @@ -5,19 +5,65 @@ module Oga # have any children, attributes and the likes; just text. # class Text < CharacterNode + def initialize(*args) + super + + @mutex = Mutex.new + @decoded = false + end + + ## + # @param [String] value + # + def text=(value) + # In case of concurrent text/text= calls. + @mutex.synchronize do + @decoded = false + @text = value + end + end + + ## + # Returns the text as a String. Upon the first call any XML/HTML entities + # are decoded. + # + # @return [String] + # + def text + @mutex.synchronize do + unless @decoded + decoder = html? ? HTML::Entities : Entities + @text = decoder.decode(@text) + @decoded = true + end + end + + return @text + end + ## # @see [Oga::XML::CharacterNode#to_xml] # def to_xml node = parent - root = root_node - if root.is_a?(Document) and node.is_a?(Element) and root.html? \ + if node.is_a?(Element) and html? \ and Lexer::LITERAL_HTML_ELEMENTS.include?(node.name) return super - else - return Entities.encode(super) end + + return Entities.encode(super) + end + + private + + ## + # @return [TrueClass|FalseClass] + # + def html? + root = root_node + + return root.is_a?(Document) && root.html? end end # Text end # XML diff --git a/spec/oga/html/entities_spec.rb b/spec/oga/html/entities_spec.rb new file mode 100644 index 0000000..7ee2204 --- /dev/null +++ b/spec/oga/html/entities_spec.rb @@ -0,0 +1,15 @@ +# encoding: utf-8 + +require 'spec_helper' + +describe Oga::HTML::Entities do + describe 'decode' do + it 'decodes & into &' do + described_class.decode('&').should == '&' + end + + it 'decodes λ into λ' do + described_class.decode('λ').should == 'λ' + end + end +end diff --git a/spec/oga/xml/entities_spec.rb b/spec/oga/xml/entities_spec.rb index 84346f3..68bf4d3 100644 --- a/spec/oga/xml/entities_spec.rb +++ b/spec/oga/xml/entities_spec.rb @@ -65,6 +65,10 @@ describe Oga::XML::Entities do it 'decodes &&lt; into &<' do described_class.decode('&&lt;').should == '&<' end + + it 'decodes < into <' do + described_class.decode('<').should == '<' + end end describe 'encode' do diff --git a/spec/oga/xml/lexer/entities_spec.rb b/spec/oga/xml/lexer/entities_spec.rb deleted file mode 100644 index f124623..0000000 --- a/spec/oga/xml/lexer/entities_spec.rb +++ /dev/null @@ -1,55 +0,0 @@ -require 'spec_helper' - -describe Oga::XML::Lexer do - describe 'converting XML entities in text tokens' do - it 'converts & into &' do - lex('&').should == [[:T_TEXT, '&', 1]] - end - - it 'converts < into <' do - lex('<').should == [[:T_TEXT, '<', 1]] - end - - it 'converts > into >' do - lex('>').should == [[:T_TEXT, '>', 1]] - end - end - - describe 'converting XML entities in string tokens' do - it 'converts & into &' do - lex('').should == [ - [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'foo', 1], - [:T_ATTR, 'class', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_STRING_BODY, '&', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_ELEM_END, nil, 1] - ] - end - - it 'converts < into <' do - lex('').should == [ - [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'foo', 1], - [:T_ATTR, 'class', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_STRING_BODY, '<', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_ELEM_END, nil, 1] - ] - end - - it 'converts > into >' do - lex('').should == [ - [:T_ELEM_START, nil, 1], - [:T_ELEM_NAME, 'foo', 1], - [:T_ATTR, 'class', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_STRING_BODY, '>', 1], - [:T_STRING_DQUOTE, nil, 1], - [:T_ELEM_END, nil, 1] - ] - end - end -end diff --git a/spec/oga/xml/text_spec.rb b/spec/oga/xml/text_spec.rb index cda54ce..b29ef2a 100644 --- a/spec/oga/xml/text_spec.rb +++ b/spec/oga/xml/text_spec.rb @@ -14,6 +14,79 @@ describe Oga::XML::Text do end end + describe '#text' do + describe 'with XML entities' do + it 'converts & to &' do + described_class.new(:text => '&').text.should == '&' + end + + it 'converts < to <' do + described_class.new(:text => '<').text.should == '<' + end + + it 'converts > to >' do + described_class.new(:text => '>').text.should == '>' + end + + it 'caches the converted text' do + node = described_class.new(:text => '&') + + Oga::XML::Entities.should_receive(:decode).once.and_call_original + + node.text.should == '&' + node.text.should == '&' + end + + it 'converts new text set using text=' do + node = described_class.new(:text => '&') + + node.text.should == '&' + + node.text = '<' + + node.text.should == '<' + end + end + + describe 'with HTML entities' do + before do + @document = Oga::XML::Document.new(:type => :html) + end + + it 'converts & to &' do + node = described_class.new(:text => '&') + + @document.children << node + + node.text.should == '&' + end + + it 'converts < to <' do + node = described_class.new(:text => '<') + + @document.children << node + + node.text.should == '<' + end + + it 'converts > to >' do + node = described_class.new(:text => '>') + + @document.children << node + + node.text.should == '>' + end + + it 'converts   into a space' do + node = described_class.new(:text => ' ') + + @document.children << node + + node.text.should == [160].pack('U') + end + end + end + describe '#to_xml' do it 'generates the corresponding XML' do node = described_class.new(:text => 'foo')