From 196765788f6f03fb0754e71c7038669377797374 Mon Sep 17 00:00:00 2001 From: murphy Date: Sun, 9 Jul 2006 23:18:40 +0000 Subject: Fixed another bug in the Ruby scanner, this time it was unfinished heredocs with empty delimiter. Fixed documentation uploading. --- lib/coderay/scanners/ruby.rb | 729 +++++++++++++++++----------------- lib/coderay/scanners/ruby/patterns.rb | 420 ++++++++++---------- 2 files changed, 575 insertions(+), 574 deletions(-) (limited to 'lib/coderay/scanners') diff --git a/lib/coderay/scanners/ruby.rb b/lib/coderay/scanners/ruby.rb index 2a415eb..7ba3029 100644 --- a/lib/coderay/scanners/ruby.rb +++ b/lib/coderay/scanners/ruby.rb @@ -1,397 +1,398 @@ module CodeRay module Scanners - # This scanner is really complex, since Ruby _is_ a complex language! - # - # It tries to highlight 100% of all common code, - # and 90% of strange codes. - # - # It is optimized for HTML highlighting, and is not very useful for - # parsing or pretty printing. - # - # For now, I think it's better than the scanners in VIM or Syntax, or - # any highlighter I was able to find, except Caleb's RubyLexer. - # - # I hope it's also better than the rdoc/irb lexer. - class Ruby < Scanner + # This scanner is really complex, since Ruby _is_ a complex language! + # + # It tries to highlight 100% of all common code, + # and 90% of strange codes. + # + # It is optimized for HTML highlighting, and is not very useful for + # parsing or pretty printing. + # + # For now, I think it's better than the scanners in VIM or Syntax, or + # any highlighter I was able to find, except Caleb's RubyLexer. + # + # I hope it's also better than the rdoc/irb lexer. + class Ruby < Scanner - include Streamable + include Streamable - register_for :ruby + register_for :ruby - helper :patterns - - DEFAULT_OPTIONS = { - :parse_regexps => true, - } + helper :patterns + + DEFAULT_OPTIONS = { + :parse_regexps => true, + } - private - def scan_tokens tokens, options - parse_regexp = false # options[:parse_regexps] - first_bake = saved_tokens = nil - last_token_dot = false - fancy_allowed = regexp_allowed = true - heredocs = nil - last_state = nil - state = :initial - depth = nil - states = [] + private + def scan_tokens tokens, options + parse_regexp = false # options[:parse_regexps] + first_bake = saved_tokens = nil + last_token_dot = false + fancy_allowed = regexp_allowed = true + heredocs = nil + last_state = nil + state = :initial + depth = nil + states = [] - patterns = Patterns # avoid constant lookup + patterns = Patterns # avoid constant lookup - until eos? - type = :error - match = nil - kind = nil + until eos? + type = :error + match = nil + kind = nil - if state.instance_of? patterns::StringState + if state.instance_of? patterns::StringState # {{{ - match = scan_until(state.pattern) || scan_until(/\z/) - tokens << [match, :content] unless match.empty? - break if eos? - - if state.heredoc and self[1] - match = getch + scan_until(/$/) - tokens << [match, :delimiter] - tokens << [:close, state.type] - state = state.next_state - next - end - - case match = getch - - when state.delim - if state.paren - state.paren_depth -= 1 - if state.paren_depth > 0 - tokens << [match, :nesting_delimiter] - next - end - end - tokens << [match, :delimiter] - if state.type == :regexp and not eos? - modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) - tokens << [modifiers, :modifier] unless modifiers.empty? - if parse_regexp - extended = modifiers.index ?x - tokens = saved_tokens - regexp = tokens - for text, type in regexp - if text.is_a? ::String - case type - when :content - text.scan(/([^#]+)|(#.*)/) do |plain, comment| - if plain - tokens << [plain, :content] - else - tokens << [comment, :comment] - end - end - when :character - if text[/\\(?:[swdSWDAzZbB]|\d+)/] - tokens << [text, :modifier] - else - tokens << [text, type] - end - else - tokens << [text, type] - end - else - tokens << [text, type] - end - end - first_bake = saved_tokens = nil - end - end - tokens << [:close, state.type] - fancy_allowed = regexp_allowed = false - state = state.next_state - - when '\\' - if state.interpreted - if esc = scan(/ #{patterns::ESCAPE} /ox) - tokens << [match + esc, :char] - else - tokens << [match, :error] - end - else - case m = getch - when state.delim, '\\' - tokens << [match + m, :char] + match = scan_until(state.pattern) || scan_until(/\z/) + tokens << [match, :content] unless match.empty? + break if eos? + + if state.heredoc and self[1] # end of heredoc + match = getch.to_s + match << scan_until(/$/) unless eos? + tokens << [match, :delimiter] + tokens << [:close, state.type] + state = state.next_state + next + end + + case match = getch + + when state.delim + if state.paren + state.paren_depth -= 1 + if state.paren_depth > 0 + tokens << [match, :nesting_delimiter] + next + end + end + tokens << [match, :delimiter] + if state.type == :regexp and not eos? + modifiers = scan(/#{patterns::REGEXP_MODIFIERS}/ox) + tokens << [modifiers, :modifier] unless modifiers.empty? + if parse_regexp + extended = modifiers.index ?x + tokens = saved_tokens + regexp = tokens + for text, type in regexp + if text.is_a? ::String + case type + when :content + text.scan(/([^#]+)|(#.*)/) do |plain, comment| + if plain + tokens << [plain, :content] + else + tokens << [comment, :comment] + end + end + when :character + if text[/\\(?:[swdSWDAzZbB]|\d+)/] + tokens << [text, :modifier] + else + tokens << [text, type] + end + else + tokens << [text, type] + end + else + tokens << [text, type] + end + end + first_bake = saved_tokens = nil + end + end + tokens << [:close, state.type] + fancy_allowed = regexp_allowed = false + state = state.next_state + + when '\\' + if state.interpreted + if esc = scan(/ #{patterns::ESCAPE} /ox) + tokens << [match + esc, :char] + else + tokens << [match, :error] + end + else + case m = getch + when state.delim, '\\' + tokens << [match + m, :char] when nil tokens << [match, :error] - else - tokens << [match + m, :content] - end - end - - when '#' - case peek(1)[0] - when ?{ - states.push [state, depth, heredocs] - fancy_allowed = regexp_allowed = true - state = :initial - depth = 1 - tokens << [:open, :inline] - tokens << [match + getch, :delimiter] - when ?$, ?@ - tokens << [match, :escape] - last_state = state # scan one token as normal code, then return here - state = :initial - else - raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens - end - - when state.paren - state.paren_depth += 1 - tokens << [match, :nesting_delimiter] + else + tokens << [match + m, :content] + end + end + + when '#' + case peek(1)[0] + when ?{ + states.push [state, depth, heredocs] + fancy_allowed = regexp_allowed = true + state = :initial + depth = 1 + tokens << [:open, :inline] + tokens << [match + getch, :delimiter] + when ?$, ?@ + tokens << [match, :escape] + last_state = state # scan one token as normal code, then return here + state = :initial + else + raise_inspect 'else-case # reached; #%p not handled' % peek(1), tokens + end + + when state.paren + state.paren_depth += 1 + tokens << [match, :nesting_delimiter] - when /#{patterns::REGEXP_SYMBOLS}/ox - tokens << [match, :function] - - else - raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens - - end - next + when /#{patterns::REGEXP_SYMBOLS}/ox + tokens << [match, :function] + + else + raise_inspect 'else-case " reached; %p not handled, state = %p' % [match, state], tokens + + end + next # }}} - else -# {{{ - if match = scan(/ [ \t\f]+ | \\? \n | \# .* /x) or - ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) - fancy_allowed = true - case m = match[0] - when ?\s, ?\t, ?\f - match << scan(/\s*/) unless eos? or heredocs - type = :space - when ?\n, ?\\ - type = :space - if m == ?\n - regexp_allowed = true - state = :initial if state == :undef_comma_expected - end - if heredocs - unscan # heredoc scanning needs \n at start - state = heredocs.shift - tokens << [:open, state.type] - heredocs = nil if heredocs.empty? - next - else - match << scan(/\s*/) unless eos? - end - when ?#, ?=, ?_ - type = :comment - regexp_allowed = true - else - raise_inspect 'else-case _ reached, because case %p was not handled' % [matched[0].chr], tokens - end - tokens << [match, type] - next + else +# {{{ + if match = scan(/ [ \t\f]+ | \\? \n | \# .* /x) or + ( bol? and match = scan(/#{patterns::RUBYDOC_OR_DATA}/o) ) + fancy_allowed = true + case m = match[0] + when ?\s, ?\t, ?\f + match << scan(/\s*/) unless eos? or heredocs + type = :space + when ?\n, ?\\ + type = :space + if m == ?\n + regexp_allowed = true + state = :initial if state == :undef_comma_expected + end + if heredocs + unscan # heredoc scanning needs \n at start + state = heredocs.shift + tokens << [:open, state.type] + heredocs = nil if heredocs.empty? + next + else + match << scan(/\s*/) unless eos? + end + when ?#, ?=, ?_ + type = :comment + regexp_allowed = true + else + raise_inspect 'else-case _ reached, because case %p was not handled' % [matched[0].chr], tokens + end + tokens << [match, type] + next - elsif state == :initial - - # IDENTS # - if match = scan(/#{patterns::METHOD_NAME}/o) - if last_token_dot - type = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end - else - type = patterns::IDENT_KIND[match] - if type == :ident and match[/^[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) - type = :constant - elsif type == :reserved - state = patterns::DEF_NEW_STATE[match] - end - end - ## experimental! - fancy_allowed = regexp_allowed = :set if patterns::REGEXP_ALLOWED[match] or check(/\s+(?:%\S|\/\S)/) - - # OPERATORS # - elsif (not last_token_dot and match = scan(/ ==?=? | \.\.?\.? | [\(\)\[\]\{\}] | :: | , /x)) or - (last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}/o)) - if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ - regexp_allowed = fancy_allowed = :set - end - last_token_dot = :set if match == '.' or match == '::' - type = :operator - unless states.empty? - case match - when '{' - depth += 1 - when '}' - depth -= 1 - if depth == 0 - state, depth, heredocs = states.pop - tokens << [match, :delimiter] - type = :inline - match = :close - end - end - end - - elsif match = scan(/ ['"] /mx) - tokens << [:open, :string] - type = :delimiter - state = patterns::StringState.new :string, match == '"', match # important for streaming - - elsif match = scan(/#{patterns::INSTANCE_VARIABLE}/o) - type = :instance_variable + elsif state == :initial + + # IDENTS # + if match = scan(/#{patterns::METHOD_NAME}/o) + if last_token_dot + type = if match[/^[A-Z]/] and not match?(/\(/) then :constant else :ident end + else + type = patterns::IDENT_KIND[match] + if type == :ident and match[/^[A-Z]/] and not match[/[!?]$/] and not match?(/\(/) + type = :constant + elsif type == :reserved + state = patterns::DEF_NEW_STATE[match] + end + end + ## experimental! + fancy_allowed = regexp_allowed = :set if patterns::REGEXP_ALLOWED[match] or check(/\s+(?:%\S|\/\S)/) + + # OPERATORS # + elsif (not last_token_dot and match = scan(/ ==?=? | \.\.?\.? | [\(\)\[\]\{\}] | :: | , /x)) or + (last_token_dot and match = scan(/#{patterns::METHOD_NAME_OPERATOR}/o)) + if match !~ / [.\)\]\}] /x or match =~ /\.\.\.?/ + regexp_allowed = fancy_allowed = :set + end + last_token_dot = :set if match == '.' or match == '::' + type = :operator + unless states.empty? + case match + when '{' + depth += 1 + when '}' + depth -= 1 + if depth == 0 + state, depth, heredocs = states.pop + tokens << [match, :delimiter] + type = :inline + match = :close + end + end + end + + elsif match = scan(/ ['"] /mx) + tokens << [:open, :string] + type = :delimiter + state = patterns::StringState.new :string, match == '"', match # important for streaming + + elsif match = scan(/#{patterns::INSTANCE_VARIABLE}/o) + type = :instance_variable - elsif regexp_allowed and match = scan(/\//) - tokens << [:open, :regexp] - type = :delimiter - interpreted = true - state = patterns::StringState.new :regexp, interpreted, match - if parse_regexp - tokens = [] - saved_tokens = tokens - end - - elsif match = scan(/#{patterns::NUMERIC}/o) - type = if self[1] then :float else :integer end + elsif regexp_allowed and match = scan(/\//) + tokens << [:open, :regexp] + type = :delimiter + interpreted = true + state = patterns::StringState.new :regexp, interpreted, match + if parse_regexp + tokens = [] + saved_tokens = tokens + end + + elsif match = scan(/#{patterns::NUMERIC}/o) + type = if self[1] then :float else :integer end - elsif match = scan(/#{patterns::SYMBOL}/o) - case delim = match[1] - when ?', ?" - tokens << [:open, :symbol] - tokens << [':', :symbol] - match = delim.chr - type = :delimiter - state = patterns::StringState.new :symbol, delim == ?", match - else - type = :symbol - end - - elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /x) - regexp_allowed = fancy_allowed = :set - type = :operator - - elsif fancy_allowed and match = scan(/#{patterns::HEREDOC_OPEN}/o) - indented = self[1] == '-' - quote = self[3] - delim = self[quote ? 4 : 2] - type = patterns::QUOTE_TO_TYPE[quote] - tokens << [:open, type] - tokens << [match, :delimiter] - match = :close - heredoc = patterns::StringState.new type, quote != '\'', delim, (indented ? :indented : :linestart ) - heredocs ||= [] # create heredocs if empty - heredocs << heredoc - - elsif fancy_allowed and match = scan(/#{patterns::FANCY_START_SAVE}/o) - type, interpreted = *patterns::FancyStringType.fetch(self[1]) do - raise_inspect 'Unknown fancy string: %%%p' % k, tokens - end - tokens << [:open, type] - state = patterns::StringState.new type, interpreted, self[2] - type = :delimiter + elsif match = scan(/#{patterns::SYMBOL}/o) + case delim = match[1] + when ?', ?" + tokens << [:open, :symbol] + tokens << [':', :symbol] + match = delim.chr + type = :delimiter + state = patterns::StringState.new :symbol, delim == ?", match + else + type = :symbol + end + + elsif match = scan(/ [-+!~^]=? | [*|&]{1,2}=? | >>? /x) + regexp_allowed = fancy_allowed = :set + type = :operator + + elsif fancy_allowed and match = scan(/#{patterns::HEREDOC_OPEN}/o) + indented = self[1] == '-' + quote = self[3] + delim = self[quote ? 4 : 2] + type = patterns::QUOTE_TO_TYPE[quote] + tokens << [:open, type] + tokens << [match, :delimiter] + match = :close + heredoc = patterns::StringState.new type, quote != '\'', delim, (indented ? :indented : :linestart ) + heredocs ||= [] # create heredocs if empty + heredocs << heredoc + + elsif fancy_allowed and match = scan(/#{patterns::FANCY_START_SAVE}/o) + type, interpreted = *patterns::FancyStringType.fetch(self[1]) do + raise_inspect 'Unknown fancy string: %%%p' % k, tokens + end + tokens << [:open, type] + state = patterns::StringState.new type, interpreted, self[2] + type = :delimiter - elsif fancy_allowed and match = scan(/#{patterns::CHARACTER}/o) - type = :integer + elsif fancy_allowed and match = scan(/#{patterns::CHARACTER}/o) + type = :integer - elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) - regexp_allowed = fancy_allowed = :set - type = :operator + elsif match = scan(/ [\/%]=? | <(?:<|=>?)? | [?:;] /x) + regexp_allowed = fancy_allowed = :set + type = :operator - elsif match = scan(/`/) - if last_token_dot - type = :operator - else - tokens << [:open, :shell] - type = :delimiter - state = patterns::StringState.new :shell, true, match - end - - elsif match = scan(/#{patterns::GLOBAL_VARIABLE}/o) - type = :global_variable - - elsif match = scan(/#{patterns::CLASS_VARIABLE}/o) - type = :class_variable - - else - match = getch - - end - - elsif state == :def_expected - state = :initial - if match = scan(/(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) - type = :method - else - next - end + elsif match = scan(/`/) + if last_token_dot + type = :operator + else + tokens << [:open, :shell] + type = :delimiter + state = patterns::StringState.new :shell, true, match + end + + elsif match = scan(/#{patterns::GLOBAL_VARIABLE}/o) + type = :global_variable + + elsif match = scan(/#{patterns::CLASS_VARIABLE}/o) + type = :class_variable + + else + match = getch + + end + + elsif state == :def_expected + state = :initial + if match = scan(/(?>#{patterns::METHOD_NAME_EX})(?!\.|::)/o) + type = :method + else + next + end - elsif state == :undef_expected - state = :undef_comma_expected - if match = scan(/#{patterns::METHOD_NAME_EX}/o) - type = :method - elsif match = scan(/#{patterns::SYMBOL}/o) - case delim = match[1] - when ?', ?" - tokens << [:open, :symbol] - tokens << [':', :symbol] - match = delim.chr - type = :delimiter - state = patterns::StringState.new :symbol, delim == ?", match - state.next_state = :undef_comma_expected - else - type = :symbol - end - else - state = :initial - next - end - - elsif state == :undef_comma_expected - if match = scan(/,/) - type = :operator - state = :undef_expected - else - state = :initial - next - end + elsif state == :undef_expected + state = :undef_comma_expected + if match = scan(/#{patterns::METHOD_NAME_EX}/o) + type = :method + elsif match = scan(/#{patterns::SYMBOL}/o) + case delim = match[1] + when ?', ?" + tokens << [:open, :symbol] + tokens << [':', :symbol] + match = delim.chr + type = :delimiter + state = patterns::StringState.new :symbol, delim == ?", match + state.next_state = :undef_comma_expected + else + type = :symbol + end + else + state = :initial + next + end + + elsif state == :undef_comma_expected + if match = scan(/,/) + type = :operator + state = :undef_expected + else + state = :initial + next + end - elsif state == :module_expected - if match = scan(/<> # append or shift left, shift right - | <=?>? | >=? # comparison, rocket operator - | ===? # simple equality and case equality - /ox - METHOD_NAME_EX = / #{IDENT} (?:[?!]|=(?!>))? | #{METHOD_NAME_OPERATOR} /ox - INSTANCE_VARIABLE = / @ #{IDENT} /ox - CLASS_VARIABLE = / @@ #{IDENT} /ox - OBJECT_VARIABLE = / @@? #{IDENT} /ox - GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9]\d* | 0\w* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox - PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} |#{OBJECT_VARIABLE} /ox - VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox - - QUOTE_TO_TYPE = { - '`' => :shell, - '/'=> :regexp, - } - QUOTE_TO_TYPE.default = :string - - REGEXP_MODIFIERS = /[mixounse]*/ - REGEXP_SYMBOLS = /[|?*+?(){}\[\].^$]/ - - DECIMAL = /\d+(?:_\d+)*/ - OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ - HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ - BINARY = /0b[01]+(?:_[01]+)*/ - - EXPONENT = / [eE] [+-]? #{DECIMAL} /ox - FLOAT_SUFFIX = / #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? /ox - FLOAT_OR_INT = / #{DECIMAL} (?: #{FLOAT_SUFFIX} () )? /ox - NUMERIC = / [-+]? (?: (?=0) (?: #{OCTAL} | #{HEXADECIMAL} | #{BINARY} ) | #{FLOAT_OR_INT} ) /ox - - SYMBOL = / - : - (?: - #{METHOD_NAME_EX} - | #{PREFIX_VARIABLE} - | ['"] - ) - /ox - - # TODO investigste \M, \c and \C escape sequences - # (?: M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-)? (?: \\ (?: [0-7]{3} | x[0-9A-Fa-f]{2} | . ) ) - # assert_equal(225, ?\M-a) - # assert_equal(129, ?\M-\C-a) - ESCAPE = / - [abefnrstv] - | M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M- - | [0-7]{1,3} - | x[0-9A-Fa-f]{1,2} - | . - /mx - - CHARACTER = / - \? - (?: - [^\s\\] - | \\ #{ESCAPE} - ) - /mx - - # NOTE: This is not completely correct, but - # nobody needs heredoc delimiters ending with \n. - HEREDOC_OPEN = / - << (-)? # $1 = float - (?: - ( [A-Za-z_0-9]+ ) # $2 = delim - | - ( ["'`] ) # $3 = quote, type - ( [^\n]*? ) \3 # $4 = delim - ) - /mx - - RUBYDOC = / - =begin (?!\S) - .*? - (?: \Z | ^=end (?!\S) [^\n]* ) - /mx - - DATA = / - __END__$ - .*? - (?: \Z | (?=^\#CODE) ) - /mx - - RUBYDOC_OR_DATA = / #{RUBYDOC} | #{DATA} /xo - - RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x - - # FIXME: \s and = are only a workaround, they are still allowed - # as delimiters. - FANCY_START_SAVE = / % ( [qQwWxsr] | (?![a-zA-Z0-9\s=]) ) ([^a-zA-Z0-9]) /mx - FANCY_START_CORRECT = / % ( [qQwWxsr] | (?![a-zA-Z0-9]) ) ([^a-zA-Z0-9]) /mx - - FancyStringType = { - 'q' => [:string, false], - 'Q' => [:string, true], - 'r' => [:regexp, true], - 's' => [:symbol, false], - 'x' => [:shell, true] - } - FancyStringType['w'] = FancyStringType['q'] - FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] - - class StringState < Struct.new :type, :interpreted, :delim, :heredoc, - :paren, :paren_depth, :pattern, :next_state - - CLOSING_PAREN = Hash[ *%w[ - ( ) - [ ] - < > - { } - ] ] - - CLOSING_PAREN.values.each { |o| o.freeze } # debug, if I try to change it with << - OPENING_PAREN = CLOSING_PAREN.invert - - STRING_PATTERN = Hash.new { |h, k| - delim, interpreted = *k - delim_pattern = Regexp.escape(delim.dup) - if closing_paren = CLOSING_PAREN[delim] - delim_pattern << Regexp.escape(closing_paren) - end - - - special_escapes = - case interpreted - when :regexp_symbols - '| ' + REGEXP_SYMBOLS.source - when :words - '| \s' - end - - h[k] = - if interpreted and not delim == '#' - / (?= [#{delim_pattern}\\] | \# [{$@] #{special_escapes} ) /mx - else - / (?= [#{delim_pattern}\\] #{special_escapes} ) /mx - end - } - - HEREDOC_PATTERN = Hash.new { |h, k| - delim, interpreted, indented = *k - delim_pattern = Regexp.escape(delim.dup) - delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x - h[k] = - if interpreted - / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx # $1 set == end of heredoc - else - / (?= #{delim_pattern}() | \\ ) /mx - end - } - - def initialize kind, interpreted, delim, heredoc = false - if heredoc - pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ] - delim = nil - else - pattern = STRING_PATTERN[ [delim, interpreted] ] - if paren = CLOSING_PAREN[delim] - delim, paren = paren, delim - paren_depth = 1 - end - end - super kind, interpreted, delim, heredoc, paren, paren_depth, pattern, :initial - end - end unless defined? StringState - - end + module Ruby::Patterns # :nodoc: + + RESERVED_WORDS = %w[ + and def end in or unless begin + defined? ensure module redo super until + BEGIN break do next rescue then + when END case else for retry + while alias class elsif if not return + undef yield + ] + + DEF_KEYWORDS = %w[ def ] + UNDEF_KEYWORDS = %w[ undef ] + MODULE_KEYWORDS = %w[class module] + DEF_NEW_STATE = WordList.new(:initial). + add(DEF_KEYWORDS, :def_expected). + add(UNDEF_KEYWORDS, :undef_expected). + add(MODULE_KEYWORDS, :module_expected) + + IDENTS_ALLOWING_REGEXP = %w[ + and or not while until unless if then elsif when sub sub! gsub gsub! scan slice slice! split + ] + REGEXP_ALLOWED = WordList.new(false). + add(IDENTS_ALLOWING_REGEXP, :set) + + PREDEFINED_CONSTANTS = %w[ + nil true false self + DATA ARGV ARGF __FILE__ __LINE__ + ] + + IDENT_KIND = WordList.new(:ident). + add(RESERVED_WORDS, :reserved). + add(PREDEFINED_CONSTANTS, :pre_constant) + + IDENT = /[a-z_][\w_]*/i + + METHOD_NAME = / #{IDENT} [?!]? /ox + METHOD_NAME_OPERATOR = / + \*\*? # multiplication and power + | [-+]@? # plus, minus + | [\/%&|^`~] # division, modulo or format strings, &and, |or, ^xor, `system`, tilde + | \[\]=? # array getter and setter + | << | >> # append or shift left, shift right + | <=?>? | >=? # comparison, rocket operator + | ===? # simple equality and case equality + /ox + METHOD_NAME_EX = / #{IDENT} (?:[?!]|=(?!>))? | #{METHOD_NAME_OPERATOR} /ox + INSTANCE_VARIABLE = / @ #{IDENT} /ox + CLASS_VARIABLE = / @@ #{IDENT} /ox + OBJECT_VARIABLE = / @@? #{IDENT} /ox + GLOBAL_VARIABLE = / \$ (?: #{IDENT} | [1-9]\d* | 0\w* | [~&+`'=\/,;_.<>!@$?*":\\] | -[a-zA-Z_0-9] ) /ox + PREFIX_VARIABLE = / #{GLOBAL_VARIABLE} |#{OBJECT_VARIABLE} /ox + VARIABLE = / @?@? #{IDENT} | #{GLOBAL_VARIABLE} /ox + + QUOTE_TO_TYPE = { + '`' => :shell, + '/'=> :regexp, + } + QUOTE_TO_TYPE.default = :string + + REGEXP_MODIFIERS = /[mixounse]*/ + REGEXP_SYMBOLS = /[|?*+?(){}\[\].^$]/ + + DECIMAL = /\d+(?:_\d+)*/ + OCTAL = /0_?[0-7]+(?:_[0-7]+)*/ + HEXADECIMAL = /0x[0-9A-Fa-f]+(?:_[0-9A-Fa-f]+)*/ + BINARY = /0b[01]+(?:_[01]+)*/ + + EXPONENT = / [eE] [+-]? #{DECIMAL} /ox + FLOAT_SUFFIX = / #{EXPONENT} | \. #{DECIMAL} #{EXPONENT}? /ox + FLOAT_OR_INT = / #{DECIMAL} (?: #{FLOAT_SUFFIX} () )? /ox + NUMERIC = / [-+]? (?: (?=0) (?: #{OCTAL} | #{HEXADECIMAL} | #{BINARY} ) | #{FLOAT_OR_INT} ) /ox + + SYMBOL = / + : + (?: + #{METHOD_NAME_EX} + | #{PREFIX_VARIABLE} + | ['"] + ) + /ox + + # TODO investigste \M, \c and \C escape sequences + # (?: M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M-)? (?: \\ (?: [0-7]{3} | x[0-9A-Fa-f]{2} | . ) ) + # assert_equal(225, ?\M-a) + # assert_equal(129, ?\M-\C-a) + ESCAPE = / + [abefnrstv] + | M-\\C-|C-\\M-|M-\\c|c\\M-|c|C-|M- + | [0-7]{1,3} + | x[0-9A-Fa-f]{1,2} + | . + /mx + + CHARACTER = / + \? + (?: + [^\s\\] + | \\ #{ESCAPE} + ) + /mx + + # NOTE: This is not completely correct, but + # nobody needs heredoc delimiters ending with \n. + HEREDOC_OPEN = / + << (-)? # $1 = float + (?: + ( [A-Za-z_0-9]+ ) # $2 = delim + | + ( ["'`] ) # $3 = quote, type + ( [^\n]*? ) \3 # $4 = delim + ) + /mx + + RUBYDOC = / + =begin (?!\S) + .*? + (?: \Z | ^=end (?!\S) [^\n]* ) + /mx + + DATA = / + __END__$ + .*? + (?: \Z | (?=^\#CODE) ) + /mx + + RUBYDOC_OR_DATA = / #{RUBYDOC} | #{DATA} /xo + + RDOC_DATA_START = / ^=begin (?!\S) | ^__END__$ /x + + # FIXME: \s and = are only a workaround, they are still allowed + # as delimiters. + FANCY_START_SAVE = / % ( [qQwWxsr] | (?![a-zA-Z0-9\s=]) ) ([^a-zA-Z0-9]) /mx + FANCY_START_CORRECT = / % ( [qQwWxsr] | (?![a-zA-Z0-9]) ) ([^a-zA-Z0-9]) /mx + + FancyStringType = { + 'q' => [:string, false], + 'Q' => [:string, true], + 'r' => [:regexp, true], + 's' => [:symbol, false], + 'x' => [:shell, true] + } + FancyStringType['w'] = FancyStringType['q'] + FancyStringType['W'] = FancyStringType[''] = FancyStringType['Q'] + + class StringState < Struct.new :type, :interpreted, :delim, :heredoc, + :paren, :paren_depth, :pattern, :next_state + + CLOSING_PAREN = Hash[ *%w[ + ( ) + [ ] + < > + { } + ] ] + + CLOSING_PAREN.values.each { |o| o.freeze } # debug, if I try to change it with << + OPENING_PAREN = CLOSING_PAREN.invert + + STRING_PATTERN = Hash.new { |h, k| + delim, interpreted = *k + delim_pattern = Regexp.escape(delim.dup) + if closing_paren = CLOSING_PAREN[delim] + delim_pattern << Regexp.escape(closing_paren) + end + + + special_escapes = + case interpreted + when :regexp_symbols + '| ' + REGEXP_SYMBOLS.source + when :words + '| \s' + end + + h[k] = + if interpreted and not delim == '#' + / (?= [#{delim_pattern}\\] | \# [{$@] #{special_escapes} ) /mx + else + / (?= [#{delim_pattern}\\] #{special_escapes} ) /mx + end + } + + HEREDOC_PATTERN = Hash.new { |h, k| + delim, interpreted, indented = *k + delim_pattern = Regexp.escape(delim.dup) + delim_pattern = / \n #{ '(?>[\ \t]*)' if indented } #{ Regexp.new delim_pattern } $ /x + h[k] = + if interpreted + / (?= #{delim_pattern}() | \\ | \# [{$@] ) /mx # $1 set == end of heredoc + else + / (?= #{delim_pattern}() | \\ ) /mx + end + } + + def initialize kind, interpreted, delim, heredoc = false + if heredoc + pattern = HEREDOC_PATTERN[ [delim, interpreted, heredoc == :indented] ] + delim = nil + else + pattern = STRING_PATTERN[ [delim, interpreted] ] + if paren = CLOSING_PAREN[delim] + delim, paren = paren, delim + paren_depth = 1 + end + end + super kind, interpreted, delim, heredoc, paren, paren_depth, pattern, :initial + end + end unless defined? StringState + + end end end -- cgit v1.2.1