############### # tokenizer.rb ############### require "strscan" class Tokenizer NCName = "[a-zA-Z_][-a-zA-Z0-9._]*" QName = "(#{NCName}:)?#{NCName}" AxisName = "(ancestor-or-self|ancestor|attribute|child|descendant-or-self" + "|descendant|following-sibling|following|namespace" + "|parent|preceding-sibling|preceding|self)" NodeType = "(comment|text|processing-instruction|node)" OperatorName = "(and|or|mod|div)" Digits = "([0-9]+)" Literal = %{("[^"]*"|'[^']*')} def initialize( str ) @scanner = StringScanner.new( str ) @last_token = nil end def next_token s = @scanner # skip whitespaces s.skip /\A\s+/ token = # match ( ) [ ] @ , if s.scan /\A[()@,\]\[]/ then [s.matched] # match digits (before . and .. !!!) elsif s.scan /\A(#{Digits}([.]#{Digits}?)?|[.]#{Digits})/ [:Number, s.matched] # match . .. :: elsif s.scan /\A(::|[.][.]?)/ [s.matched] # match literals elsif s.scan /\A#{Literal}/ [:Literal, s.matched] elsif s.scan /\A([+-]|\/\/?|!=|=|[<]=?|[>]=?)/ [:Operator, s.matched] elsif s.scan /\A[*]/ then if @last_token.nil? or @last_token[0] == :Operator or %w{@ :: ( [ ,}.include?( @last_token[1] ) [:NameTest, s.matched] else [:Operator, s.matched] end # match AxisName elsif s.check /\A#{AxisName}\s*::/ s.scan /\A#{AxisName}/ [:AxisName, s.matched] # match NodeType elsif s.check /\A#{NodeType}\s*\(/ s.scan /\A#{NodeType}/ [:NodeType, s.matched] # match FunctionName elsif s.check /\A#{QName}\s*\(/ s.scan /\A#{QName}/ [:FunctionName, s.matched] # match VariableReference elsif s.scan /\A[\$]#{QName}/ [:VariableReference, s.matched] # match NameTest elsif s.scan /\A((#{NCName}:)?[*]|#{QName})/ [:NameTest, s.matched] elsif s.scan /\A#{OperatorName}/ [:Operator, s.matched] elsif s.empty? nil elsif s.rest? raise "Syntax Error!" end @last_token = token return token end end ############