net.sf.saxon.expr
Class Tokenizer

java.lang.Object
  extended bynet.sf.saxon.expr.Tokenizer

public final class Tokenizer
extends Object

Tokenizer for expressions and inputs. This code was originally derived from James Clark's xt, though it has been greatly modified since. See copyright notice at end of file.


Field Summary
static int AND
          Operator "and"
static int ASSIGN
          ":=" symbol (XQuery only)
static int AT
          At token, "@"
static int ATTRIBUTE_QNAME
          composite token <'attribute' QNAME> (XQuery only)
static int AXIS
          Token representing the name of an axis and the following "::" symbol
static int BARE_NAME_STATE
          State in which a name is NOT to be merged with what comes next, for example "("
static int CASE
          Keyword "case"
static int CAST_AS
          operator "cast as"
static int CASTABLE_AS
          operator "castable as"
static int COLONCOLON
          "::" symbol
static int COLONSTAR
          ":*" symbol
static int COMMA
          Comma token
 int currentToken
          The number identifying the most recently read token
 int currentTokenStartIndex
          The position in the input expression where the current token starts
 String currentTokenValue
          The string value of the most recently read token
static int DECLARE_BASEURI
          "declare base-uri"
static int DECLARE_DEFAULT
          "declare default"
static int DECLARE_FUNCTION
          "define function"
static int DECLARE_NAMESPACE
          "declare namespace"
static int DECLARE_VALIDATION
          "declare validation"
static int DECLARE_VARIABLE
          "define variable"
static int DECLARE_XMLSPACE
          "declare xmlspace"
static int DEFAULT
          Keyword "default"
static int DEFAULT_STATE
          Initial default state of the Tokenizer
static int DIV
          Operator "div"
static int DOLLAR
          "$" symbol
static int DOT
          "." symbol
static int DOTDOT
          ".." symbol
static HashMap doubleKeywords
          Lookup table for composite (two-keyword) tokens
static int ELEMENT_QNAME
          composite token <'element' QNAME> (XQuery only)
static int ELSE
          Keyword "else"
static int EOF
          Pseudo-token representing the end of the expression
static int EQUALS
          Equals token ("=")
static int EVERY
          Keyword "every"
static int EXCEPT
          Operator "except"
static int FEQ
          operator "eq"
static int FGE
          operator "ge"
static int FGT
          operator "gt"
static int FLE
          opeartor "le"
static int FLT
          operator "lt"
static int FNE
          operator "ne"
static int FOLLOWS
          Operator ">>"
static int FOR
          "for" keyword
static int FUNCTION
          Token representing the name of a function and the following "(" symbol
static int GE
          Operator ">="
static int GT
          Operator ">"
static int IDIV
          operator "idiv"
static int IF
          Keyword "if"
static int IMPORT_MODULE
          "import module"
static int IMPORT_SCHEMA
          "import schema"
static int IN
          Keyword "in"
 String input
          The string being parsed
 int inputIndex
          The current position within the input string
static int INSTANCE_OF
          operator "instance of"
static int INTERSECT
          Operator "intersect"
static int IS
          Operator "is"
static int KEYWORD_CURLY
          composite token: (XQuery only)
(package private) static int LAST_OPERATOR
          Constant identifying the token number of the last token to be classified as an operator
static int LCURLY
          "{" symbol (XQuery only)
static int LE
          Operator "<="
static int LET
          "let" keyword (XQuery only)
static int LPAR
          Left parenthesis
static int LSQB
          Left square bracket
static int LT
          Operator "<"
static int MINUS
          Binary minus operator
static int MOD
          Operator "mod"
static int MODULE_NAMESPACE
          "module namespace"
static int MULT
          Multiply operator, "*" when used in an operator context
static int NAME
          Name token (a QName, in general)
static int NE
          Operator "!="
static int NEGATE
          Unary minus sign
static int NODEKIND
          Node kind, e.g.
static int NUMBER
          Numeric literal
static int OPERATOR_STATE
          State in which the next thing to be read is an operator
static int OR
          Operator "or"
static int PI_QNAME
          composite token <'pi' QNAME> (XQuery only)
static int PLUS
          Operator "+"
static int PRECEDES
          Operator "<<"
static int PREFIX
          "prefix:*" token
static int QMARK
          "?" symbol
static int RCURLY
          "}" symbol (XQuery only)
static int RETURN
          Keyword "return"
static int RPAR
          Right parenthesis
static int RSQB
          Right square bracket
static int SATISFIES
          Keyword "satisfies"
static int SEMICOLON
          semicolon separator
static int SEQUENCE_TYPE_STATE
          State in which the next thing to be read is a SequenceType
static int SLASH
          Forwards "/"
static int SLSL
          Double forwards slash, "//"
static int SOME
          Keyword "some"
static int STAR
          "*" symbol when used as a wildcard
static int STRING_LITERAL
          String literal
static int SUFFIX
          "*:local-name" token
static int TAG
          "<" at the start of a tag (XQuery only).
static int THEN
          Ketword "then"
static int TO
          Operator "to"
static String[] tokens
          The following strings are used to represent tokens in error messages
static int TREAT_AS
          operator "treat as"
static int TYPESWITCH
          Keyword "typeswitch"
static int TYPETEST
          "type("
static int UNION
          "union" or "|" token
static int UNKNOWN
          Pseudo-token representing the start of the expression
static int VALIDATE
          Various compound symbols supporting XQuery validation expression
static int VALIDATE_CONTEXT
           
static int VALIDATE_GLOBAL
           
static int VALIDATE_LAX
           
static int VALIDATE_SKIP
           
static int VALIDATE_STRICT
           
static int WHERE
          Keyword "where"
static int XQUERY_VERSION
          "xquery version"
 
Constructor Summary
Tokenizer()
           
 
Method Summary
 String getLexicalSubstring(int start, int end)
          Get a slice of the input expression
 int getLineNumber()
          Get the line number of the current token
 int getState()
           
 void lookAhead()
          Look ahead by one token.
 void next()
          Get the next token from the input expression.
 char nextChar()
          Read next character directly.
 String recentText()
          Get the most recently read text (for use in an error message)
 void setState(int state)
           
 void tokenize(String input, int start, int end)
          Prepare a string for tokenization.
 void treatCurrentAsOperator()
          Force the current token to be treated as an operator if possible
 void unreadChar()
          Step back one character
 
Methods inherited from class java.lang.Object
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
 

Field Detail

DEFAULT_STATE

public static final int DEFAULT_STATE
Initial default state of the Tokenizer

See Also:
Constant Field Values

BARE_NAME_STATE

public static final int BARE_NAME_STATE
State in which a name is NOT to be merged with what comes next, for example "("

See Also:
Constant Field Values

SEQUENCE_TYPE_STATE

public static final int SEQUENCE_TYPE_STATE
State in which the next thing to be read is a SequenceType

See Also:
Constant Field Values

OPERATOR_STATE

public static final int OPERATOR_STATE
State in which the next thing to be read is an operator

See Also:
Constant Field Values

UNKNOWN

public static final int UNKNOWN
Pseudo-token representing the start of the expression

See Also:
Constant Field Values

EOF

public static final int EOF
Pseudo-token representing the end of the expression

See Also:
Constant Field Values

UNION

public static final int UNION
"union" or "|" token

See Also:
Constant Field Values

SLASH

public static final int SLASH
Forwards "/"

See Also:
Constant Field Values

AT

public static final int AT
At token, "@"

See Also:
Constant Field Values

LSQB

public static final int LSQB
Left square bracket

See Also:
Constant Field Values

LPAR

public static final int LPAR
Left parenthesis

See Also:
Constant Field Values

EQUALS

public static final int EQUALS
Equals token ("=")

See Also:
Constant Field Values

COMMA

public static final int COMMA
Comma token

See Also:
Constant Field Values

SLSL

public static final int SLSL
Double forwards slash, "//"

See Also:
Constant Field Values

OR

public static final int OR
Operator "or"

See Also:
Constant Field Values

AND

public static final int AND
Operator "and"

See Also:
Constant Field Values

GT

public static final int GT
Operator ">"

See Also:
Constant Field Values

LT

public static final int LT
Operator "<"

See Also:
Constant Field Values

GE

public static final int GE
Operator ">="

See Also:
Constant Field Values

LE

public static final int LE
Operator "<="

See Also:
Constant Field Values

PLUS

public static final int PLUS
Operator "+"

See Also:
Constant Field Values

MINUS

public static final int MINUS
Binary minus operator

See Also:
Constant Field Values

MULT

public static final int MULT
Multiply operator, "*" when used in an operator context

See Also:
Constant Field Values

DIV

public static final int DIV
Operator "div"

See Also:
Constant Field Values

MOD

public static final int MOD
Operator "mod"

See Also:
Constant Field Values

IS

public static final int IS
Operator "is"

See Also:
Constant Field Values

DOLLAR

public static final int DOLLAR
"$" symbol

See Also:
Constant Field Values

NE

public static final int NE
Operator "!="

See Also:
Constant Field Values

INTERSECT

public static final int INTERSECT
Operator "intersect"

See Also:
Constant Field Values

EXCEPT

public static final int EXCEPT
Operator "except"

See Also:
Constant Field Values

RETURN

public static final int RETURN
Keyword "return"

See Also:
Constant Field Values

THEN

public static final int THEN
Ketword "then"

See Also:
Constant Field Values

ELSE

public static final int ELSE
Keyword "else"

See Also:
Constant Field Values

WHERE

public static final int WHERE
Keyword "where"

See Also:
Constant Field Values

TO

public static final int TO
Operator "to"

See Also:
Constant Field Values

IN

public static final int IN
Keyword "in"

See Also:
Constant Field Values

SOME

public static final int SOME
Keyword "some"

See Also:
Constant Field Values

EVERY

public static final int EVERY
Keyword "every"

See Also:
Constant Field Values

SATISFIES

public static final int SATISFIES
Keyword "satisfies"

See Also:
Constant Field Values

FUNCTION

public static final int FUNCTION
Token representing the name of a function and the following "(" symbol

See Also:
Constant Field Values

AXIS

public static final int AXIS
Token representing the name of an axis and the following "::" symbol

See Also:
Constant Field Values

IF

public static final int IF
Keyword "if"

See Also:
Constant Field Values

PRECEDES

public static final int PRECEDES
Operator "<<"

See Also:
Constant Field Values

FOLLOWS

public static final int FOLLOWS
Operator ">>"

See Also:
Constant Field Values

COLONCOLON

public static final int COLONCOLON
"::" symbol

See Also:
Constant Field Values

COLONSTAR

public static final int COLONSTAR
":*" symbol

See Also:
Constant Field Values

INSTANCE_OF

public static final int INSTANCE_OF
operator "instance of"

See Also:
Constant Field Values

CAST_AS

public static final int CAST_AS
operator "cast as"

See Also:
Constant Field Values

TREAT_AS

public static final int TREAT_AS
operator "treat as"

See Also:
Constant Field Values

FEQ

public static final int FEQ
operator "eq"

See Also:
Constant Field Values

FNE

public static final int FNE
operator "ne"

See Also:
Constant Field Values

FGT

public static final int FGT
operator "gt"

See Also:
Constant Field Values

FLT

public static final int FLT
operator "lt"

See Also:
Constant Field Values

FGE

public static final int FGE
operator "ge"

See Also:
Constant Field Values

FLE

public static final int FLE
opeartor "le"

See Also:
Constant Field Values

IDIV

public static final int IDIV
operator "idiv"

See Also:
Constant Field Values

CASTABLE_AS

public static final int CASTABLE_AS
operator "castable as"

See Also:
Constant Field Values

ASSIGN

public static final int ASSIGN
":=" symbol (XQuery only)

See Also:
Constant Field Values

LCURLY

public static final int LCURLY
"{" symbol (XQuery only)

See Also:
Constant Field Values

KEYWORD_CURLY

public static final int KEYWORD_CURLY
composite token: (XQuery only)

See Also:
Constant Field Values

ELEMENT_QNAME

public static final int ELEMENT_QNAME
composite token <'element' QNAME> (XQuery only)

See Also:
Constant Field Values

ATTRIBUTE_QNAME

public static final int ATTRIBUTE_QNAME
composite token <'attribute' QNAME> (XQuery only)

See Also:
Constant Field Values

PI_QNAME

public static final int PI_QNAME
composite token <'pi' QNAME> (XQuery only)

See Also:
Constant Field Values

TYPESWITCH

public static final int TYPESWITCH
Keyword "typeswitch"

See Also:
Constant Field Values

CASE

public static final int CASE
Keyword "case"

See Also:
Constant Field Values

DEFAULT

public static final int DEFAULT
Keyword "default"

See Also:
Constant Field Values

XQUERY_VERSION

public static final int XQUERY_VERSION
"xquery version"

See Also:
Constant Field Values

DECLARE_NAMESPACE

public static final int DECLARE_NAMESPACE
"declare namespace"

See Also:
Constant Field Values

DECLARE_DEFAULT

public static final int DECLARE_DEFAULT
"declare default"

See Also:
Constant Field Values

DECLARE_VALIDATION

public static final int DECLARE_VALIDATION
"declare validation"

See Also:
Constant Field Values

DECLARE_BASEURI

public static final int DECLARE_BASEURI
"declare base-uri"

See Also:
Constant Field Values

DECLARE_XMLSPACE

public static final int DECLARE_XMLSPACE
"declare xmlspace"

See Also:
Constant Field Values

IMPORT_SCHEMA

public static final int IMPORT_SCHEMA
"import schema"

See Also:
Constant Field Values

IMPORT_MODULE

public static final int IMPORT_MODULE
"import module"

See Also:
Constant Field Values

DECLARE_VARIABLE

public static final int DECLARE_VARIABLE
"define variable"

See Also:
Constant Field Values

DECLARE_FUNCTION

public static final int DECLARE_FUNCTION
"define function"

See Also:
Constant Field Values

MODULE_NAMESPACE

public static final int MODULE_NAMESPACE
"module namespace"

See Also:
Constant Field Values

VALIDATE

public static final int VALIDATE
Various compound symbols supporting XQuery validation expression

See Also:
Constant Field Values

VALIDATE_STRICT

public static final int VALIDATE_STRICT
See Also:
Constant Field Values

VALIDATE_LAX

public static final int VALIDATE_LAX
See Also:
Constant Field Values

VALIDATE_SKIP

public static final int VALIDATE_SKIP
See Also:
Constant Field Values

VALIDATE_GLOBAL

public static final int VALIDATE_GLOBAL
See Also:
Constant Field Values

VALIDATE_CONTEXT

public static final int VALIDATE_CONTEXT
See Also:
Constant Field Values

SEMICOLON

public static final int SEMICOLON
semicolon separator

See Also:
Constant Field Values

LAST_OPERATOR

static int LAST_OPERATOR
Constant identifying the token number of the last token to be classified as an operator


NAME

public static final int NAME
Name token (a QName, in general)

See Also:
Constant Field Values

STRING_LITERAL

public static final int STRING_LITERAL
String literal

See Also:
Constant Field Values

RSQB

public static final int RSQB
Right square bracket

See Also:
Constant Field Values

RPAR

public static final int RPAR
Right parenthesis

See Also:
Constant Field Values

DOT

public static final int DOT
"." symbol

See Also:
Constant Field Values

DOTDOT

public static final int DOTDOT
".." symbol

See Also:
Constant Field Values

STAR

public static final int STAR
"*" symbol when used as a wildcard

See Also:
Constant Field Values

PREFIX

public static final int PREFIX
"prefix:*" token

See Also:
Constant Field Values

NUMBER

public static final int NUMBER
Numeric literal

See Also:
Constant Field Values

NODEKIND

public static final int NODEKIND
Node kind, e.g. "node()" or "comment()"

See Also:
Constant Field Values

FOR

public static final int FOR
"for" keyword

See Also:
Constant Field Values

SUFFIX

public static final int SUFFIX
"*:local-name" token

See Also:
Constant Field Values

QMARK

public static final int QMARK
"?" symbol

See Also:
Constant Field Values

TYPETEST

public static final int TYPETEST
"type("

See Also:
Constant Field Values

RCURLY

public static final int RCURLY
"}" symbol (XQuery only)

See Also:
Constant Field Values

LET

public static final int LET
"let" keyword (XQuery only)

See Also:
Constant Field Values

TAG

public static final int TAG
"<" at the start of a tag (XQuery only). The pseudo-XML syntax that follows is read character-by-character by the XQuery parser

See Also:
Constant Field Values

NEGATE

public static final int NEGATE
Unary minus sign

See Also:
Constant Field Values

tokens

public static String[] tokens
The following strings are used to represent tokens in error messages


doubleKeywords

public static HashMap doubleKeywords
Lookup table for composite (two-keyword) tokens


currentToken

public int currentToken
The number identifying the most recently read token


currentTokenValue

public String currentTokenValue
The string value of the most recently read token


currentTokenStartIndex

public int currentTokenStartIndex
The position in the input expression where the current token starts


input

public String input
The string being parsed


inputIndex

public int inputIndex
The current position within the input string

Constructor Detail

Tokenizer

public Tokenizer()
Method Detail

getState

public int getState()

setState

public void setState(int state)

tokenize

public void tokenize(String input,
                     int start,
                     int end)
              throws XPathException
Prepare a string for tokenization. The actual tokens are obtained by calls on next()

Parameters:
input - the string to be tokenized
start - start point within the string
end - end point within the string (last character not read): -1 means end of string
Throws:
XPathException - if a lexical error occurs, e.g. unmatched string quotes

next

public void next()
          throws XPathException
Get the next token from the input expression. The type of token is returned in the currentToken variable, the string value of the token in currentTokenValue.

Throws:
XPathException - if a lexical error is detected

treatCurrentAsOperator

public void treatCurrentAsOperator()
Force the current token to be treated as an operator if possible


lookAhead

public void lookAhead()
               throws XPathException
Look ahead by one token. This method does the real tokenization work. The method is normally called internally, but the XQuery parser also calls it to resume normal tokenization after dealing with pseudo-XML syntax.

Throws:
XPathException - if a lexical error occurs

nextChar

public char nextChar()
              throws StringIndexOutOfBoundsException
Read next character directly. Used by the XQuery parser when parsing pseudo-XML syntax

Returns:
the next character from the input
Throws:
StringIndexOutOfBoundsException - if an attempt is made to read beyond the end of the string. This will only occur in the event of a syntax error in the input.

unreadChar

public void unreadChar()
Step back one character


recentText

public String recentText()
Get the most recently read text (for use in an error message)


getLineNumber

public int getLineNumber()
Get the line number of the current token


getLexicalSubstring

public String getLexicalSubstring(int start,
                                  int end)
Get a slice of the input expression