Tokenizer

/*
 * #%L
 * prolobjectlink-jpi-jtrolog
 * %%
 * Copyright (C) 2012 - 2018 WorkLogic Project
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 *
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package jTrolog.parser;

import jTrolog.errors.InvalidTermException;
import jTrolog.parser.Parser;
import jTrolog.parser.Token;

import java.io.*;
import java.util.LinkedList;
import java.util.Arrays;

/**
 * BNF for jTrolog
 *
 * from the super class, the super.nextToken() returns and updates the following
 * relevant fields: - if the next token is a collection of wordChars, the type
 * returned is TT_WORD and the value is put into the field sval. - if the next
 * token is an ordinary char, the type returned is the same as the unicode int
 * value of the ordinary character - other characters should be handled as
 * ordinary characters.
 */
@SuppressWarnings({ "rawtypes", "unchecked","serial" })
class Tokenizer extends StreamTokenizer implements Serializable {

	static final char[] GRAPHIC_CHARS = { '\\', '$', '&', '?', '^', '@', '#', '.', ',', ':', ';', '=', '<', '>', '+', '-', '*', '/', '~' };
	static {
		Arrays.sort(Tokenizer.GRAPHIC_CHARS); // must be done to ensure correct
												// behavior of
												// Arrays.binarySearch
	}

	// used to enable pushback from the parser. Not in any way connected with
	// pushBack2 and super.pushBack().
	private LinkedList tokenList = new LinkedList();

	// used in the double lookahead check that . following ints is a fraction
	// marker or end marker (pushback() only works on one level)
	private PushBack pushBack2 = null;

	public Tokenizer(String text) {
		this(new StringReader(text));
	}

	/**
	 * creating a tokenizer for the source stream
	 */
	public Tokenizer(Reader text) {
		super(text);

		// Prepare the tokenizer for Prolog-style tokenizing rules
		resetSyntax();

		// letters
		wordChars('a', 'z');
		wordChars('A', 'Z');
		wordChars('_', '_');
		wordChars('0', '9'); // need to parse numbers as special words

		ordinaryChar('!');

		// symbols
		ordinaryChar('\\');
		ordinaryChar('$');
		ordinaryChar('&');
		ordinaryChar('^');
		ordinaryChar('@');
		ordinaryChar('#');
		ordinaryChar(',');
		ordinaryChar('.');
		ordinaryChar(':');
		ordinaryChar(';');
		ordinaryChar('=');
		ordinaryChar('<');
		ordinaryChar('>');
		ordinaryChar('+');
		ordinaryChar('-');
		ordinaryChar('*');
		ordinaryChar('/');
		ordinaryChar('~');

		// quotes
		ordinaryChar('\''); // must be parsed individually to handles \\ in
							// quotes and character code constants
		ordinaryChar('\"'); // same as above?

		// comments
		ordinaryChar('%');
		// it is not possible to enable StreamTokenizer#slashStarComments and %
		// as a StreamTokenizer#commentChar
		// and it is also not possible to use StreamTokenizer#whitespaceChars
		// for ' '
	}

	/**
	 * reads next available token
	 */
	Token readToken() throws InvalidTermException, IOException {
		return !tokenList.isEmpty() ? (Token) tokenList.removeFirst() : readNextToken();
	}

	/**
	 * puts back token to be read again
	 */
	void unreadToken(Token token) {
		tokenList.addFirst(token);
	}

	Token readNextToken() throws IOException, InvalidTermException {
		int typea;
		String svala;
		if (pushBack2 != null) {
			typea = pushBack2.typea;
			svala = pushBack2.svala;
			pushBack2 = null;
		} else {
			typea = super.nextToken();
			svala = sval;
		}

		// skips whitespace
		// could be simplified if lookahead for blank space in functors wasn't
		// necessary
		// and if '.' in numbers could be written with blank space
		while (Tokenizer.isWhite(typea)) {
			typea = super.nextToken();
			svala = sval;
		}

		// skips single line comments
		// could be simplified if % was not a legal character in quotes
		if (typea == '%') {
			do {
				typea = super.nextToken();
			} while (typea != '\r' && typea != '\n' && typea != TT_EOF);
			pushBack(); // pushes back \r or \n. These are whitespace, so when
						// readNextToken() finds them, they are marked as
						// whitespace
			return readNextToken();
		}

		// skips /* comments */
		if (typea == '/') {
			int typeb = super.nextToken();
			if (typeb == '*') {
				do {
					typea = typeb;
					typeb = super.nextToken();
				} while (typea != '*' || typeb != '/');
				return readNextToken();
			} else {
				pushBack();
			}
		}

		// syntactic charachters
		if (typea == TT_EOF)
			return new Token("", Token.EOF);
		if (typea == '(')
			return new Token("(", '(');
		if (typea == ')')
			return new Token(")", ')');
		if (typea == '{')
			return new Token("{", '{');
		if (typea == '}')
			return new Token("}", '}');
		if (typea == '[')
			return new Token("[", '[');
		if (typea == ']')
			return new Token("]", ']');
		if (typea == '|')
			return new Token("|", '|');

		if (typea == '!')
			return new Token("!", Token.ATOM);
		if (typea == ',')
			return new Token(",", Token.OPERATOR);

		// check that '.' as end token is followed by a layout character, see
		// ISO Standard 6.4.8 endnote
		if (typea == '.') {
			int typeb = super.nextToken();
			pushBack();
			if (Tokenizer.isWhite(typeb) || typeb == '%' || typeb == StreamTokenizer.TT_EOF)
				return new Token(".", '.');
		}

		boolean isNumber = false;

		// variable, atom or number
		if (typea == TT_WORD) {
			char firstChar = svala.charAt(0);
			// variable
			if (Character.isUpperCase(firstChar) || '_' == firstChar)
				return new Token(svala, Token.VARIABLE);

			else if (firstChar >= '0' && firstChar <= '9') // all words starting
															// with 0 or 9 must
															// be a number
				isNumber = true; // set type to number and handle later

			else { // otherwise, it must be an atom (or wrong)
				int typeb = super.nextToken(); // lookahead 1 to identify what
												// type of atom
				pushBack(); // this does not skip whitespaces, only readNext
							// does so.
				if (typeb == '(')
					return new Token(svala, Token.ATOM_FUNCTOR);
				if (Tokenizer.isWhite(typeb))
					return new Token(svala, Token.ATOM_OPERATOR);
				return new Token(svala, Token.ATOM);
			}
		}

		// quotes
		if (typea == '\'' || typea == '\"' || typea == '`') {
			int qType = typea;
			StringBuffer quote = new StringBuffer();
			while (true) { // run through entire quote and added body to quote
							// buffer
				typea = super.nextToken();
				svala = sval;
				// double back slash and continuation escape sequence
				if (typea == '\\') {
					int typeb = super.nextToken();
					if (typeb == '\\') { // double back slash '... \\\\ ...'
						quote.append((char) typeb);
						continue;
					}
					if (typeb == '\n') // continuation escape sequence marker
										// \\n
						continue;
					if (typeb == '\r') {
						int typec = super.nextToken();
						if (typec == '\n')
							continue; // continuation escape sequence marker
										// \\r\n
						pushBack();
						continue; // continuation escape sequence marker \\r
					}
					pushBack(); // pushback typeb
				}
				// double '' or "" or ``
				if (typea == qType) {
					int typeb = super.nextToken();
					if (typeb == qType) { // escaped '' or "" or ``
						quote.append((char) qType);
						continue;
					} else {
						pushBack();
						break; // otherwise, break on single quote
					}
				}
				if (typea == '\n' || typea == '\r')
					throw new InvalidTermException("line break in quote not allowed (unless they are escaped \\ first)");

				if (svala != null)
					quote.append(svala);
				else
					quote.append((char) typea);
			}

			String quoteBody = quote.toString();

			qType = qType == '\'' ? Token.SQ_SEQUENCE : qType == '\"' ? Token.DQ_SEQUENCE : Token.SQ_SEQUENCE;
			if (qType == Token.SQ_SEQUENCE) {
				if (Parser.isAtom(quoteBody))
					qType = Token.ATOM;
				int typeb = super.nextToken(); // lookahead 1 to identify what
												// type of quote
				pushBack(); // nextToken() does not skip whitespaces, only
							// readNext does so.
				if (typeb == '(')
					return new Token(quoteBody, Token.SQ_FUNCTOR);
			}
			return new Token(quoteBody, qType);
		}

		// symbols
		if (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typea) >= 0) {

			// the symbols are parsed individually by the super.nextToken(), so
			// accumulate symbollist
			StringBuffer symbols = new StringBuffer();
			int typeb = typea;
			// String svalb = null;
			while (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typeb) >= 0) {
				symbols.append((char) typeb);
				typeb = super.nextToken();
				// svalb = sval;
			}
			pushBack();

			// special symbols: unary + and unary -
			// try {
			// if (symbols.length() == 1 && typeb == TT_WORD &&
			// java.lang.Long.parseLong(svalb) > 0) {
			// if (typea == '+') //todo, issue of handling + and -. I don't
			// think this is ISO..
			// return readNextToken(); //skips + and returns the next number
			// if (typea == '-') {
			// Token t = readNextToken(); //read the next number
			// t.seq = "-" + t.seq; //add minus to value
			// return t; //return token
			// }
			// } //ps. the reason why the number isn't returned right away, but
			// through nextToken(), is because the number might be for instance
			// a float
			// } catch (NumberFormatException e) {
			// }
			if (typeb == '(') // applies to . as well
				return new Token(symbols.toString(), Token.OPERATOR_FUNCTOR);
			return new Token(symbols.toString(), Token.OPERATOR);
		}

		// numbers: 1. integer, 2. float
		if (isNumber) {
			try { // the various parseInt checks will throw exceptions when
					// parts of numbers are written illegally

				// 1.a. complex integers
				if (svala.startsWith("0")) {
					if (svala.indexOf('b') == 1)
						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 2), Token.INTEGER); // try
																												// binary
					if (svala.indexOf('o') == 1)
						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 8), Token.INTEGER); // try
																												// octal
					if (svala.indexOf('x') == 1)
						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 16), Token.INTEGER); // try
																												// hex
				}

				// lookahead 1
				int typeb = super.nextToken();
				String svalb = sval;

				// 1.b ordinary integers
				if (typeb != '.' && typeb != '\'') { // i.e. not float or
														// character constant
					pushBack(); // lookahead 0
					return new Token("" + java.lang.Long.parseLong(svala), Token.INTEGER);
				}

				// 1.c character code constant
				if (typeb == '\'' && "0".equals(svala)) {
					int typec = super.nextToken(); // lookahead 2
					String svalc = sval;
					int intVal;
					if ((intVal = isCharacterCodeConstantToken(typec, svalc)) != -1)
						return new Token("" + intVal, Token.INTEGER);

					// this is an invalid character code constant int
					throw new InvalidTermException("Character code constant starting with 0'<X> at line: " + super.lineno() + " cannot be recognized.");
				}

				// 2.a check that the value of the word prior to period is a
				// valid long
				java.lang.Long.parseLong(svala); // throws an exception if not

				// 2.b first int is followed by a period
				if (typeb != '.')
					throw new InvalidTermException("A number starting with 0-9 cannot be rcognized as an int and does not have a fraction '.' at line: " + super.lineno());

				// lookahead 2
				int typec = super.nextToken();
				String svalc = sval;

				// 2.c check that the next token after '.' is a possible
				// fraction
				if (typec != TT_WORD) { // if its not, the period is an End
										// period
					pushBack(); // pushback 1 the token after period
					pushBack2 = new PushBack(typeb, svalb); // pushback 2 the
															// period token
					return new Token(svala, Token.INTEGER); // return what must
															// be an int
				}

				// 2.d checking for exponent
				int exponent = svalc.indexOf("E");
				if (exponent == -1)
					exponent = svalc.indexOf("e");

				if (exponent >= 1) { // the float must have a valid exponent
					if (exponent == svalc.length() - 1) { // the exponent must
															// be signed
															// exponent
						int typeb2 = super.nextToken();
						if (typeb2 == '+' || typeb2 == '-') {
							int typec2 = super.nextToken();
							String svalc2 = sval;
							if (typec2 == TT_WORD) {
								// verify the remaining parts of the float and
								// return
								java.lang.Long.parseLong(svalc.substring(0, exponent));
								java.lang.Integer.parseInt(svalc2);
								return new Token(svala + "." + svalc + (char) typeb2 + svalc2, Token.FLOAT);
							}
						}
					}
				}
				// 2.e verify lastly that ordinary floats and unsigned exponent
				// floats are Java legal and return them
				java.lang.Double.parseDouble(svala + "." + svalc);
				return new Token(svala + "." + svalc, Token.FLOAT);

			} catch (NumberFormatException e) {
				// TODO return more info on what was wrong with the number given
				throw new InvalidTermException("A term starting with 0-9 cannot be parsed as a number at line: " + lineno());
			}
		}
		throw new InvalidTermException("Unknown Unicode character: " + typea + "  (" + svala + ")");
	}

	/**
	 *
	 *
	 * @param typec
	 * @param svalc
	 * @return the intValue of the next character token, -1 if invalid todo
	 *         needs a lookahead if typec is \
	 */
	private static int isCharacterCodeConstantToken(int typec, String svalc) {
		if (svalc != null) {
			if (svalc.length() == 1)
				return (int) svalc.charAt(0);
			if (svalc.length() > 1) {
				// TODO the following charachters is not implemented:
				// * 1 meta escape sequence (* 6.4.2.1 *) todo
				// * 1 control escape sequence (* 6.4.2.1 *)
				// * 1 octal escape sequence (* 6.4.2.1 *)
				// * 1 hexadecimal escape sequence (* 6.4.2.1 *)
				return -1;
			}
		}
		if (typec == ' ' || // space char (* 6.5.4 *)
				typec == '(' || typec == ')' || typec == '{' || typec == '}' || typec == '[' || typec == ']' || Arrays.binarySearch(GRAPHIC_CHARS, (char) typec) >= 0) // graphic
																																										// char
																																										// (*
																																										// 6.5.1
																																										// *)
		// TODO solo char (* 6.5.3 *)
			return typec;

		return -1;
	}

	private static boolean isWhite(int type) {
		return type == ' ' || type == '\r' || type == '\n' || type == '\t' || type == '\f';
	}

	/**
	 * used to implement lookahead for two tokens, super.pushBack() only handles
	 * one pushBack..
	 */
	private static class PushBack {
		int typea;
		String svala;

		public PushBack(int i, String s) {
			typea = i;
			svala = s;
		}
	}
}