Parser.java

/*
 * #%L
 * prolobjectlink-jpi-jtrolog
 * %%
 * Copyright (C) 2012 - 2018 WorkLogic Project
 * %%
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU Lesser General Public License as
 * published by the Free Software Foundation, either version 2.1 of the
 * License, or (at your option) any later version.
 * 
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Lesser Public License for more details.
 * 
 * You should have received a copy of the GNU General Lesser Public
 * License along with this program.  If not, see
 * <http://www.gnu.org/licenses/lgpl-2.1.html>.
 * #L%
 */
package jTrolog.parser;

import jTrolog.engine.Prolog;
import jTrolog.errors.InvalidTermException;
import jTrolog.terms.Double;
import jTrolog.terms.Int;
import jTrolog.terms.Number;
import jTrolog.terms.Struct;
import jTrolog.terms.StructAtom;
import jTrolog.terms.Term;
import jTrolog.terms.Var;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Pattern;

/**
 * This class defines a parser of prolog terms and sentences.
 * BNF for jTrolog part 2: Parser term ::= expr(1200) expr(n) ::= exprC(n-1) {
 * op(xfx,n) expr(n-1) | op(xfy,n) expr(n) | op(xf,n) | op(yfx,n) expr(n-1) |
 * op(yf,n) }* // exprC is called parseLeftSide in the code exprC(n) ::= '-'
 * integer | '-' float | op( fx,n ) exprA(n-1) | op( fy,n ) exprA(n) | exprA(n)
 * exprA(0) ::= integer | float | atom | variable | atom'(' expr(1200) { ','
 * expr(1200) }* ')' | '[' [ expr(1200) { ',' expr(1200) }* [ '|' expr(1200) ] ]
 * ']' | '(' { expr(1200) }* ')' '{' { expr(1200) }* '}' op(type,n) ::= atom | {
 * symbol }+
 */
@SuppressWarnings({ "rawtypes", "unchecked","serial" })
public class Parser implements Serializable {
	int dqFlag = 0;
	private static final int DQ_ATOMS = 0;
	private static final int DQ_CHARS = 1;
	private static final int DQ_CODES = 2;

	public static final String floatSignature = "float/1".intern();
	public static final String listSignature = "'.'/2".intern();
	public static final String commaSignature = "','/2".intern();
	public static final String cutSignature = "!/0".intern();
	public static final String singleClauseSignature = "':-'/1".intern();
	public static final String doubleClauseSignature = "':-'/2".intern();
	public static final String semiColonSignature = "';'/2".intern();
	public static final String ifSignature = "'->'/2".intern();
	public static final String callSignature = "call/1".intern();
	public static final String throwSignature = "throw/1".intern();
	public static final String catchSignature = "catch/3".intern();
	public static final String trueSignature = "true/0".intern();
	public static final String failSignature = "fail/0".intern();

	private Tokenizer tokenizer;
	private Prolog engine;
	private List variableList;

	/**
	 * creating a Parser specifing how to handle operators and what text to
	 * parse
	 */
	public Parser(InputStream theoryText, Prolog p) {
		this(p, new Tokenizer(new BufferedReader(new InputStreamReader(theoryText))));
	}

	/**
	 * creating a Parser specifing how to handle operators and what text to
	 * parse
	 */
	public Parser(String theoryText, Prolog engine) {
		this(engine, new Tokenizer(theoryText));
	}

	/**
	 * creating a parser with default operator interpretation
	 */
	public Parser(String theoryText) {
		this(null, new Tokenizer(theoryText));
	}

	/**
	 * creating a parser with default operator interpretation
	 */
	public Parser(InputStream theoryText) {
		this(null, new Tokenizer(new BufferedReader(new InputStreamReader(theoryText))));
	}

	private Parser(Prolog p, Tokenizer lexer) {
		tokenizer = lexer;
		variableList = new ArrayList();
		if (p == null) {
			engine = Prolog.defaultMachine;
		} else {
			engine = p;
			Term dqFlag = p.getFlagValue("double_quotes");
			if (dqFlag != null) {
				if ("chars".equals(dqFlag.toString()))
					this.dqFlag = DQ_CHARS;
				else if ("codes".equals(dqFlag.toString()))
					this.dqFlag = DQ_CODES;
				else
					this.dqFlag = DQ_ATOMS;
			}
		}
	}

	// user interface

	public Iterator iterator() throws InvalidTermException {
		return new TermIterator(this);
	}

	/**
	 * Parses next term from the stream built on string.
	 * 
	 * @param endNeeded
	 *            <tt>true</tt> if it is required to parse the end token (a
	 *            period), <tt>false</tt> otherwise.
	 * @throws InvalidTermException
	 *             if a syntax error is found.
	 */
	public Term nextTerm(boolean endNeeded) throws InvalidTermException {
		try {
			variableList.clear();
			Token t = tokenizer.readToken();
			if (t.isEOF())
				return null;

			tokenizer.unreadToken(t);
			Term term = expr(Prolog.OP_HIGH, false);
			if (term == null)
				throw new InvalidTermException("The parser is unable to finish.");

			if (endNeeded && !tokenizer.readToken().isType('.'))
				throw new InvalidTermException("The term " + term + " is not ended with a period.");
			return term;
		} catch (IOException ex) {
			throw new InvalidTermException("An I/O error occured.");
		}
	}

	// internal parsing procedures

	private Term expr(int maxPriority, boolean commaIsEndMarker) throws InvalidTermException, IOException {

		// 1. op(fx,n) expr(n-1) | op(fy,n) exprA(n) | expr0
		Term leftRes = parseLeftSide(commaIsEndMarker, maxPriority);
		// todo should minPriority come from parseLeftSide??
		int minPriority = 0;

		// 2.left is followed by either xfx, xfy or xf operators, parse these
		Token operator = tokenizer.readToken();
		for (; operator.isOperator(commaIsEndMarker); operator = tokenizer.readToken()) {
			int XFX = engine.getOperatorPriority(operator.seq, Prolog.XFX);
			int XFY = engine.getOperatorPriority(operator.seq, Prolog.XFY);
			int XF = engine.getOperatorPriority(operator.seq, Prolog.XF);
			int YFX = engine.getOperatorPriority(operator.seq, Prolog.YFX);
			int YF = engine.getOperatorPriority(operator.seq, Prolog.YF);

			// check that no operator has a priority higher than permitted
			// or a lower priority than the left side expression
			if (XFX > maxPriority || XFX < Prolog.OP_LOW)
				XFX = -1;
			if (XFY > maxPriority || XFY < Prolog.OP_LOW)
				XFY = -1;
			if (XF > maxPriority || XF < Prolog.OP_LOW)
				XF = -1;
			if (YF < minPriority || YF > maxPriority)
				YF = -1;
			if (YFX < minPriority || YFX > maxPriority)
				YFX = -1;

			// XFX
			if (XFX >= XFY && XFX >= XF && XFX >= minPriority) { // XFX has
																	// priority
				Term found = expr(XFX - 1, commaIsEndMarker);
				if (found != null) {
					minPriority = XFX;
					leftRes = new Struct(operator.seq, new Term[] { leftRes, found }, Prolog.XFX);
					continue;
				}
			}
			// XFY
			else if (XFY >= XF && XFY >= minPriority) { // XFY has priority, or
														// XFX has failed
				Term found = expr(XFY, commaIsEndMarker);
				if (found != null) {
					minPriority = XFY;
					leftRes = new Struct(operator.seq, new Term[] { leftRes, found }, Prolog.XFY);
					continue;
				}
			}
			// XF
			else if (XF >= minPriority) // XF has priority, or XFX and/or XFY
										// has failed
				return new Struct(operator.seq, new Term[] { leftRes }, Prolog.XF);

			// XFX did not have top priority, but XFY failed
			else if (XFX >= minPriority) {
				Term found = expr(XFX - 1, commaIsEndMarker);
				if (found != null) {
					minPriority = XFX;
					leftRes = new Struct(operator.seq, new Term[] { leftRes, found }, Prolog.XFX);
					continue;
				}
			}
			// YFX
			else if (YFX >= YF && YFX >= Prolog.OP_LOW) {
				Term found = expr(YFX - 1, commaIsEndMarker);
				if (found != null) {
					minPriority = YFX;
					leftRes = new Struct(operator.seq, new Term[] { leftRes, found }, Prolog.YFX);
					continue;
				}
			}
			// either YF has priority over YFX or YFX failed
			else if (YF >= Prolog.OP_LOW) {
				minPriority = YF;
				leftRes = new Struct(operator.seq, new Term[] { leftRes }, Prolog.YF);
				continue;
			}
			break;
		}
		tokenizer.unreadToken(operator);
		return leftRes;
	}

	/**
	 * Parses and returns a valid 'leftside' of an expression. If the left side
	 * starts with a prefix, it consumes other expressions with a lower priority
	 * than itself. If the left side does not have a prefix it must be an expr0.
	 * 
	 * @param commaIsEndMarker
	 *            used when the leftside is part of and argument list of
	 *            expressions
	 * @param maxPriority
	 *            operators with a higher priority than this will effectivly end
	 *            the expression
	 * @return a wrapper of: 1. term correctly structured and 2. the priority of
	 *         its root operator
	 * @throws InvalidTermException
	 */
	private Term parseLeftSide(boolean commaIsEndMarker, int maxPriority) throws InvalidTermException, IOException {
		// 1. prefix expression
		Token f = tokenizer.readToken();
		if (f.isOperator(commaIsEndMarker) && !f.isFunctor()) {
			int FX = engine.getOperatorPriority(f.seq, Prolog.FX);
			int FY = engine.getOperatorPriority(f.seq, Prolog.FY);

			if (f.seq.equals("-")) {
				Token t = tokenizer.readToken();
				if (t.isNumber())
					return jTrolog.terms.Number.create("-" + t.seq);
				else
					tokenizer.unreadToken(t);
			}

			// check that no operator has a priority higher than permitted
			if (FY > maxPriority)
				FY = -1;
			if (FX > maxPriority)
				FX = -1;

			// FX has priority over FY
			if (FX >= FY && FX >= Prolog.OP_LOW) {
				Term found = expr(FX - 1, commaIsEndMarker); // op(fx, n)
																// exprA(n - 1)
				if (found != null)
					return new Struct(f.seq, new Term[] { found }, Prolog.FX);
			}
			// FY has priority over FX, or FX has failed
			else if (FY >= Prolog.OP_LOW) {
				Term found = expr(FY, commaIsEndMarker); // op(fy,n) exprA(1200)
															// or op(fy,n)
															// expr(n)
				if (found != null)
					return new Struct(f.seq, new Term[] { found }, Prolog.FY);
			}
			// FY has priority over FX, but FY failed
			else if (FX >= Prolog.OP_LOW) {
				Term found = expr(FX - 1, commaIsEndMarker); // op(fx, n)
																// exprA(n - 1)
				if (found != null)
					return new Struct(f.seq, new Term[] { found }, Prolog.FX);
			}
		}
		tokenizer.unreadToken(f);
		// 2. expr0
		return expr0();
	}

	/**
	 * exprA(0) ::= integer | float | variable | atom | atom( expr(1200) { ,
	 * exprA(1200) }* ) | '[' expr(1200) { , expr(1200) }* [ | exprA(1200) ] ']'
	 * | '{' [ exprA(1200) ] '}' | '(' exprA(1200) ')'
	 */
	private Term expr0() throws InvalidTermException, IOException {
		Token t1 = tokenizer.readToken();

		if (t1.isType(Token.INTEGER))
			return Int.create(t1.seq);

		if (t1.isType(Token.FLOAT))
			return new Double(t1.seq);

		if (t1.isType(Token.VARIABLE)) {
			int pos = variableList.indexOf(t1.seq);
			if (pos != -1 && t1.seq != Var.ANY)
				return new Var(t1.seq, pos + 1);
			variableList.add(t1.seq);
			return new Var(t1.seq, variableList.size());
		}

		if (t1.isFunctor()) {
			String functor = t1.seq;
			Token t = tokenizer.readToken(); // reading left par
			LinkedList l = new LinkedList();
			do {
				l.add(expr(Prolog.OP_HIGH, true)); // adding argument
				t = tokenizer.readToken();
				if (")".equals(t.seq)) // if right par, return
					return new Struct(functor, (Term[]) l.toArray(new Term[0]));
			} while (",".equals(t.seq)); // if comma, read next arg
			throw new InvalidTermException("Error in argument list syntax.\n" + "Token: " + t + " not expected at line " + tokenizer.lineno() + ".");
		}

		if (t1.isType(Token.DQ_SEQUENCE)) {
			if (dqFlag == Parser.DQ_ATOMS) // DQ as atoms
				return new StructAtom(t1.seq);
			if (dqFlag == Parser.DQ_CHARS) // DQ as char list
				return stringToStructList(t1.seq);
			if (dqFlag == Parser.DQ_CODES) { // DQ as int list
				char[] chars = t1.seq.toCharArray();
				int[] codes = new int[chars.length];
				for (int i = 0; i < chars.length; i++)
					codes[i] = chars[i];
				return intsToStructList(codes);
			}
		}

		if (t1.isAtom())
			return new StructAtom(t1.seq);

		// todo review handling of ( ... ). Error in test set and it should have
		// consequences for how toString in Struct performs for operators that
		// have strange priorities
		if (t1.isType('(')) {
			Term term = expr(Prolog.OP_HIGH, false);
			if (tokenizer.readToken().isType(')'))
				return term;
			throw new InvalidTermException("Missing right parenthesis: (" + term + " -> here <-");
		}

		if (t1.isType('[')) {
			Token t2 = tokenizer.readToken();
			if (t2.isType(']'))
				return Term.emptyList;
			tokenizer.unreadToken(t2);

			LinkedList elems = new LinkedList();
			do {
				elems.add(expr(Prolog.OP_HIGH, true));
				t2 = tokenizer.readToken();
				if ("|".equals(t2.seq)) {
					elems.add(expr(Prolog.OP_HIGH, true));
					t2 = tokenizer.readToken();
					if ("]".equals(t2.seq))
						return createStructList(elems);
					throw new InvalidTermException("Missing ']' after: " + elems.getLast());
				}
				if ("]".equals(t2.seq)) {
					elems.add(Term.emptyList);
					return createStructList(elems);
				}
			} while (",".equals(t2.seq));
			throw new InvalidTermException("Error in list syntax after: " + elems.getLast());
		}

		if (t1.isType('{')) {
			Token t2 = tokenizer.readToken();
			if (t2.isType('}'))
				return new StructAtom("{}");

			tokenizer.unreadToken(t2);
			Term arg = expr(Prolog.OP_HIGH, false);
			t2 = tokenizer.readToken();
			if (t2.isType('}'))
				return new Struct("{}", new Term[] { arg });
			throw new InvalidTermException("Missing right braces: {" + arg + " -> here <-");
		}

		throw new InvalidTermException("The following token could not be identified: " + t1.seq);
	}

	public int getCurrentLine() {
		return tokenizer.lineno();
	}

	public static Struct createListContainingAnyVars(int lengthInt) {
		LinkedList vars = new LinkedList();
		for (int i = 0; i < lengthInt; i++)
			vars.add(new Var("_", i + 1));
		vars.add(Term.emptyList);
		return createStructList(vars);
	}

	public static Struct createStructList(LinkedList complete) {
		if (complete.isEmpty())
			return Term.emptyList;
		if (complete.size() == 2)
			return new Struct(".", new Term[] { (Term) complete.getFirst(), (Term) complete.getLast() });
		if (complete.size() > 2) {
			Term head = (Term) complete.removeFirst();
			return new Struct(".", new Term[] { head, createStructList(complete) });
		}
		throw new RuntimeException("omg-..");
	}

	public static Struct stringToStructList(String charList) {
		Struct t = StructAtom.emptyList;
		for (int i = charList.length() - 1; i >= 0; i--)
			t = new Struct(".", new Term[] { new StructAtom(Character.toString(charList.charAt(i))), t });
		return t;
	}

	public static Struct intsToStructList(int[] numbers) {
		Struct t = StructAtom.emptyList;
		for (int i = numbers.length - 1; i >= 0; i--)
			t = new Struct(".", new Term[] { new Int(numbers[i]), t });
		return t;
	}

	public static boolean isSemiAndNotIf(Struct struct) {
		if (struct == null || struct.predicateIndicator != semiColonSignature)
			return false;
		final Term left = struct.getArg(0);
		return !(left instanceof Struct) || ((Struct) left).predicateIndicator != ifSignature;
	}

	/**
	 * @param atom
	 * @return atom wrapped in 'atom' if necessary
	 */
	public static String wrapAtom(final String atom) {
		return isAtom(atom) || (atom.startsWith("'") && atom.endsWith("'")) ? atom : "'" + atom + "'";
	}

	/**
	 * @return true if the String could be a prolog atom
	 */
	public static boolean isAtom(String s) {
		return atom.matcher(s).matches();
	}

	static private Pattern atom = Pattern.compile("(!|[a-z][a-zA-Z_0-9]*)");

	public static Number parseNumber(String s) throws InvalidTermException {
		Term t = new Parser(s).nextTerm(false);
		if (t instanceof Number)
			return (Number) t;
		throw new InvalidTermException("Term " + t + " is not a number.");
	}

	public static String removeApices(String st) {
		if (st.startsWith("'") && st.endsWith("'") && st.length() > 2)
			return st.substring(1, st.length() - 1);
		return st;
	}
}