View Javadoc

1   /*
2    * #%L
3    * prolobjectlink-jpi-jtrolog
4    * %%
5    * Copyright (C) 2012 - 2018 WorkLogic Project
6    * %%
7    * This program is free software: you can redistribute it and/or modify
8    * it under the terms of the GNU Lesser General Public License as
9    * published by the Free Software Foundation, either version 2.1 of the
10   * License, or (at your option) any later version.
11   * 
12   * This program is distributed in the hope that it will be useful,
13   * but WITHOUT ANY WARRANTY; without even the implied warranty of
14   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15   * GNU General Lesser Public License for more details.
16   * 
17   * You should have received a copy of the GNU General Lesser Public
18   * License along with this program.  If not, see
19   * <http://www.gnu.org/licenses/lgpl-2.1.html>.
20   * #L%
21   */
22  package jTrolog.parser;
23  
24  import jTrolog.errors.InvalidTermException;
25  import jTrolog.parser.Parser;
26  import jTrolog.parser.Token;
27  
28  import java.io.*;
29  import java.util.LinkedList;
30  import java.util.Arrays;
31  
32  /**
33   * BNF for jTrolog
34   * 
35   * from the super class, the super.nextToken() returns and updates the following
36   * relevant fields: - if the next token is a collection of wordChars, the type
37   * returned is TT_WORD and the value is put into the field sval. - if the next
38   * token is an ordinary char, the type returned is the same as the unicode int
39   * value of the ordinary character - other characters should be handled as
40   * ordinary characters.
41   */
42  @SuppressWarnings({ "rawtypes", "unchecked","serial" })
43  class Tokenizer extends StreamTokenizer implements Serializable {
44  
45  	static final char[] GRAPHIC_CHARS = { '\\', '$', '&', '?', '^', '@', '#', '.', ',', ':', ';', '=', '<', '>', '+', '-', '*', '/', '~' };
46  	static {
47  		Arrays.sort(Tokenizer.GRAPHIC_CHARS); // must be done to ensure correct
48  												// behavior of
49  												// Arrays.binarySearch
50  	}
51  
52  	// used to enable pushback from the parser. Not in any way connected with
53  	// pushBack2 and super.pushBack().
54  	private LinkedList tokenList = new LinkedList();
55  
56  	// used in the double lookahead check that . following ints is a fraction
57  	// marker or end marker (pushback() only works on one level)
58  	private PushBack pushBack2 = null;
59  
60  	public Tokenizer(String text) {
61  		this(new StringReader(text));
62  	}
63  
64  	/**
65  	 * creating a tokenizer for the source stream
66  	 */
67  	public Tokenizer(Reader text) {
68  		super(text);
69  
70  		// Prepare the tokenizer for Prolog-style tokenizing rules
71  		resetSyntax();
72  
73  		// letters
74  		wordChars('a', 'z');
75  		wordChars('A', 'Z');
76  		wordChars('_', '_');
77  		wordChars('0', '9'); // need to parse numbers as special words
78  
79  		ordinaryChar('!');
80  
81  		// symbols
82  		ordinaryChar('\\');
83  		ordinaryChar('$');
84  		ordinaryChar('&');
85  		ordinaryChar('^');
86  		ordinaryChar('@');
87  		ordinaryChar('#');
88  		ordinaryChar(',');
89  		ordinaryChar('.');
90  		ordinaryChar(':');
91  		ordinaryChar(';');
92  		ordinaryChar('=');
93  		ordinaryChar('<');
94  		ordinaryChar('>');
95  		ordinaryChar('+');
96  		ordinaryChar('-');
97  		ordinaryChar('*');
98  		ordinaryChar('/');
99  		ordinaryChar('~');
100 
101 		// quotes
102 		ordinaryChar('\''); // must be parsed individually to handles \\ in
103 							// quotes and character code constants
104 		ordinaryChar('\"'); // same as above?
105 
106 		// comments
107 		ordinaryChar('%');
108 		// it is not possible to enable StreamTokenizer#slashStarComments and %
109 		// as a StreamTokenizer#commentChar
110 		// and it is also not possible to use StreamTokenizer#whitespaceChars
111 		// for ' '
112 	}
113 
114 	/**
115 	 * reads next available token
116 	 */
117 	Token readToken() throws InvalidTermException, IOException {
118 		return !tokenList.isEmpty() ? (Token) tokenList.removeFirst() : readNextToken();
119 	}
120 
121 	/**
122 	 * puts back token to be read again
123 	 */
124 	void unreadToken(Token token) {
125 		tokenList.addFirst(token);
126 	}
127 
128 	Token readNextToken() throws IOException, InvalidTermException {
129 		int typea;
130 		String svala;
131 		if (pushBack2 != null) {
132 			typea = pushBack2.typea;
133 			svala = pushBack2.svala;
134 			pushBack2 = null;
135 		} else {
136 			typea = super.nextToken();
137 			svala = sval;
138 		}
139 
140 		// skips whitespace
141 		// could be simplified if lookahead for blank space in functors wasn't
142 		// necessary
143 		// and if '.' in numbers could be written with blank space
144 		while (Tokenizer.isWhite(typea)) {
145 			typea = super.nextToken();
146 			svala = sval;
147 		}
148 
149 		// skips single line comments
150 		// could be simplified if % was not a legal character in quotes
151 		if (typea == '%') {
152 			do {
153 				typea = super.nextToken();
154 			} while (typea != '\r' && typea != '\n' && typea != TT_EOF);
155 			pushBack(); // pushes back \r or \n. These are whitespace, so when
156 						// readNextToken() finds them, they are marked as
157 						// whitespace
158 			return readNextToken();
159 		}
160 
161 		// skips /* comments */
162 		if (typea == '/') {
163 			int typeb = super.nextToken();
164 			if (typeb == '*') {
165 				do {
166 					typea = typeb;
167 					typeb = super.nextToken();
168 				} while (typea != '*' || typeb != '/');
169 				return readNextToken();
170 			} else {
171 				pushBack();
172 			}
173 		}
174 
175 		// syntactic charachters
176 		if (typea == TT_EOF)
177 			return new Token("", Token.EOF);
178 		if (typea == '(')
179 			return new Token("(", '(');
180 		if (typea == ')')
181 			return new Token(")", ')');
182 		if (typea == '{')
183 			return new Token("{", '{');
184 		if (typea == '}')
185 			return new Token("}", '}');
186 		if (typea == '[')
187 			return new Token("[", '[');
188 		if (typea == ']')
189 			return new Token("]", ']');
190 		if (typea == '|')
191 			return new Token("|", '|');
192 
193 		if (typea == '!')
194 			return new Token("!", Token.ATOM);
195 		if (typea == ',')
196 			return new Token(",", Token.OPERATOR);
197 
198 		// check that '.' as end token is followed by a layout character, see
199 		// ISO Standard 6.4.8 endnote
200 		if (typea == '.') {
201 			int typeb = super.nextToken();
202 			pushBack();
203 			if (Tokenizer.isWhite(typeb) || typeb == '%' || typeb == StreamTokenizer.TT_EOF)
204 				return new Token(".", '.');
205 		}
206 
207 		boolean isNumber = false;
208 
209 		// variable, atom or number
210 		if (typea == TT_WORD) {
211 			char firstChar = svala.charAt(0);
212 			// variable
213 			if (Character.isUpperCase(firstChar) || '_' == firstChar)
214 				return new Token(svala, Token.VARIABLE);
215 
216 			else if (firstChar >= '0' && firstChar <= '9') // all words starting
217 															// with 0 or 9 must
218 															// be a number
219 				isNumber = true; // set type to number and handle later
220 
221 			else { // otherwise, it must be an atom (or wrong)
222 				int typeb = super.nextToken(); // lookahead 1 to identify what
223 												// type of atom
224 				pushBack(); // this does not skip whitespaces, only readNext
225 							// does so.
226 				if (typeb == '(')
227 					return new Token(svala, Token.ATOM_FUNCTOR);
228 				if (Tokenizer.isWhite(typeb))
229 					return new Token(svala, Token.ATOM_OPERATOR);
230 				return new Token(svala, Token.ATOM);
231 			}
232 		}
233 
234 		// quotes
235 		if (typea == '\'' || typea == '\"' || typea == '`') {
236 			int qType = typea;
237 			StringBuffer quote = new StringBuffer();
238 			while (true) { // run through entire quote and added body to quote
239 							// buffer
240 				typea = super.nextToken();
241 				svala = sval;
242 				// double back slash and continuation escape sequence
243 				if (typea == '\\') {
244 					int typeb = super.nextToken();
245 					if (typeb == '\\') { // double back slash '... \\\\ ...'
246 						quote.append((char) typeb);
247 						continue;
248 					}
249 					if (typeb == '\n') // continuation escape sequence marker
250 										// \\n
251 						continue;
252 					if (typeb == '\r') {
253 						int typec = super.nextToken();
254 						if (typec == '\n')
255 							continue; // continuation escape sequence marker
256 										// \\r\n
257 						pushBack();
258 						continue; // continuation escape sequence marker \\r
259 					}
260 					pushBack(); // pushback typeb
261 				}
262 				// double '' or "" or ``
263 				if (typea == qType) {
264 					int typeb = super.nextToken();
265 					if (typeb == qType) { // escaped '' or "" or ``
266 						quote.append((char) qType);
267 						continue;
268 					} else {
269 						pushBack();
270 						break; // otherwise, break on single quote
271 					}
272 				}
273 				if (typea == '\n' || typea == '\r')
274 					throw new InvalidTermException("line break in quote not allowed (unless they are escaped \\ first)");
275 
276 				if (svala != null)
277 					quote.append(svala);
278 				else
279 					quote.append((char) typea);
280 			}
281 
282 			String quoteBody = quote.toString();
283 
284 			qType = qType == '\'' ? Token.SQ_SEQUENCE : qType == '\"' ? Token.DQ_SEQUENCE : Token.SQ_SEQUENCE;
285 			if (qType == Token.SQ_SEQUENCE) {
286 				if (Parser.isAtom(quoteBody))
287 					qType = Token.ATOM;
288 				int typeb = super.nextToken(); // lookahead 1 to identify what
289 												// type of quote
290 				pushBack(); // nextToken() does not skip whitespaces, only
291 							// readNext does so.
292 				if (typeb == '(')
293 					return new Token(quoteBody, Token.SQ_FUNCTOR);
294 			}
295 			return new Token(quoteBody, qType);
296 		}
297 
298 		// symbols
299 		if (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typea) >= 0) {
300 
301 			// the symbols are parsed individually by the super.nextToken(), so
302 			// accumulate symbollist
303 			StringBuffer symbols = new StringBuffer();
304 			int typeb = typea;
305 			// String svalb = null;
306 			while (Arrays.binarySearch(Tokenizer.GRAPHIC_CHARS, (char) typeb) >= 0) {
307 				symbols.append((char) typeb);
308 				typeb = super.nextToken();
309 				// svalb = sval;
310 			}
311 			pushBack();
312 
313 			// special symbols: unary + and unary -
314 			// try {
315 			// if (symbols.length() == 1 && typeb == TT_WORD &&
316 			// java.lang.Long.parseLong(svalb) > 0) {
317 			// if (typea == '+') //todo, issue of handling + and -. I don't
318 			// think this is ISO..
319 			// return readNextToken(); //skips + and returns the next number
320 			// if (typea == '-') {
321 			// Token t = readNextToken(); //read the next number
322 			// t.seq = "-" + t.seq; //add minus to value
323 			// return t; //return token
324 			// }
325 			// } //ps. the reason why the number isn't returned right away, but
326 			// through nextToken(), is because the number might be for instance
327 			// a float
328 			// } catch (NumberFormatException e) {
329 			// }
330 			if (typeb == '(') // applies to . as well
331 				return new Token(symbols.toString(), Token.OPERATOR_FUNCTOR);
332 			return new Token(symbols.toString(), Token.OPERATOR);
333 		}
334 
335 		// numbers: 1. integer, 2. float
336 		if (isNumber) {
337 			try { // the various parseInt checks will throw exceptions when
338 					// parts of numbers are written illegally
339 
340 				// 1.a. complex integers
341 				if (svala.startsWith("0")) {
342 					if (svala.indexOf('b') == 1)
343 						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 2), Token.INTEGER); // try
344 																												// binary
345 					if (svala.indexOf('o') == 1)
346 						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 8), Token.INTEGER); // try
347 																												// octal
348 					if (svala.indexOf('x') == 1)
349 						return new Token("" + java.lang.Long.parseLong(svala.substring(2), 16), Token.INTEGER); // try
350 																												// hex
351 				}
352 
353 				// lookahead 1
354 				int typeb = super.nextToken();
355 				String svalb = sval;
356 
357 				// 1.b ordinary integers
358 				if (typeb != '.' && typeb != '\'') { // i.e. not float or
359 														// character constant
360 					pushBack(); // lookahead 0
361 					return new Token("" + java.lang.Long.parseLong(svala), Token.INTEGER);
362 				}
363 
364 				// 1.c character code constant
365 				if (typeb == '\'' && "0".equals(svala)) {
366 					int typec = super.nextToken(); // lookahead 2
367 					String svalc = sval;
368 					int intVal;
369 					if ((intVal = isCharacterCodeConstantToken(typec, svalc)) != -1)
370 						return new Token("" + intVal, Token.INTEGER);
371 
372 					// this is an invalid character code constant int
373 					throw new InvalidTermException("Character code constant starting with 0'<X> at line: " + super.lineno() + " cannot be recognized.");
374 				}
375 
376 				// 2.a check that the value of the word prior to period is a
377 				// valid long
378 				java.lang.Long.parseLong(svala); // throws an exception if not
379 
380 				// 2.b first int is followed by a period
381 				if (typeb != '.')
382 					throw new InvalidTermException("A number starting with 0-9 cannot be rcognized as an int and does not have a fraction '.' at line: " + super.lineno());
383 
384 				// lookahead 2
385 				int typec = super.nextToken();
386 				String svalc = sval;
387 
388 				// 2.c check that the next token after '.' is a possible
389 				// fraction
390 				if (typec != TT_WORD) { // if its not, the period is an End
391 										// period
392 					pushBack(); // pushback 1 the token after period
393 					pushBack2 = new PushBack(typeb, svalb); // pushback 2 the
394 															// period token
395 					return new Token(svala, Token.INTEGER); // return what must
396 															// be an int
397 				}
398 
399 				// 2.d checking for exponent
400 				int exponent = svalc.indexOf("E");
401 				if (exponent == -1)
402 					exponent = svalc.indexOf("e");
403 
404 				if (exponent >= 1) { // the float must have a valid exponent
405 					if (exponent == svalc.length() - 1) { // the exponent must
406 															// be signed
407 															// exponent
408 						int typeb2 = super.nextToken();
409 						if (typeb2 == '+' || typeb2 == '-') {
410 							int typec2 = super.nextToken();
411 							String svalc2 = sval;
412 							if (typec2 == TT_WORD) {
413 								// verify the remaining parts of the float and
414 								// return
415 								java.lang.Long.parseLong(svalc.substring(0, exponent));
416 								java.lang.Integer.parseInt(svalc2);
417 								return new Token(svala + "." + svalc + (char) typeb2 + svalc2, Token.FLOAT);
418 							}
419 						}
420 					}
421 				}
422 				// 2.e verify lastly that ordinary floats and unsigned exponent
423 				// floats are Java legal and return them
424 				java.lang.Double.parseDouble(svala + "." + svalc);
425 				return new Token(svala + "." + svalc, Token.FLOAT);
426 
427 			} catch (NumberFormatException e) {
428 				// TODO return more info on what was wrong with the number given
429 				throw new InvalidTermException("A term starting with 0-9 cannot be parsed as a number at line: " + lineno());
430 			}
431 		}
432 		throw new InvalidTermException("Unknown Unicode character: " + typea + "  (" + svala + ")");
433 	}
434 
435 	/**
436 	 * 
437 	 * 
438 	 * @param typec
439 	 * @param svalc
440 	 * @return the intValue of the next character token, -1 if invalid todo
441 	 *         needs a lookahead if typec is \
442 	 */
443 	private static int isCharacterCodeConstantToken(int typec, String svalc) {
444 		if (svalc != null) {
445 			if (svalc.length() == 1)
446 				return (int) svalc.charAt(0);
447 			if (svalc.length() > 1) {
448 				// TODO the following charachters is not implemented:
449 				// * 1 meta escape sequence (* 6.4.2.1 *) todo
450 				// * 1 control escape sequence (* 6.4.2.1 *)
451 				// * 1 octal escape sequence (* 6.4.2.1 *)
452 				// * 1 hexadecimal escape sequence (* 6.4.2.1 *)
453 				return -1;
454 			}
455 		}
456 		if (typec == ' ' || // space char (* 6.5.4 *)
457 				typec == '(' || typec == ')' || typec == '{' || typec == '}' || typec == '[' || typec == ']' || Arrays.binarySearch(GRAPHIC_CHARS, (char) typec) >= 0) // graphic
458 																																										// char
459 																																										// (*
460 																																										// 6.5.1
461 																																										// *)
462 		// TODO solo char (* 6.5.3 *)
463 			return typec;
464 
465 		return -1;
466 	}
467 
468 	private static boolean isWhite(int type) {
469 		return type == ' ' || type == '\r' || type == '\n' || type == '\t' || type == '\f';
470 	}
471 
472 	/**
473 	 * used to implement lookahead for two tokens, super.pushBack() only handles
474 	 * one pushBack..
475 	 */
476 	private static class PushBack {
477 		int typea;
478 		String svala;
479 
480 		public PushBack(int i, String s) {
481 			typea = i;
482 			svala = s;
483 		}
484 	}
485 }