diff src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java @ 0:a5a898f6886c

Copy of MonetDB java directory changeset e6e32756ad31.
author Sjoerd Mullender <sjoerd@acm.org>
date Wed, 21 Sep 2016 09:34:48 +0200 (2016-09-21)
parents
children 57978db4ee57 b9b35ca2eec2
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
@@ -0,0 +1,183 @@
+/*
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0.  If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
+ */
+
+package nl.cwi.monetdb.mcl.parser;
+
+/**
+ * The TupleLineParser extracts the values from a given tuple.  The
+ * number of values that are expected are known upfront to speed up
+ * allocation and validation.
+ *
+ * @author Fabian Groffen <Fabian.Groffen>
+ */
+public class TupleLineParser extends MCLParser {
+	/**
+	 * Constructs a TupleLineParser which expects columncount columns.
+	 *
+	 * @param columncount the number of columns in the to be parsed string
+	 */
+	public TupleLineParser(int columncount) {
+		super(columncount);
+	}
+
+	/**
+	 * Parses the given String source as tuple line.  If source cannot
+	 * be parsed, a ParseException is thrown.  The columncount argument
+	 * is used for allocation of the returned array.  While this seems
+	 * illogical, the caller should know this size, since the
+	 * StartOfHeader contains this information.
+	 *
+	 * @param source a String which should be parsed
+	 * @return 0, as there is no 'type' of TupleLine
+	 * @throws ParseException if an error occurs during parsing
+	 */
+	@Override
+	public int parse(String source) throws MCLParseException {
+		int len = source.length();
+		char[] chrLine = new char[len];
+		source.getChars(0, len, chrLine, 0);
+
+		// first detect whether this is a single value line (=) or a
+		// real tuple ([)
+		if (chrLine[0] == '=') {
+			if (values.length != 1)
+				throw new MCLParseException(values.length +
+						" columns expected, but only single value found");
+
+			// return the whole string but the leading =
+			values[0] = source.substring(1);
+
+			// reset colnr
+			reset();
+
+			return 0;
+		}
+
+		// extract separate fields by examining string, char for char
+		boolean inString = false, escaped = false;
+		int cursor = 2, column = 0, i = 2;
+		StringBuilder uesc = new StringBuilder();
+		for (; i < len; i++) {
+			switch(chrLine[i]) {
+				default:
+					escaped = false;
+				break;
+				case '\\':
+					escaped = !escaped;
+				break;
+				case '"':
+					/**
+					 * If all strings are wrapped between two quotes, a \" can
+					 * never exist outside a string. Thus if we believe that we
+					 * are not within a string, we can safely assume we're about
+					 * to enter a string if we find a quote.
+					 * If we are in a string we should stop being in a string if
+					 * we find a quote which is not prefixed by a \, for that
+					 * would be an escaped quote. However, a nasty situation can
+					 * occur where the string is like "test \\" as obvious, a
+					 * test for a \ in front of a " doesn't hold here for all
+					 * cases. Because "test \\\"" can exist as well, we need to
+					 * know if a quote is prefixed by an escaping slash or not.
+					 */
+					if (!inString) {
+						inString = true;
+					} else if (!escaped) {
+						inString = false;
+					}
+
+					// reset escaped flag
+					escaped = false;
+				break;
+				case '\t':
+					if (!inString &&
+						(i > 0 && chrLine[i - 1] == ',') ||
+						(i + 1 == len - 1 && chrLine[++i] == ']')) // dirty
+					{
+						// split!
+						if (chrLine[cursor] == '"' &&
+							chrLine[i - 2] == '"')
+						{
+							// reuse the StringBuilder by cleaning it
+							uesc.delete(0, uesc.length());
+							// prevent capacity increasements
+							uesc.ensureCapacity((i - 2) - (cursor + 1));
+							for (int pos = cursor + 1; pos < i - 2; pos++) {
+								if (chrLine[pos] == '\\' && pos + 1 < i - 2) {
+									pos++;
+									// strToStr and strFromStr in gdk_atoms.mx only
+									// support \t \n \\ \" and \377
+									switch (chrLine[pos]) {
+										case '\\':
+											uesc.append('\\');
+										break;
+										case 'n':
+											uesc.append('\n');
+										break;
+										case 't':
+											uesc.append('\t');
+										break;
+										case '"':
+											uesc.append('"');
+										break;
+										case '0': case '1': case '2': case '3':
+											// this could be an octal number, let's check it out
+											if (pos + 2 < i - 2 &&
+												chrLine[pos + 1] >= '0' && chrLine[pos + 1] <= '7' &&
+												chrLine[pos + 2] >= '0' && chrLine[pos + 2] <= '7'
+											) {
+												// we got the number!
+												try {
+													uesc.append((char)(Integer.parseInt("" + chrLine[pos] + chrLine[pos + 1] + chrLine[pos + 2], 8)));
+													pos += 2;
+												} catch (NumberFormatException e) {
+													// hmmm, this point should never be reached actually...
+													throw new AssertionError("Flow error, should never try to parse non-number");
+												}
+											} else {
+												// do default action if number seems not to be correct
+												uesc.append(chrLine[pos]);
+											}
+										break;
+										default:
+											// this is wrong, just ignore the escape, and print the char
+											uesc.append(chrLine[pos]);
+										break;
+									}
+								} else {
+									uesc.append(chrLine[pos]);
+								}
+							}
+
+							// put the unescaped string in the right place
+							values[column++] = uesc.toString();
+						} else if ((i - 1) - cursor == 4 &&
+								source.indexOf("NULL", cursor) == cursor)
+						{
+							values[column++] = null;
+						} else {
+							values[column++] =
+								source.substring(cursor, i - 1);
+						}
+						cursor = i + 1;
+					}
+
+					// reset escaped flag
+					escaped = false;
+				break;
+			}
+		}
+		// check if this result is of the size we expected it to be
+		if (column != values.length)
+			throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>"));
+
+		// reset colnr
+		reset();
+		
+		return 0;
+	}
+}