Mercurial > hg > monetdb-java

--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,10 @@
 # ChangeLog file for monetdb-java
 # This file is updated with Maddlog

+* Wed Sep 11 2019 Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
+- Optimized parse() method of TupleLineParser by creating less helper objects
+  and replacing method calls by direct operations on variables.
+
 * Wed Sep  4 2019 Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
 - The jar files are now named according to Java version compatibility.
   For example, the monetdb-jdbc-2.29.jre7.jar file should be used with
--- a/src/main/java/nl/cwi/monetdb/mcl/parser/MCLParser.java
+++ b/src/main/java/nl/cwi/monetdb/mcl/parser/MCLParser.java
@@ -26,7 +26,7 @@ package nl.cwi.monetdb.mcl.parser;
 public abstract class MCLParser {
 	/** The String values found while parsing.  Public, you may touch it. */
 	public final String values[];
-	protected int colnr;
+	protected int colnr = 0;

 	/**
 	 * Creates an MCLParser targetted at a given number of field values.
--- a/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
+++ b/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
@@ -9,15 +9,21 @@
 package nl.cwi.monetdb.mcl.parser;

 /**
- * The TupleLineParser extracts the values from a given tuple.  The
- * number of values that are expected are known upfront to speed up
+ * The TupleLineParser extracts the values from a given tuple.
+ * The number of values that are expected are known upfront to speed up
  * allocation and validation.
  *
  * @author Fabian Groffen
+ * @author Martin van Dinther
  */
 public final class TupleLineParser extends MCLParser {
+	private StringBuilder uesc = null;	// used for building field string value when an escape is present in the field value
+
 	/**
 	 * Constructs a TupleLineParser which expects columncount columns.
+	 * The columncount argument is used for allocation of the public values array.
+	 * While this seems illogical, the caller should know this size, since the
+	 * StartOfHeader contains this information.
 	 *
 	 * @param columncount the number of columns in the to be parsed string
 	 */
@@ -26,22 +32,22 @@ public final class TupleLineParser exten
 	}

 	/**
-	 * Parses the given String source as tuple line.  If source cannot
-	 * be parsed, a ParseException is thrown.  The columncount argument
-	 * is used for allocation of the returned array.  While this seems
-	 * illogical, the caller should know this size, since the
-	 * StartOfHeader contains this information.
+	 * Parses the given String source as tuple line.
+	 * If source cannot be parsed, a MCLParseException is thrown.
 	 *
-	 * @param source a String which should be parsed
+	 * @param source a String representing a tuple line which should be parsed
 	 * @return 0, as there is no 'type' of TupleLine
-	 * @throws MCLParseException if an error occurs during parsing
+	 * @throws MCLParseException if source is not compliant to expected tuple/single value format
 	 */
 	@Override
 	public int parse(final String source) throws MCLParseException {
 		final int len = source.length();
-		// first detect whether this is a single value line (=) or a
-		// real tuple ([)
-		if (len >= 1 && source.charAt(0) == '=') {
+		if (len <= 0)
+			throw new MCLParseException("Missing tuple data");
+
+		// first detect whether this is a single value line (=) or a real tuple ([)
+		char chr = source.charAt(0);
+		if (chr == '=') {
 			if (values.length != 1)
 				throw new MCLParseException(values.length +
 						" columns expected, but only single value found");
@@ -50,20 +56,19 @@ public final class TupleLineParser exten
 			values[0] = source.substring(1);

 			// reset colnr
-			reset();
+			colnr = 0;
 			return 0;
 		}

-		if (!source.startsWith("["))
+		if (chr != '[')
 			throw new MCLParseException("Expected a data row starting with [");

 		// It is a tuple. Extract separate fields by examining the string data char for char
-		// convert whole string to char[] to avoid overhead of source.charAt(i) calls
-		// TODO: measure the source.charAt(i) overhead and whether it is faster to eliminate the source.toCharArray(); copy
+		// For parsing it is faster to use an char[] to avoid overhead of source.charAt(i) method calls
 		final char[] chrLine = source.toCharArray();
 		boolean inString = false, escaped = false, fieldHasEscape = false;
-		final StringBuilder uesc = new StringBuilder(128);	// used for building field string value when an escape is present in the field value
 		int column = 0, cursor = 2;
+		// scan the characters, when a field separator is found extract the field value as String dealing with possible escape characters
 		for (int i = 2; i < len; i++) {
 			switch(chrLine[i]) {
 				case '\\':
@@ -102,14 +107,23 @@ public final class TupleLineParser exten
 						if (chrLine[cursor] == '"' &&
 						    chrLine[endpos] == '"')	// field is surrounded by double quotes, so a string with possible escape codes
 						{
+							cursor++;
+							final int fieldlen = endpos - cursor;
 							if (fieldHasEscape) {
-								// reuse the StringBuilder by cleaning it
-								uesc.setLength(0);
-								// prevent multiple capacity increments during the append()'s in the inner loop
-								uesc.ensureCapacity(endpos - (cursor + 1));
+								if (uesc == null) {
+									// first time use, create it with enough capacity, minimum 1024
+									uesc = new StringBuilder(fieldlen > 1024 ? fieldlen : 1024);
+								} else {
+									// reuse the StringBuilder by cleaning it
+									uesc.setLength(0);
+									if (fieldlen > 1024) {
+										// prevent multiple capacity increments during the append()'s in the inner loop
+										uesc.ensureCapacity(fieldlen);
+									}
+								}
 								// parse the field value (excluding the double quotes) and convert it to a string without any escape characters
-								for (int pos = cursor + 1; pos < endpos; pos++) {
-									char chr = chrLine[pos];
+								for (int pos = cursor; pos < endpos; pos++) {
+									chr = chrLine[pos];
 									if (chr == '\\' && pos + 1 < endpos) {
 										// we detected an escape
 										// escapedStr and GDKstrFromStr in gdk_atoms.c only
@@ -117,9 +131,6 @@ public final class TupleLineParser exten
 										pos++;
 										chr = chrLine[pos];
 										switch (chr) {
-											case '\\':
-												uesc.append('\\');
-												break;
 											case 'f':
 												uesc.append('\f');
 												break;
@@ -132,9 +143,6 @@ public final class TupleLineParser exten
 											case 't':
 												uesc.append('\t');
 												break;
-											case '"':
-												uesc.append('"');
-												break;
 											case '0': case '1': case '2': case '3':
 												// this could be an octal number, let's check it out
 												if (pos + 2 < endpos) {
@@ -158,8 +166,16 @@ public final class TupleLineParser exten
 													uesc.append(chr);
 												}
 												break;
+											/* case '\\':	optimisation: this code does the same as the default case, so not needed
+												uesc.append('\\');
+												break;
+											*/
+											/* case '"':	optimisation: this code does the same as the default case, so not needed
+												uesc.append('"');
+												break;
+											*/
 											default:
-												// this is wrong usage of escape, just ignore the \-escape and print the char
+												// this is wrong usage of escape (except for '\\' and '"'), just ignore the \-escape and print the char
 												uesc.append(chr);
 												break;
 										}
@@ -171,23 +187,24 @@ public final class TupleLineParser exten
 								values[column] = uesc.toString();
 							} else {
 								// the field is a string surrounded by double quotes and without escape chars
-								cursor++;
-								final String fieldVal = new String(chrLine, cursor, endpos - cursor);
-								// if (fieldVal.contains("\\")) {
+								values[column] = new String(chrLine, cursor, fieldlen);
+								// if (values[column].contains("\\")) {
 								//	throw new MCLParseException("Invalid parsing: detected a \\ in double quoted string: " + fieldVal);
 								// }
-								values[column] = fieldVal;
 							}
-						} else if (((i - 1 - cursor) == 4) && source.indexOf("NULL", cursor) == cursor) {
-							// the field contains NULL, so no value
-							values[column] = null;
 						} else {
-							// the field is a string NOT surrounded by double quotes and thus without escape chars
-							final String fieldVal = new String(chrLine, cursor, i - 1 - cursor);
-							// if (fieldVal.contains("\\")) {
-							//	throw new MCLParseException("Invalid parsing: detected a \\ in unquoted string: " + fieldVal);
-							// }
-							values[column] = fieldVal;
+							final int vlen = i - 1 - cursor;
+							if (vlen == 4 &&
+							    chrLine[cursor] == 'N' && chrLine[cursor+1] == 'U' && chrLine[cursor+2] == 'L' && chrLine[cursor+3] == 'L') {
+								// the field contains NULL, so no value
+								values[column] = null;
+							} else {
+								// the field is a string NOT surrounded by double quotes and thus without escape chars
+								values[column] = new String(chrLine, cursor, vlen);
+								// if (values[column].contains("\\")) {
+								//	throw new MCLParseException("Invalid parsing: detected a \\ in unquoted string: " + fieldVal);
+								// }
+							}
 						}
 						cursor = i + 1;
 						fieldHasEscape = false;		// reset for next field scan
@@ -207,7 +224,7 @@ public final class TupleLineParser exten
 			throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>"));

 		// reset colnr
-		reset();
+		colnr = 0;
 		return 0;
 	}
 }