changeset 322:0fcf338ce0b4

Optimized parse method of TupleLineParser by creating less helper objects and replacing method calls by direct operations on variables. We now only create a StringBuilder object uesc when it is needed (when an escaped character is detected in the tuple line). In most tuple data lines no escape characters are used and thus the StringBuilder object was not used/needed. Also increased the default capacity of StringBuilder uesc such that it is not enlarged so often. Also made StringBuilder object uesc now part of the TupleLineParser object, such that it can be reused by many parse() calls, again reducing the number of created StringBuilder objects.
author Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
date Wed, 11 Sep 2019 17:18:00 +0200 (2019-09-11)
parents dbd8e9a8566f
children 8701024a9bb0
files ChangeLog src/main/java/nl/cwi/monetdb/mcl/parser/MCLParser.java src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
diffstat 3 files changed, 66 insertions(+), 45 deletions(-) [+]
line wrap: on
line diff
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,6 +1,10 @@
 # ChangeLog file for monetdb-java
 # This file is updated with Maddlog
 
+* Wed Sep 11 2019 Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
+- Optimized parse() method of TupleLineParser by creating less helper objects
+  and replacing method calls by direct operations on variables.
+
 * Wed Sep  4 2019 Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
 - The jar files are now named according to Java version compatibility.
   For example, the monetdb-jdbc-2.29.jre7.jar file should be used with
--- a/src/main/java/nl/cwi/monetdb/mcl/parser/MCLParser.java
+++ b/src/main/java/nl/cwi/monetdb/mcl/parser/MCLParser.java
@@ -26,7 +26,7 @@ package nl.cwi.monetdb.mcl.parser;
 public abstract class MCLParser {
 	/** The String values found while parsing.  Public, you may touch it. */
 	public final String values[];
-	protected int colnr;
+	protected int colnr = 0;
 
 	/**
 	 * Creates an MCLParser targetted at a given number of field values.
--- a/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
+++ b/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java
@@ -9,15 +9,21 @@
 package nl.cwi.monetdb.mcl.parser;
 
 /**
- * The TupleLineParser extracts the values from a given tuple.  The
- * number of values that are expected are known upfront to speed up
+ * The TupleLineParser extracts the values from a given tuple.
+ * The number of values that are expected are known upfront to speed up
  * allocation and validation.
  *
  * @author Fabian Groffen
+ * @author Martin van Dinther
  */
 public final class TupleLineParser extends MCLParser {
+	private StringBuilder uesc = null;	// used for building field string value when an escape is present in the field value
+
 	/**
 	 * Constructs a TupleLineParser which expects columncount columns.
+	 * The columncount argument is used for allocation of the public values array.
+	 * While this seems illogical, the caller should know this size, since the
+	 * StartOfHeader contains this information.
 	 *
 	 * @param columncount the number of columns in the to be parsed string
 	 */
@@ -26,22 +32,22 @@ public final class TupleLineParser exten
 	}
 
 	/**
-	 * Parses the given String source as tuple line.  If source cannot
-	 * be parsed, a ParseException is thrown.  The columncount argument
-	 * is used for allocation of the returned array.  While this seems
-	 * illogical, the caller should know this size, since the
-	 * StartOfHeader contains this information.
+	 * Parses the given String source as tuple line.
+	 * If source cannot be parsed, a MCLParseException is thrown.
 	 *
-	 * @param source a String which should be parsed
+	 * @param source a String representing a tuple line which should be parsed
 	 * @return 0, as there is no 'type' of TupleLine
-	 * @throws MCLParseException if an error occurs during parsing
+	 * @throws MCLParseException if source is not compliant to expected tuple/single value format
 	 */
 	@Override
 	public int parse(final String source) throws MCLParseException {
 		final int len = source.length();
-		// first detect whether this is a single value line (=) or a
-		// real tuple ([)
-		if (len >= 1 && source.charAt(0) == '=') {
+		if (len <= 0)
+			throw new MCLParseException("Missing tuple data");
+
+		// first detect whether this is a single value line (=) or a real tuple ([)
+		char chr = source.charAt(0);
+		if (chr == '=') {
 			if (values.length != 1)
 				throw new MCLParseException(values.length +
 						" columns expected, but only single value found");
@@ -50,20 +56,19 @@ public final class TupleLineParser exten
 			values[0] = source.substring(1);
 
 			// reset colnr
-			reset();
+			colnr = 0;
 			return 0;
 		}
 
-		if (!source.startsWith("["))
+		if (chr != '[')
 			throw new MCLParseException("Expected a data row starting with [");
 
 		// It is a tuple. Extract separate fields by examining the string data char for char
-		// convert whole string to char[] to avoid overhead of source.charAt(i) calls
-		// TODO: measure the source.charAt(i) overhead and whether it is faster to eliminate the source.toCharArray(); copy
+		// For parsing it is faster to use an char[] to avoid overhead of source.charAt(i) method calls
 		final char[] chrLine = source.toCharArray();
 		boolean inString = false, escaped = false, fieldHasEscape = false;
-		final StringBuilder uesc = new StringBuilder(128);	// used for building field string value when an escape is present in the field value
 		int column = 0, cursor = 2;
+		// scan the characters, when a field separator is found extract the field value as String dealing with possible escape characters
 		for (int i = 2; i < len; i++) {
 			switch(chrLine[i]) {
 				case '\\':
@@ -102,14 +107,23 @@ public final class TupleLineParser exten
 						if (chrLine[cursor] == '"' &&
 						    chrLine[endpos] == '"')	// field is surrounded by double quotes, so a string with possible escape codes
 						{
+							cursor++;
+							final int fieldlen = endpos - cursor;
 							if (fieldHasEscape) {
-								// reuse the StringBuilder by cleaning it
-								uesc.setLength(0);
-								// prevent multiple capacity increments during the append()'s in the inner loop
-								uesc.ensureCapacity(endpos - (cursor + 1));
+								if (uesc == null) {
+									// first time use, create it with enough capacity, minimum 1024
+									uesc = new StringBuilder(fieldlen > 1024 ? fieldlen : 1024);
+								} else {
+									// reuse the StringBuilder by cleaning it
+									uesc.setLength(0);
+									if (fieldlen > 1024) {
+										// prevent multiple capacity increments during the append()'s in the inner loop
+										uesc.ensureCapacity(fieldlen);
+									}
+								}
 								// parse the field value (excluding the double quotes) and convert it to a string without any escape characters
-								for (int pos = cursor + 1; pos < endpos; pos++) {
-									char chr = chrLine[pos];
+								for (int pos = cursor; pos < endpos; pos++) {
+									chr = chrLine[pos];
 									if (chr == '\\' && pos + 1 < endpos) {
 										// we detected an escape
 										// escapedStr and GDKstrFromStr in gdk_atoms.c only
@@ -117,9 +131,6 @@ public final class TupleLineParser exten
 										pos++;
 										chr = chrLine[pos];
 										switch (chr) {
-											case '\\':
-												uesc.append('\\');
-												break;
 											case 'f':
 												uesc.append('\f');
 												break;
@@ -132,9 +143,6 @@ public final class TupleLineParser exten
 											case 't':
 												uesc.append('\t');
 												break;
-											case '"':
-												uesc.append('"');
-												break;
 											case '0': case '1': case '2': case '3':
 												// this could be an octal number, let's check it out
 												if (pos + 2 < endpos) {
@@ -158,8 +166,16 @@ public final class TupleLineParser exten
 													uesc.append(chr);
 												}
 												break;
+											/* case '\\':	optimisation: this code does the same as the default case, so not needed
+												uesc.append('\\');
+												break;
+											*/
+											/* case '"':	optimisation: this code does the same as the default case, so not needed
+												uesc.append('"');
+												break;
+											*/
 											default:
-												// this is wrong usage of escape, just ignore the \-escape and print the char
+												// this is wrong usage of escape (except for '\\' and '"'), just ignore the \-escape and print the char
 												uesc.append(chr);
 												break;
 										}
@@ -171,23 +187,24 @@ public final class TupleLineParser exten
 								values[column] = uesc.toString();
 							} else {
 								// the field is a string surrounded by double quotes and without escape chars
-								cursor++;
-								final String fieldVal = new String(chrLine, cursor, endpos - cursor);
-								// if (fieldVal.contains("\\")) {
+								values[column] = new String(chrLine, cursor, fieldlen);
+								// if (values[column].contains("\\")) {
 								//	throw new MCLParseException("Invalid parsing: detected a \\ in double quoted string: " + fieldVal);
 								// }
-								values[column] = fieldVal;
 							}
-						} else if (((i - 1 - cursor) == 4) && source.indexOf("NULL", cursor) == cursor) {
-							// the field contains NULL, so no value
-							values[column] = null;
 						} else {
-							// the field is a string NOT surrounded by double quotes and thus without escape chars
-							final String fieldVal = new String(chrLine, cursor, i - 1 - cursor);
-							// if (fieldVal.contains("\\")) {
-							//	throw new MCLParseException("Invalid parsing: detected a \\ in unquoted string: " + fieldVal);
-							// }
-							values[column] = fieldVal;
+							final int vlen = i - 1 - cursor;
+							if (vlen == 4 &&
+							    chrLine[cursor] == 'N' && chrLine[cursor+1] == 'U' && chrLine[cursor+2] == 'L' && chrLine[cursor+3] == 'L') {
+								// the field contains NULL, so no value
+								values[column] = null;
+							} else {
+								// the field is a string NOT surrounded by double quotes and thus without escape chars
+								values[column] = new String(chrLine, cursor, vlen);
+								// if (values[column].contains("\\")) {
+								//	throw new MCLParseException("Invalid parsing: detected a \\ in unquoted string: " + fieldVal);
+								// }
+							}
 						}
 						cursor = i + 1;
 						fieldHasEscape = false;		// reset for next field scan
@@ -207,7 +224,7 @@ public final class TupleLineParser exten
 			throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>"));
 
 		// reset colnr
-		reset();
+		colnr = 0;
 		return 0;
 	}
 }