Mercurial > hg > monetdb-java
view src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java @ 261:d4baf8a4b43a
Update Copyright year to 2019
author | Martin van Dinther <martin.van.dinther@monetdbsolutions.com> |
---|---|
date | Thu, 03 Jan 2019 14:43:44 +0100 (2019-01-03) |
parents | ae1d0d1c2f0f |
children | bb273e9c7e09 |
line wrap: on
line source
/* * This Source Code Form is subject to the terms of the Mozilla Public * License, v. 2.0. If a copy of the MPL was not distributed with this * file, You can obtain one at http://mozilla.org/MPL/2.0/. * * Copyright 1997 - July 2008 CWI, August 2008 - 2019 MonetDB B.V. */ package nl.cwi.monetdb.mcl.parser; /** * The TupleLineParser extracts the values from a given tuple. The * number of values that are expected are known upfront to speed up * allocation and validation. * * @author Fabian Groffen */ public class TupleLineParser extends MCLParser { /** * Constructs a TupleLineParser which expects columncount columns. * * @param columncount the number of columns in the to be parsed string */ public TupleLineParser(int columncount) { super(columncount); } /** * Parses the given String source as tuple line. If source cannot * be parsed, a ParseException is thrown. The columncount argument * is used for allocation of the returned array. While this seems * illogical, the caller should know this size, since the * StartOfHeader contains this information. * * @param source a String which should be parsed * @return 0, as there is no 'type' of TupleLine * @throws MCLParseException if an error occurs during parsing */ @Override public int parse(String source) throws MCLParseException { final int len = source.length(); // first detect whether this is a single value line (=) or a // real tuple ([) if (len >= 1 && source.charAt(0) == '=') { if (values.length != 1) throw new MCLParseException(values.length + " columns expected, but only single value found"); // return the whole string but without the leading = values[0] = source.substring(1); // reset colnr reset(); return 0; } if (!source.startsWith("[")) throw new MCLParseException("Expected a data row starting with ["); // It is a tuple. Extract separate fields by examining the string data char for char final char[] chrLine = source.toCharArray(); // convert whole string to char[] to avoid overhead of source.charAt(i) calls TODO: measure the overhead boolean inString = false, escaped = false, fieldHasEscape = false; final StringBuilder uesc = new StringBuilder(128); // used for building field string value when an escape is present in the field value int column = 0, cursor = 2; for (int i = 2; i < len; i++) { switch(chrLine[i]) { case '\\': escaped = !escaped; fieldHasEscape = true; break; case '"': /** * If all strings are wrapped between two quotes, a \" can * never exist outside a string. Thus if we believe that we * are not within a string, we can safely assume we're about * to enter a string if we find a quote. * If we are in a string we should stop being in a string if * we find a quote which is not prefixed by a \, for that * would be an escaped quote. However, a nasty situation can * occur where the string is like "test \\" as obvious, a * test for a \ in front of a " doesn't hold here for all * cases. Because "test \\\"" can exist as well, we need to * know if a quote is prefixed by an escaping slash or not. */ if (!inString) { inString = true; } else if (!escaped) { inString = false; } // reset escaped flag escaped = false; break; case '\t': // potential field separator found if (!inString && ((chrLine[i - 1] == ',') || // found field separator: ,\t ((i + 1 == len - 1) && chrLine[++i] == ']'))) // found last field: \t] { // extract the field value as a string, without the potential escape codes final int endpos = i - 2; // minus the tab and the comma or ] if (chrLine[cursor] == '"' && chrLine[endpos] == '"') // field is surrounded by double quotes, so a string with possible escape codes { if (fieldHasEscape) { // reuse the StringBuilder by cleaning it uesc.delete(0, uesc.length()); // prevent capacity increasements uesc.ensureCapacity(endpos - (cursor + 1)); // parse the field value (excluding the double quotes) and convert it to a string without any escape characters for (int pos = cursor + 1; pos < endpos; pos++) { char chr = chrLine[pos]; if (chr == '\\' && pos + 1 < endpos) { // we detected an escape // escapedStr and GDKstrFromStr in gdk_atoms.c only // support \\ \f \n \r \t \" and \377 pos++; chr = chrLine[pos]; switch (chr) { case '\\': uesc.append('\\'); break; case 'f': uesc.append('\f'); break; case 'n': uesc.append('\n'); break; case 'r': uesc.append('\r'); break; case 't': uesc.append('\t'); break; case '"': uesc.append('"'); break; case '0': case '1': case '2': case '3': // this could be an octal number, let's check it out if (pos + 2 < endpos) { char chr2 = chrLine[pos + 1]; char chr3 = chrLine[pos + 2]; if (chr2 >= '0' && chr2 <= '7' && chr3 >= '0' && chr3 <= '7') { // we got an octal number between \000 and \377 try { uesc.append((char)(Integer.parseInt("" + chr + chr2 + chr3, 8))); pos += 2; } catch (NumberFormatException e) { // hmmm, this point should never be reached actually... throw new AssertionError("Flow error, should never try to parse non-number"); } } else { // do default action if number seems not to be an octal number uesc.append(chr); } } else { // do default action if number seems not to be an octal number uesc.append(chr); } break; default: // this is wrong usage of escape, just ignore the \-escape and print the char uesc.append(chr); break; } } else { uesc.append(chr); } } // put the unescaped string in the right place values[column] = uesc.toString(); } else { // the field is a string surrounded by double quotes and without escape chars cursor++; String fieldVal = new String(chrLine, cursor, endpos - cursor); // if (fieldVal.contains("\\")) { // throw new MCLParseException("Invalid parsing: detected a \\ in double quoted string: " + fieldVal); // } values[column] = fieldVal; } } else if (((i - 1 - cursor) == 4) && source.indexOf("NULL", cursor) == cursor) { // the field contains NULL, so no value values[column] = null; } else { // the field is a string NOT surrounded by double quotes and thus without escape chars String fieldVal = new String(chrLine, cursor, i - 1 - cursor); // if (fieldVal.contains("\\")) { // throw new MCLParseException("Invalid parsing: detected a \\ in unquoted string: " + fieldVal); // } values[column] = fieldVal; } cursor = i + 1; fieldHasEscape = false; // reset for next field scan column++; } // reset escaped flag escaped = false; break; default: escaped = false; break; } // end of switch() } // end of for() // check if this result is of the size we expected it to be if (column != values.length) throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>")); // reset colnr reset(); return 0; } }