Mercurial > hg > monetdb-java
diff src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java @ 0:a5a898f6886c
Copy of MonetDB java directory changeset e6e32756ad31.
author | Sjoerd Mullender <sjoerd@acm.org> |
---|---|
date | Wed, 21 Sep 2016 09:34:48 +0200 (2016-09-21) |
parents | |
children | 57978db4ee57 b9b35ca2eec2 |
line wrap: on
line diff
new file mode 100644 --- /dev/null +++ b/src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java @@ -0,0 +1,183 @@ +/* + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V. + */ + +package nl.cwi.monetdb.mcl.parser; + +/** + * The TupleLineParser extracts the values from a given tuple. The + * number of values that are expected are known upfront to speed up + * allocation and validation. + * + * @author Fabian Groffen <Fabian.Groffen> + */ +public class TupleLineParser extends MCLParser { + /** + * Constructs a TupleLineParser which expects columncount columns. + * + * @param columncount the number of columns in the to be parsed string + */ + public TupleLineParser(int columncount) { + super(columncount); + } + + /** + * Parses the given String source as tuple line. If source cannot + * be parsed, a ParseException is thrown. The columncount argument + * is used for allocation of the returned array. While this seems + * illogical, the caller should know this size, since the + * StartOfHeader contains this information. + * + * @param source a String which should be parsed + * @return 0, as there is no 'type' of TupleLine + * @throws ParseException if an error occurs during parsing + */ + @Override + public int parse(String source) throws MCLParseException { + int len = source.length(); + char[] chrLine = new char[len]; + source.getChars(0, len, chrLine, 0); + + // first detect whether this is a single value line (=) or a + // real tuple ([) + if (chrLine[0] == '=') { + if (values.length != 1) + throw new MCLParseException(values.length + + " columns expected, but only single value found"); + + // return the whole string but the leading = + values[0] = source.substring(1); + + // reset colnr + reset(); + + return 0; + } + + // extract separate fields by examining string, char for char + boolean inString = false, escaped = false; + int cursor = 2, column = 0, i = 2; + StringBuilder uesc = new StringBuilder(); + for (; i < len; i++) { + switch(chrLine[i]) { + default: + escaped = false; + break; + case '\\': + escaped = !escaped; + break; + case '"': + /** + * If all strings are wrapped between two quotes, a \" can + * never exist outside a string. Thus if we believe that we + * are not within a string, we can safely assume we're about + * to enter a string if we find a quote. + * If we are in a string we should stop being in a string if + * we find a quote which is not prefixed by a \, for that + * would be an escaped quote. However, a nasty situation can + * occur where the string is like "test \\" as obvious, a + * test for a \ in front of a " doesn't hold here for all + * cases. Because "test \\\"" can exist as well, we need to + * know if a quote is prefixed by an escaping slash or not. + */ + if (!inString) { + inString = true; + } else if (!escaped) { + inString = false; + } + + // reset escaped flag + escaped = false; + break; + case '\t': + if (!inString && + (i > 0 && chrLine[i - 1] == ',') || + (i + 1 == len - 1 && chrLine[++i] == ']')) // dirty + { + // split! + if (chrLine[cursor] == '"' && + chrLine[i - 2] == '"') + { + // reuse the StringBuilder by cleaning it + uesc.delete(0, uesc.length()); + // prevent capacity increasements + uesc.ensureCapacity((i - 2) - (cursor + 1)); + for (int pos = cursor + 1; pos < i - 2; pos++) { + if (chrLine[pos] == '\\' && pos + 1 < i - 2) { + pos++; + // strToStr and strFromStr in gdk_atoms.mx only + // support \t \n \\ \" and \377 + switch (chrLine[pos]) { + case '\\': + uesc.append('\\'); + break; + case 'n': + uesc.append('\n'); + break; + case 't': + uesc.append('\t'); + break; + case '"': + uesc.append('"'); + break; + case '0': case '1': case '2': case '3': + // this could be an octal number, let's check it out + if (pos + 2 < i - 2 && + chrLine[pos + 1] >= '0' && chrLine[pos + 1] <= '7' && + chrLine[pos + 2] >= '0' && chrLine[pos + 2] <= '7' + ) { + // we got the number! + try { + uesc.append((char)(Integer.parseInt("" + chrLine[pos] + chrLine[pos + 1] + chrLine[pos + 2], 8))); + pos += 2; + } catch (NumberFormatException e) { + // hmmm, this point should never be reached actually... + throw new AssertionError("Flow error, should never try to parse non-number"); + } + } else { + // do default action if number seems not to be correct + uesc.append(chrLine[pos]); + } + break; + default: + // this is wrong, just ignore the escape, and print the char + uesc.append(chrLine[pos]); + break; + } + } else { + uesc.append(chrLine[pos]); + } + } + + // put the unescaped string in the right place + values[column++] = uesc.toString(); + } else if ((i - 1) - cursor == 4 && + source.indexOf("NULL", cursor) == cursor) + { + values[column++] = null; + } else { + values[column++] = + source.substring(cursor, i - 1); + } + cursor = i + 1; + } + + // reset escaped flag + escaped = false; + break; + } + } + // check if this result is of the size we expected it to be + if (column != values.length) + throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>")); + + // reset colnr + reset(); + + return 0; + } +}