comparison src/main/java/nl/cwi/monetdb/mcl/parser/TupleLineParser.java @ 0:a5a898f6886c

Copy of MonetDB java directory changeset e6e32756ad31.
author Sjoerd Mullender <sjoerd@acm.org>
date Wed, 21 Sep 2016 09:34:48 +0200 (2016-09-21)
parents
children 57978db4ee57 b9b35ca2eec2
comparison
equal deleted inserted replaced
-1:000000000000 0:a5a898f6886c
1 /*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2016 MonetDB B.V.
7 */
8
9 package nl.cwi.monetdb.mcl.parser;
10
11 /**
12 * The TupleLineParser extracts the values from a given tuple. The
13 * number of values that are expected are known upfront to speed up
14 * allocation and validation.
15 *
16 * @author Fabian Groffen <Fabian.Groffen>
17 */
18 public class TupleLineParser extends MCLParser {
19 /**
20 * Constructs a TupleLineParser which expects columncount columns.
21 *
22 * @param columncount the number of columns in the to be parsed string
23 */
24 public TupleLineParser(int columncount) {
25 super(columncount);
26 }
27
28 /**
29 * Parses the given String source as tuple line. If source cannot
30 * be parsed, a ParseException is thrown. The columncount argument
31 * is used for allocation of the returned array. While this seems
32 * illogical, the caller should know this size, since the
33 * StartOfHeader contains this information.
34 *
35 * @param source a String which should be parsed
36 * @return 0, as there is no 'type' of TupleLine
37 * @throws ParseException if an error occurs during parsing
38 */
39 @Override
40 public int parse(String source) throws MCLParseException {
41 int len = source.length();
42 char[] chrLine = new char[len];
43 source.getChars(0, len, chrLine, 0);
44
45 // first detect whether this is a single value line (=) or a
46 // real tuple ([)
47 if (chrLine[0] == '=') {
48 if (values.length != 1)
49 throw new MCLParseException(values.length +
50 " columns expected, but only single value found");
51
52 // return the whole string but the leading =
53 values[0] = source.substring(1);
54
55 // reset colnr
56 reset();
57
58 return 0;
59 }
60
61 // extract separate fields by examining string, char for char
62 boolean inString = false, escaped = false;
63 int cursor = 2, column = 0, i = 2;
64 StringBuilder uesc = new StringBuilder();
65 for (; i < len; i++) {
66 switch(chrLine[i]) {
67 default:
68 escaped = false;
69 break;
70 case '\\':
71 escaped = !escaped;
72 break;
73 case '"':
74 /**
75 * If all strings are wrapped between two quotes, a \" can
76 * never exist outside a string. Thus if we believe that we
77 * are not within a string, we can safely assume we're about
78 * to enter a string if we find a quote.
79 * If we are in a string we should stop being in a string if
80 * we find a quote which is not prefixed by a \, for that
81 * would be an escaped quote. However, a nasty situation can
82 * occur where the string is like "test \\" as obvious, a
83 * test for a \ in front of a " doesn't hold here for all
84 * cases. Because "test \\\"" can exist as well, we need to
85 * know if a quote is prefixed by an escaping slash or not.
86 */
87 if (!inString) {
88 inString = true;
89 } else if (!escaped) {
90 inString = false;
91 }
92
93 // reset escaped flag
94 escaped = false;
95 break;
96 case '\t':
97 if (!inString &&
98 (i > 0 && chrLine[i - 1] == ',') ||
99 (i + 1 == len - 1 && chrLine[++i] == ']')) // dirty
100 {
101 // split!
102 if (chrLine[cursor] == '"' &&
103 chrLine[i - 2] == '"')
104 {
105 // reuse the StringBuilder by cleaning it
106 uesc.delete(0, uesc.length());
107 // prevent capacity increasements
108 uesc.ensureCapacity((i - 2) - (cursor + 1));
109 for (int pos = cursor + 1; pos < i - 2; pos++) {
110 if (chrLine[pos] == '\\' && pos + 1 < i - 2) {
111 pos++;
112 // strToStr and strFromStr in gdk_atoms.mx only
113 // support \t \n \\ \" and \377
114 switch (chrLine[pos]) {
115 case '\\':
116 uesc.append('\\');
117 break;
118 case 'n':
119 uesc.append('\n');
120 break;
121 case 't':
122 uesc.append('\t');
123 break;
124 case '"':
125 uesc.append('"');
126 break;
127 case '0': case '1': case '2': case '3':
128 // this could be an octal number, let's check it out
129 if (pos + 2 < i - 2 &&
130 chrLine[pos + 1] >= '0' && chrLine[pos + 1] <= '7' &&
131 chrLine[pos + 2] >= '0' && chrLine[pos + 2] <= '7'
132 ) {
133 // we got the number!
134 try {
135 uesc.append((char)(Integer.parseInt("" + chrLine[pos] + chrLine[pos + 1] + chrLine[pos + 2], 8)));
136 pos += 2;
137 } catch (NumberFormatException e) {
138 // hmmm, this point should never be reached actually...
139 throw new AssertionError("Flow error, should never try to parse non-number");
140 }
141 } else {
142 // do default action if number seems not to be correct
143 uesc.append(chrLine[pos]);
144 }
145 break;
146 default:
147 // this is wrong, just ignore the escape, and print the char
148 uesc.append(chrLine[pos]);
149 break;
150 }
151 } else {
152 uesc.append(chrLine[pos]);
153 }
154 }
155
156 // put the unescaped string in the right place
157 values[column++] = uesc.toString();
158 } else if ((i - 1) - cursor == 4 &&
159 source.indexOf("NULL", cursor) == cursor)
160 {
161 values[column++] = null;
162 } else {
163 values[column++] =
164 source.substring(cursor, i - 1);
165 }
166 cursor = i + 1;
167 }
168
169 // reset escaped flag
170 escaped = false;
171 break;
172 }
173 }
174 // check if this result is of the size we expected it to be
175 if (column != values.length)
176 throw new MCLParseException("illegal result length: " + column + "\nlast read: " + (column > 0 ? values[column - 1] : "<none>"));
177
178 // reset colnr
179 reset();
180
181 return 0;
182 }
183 }