comparison src/main/java/org/monetdb/mcl/parser/HeaderLineParser.java @ 391:f523727db392

Moved Java classes from packages starting with nl.cwi.monetdb.* to package org.monetdb.* This naming complies to the Java Package Naming convention as MonetDB's main website is www.monetdb.org.
author Martin van Dinther <martin.van.dinther@monetdbsolutions.com>
date Thu, 12 Nov 2020 22:02:01 +0100 (2020-11-12)
parents src/main/java/nl/cwi/monetdb/mcl/parser/HeaderLineParser.java@f15d2ac35932
children bf9f6b6ecf40
comparison
equal deleted inserted replaced
390:6199e0be3c6e 391:f523727db392
1 /*
2 * This Source Code Form is subject to the terms of the Mozilla Public
3 * License, v. 2.0. If a copy of the MPL was not distributed with this
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/.
5 *
6 * Copyright 1997 - July 2008 CWI, August 2008 - 2020 MonetDB B.V.
7 */
8
9 package org.monetdb.mcl.parser;
10
11
12 /**
13 * The HeaderLineParser is a generic MCLParser that extracts values from
14 * a metadata header in the MCL protocol either as string or integer
15 * values.
16 *
17 * @author Fabian Groffen
18 */
19 public final class HeaderLineParser extends MCLParser {
20 /* types of meta data supported by MCL protocol */
21 public final static int NAME = 1; // name of column
22 public final static int LENGTH = 2;
23 public final static int TABLE = 3; // may include the schema name
24 public final static int TYPE = 4;
25
26 /** The int values found while parsing. Public, you may touch it. */
27 public final int intValues[];
28
29 /**
30 * Constructs a HeaderLineParser which expects columncount columns.
31 *
32 * @param columncount the number of columns in the to be parsed string
33 */
34 public HeaderLineParser(final int columncount) {
35 super(columncount);
36 intValues = new int[columncount];
37 }
38
39 /**
40 * Parses the given String source as header line. If source cannot
41 * be parsed, an MCLParseException is thrown. The columncount argument
42 * given during construction is used for allocation of the backing array.
43 *
44 * @param source a String which should be parsed
45 * @return the type of then parsed header line
46 * @throws MCLParseException if an error occurs during parsing
47 */
48 @Override
49 public int parse(final String source) throws MCLParseException {
50 final char[] chrLine = source.toCharArray();
51 int len = chrLine.length;
52 int pos = 0;
53 boolean foundChar = false;
54 boolean nameFound = false;
55 int i;
56
57 // find header name searching from the end of the line
58 for (i = len - 1; i >= 0; i--) {
59 switch (chrLine[i]) {
60 case ' ':
61 case '\n':
62 case '\t':
63 case '\r':
64 if (!foundChar) {
65 len = i - 1;
66 } else {
67 pos = i + 1;
68 }
69 break;
70 case '#':
71 // found!
72 nameFound = true;
73 if (pos == 0)
74 pos = i + 1;
75 i = 0; // force the loop to terminate
76 break;
77 default:
78 foundChar = true;
79 pos = 0;
80 break;
81 }
82 }
83 if (!nameFound)
84 throw new MCLParseException("invalid header, no header name found", pos);
85
86 // depending on the name of the header, we continue
87 int type = 0;
88 i = pos;
89 switch (len - pos) {
90 case 4:
91 // source.regionMatches(pos + 1, "name", 0, 4)
92 if (chrLine[i] == 'n' && chrLine[++i] == 'a' && chrLine[++i] == 'm' && chrLine[++i] == 'e') {
93 getValues(chrLine, 2, pos - 3);
94 type = NAME;
95 } else
96 // source.regionMatches(pos + 1, "type", 0, 4)
97 if (chrLine[i] == 't' && chrLine[++i] == 'y' && chrLine[++i] == 'p' && chrLine[++i] == 'e') {
98 getValues(chrLine, 2, pos - 3);
99 type = TYPE;
100 }
101 break;
102 case 6:
103 // source.regionMatches(pos + 1, "length", 0, 6)
104 if (chrLine[ i ] == 'l' && chrLine[++i] == 'e' && chrLine[++i] == 'n' && chrLine[++i] == 'g'
105 && chrLine[++i] == 't' && chrLine[++i] == 'h') {
106 getIntValues(chrLine, 2, pos - 3);
107 type = LENGTH;
108 }
109 break;
110 case 10:
111 // source.regionMatches(pos + 1, "table_name", 0, 10)
112 if (chrLine[ i ] == 't' && chrLine[++i] == 'a' && chrLine[++i] == 'b' && chrLine[++i] == 'l' && chrLine[++i] == 'e'
113 && chrLine[++i] == '_' && chrLine[++i] == 'n' && chrLine[++i] == 'a' && chrLine[++i] == 'm' && chrLine[++i] == 'e') {
114 getValues(chrLine, 2, pos - 3);
115 type = TABLE;
116 }
117 break;
118 default:
119 throw new MCLParseException("unknown header: " + (new String(chrLine, pos, len - pos)));
120 }
121
122 // adjust colno
123 colnr = 0;
124
125 return type;
126 }
127
128 /**
129 * Fills an array of Strings containing the values between
130 * ',\t' separators.
131 *
132 * As of Oct2014-SP1 release MAPI adds double quotes around names when
133 * the name contains a comma or a tab or a space or a # or " or \ escape character.
134 * See issue: https://www.monetdb.org/bugzilla/show_bug.cgi?id=3616
135 * If the parsed name string part has a " as first and last character,
136 * we remove those added double quotes here.
137 *
138 * @param chrLine a character array holding the input data
139 * @param start where the relevant data starts
140 * @param stop where the relevant data stops
141 */
142 private final void getValues(final char[] chrLine, int start, final int stop) {
143 int elem = 0;
144 boolean inString = false, escaped = false;
145
146 for (int i = start; i < stop; i++) {
147 switch(chrLine[i]) {
148 case '\\':
149 escaped = !escaped;
150 break;
151 case '"':
152 /**
153 * If all strings are wrapped between two quotes, a \" can
154 * never exist outside a string. Thus if we believe that we
155 * are not within a string, we can safely assume we're about
156 * to enter a string if we find a quote.
157 * If we are in a string we should stop being in a string if
158 * we find a quote which is not prefixed by a \, for that
159 * would be an escaped quote. However, a nasty situation can
160 * occur where the string is like "test \\" as obvious, a
161 * test for a \ in front of a " doesn't hold here for all
162 * cases. Because "test \\\"" can exist as well, we need to
163 * know if a quote is prefixed by an escaping slash or not.
164 */
165 if (!inString) {
166 inString = true;
167 } else if (!escaped) {
168 inString = false;
169 }
170 // reset escaped flag
171 escaped = false;
172 break;
173 case ',':
174 if (!inString && chrLine[i + 1] == '\t') {
175 // we found the field separator
176 if (chrLine[start] == '"')
177 start++; // skip leading double quote
178 if (elem < values.length) {
179 // TODO: also deal with escape characters as done in TupleLineParser.parse()
180 values[elem++] = new String(chrLine, start, i - (chrLine[i - 1] == '"' ? 1 : 0) - start);
181 }
182 i++;
183 start = i + 1; // reset start for the next name, skipping the field separator (a comma and tab)
184 }
185 // reset escaped flag
186 escaped = false;
187 break;
188 default:
189 escaped = false;
190 break;
191 }
192 }
193 // add the left over part (last column)
194 if (chrLine[start] == '"')
195 start++; // skip leading double quote
196 if (elem < values.length)
197 values[elem] = new String(chrLine, start, stop - (chrLine[stop - 1] == '"' ? 1 : 0) - start);
198 }
199
200 /**
201 * Fills an array of ints containing the values between
202 * ',\t' separators.
203 *
204 * Feb2017 note - This integer parser doesn't have to parse negative
205 * numbers, because it is only used to parse column lengths
206 * which are always greater than 0.
207 *
208 * @param chrLine a character array holding the input data
209 * @param start where the relevant data starts
210 * @param stop where the relevant data stops
211 */
212 private final void getIntValues(final char[] chrLine, final int start, final int stop) throws MCLParseException {
213 int elem = 0;
214 int tmp = 0;
215
216 for (int i = start; i < stop; i++) {
217 if (chrLine[i] == ',' && chrLine[i + 1] == '\t') {
218 if (elem < intValues.length) {
219 intValues[elem++] = tmp;
220 }
221 tmp = 0;
222 i++;
223 } else {
224 // note: don't use Character.isDigit() here, because
225 // we only want ISO-LATIN-1 digits
226 if (chrLine[i] >= '0' && chrLine[i] <= '9') {
227 tmp *= 10;
228 tmp += (int)chrLine[i] - (int)'0';
229 } else {
230 throw new MCLParseException("expected a digit in " + new String(chrLine) + " at " + i);
231 }
232 }
233 }
234 // add the left over part (last column)
235 if (elem < intValues.length)
236 intValues[elem] = tmp;
237 }
238 }