001 package org.apache.fulcrum.parser;
002
003
004 /*
005 * Licensed to the Apache Software Foundation (ASF) under one
006 * or more contributor license agreements. See the NOTICE file
007 * distributed with this work for additional information
008 * regarding copyright ownership. The ASF licenses this file
009 * to you under the Apache License, Version 2.0 (the
010 * "License"); you may not use this file except in compliance
011 * with the License. You may obtain a copy of the License at
012 *
013 * http://www.apache.org/licenses/LICENSE-2.0
014 *
015 * Unless required by applicable law or agreed to in writing,
016 * software distributed under the License is distributed on an
017 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
018 * KIND, either express or implied. See the License for the
019 * specific language governing permissions and limitations
020 * under the License.
021 */
022
023
024 import java.io.BufferedReader;
025 import java.io.IOException;
026 import java.io.InputStreamReader;
027 import java.io.Reader;
028 import java.io.StreamTokenizer;
029 import java.util.ArrayList;
030 import java.util.Iterator;
031 import java.util.List;
032 import java.util.NoSuchElementException;
033
034 import org.apache.avalon.framework.logger.LogEnabled;
035 import org.apache.avalon.framework.logger.Logger;
036
037 /**
038 * DataStreamParser is used to parse a stream with a fixed format and
039 * generate ValueParser objects which can be used to extract the values
040 * in the desired type.
041 *
042 * <p>The class itself is abstract - a concrete subclass which implements
043 * the initTokenizer method such as CSVParser or TSVParser is required
044 * to use the functionality.
045 *
046 * <p>The class implements the java.util.Iterator interface for convenience.
047 * This allows simple use in a Velocity template for example:
048 *
049 * <pre>
050 * #foreach ($row in $datastream)
051 * Name: $row.Name
052 * Description: $row.Description
053 * #end
054 * </pre>
055 *
056 * @author <a href="mailto:sean@informage.net">Sean Legassick</a>
057 * @version $Id: DataStreamParser.java 732115 2009-01-06 20:54:04Z tv $
058 */
059 public abstract class DataStreamParser
060 implements Iterator, LogEnabled
061 {
062 /**
063 * The list of column names.
064 */
065 private List columnNames;
066
067 /**
068 * The stream tokenizer for reading values from the input reader.
069 */
070 private StreamTokenizer tokenizer;
071
072 /**
073 * The parameter parser holding the values of columns for the current line.
074 */
075 private ValueParser lineValues;
076
077 /**
078 * Indicates whether or not the tokenizer has read anything yet.
079 */
080 private boolean neverRead = true;
081
082 /**
083 * The character encoding of the input
084 */
085 private String characterEncoding;
086
087 /**
088 * Logger to use
089 */
090 protected Logger log;
091
092 /**
093 * Create a new DataStreamParser instance. Requires a Reader to read the
094 * comma-separated values from, a list of column names and a
095 * character encoding.
096 *
097 * @param in the input reader.
098 * @param columnNames a list of column names.
099 * @param characterEncoding the character encoding of the input.
100 */
101 public DataStreamParser(Reader in, List columnNames,
102 String characterEncoding)
103 {
104 this.columnNames = columnNames;
105 this.characterEncoding = characterEncoding;
106
107 if (this.characterEncoding == null)
108 {
109 // try and get the characterEncoding from the reader
110 this.characterEncoding = "US-ASCII";
111 try
112 {
113 this.characterEncoding = ((InputStreamReader)in).getEncoding();
114 }
115 catch (ClassCastException e)
116 {
117 // ignore
118 }
119 }
120
121 tokenizer = new StreamTokenizer(new BufferedReader(in));
122 initTokenizer(tokenizer);
123 }
124
125 /**
126 * Initialize the StreamTokenizer instance used to read the lines
127 * from the input reader. This must be implemented in subclasses to
128 * set up the tokenizing properties.
129 */
130 protected abstract void initTokenizer(StreamTokenizer tokenizer);
131
132 /**
133 * Provide a logger
134 *
135 * @see org.apache.avalon.framework.logger.LogEnabled#enableLogging(org.apache.avalon.framework.logger.Logger)
136 */
137 public void enableLogging(Logger logger)
138 {
139 this.log = logger.getChildLogger("DataStreamParser");
140 }
141
142 /**
143 * Set the list of column names explicitly.
144 *
145 * @param columnNames A list of column names.
146 */
147 public void setColumnNames(List columnNames)
148 {
149 this.columnNames = columnNames;
150 }
151
152 /**
153 * Read the list of column names from the input reader using the
154 * tokenizer.
155 *
156 * @exception IOException an IOException occurred.
157 */
158 public void readColumnNames()
159 throws IOException
160 {
161 columnNames = new ArrayList();
162
163 neverRead = false;
164 tokenizer.nextToken();
165 while (tokenizer.ttype == StreamTokenizer.TT_WORD
166 || tokenizer.ttype == '"')
167 {
168 columnNames.add(tokenizer.sval);
169 tokenizer.nextToken();
170 }
171 }
172
173 /**
174 * Determine whether a further row of values exists in the input.
175 *
176 * @return true if the input has more rows.
177 * @exception IOException an IOException occurred.
178 */
179 public boolean hasNextRow()
180 throws IOException
181 {
182 // check for end of line ensures that an empty last line doesn't
183 // give a false positive for hasNextRow
184 if (neverRead || tokenizer.ttype == StreamTokenizer.TT_EOL)
185 {
186 tokenizer.nextToken();
187 tokenizer.pushBack();
188 neverRead = false;
189 }
190 return tokenizer.ttype != StreamTokenizer.TT_EOF;
191 }
192
193 /**
194 * Returns a ValueParser object containing the next row of values.
195 *
196 * @return a ValueParser object.
197 * @exception IOException an IOException occurred.
198 * @exception NoSuchElementException there are no more rows in the input.
199 */
200 public ValueParser nextRow()
201 throws IOException, NoSuchElementException
202 {
203 if (!hasNextRow())
204 {
205 throw new NoSuchElementException();
206 }
207
208 if (lineValues == null)
209 {
210 lineValues = new BaseValueParser(characterEncoding);
211 }
212 else
213 {
214 lineValues.clear();
215 }
216
217 Iterator it = columnNames.iterator();
218 tokenizer.nextToken();
219 while (tokenizer.ttype == StreamTokenizer.TT_WORD
220 || tokenizer.ttype == '"')
221 {
222 // note this means that if there are more values than
223 // column names, the extra values are discarded.
224 if (it.hasNext())
225 {
226 String colname = it.next().toString();
227 String colval = tokenizer.sval;
228 if (log.isDebugEnabled())
229 {
230 log.debug("DataStreamParser.nextRow(): " +
231 colname + '=' + colval);
232 }
233 lineValues.add(colname, colval);
234 }
235 tokenizer.nextToken();
236 }
237
238 return lineValues;
239 }
240
241 /**
242 * Determine whether a further row of values exists in the input.
243 *
244 * @return true if the input has more rows.
245 */
246 public boolean hasNext()
247 {
248 boolean hasNext = false;
249
250 try
251 {
252 hasNext = hasNextRow();
253 }
254 catch (IOException e)
255 {
256 log.error("IOException in CSVParser.hasNext", e);
257 }
258
259 return hasNext;
260 }
261
262 /**
263 * Returns a ValueParser object containing the next row of values.
264 *
265 * @return a ValueParser object as an Object.
266 * @exception NoSuchElementException there are no more rows in the input
267 * or an IOException occurred.
268 */
269 public Object next()
270 throws NoSuchElementException
271 {
272 Object nextRow = null;
273
274 try
275 {
276 nextRow = nextRow();
277 }
278 catch (IOException e)
279 {
280 log.error("IOException in CSVParser.next", e);
281 throw new NoSuchElementException();
282 }
283
284 return nextRow;
285 }
286
287 /**
288 * The optional Iterator.remove method is not supported.
289 *
290 * @exception UnsupportedOperationException the operation is not supported.
291 */
292 public void remove()
293 throws UnsupportedOperationException
294 {
295 throw new UnsupportedOperationException();
296 }
297 }