001/**
002 * Copyright (c) 2001, Sergey A. Samokhodkin
003 * All rights reserved.
004 * <p>
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 * <p>
008 * - Redistributions of source code must retain the above copyright notice,
009 * this list of conditions and the following disclaimer.
010 * - Redistributions in binary form
011 * must reproduce the above copyright notice, this list of conditions and the following
012 * disclaimer in the documentation and/or other materials provided with the distribution.
013 * - Neither the name of jregex nor the names of its contributors may be used
014 * to endorse or promote products derived from this software without specific prior
015 * written permission.
016 * <p>
017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
018 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
022 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
023 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
024 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
025 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026 *
027 * @version 1.2_01
028 */
029
030package regexodus;
031
032import java.io.IOException;
033import java.io.Reader;
034import java.util.Iterator;
035import java.util.NoSuchElementException;
036
037/**
038 * The Tokenizer class suggests a methods to break a text into tokens using
039 * occurrences of a pattern as delimiters.
040 * There are two ways to obtain a text tokenizer for some pattern:<pre>
041 * Pattern p=new Pattern("\\s+"); //any number of space characters
042 * String text="blah blah blah";
043 * //by factory method
044 * RETokenizer tok1=p.tokenizer(text);
045 * //or by constructor
046 * RETokenizer tok2=new RETokenizer(p,text);
047 * </pre>
048 * Now the one way is to use the tokenizer as a token enumeration/iterator:<pre>
049 * while(tok1.hasMore()) System.out.println(tok1.nextToken());
050 * </pre>
051 * and another way is to split it into a String array:
052 * {@code
053 * String[] arr=tok2.split();
054 * for(int i=0;i<tok2.length;i++) System.out.println(arr[i]);}
055 *
056 * @see Pattern#tokenizer(java.lang.String)
057 */
058
059public class RETokenizer implements Iterator<String> {
060    private Matcher matcher;
061    private boolean checked;
062    private boolean hasToken;
063    private String token;
064    private int pos = 0;
065    private boolean endReached = false;
066    private boolean emptyTokensEnabled = false;
067
068    public RETokenizer(Pattern pattern, String text) {
069        this(pattern.matcher(text), false);
070    }
071
072    public RETokenizer(Pattern pattern, char[] chars, int off, int len) {
073        this(pattern.matcher(chars, off, len), false);
074    }
075
076    @GwtIncompatible
077    public RETokenizer(Pattern pattern, Reader r, int len) throws IOException {
078        this(pattern.matcher(r, len), false);
079    }
080
081    private RETokenizer(Matcher m, boolean emptyEnabled) {
082        matcher = m;
083        emptyTokensEnabled = emptyEnabled;
084    }
085
086    public void setEmptyEnabled(boolean b) {
087        emptyTokensEnabled = b;
088    }
089
090    public boolean isEmptyEnabled() {
091        return emptyTokensEnabled;
092    }
093
094    private boolean hasMore() {
095        if (!checked) check();
096        return hasToken;
097    }
098
099    private String nextToken() {
100        if (!checked) check();
101        if (!hasToken) throw new NoSuchElementException();
102        checked = false;
103        return token;
104    }
105
106    public String[] split() {
107        return collect(this, null, 0);
108    }
109
110    public void reset() {
111        matcher.setPosition(0);
112    }
113
114    private static String[] collect(RETokenizer tok, String[] arr, int count) {
115        if (tok.hasMore()) {
116            String s = tok.nextToken();
117//System.out.println("collect(,,"+count+"): token="+s);
118            arr = collect(tok, arr, count + 1);
119            arr[count] = s;
120        } else {
121            arr = new String[count];
122        }
123        return arr;
124    }
125
126    private void check() {
127        final boolean emptyOk = this.emptyTokensEnabled;
128        checked = true;
129        if (endReached) {
130            hasToken = false;
131            return;
132        }
133        Matcher m = matcher;
134        boolean hasMatch = false;
135        while (m.find()) {
136            if (m.start() > 0) {
137                hasMatch = true;
138                break;
139            } else if (m.end() > 0) {
140                if (emptyOk) {
141                    hasMatch = true;
142                    break;
143                } else m.setTarget(m, MatchResult.SUFFIX);
144            }
145        }
146        if (!hasMatch) {
147            endReached = true;
148            if (m.length(MatchResult.TARGET) == 0 && !emptyOk) {
149                hasToken = false;
150            } else {
151                hasToken = true;
152                token = m.target();
153            }
154            return;
155        }
156
157        hasToken = true;
158        token = m.prefix();
159        m.setTarget(m, MatchResult.SUFFIX);
160        //m.setTarget(m.suffix());
161    }
162
163    /**
164     * Removes from the underlying collection the last element returned
165     * by this iterator (optional operation).  This method can be called
166     * only once per call to {@link #next}.  The behavior of an iterator
167     * is unspecified if the underlying collection is modified while the
168     * iteration is in progress in any way other than by calling this
169     * method.
170     *
171     * @throws UnsupportedOperationException if the {@code remove}
172     *                                       operation is not supported by this iterator
173     * @throws IllegalStateException         if the {@code next} method has not
174     *                                       yet been called, or the {@code remove} method has already
175     *                                       been called after the last call to the {@code next}
176     *                                       method
177     */
178    @Override
179    public void remove() {
180        throw new UnsupportedOperationException("remove() not supported on RETokenizer");
181    }
182
183    @Override
184    public boolean hasNext() {
185        return hasMore();
186    }
187
188    /**
189     * @return a next token as a String
190     */
191    @Override
192    public String next() {
193        return nextToken();
194    }
195
196    @Override
197    public boolean equals(Object o) {
198        if (this == o) return true;
199        if (o == null || getClass() != o.getClass()) return false;
200
201        RETokenizer that = (RETokenizer) o;
202
203        if (checked != that.checked) return false;
204        if (hasToken != that.hasToken) return false;
205        if (pos != that.pos) return false;
206        if (endReached != that.endReached) return false;
207        if (emptyTokensEnabled != that.emptyTokensEnabled) return false;
208        if (matcher != null ? !matcher.equals(that.matcher) : that.matcher != null) return false;
209        return token != null ? token.equals(that.token) : that.token == null;
210
211    }
212
213    @Override
214    public int hashCode() {
215        int result = matcher != null ? matcher.hashCode() : 0;
216        result = 31 * result + (checked ? 1 : 0);
217        result = 31 * result + (hasToken ? 1 : 0);
218        result = 31 * result + (token != null ? token.hashCode() : 0);
219        result = 31 * result + pos;
220        result = 31 * result + (endReached ? 1 : 0);
221        result = 31 * result + (emptyTokensEnabled ? 1 : 0);
222        return result;
223    }
224
225    @Override
226    public String toString() {
227        return "RETokenizer{" +
228                "matcher=" + matcher +
229                ", checked=" + checked +
230                ", hasToken=" + hasToken +
231                ", token='" + token + '\'' +
232                ", pos=" + pos +
233                ", endReached=" + endReached +
234                ", emptyTokensEnabled=" + emptyTokensEnabled +
235                '}';
236    }
237}