001/** 002 * Copyright (c) 2001, Sergey A. Samokhodkin 003 * All rights reserved. 004 * <p> 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * <p> 008 * - Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * - Redistributions in binary form 011 * must reproduce the above copyright notice, this list of conditions and the following 012 * disclaimer in the documentation and/or other materials provided with the distribution. 013 * - Neither the name of jregex nor the names of its contributors may be used 014 * to endorse or promote products derived from this software without specific prior 015 * written permission. 016 * <p> 017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 018 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 019 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 020 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 022 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 023 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 024 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 025 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 026 * 027 * @version 1.2_01 028 */ 029 030package regexodus; 031 032import java.io.IOException; 033import java.io.Reader; 034import java.util.Iterator; 035import java.util.NoSuchElementException; 036 037/** 038 * The Tokenizer class suggests a methods to break a text into tokens using 039 * occurrences of a pattern as delimiters. 040 * There are two ways to obtain a text tokenizer for some pattern:<pre> 041 * Pattern p=new Pattern("\\s+"); //any number of space characters 042 * String text="blah blah blah"; 043 * //by factory method 044 * RETokenizer tok1=p.tokenizer(text); 045 * //or by constructor 046 * RETokenizer tok2=new RETokenizer(p,text); 047 * </pre> 048 * Now the one way is to use the tokenizer as a token enumeration/iterator:<pre> 049 * while(tok1.hasMore()) System.out.println(tok1.nextToken()); 050 * </pre> 051 * and another way is to split it into a String array: 052 * {@code 053 * String[] arr=tok2.split(); 054 * for(int i=0;i<tok2.length;i++) System.out.println(arr[i]);} 055 * 056 * @see Pattern#tokenizer(java.lang.String) 057 */ 058 059public class RETokenizer implements Iterator<String> { 060 private Matcher matcher; 061 private boolean checked; 062 private boolean hasToken; 063 private String token; 064 private int pos = 0; 065 private boolean endReached = false; 066 private boolean emptyTokensEnabled = false; 067 068 public RETokenizer(Pattern pattern, String text) { 069 this(pattern.matcher(text), false); 070 } 071 072 public RETokenizer(Pattern pattern, char[] chars, int off, int len) { 073 this(pattern.matcher(chars, off, len), false); 074 } 075 076 @GwtIncompatible 077 public RETokenizer(Pattern pattern, Reader r, int len) throws IOException { 078 this(pattern.matcher(r, len), false); 079 } 080 081 private RETokenizer(Matcher m, boolean emptyEnabled) { 082 matcher = m; 083 emptyTokensEnabled = emptyEnabled; 084 } 085 086 public void setEmptyEnabled(boolean b) { 087 emptyTokensEnabled = b; 088 } 089 090 public boolean isEmptyEnabled() { 091 return emptyTokensEnabled; 092 } 093 094 private boolean hasMore() { 095 if (!checked) check(); 096 return hasToken; 097 } 098 099 private String nextToken() { 100 if (!checked) check(); 101 if (!hasToken) throw new NoSuchElementException(); 102 checked = false; 103 return token; 104 } 105 106 public String[] split() { 107 return collect(this, null, 0); 108 } 109 110 public void reset() { 111 matcher.setPosition(0); 112 } 113 114 private static String[] collect(RETokenizer tok, String[] arr, int count) { 115 if (tok.hasMore()) { 116 String s = tok.nextToken(); 117//System.out.println("collect(,,"+count+"): token="+s); 118 arr = collect(tok, arr, count + 1); 119 arr[count] = s; 120 } else { 121 arr = new String[count]; 122 } 123 return arr; 124 } 125 126 private void check() { 127 final boolean emptyOk = this.emptyTokensEnabled; 128 checked = true; 129 if (endReached) { 130 hasToken = false; 131 return; 132 } 133 Matcher m = matcher; 134 boolean hasMatch = false; 135 while (m.find()) { 136 if (m.start() > 0) { 137 hasMatch = true; 138 break; 139 } else if (m.end() > 0) { 140 if (emptyOk) { 141 hasMatch = true; 142 break; 143 } else m.setTarget(m, MatchResult.SUFFIX); 144 } 145 } 146 if (!hasMatch) { 147 endReached = true; 148 if (m.length(MatchResult.TARGET) == 0 && !emptyOk) { 149 hasToken = false; 150 } else { 151 hasToken = true; 152 token = m.target(); 153 } 154 return; 155 } 156 157 hasToken = true; 158 token = m.prefix(); 159 m.setTarget(m, MatchResult.SUFFIX); 160 //m.setTarget(m.suffix()); 161 } 162 163 /** 164 * Removes from the underlying collection the last element returned 165 * by this iterator (optional operation). This method can be called 166 * only once per call to {@link #next}. The behavior of an iterator 167 * is unspecified if the underlying collection is modified while the 168 * iteration is in progress in any way other than by calling this 169 * method. 170 * 171 * @throws UnsupportedOperationException if the {@code remove} 172 * operation is not supported by this iterator 173 * @throws IllegalStateException if the {@code next} method has not 174 * yet been called, or the {@code remove} method has already 175 * been called after the last call to the {@code next} 176 * method 177 */ 178 @Override 179 public void remove() { 180 throw new UnsupportedOperationException("remove() not supported on RETokenizer"); 181 } 182 183 @Override 184 public boolean hasNext() { 185 return hasMore(); 186 } 187 188 /** 189 * @return a next token as a String 190 */ 191 @Override 192 public String next() { 193 return nextToken(); 194 } 195 196 @Override 197 public boolean equals(Object o) { 198 if (this == o) return true; 199 if (o == null || getClass() != o.getClass()) return false; 200 201 RETokenizer that = (RETokenizer) o; 202 203 if (checked != that.checked) return false; 204 if (hasToken != that.hasToken) return false; 205 if (pos != that.pos) return false; 206 if (endReached != that.endReached) return false; 207 if (emptyTokensEnabled != that.emptyTokensEnabled) return false; 208 if (matcher != null ? !matcher.equals(that.matcher) : that.matcher != null) return false; 209 return token != null ? token.equals(that.token) : that.token == null; 210 211 } 212 213 @Override 214 public int hashCode() { 215 int result = matcher != null ? matcher.hashCode() : 0; 216 result = 31 * result + (checked ? 1 : 0); 217 result = 31 * result + (hasToken ? 1 : 0); 218 result = 31 * result + (token != null ? token.hashCode() : 0); 219 result = 31 * result + pos; 220 result = 31 * result + (endReached ? 1 : 0); 221 result = 31 * result + (emptyTokensEnabled ? 1 : 0); 222 return result; 223 } 224 225 @Override 226 public String toString() { 227 return "RETokenizer{" + 228 "matcher=" + matcher + 229 ", checked=" + checked + 230 ", hasToken=" + hasToken + 231 ", token='" + token + '\'' + 232 ", pos=" + pos + 233 ", endReached=" + endReached + 234 ", emptyTokensEnabled=" + emptyTokensEnabled + 235 '}'; 236 } 237}