001/** 002 * Copyright (c) 2001, Sergey A. Samokhodkin 003 * All rights reserved. 004 * <p> 005 * Redistribution and use in source and binary forms, with or without modification, 006 * are permitted provided that the following conditions are met: 007 * <p> 008 * - Redistributions of source code must retain the above copyright notice, 009 * this list of conditions and the following disclaimer. 010 * - Redistributions in binary form 011 * must reproduce the above copyright notice, this list of conditions and the following 012 * disclaimer in the documentation and/or other materials provided with the distribution. 013 * - Neither the name of jregex nor the names of its contributors may be used 014 * to endorse or promote products derived from this software without specific prior 015 * written permission. 016 * <p> 017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY 018 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 019 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 020 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, 021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 022 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; 023 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 024 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY 025 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 026 * 027 * @version 1.2_01 028 */ 029 030package regexodus; 031 032import java.io.IOException; 033import java.io.Reader; 034import java.io.Serializable; 035import java.util.HashMap; 036 037/** 038 * A handle for a precompiled regular expression; core operations should be identical to java.util.regex.Pattern . 039 * Pattern should be no different. 040 * <br> 041 * To match a regular expression <code>myExpr</code> against a text <code>myString</code> one should first 042 * create a Pattern object:<pre> 043 * Pattern p = new Pattern(myExpr); 044 * </pre> 045 * or <pre> 046 * Pattern p = Pattern.compile(myExpr); 047 * </pre> 048 * then obtain a Matcher object:<pre> 049 * Matcher matcher=p.matcher(myText); 050 * </pre> 051 * The latter is an automaton that actually performs a search. It provides the following methods: 052 * <ul> 053 * <li> search for matching substrings : matcher.find() or matcher.findAll();</li> 054 * <li> test whether the text matches the whole pattern : matcher.matches();</li> 055 * <li> test whether the text matches the beginning of the pattern : matcher.matchesPrefix();</li> 056 * <li> search with custom options : matcher.find(int options)</li> 057 * </ul> 058 * <p> 059 * <b>Flags</b> 060 * <br> 061 * Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime. 062 * These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of: 063 * <ul> 064 * <li><b>REFlags.IGNORE_CASE</b> - enables case insensitivity</li> 065 * <li><b>REFlags.MULTILINE</b> - forces "^" and "$" to match both at the start and the end of line;</li> 066 * <li><b>REFlags.DOTALL</b> - forces "." to match eols('\r' and '\n' in ASCII);</li> 067 * <li><b>REFlags.IGNORE_SPACES</b> - literal spaces in expression are ignored for better readability;</li> 068 * <li><b>REFlags.UNICODE</b> - the predefined classes('\w','\d',etc) are referenced to Unicode;</li> 069 * <li><b>REFlags.XML_SCHEMA</b> - permits XML Schema regular expressions syntax extensions.</li> 070 * </ul> 071 * <p> 072 * <b>Multithreading</b><br> 073 * Pattern instances are not thread-safe, and neither are Matcher objects. 074 * 075 * @see REFlags 076 * @see Matcher 077 * @see Matcher#setTarget(java.lang.CharSequence) 078 * @see Matcher#setTarget(java.lang.CharSequence, int, int) 079 * @see Matcher#setTarget(char[], int, int) 080 * @see Matcher#setTarget(java.io.Reader, int) 081 * @see MatchResult 082 * @see MatchResult#group(int) 083 * @see MatchResult#start(int) 084 * @see MatchResult#end(int) 085 * @see MatchResult#length(int) 086 * @see MatchResult#charAt(int, int) 087 * @see MatchResult#prefix() 088 * @see MatchResult#suffix() 089 */ 090 091public class Pattern implements Serializable, REFlags { 092 private static final long serialVersionUID = -3628346657932720807L; 093 094 String stringRepr; 095 096 // tree entry 097 Term root, root0; 098 099 // required number of memory slots 100 int memregs; 101 102 // required number of iteration counters 103 int counters; 104 105 // number of lookahead groups 106 int lookaheads; 107 108 HashMap<String, Integer> namedGroupMap; 109 110 boolean caseless = false; 111 112 protected Pattern() throws PatternSyntaxException { 113 } 114 115 /** 116 * Compiles an expression with default flags. 117 * 118 * @param regex the Perl5-compatible regular expression string. 119 * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. 120 * @see Pattern#Pattern(java.lang.String, java.lang.String) 121 * @see Pattern#Pattern(java.lang.String, int) 122 */ 123 public Pattern(String regex) throws PatternSyntaxException { 124 this(regex, DEFAULT); 125 } 126 127 /** 128 * Compiles a regular expression using Perl5-style flags. 129 * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus. 130 * The meaning of letters: 131 * <ul> 132 * <li><b>i</b> - case insensitivity, corresponds to REFlags.IGNORE_CASE;</li> 133 * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;</li> 134 * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;</li> 135 * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.</li> 136 * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.</li> 137 * <li><b>X</b> - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.</li> 138 * <li><b>-</b> - turn off the specified flags; normally has no effect unless something adds the flags.</li> 139 * <li><b>+</b> - turn on the specified flags; normally is no different from just using the letters.</li> 140 * </ul> 141 * 142 * @param regex the Perl5-compatible regular expression string. 143 * @param flags the Perl5-compatible flags. 144 * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. 145 * see REFlags 146 */ 147 public Pattern(String regex, String flags) throws PatternSyntaxException { 148 internalCompile(regex, parseFlags(flags)); 149 } 150 151 /** 152 * Compiles a regular expression using REFlags. 153 * The <code>flags</code> parameter is a bitwise OR of the following values: 154 * <ul> 155 * <li><b>REFlags.IGNORE_CASE</b> - case insensitivity, corresponds to '<b>i</b>' letter;</li> 156 * <li><b>REFlags.MULTILINE</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to '<b>m</b>';</li> 157 * <li><b>REFlags.DOTALL</b> - single line treatment('.' matches \r's and \n's),corresponds to '<b>s</b>';</li> 158 * <li><b>REFlags.IGNORE_SPACES</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to '<b>x</b>'.</li> 159 * <li><b>REFlags.UNICODE</b> - predefined classes are regarded as belonging to Unicode, corresponds to '<b>u</b>'; this may yield some performance penalty.</li> 160 * <li><b>REFlags.XML_SCHEMA</b> - compatibility with XML Schema, corresponds to '<b>X</b>'.</li> 161 * </ul> 162 * 163 * @param regex the Perl5-compatible regular expression string. 164 * @param flags the Perl5-compatible flags. 165 * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax. 166 * see REFlags 167 */ 168 private Pattern(String regex, int flags) throws PatternSyntaxException { 169 internalCompile(regex, flags); 170 } 171 172 173 //java.util.regex.* compatibility 174 175 /** 176 * Compiles the given String into a Pattern that can be used to match text. 177 * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d", 178 * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash). 179 * @param regex a String in normal Java regular expression format 180 * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression 181 * @throws PatternSyntaxException 182 */ 183 public static Pattern compile(String regex) throws PatternSyntaxException{ 184 return new Pattern(regex, DEFAULT); 185 } 186 //java.util.regex.* compatibility 187 188 /** 189 * Compiles the given String into a Pattern that can be used to match text. 190 * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d", 191 * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash). 192 * <br> 193 * This variant allows flags to be passed as an int constructed via bitwise OR from REFlags constants. You may prefer 194 * the variant that takes a String for clarity. 195 * @param regex a String in normal Java regular expression format 196 * @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags. 197 * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression 198 * @throws PatternSyntaxException 199 */ 200 public static Pattern compile(String regex,int flags) throws PatternSyntaxException{ 201 return new Pattern(regex, flags); 202 } 203 //java.util.regex.* compatibility 204 /** 205 * Compiles the given String into a Pattern that can be used to match text. 206 * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d", 207 * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash). 208 * <br> 209 * This variant allows flags to be passed as an String. 210 * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus. 211 * The meaning of letters: 212 * <ul> 213 * <li><b>i</b> - case insensitivity, corresponds to REFlags.IGNORE_CASE;</li> 214 * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;</li> 215 * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;</li> 216 * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.</li> 217 * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.</li> 218 * <li><b>X</b> - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.</li> 219 * <li><b>-</b> - turn off the specified flags; normally has no effect unless something adds the flags.</li> 220 * <li><b>+</b> - turn on the specified flags; normally is no different from just using the letters.</li> 221 * </ul> 222 * 223 * @param regex a String in normal Java regular expression format 224 * @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags. 225 * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression 226 * @throws PatternSyntaxException 227 */ 228 public static Pattern compile(String regex,String flags) throws PatternSyntaxException{ 229 return new Pattern(regex, flags); 230 } 231 232 233 private void internalCompile(String regex, int flags) throws PatternSyntaxException { 234 stringRepr = regex; 235 caseless = (flags & IGNORE_CASE) == IGNORE_CASE; 236 Term.makeTree(regex, flags, this); 237 } 238 239 /** 240 * How many capturing groups does this expression include? 241 */ 242 public int groupCount() { 243 return memregs; 244 } 245 246 /** 247 * Get numeric id for a group name. 248 * 249 * @return <code>null</code> if no such name found. 250 * @see MatchResult#group(java.lang.String) 251 * @see MatchResult#isCaptured(java.lang.String) 252 */ 253 public Integer groupId(String name) { 254 return (namedGroupMap.get(name)); 255 } 256 257 /** 258 * A shorthand for Pattern.matcher(String).matches().<br> 259 * 260 * @param s the target 261 * @return true if the entire target matches the pattern 262 * @see Matcher#matches() 263 * @see Matcher#matches(String) 264 */ 265 public boolean matches(String s) { 266 return matcher(s).matches(); 267 } 268 269 /** 270 * A shorthand for Pattern.matcher(String).matchesPrefix().<br> 271 * 272 * @param s the target 273 * @return true if the entire target matches the beginning of the pattern 274 * @see Matcher#matchesPrefix() 275 */ 276 public boolean startsWith(String s) { 277 return matcher(s).matchesPrefix(); 278 } 279 280 /** 281 * Returns a target-less matcher. 282 * Don't forget to supply a target. 283 */ 284 public Matcher matcher() { 285 return new Matcher(this); 286 } 287 288 /** 289 * Returns a matcher for a specified string. 290 */ 291 public Matcher matcher(CharSequence s) { 292 Matcher m = new Matcher(this); 293 m.setTarget(s); 294 return m; 295 } 296 297 /** 298 * Returns a matcher for a specified region. 299 */ 300 public Matcher matcher(char[] data, int start, int end) { 301 Matcher m = new Matcher(this); 302 m.setTarget(data, start, end); 303 return m; 304 } 305 306 /** 307 * Returns a matcher for a match result (in a performance-friendly way). 308 * <code>groupId</code> parameter specifies which group is a target. 309 * 310 * @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET. 311 */ 312 public Matcher matcher(MatchResult res, int groupId) { 313 Matcher m = new Matcher(this); 314 if (res instanceof Matcher) { 315 m.setTarget((Matcher) res, groupId); 316 } else { 317 m.setTarget(res.targetChars(), res.start(groupId) + res.targetStart(), res.length(groupId)); 318 } 319 return m; 320 } 321 322 /** 323 * Just as above, yet with symbolic group name. 324 * 325 * @throws NullPointerException if there is no group with such name 326 */ 327 public Matcher matcher(MatchResult res, String groupName) { 328 Integer id = res.pattern().groupId(groupName); 329 if (id == null) throw new IllegalArgumentException("group not found:" + groupName); 330 int group = id; 331 return matcher(res, group); 332 } 333 334 /** 335 * Returns a matcher taking a text stream as target. 336 * <b>Note that this is not a true POSIX-style stream matching</b>, i.e. the whole length of the text is preliminary read and stored in a char array. 337 * 338 * @param text a text stream 339 * @param length the length to read from a stream; if <code>len</code> is <code>-1</code>, the whole stream is read in. 340 * @throws IOException indicates an IO problem 341 */ 342 @GwtIncompatible 343 public Matcher matcher(Reader text, int length) throws IOException { 344 Matcher m = new Matcher(this); 345 m.setTarget(text, length); 346 return m; 347 } 348 349 /** 350 * Returns a replacer of a pattern by specified perl-like expression. 351 * Such replacer will substitute all occurrences of a pattern by an evaluated expression 352 * ("$&" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc). 353 * Example:<pre> 354 * String text="The quick brown fox jumped over the lazy dog"; 355 * Pattern word=new Pattern("\\w+"); 356 * System.out.println(word.replacer("[$&]").replace(text)); 357 * //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]" 358 * Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)"); 359 * System.out.println(swap.replacer("$3$2$1").replace(text)); 360 * //prints "The quick brown dog jumped over the lazy fox" 361 * Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)"); 362 * System.out.println(scramble.replacer("$3$2$1").replace(text)); 363 * //prints "quick The fox brown over jumped lazy the dog" 364 * </pre> 365 * 366 * @param expr a perl-like expression, the "$&" and "${&}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo. 367 * @see Replacer 368 */ 369 public Replacer replacer(String expr) { 370 return new Replacer(this, expr); 371 } 372 373 /** 374 * Returns a replacer will substitute all occurrences of a pattern 375 * through applying a user-defined substitution model. 376 * 377 * @param model a Substitution object which is in charge for match substitution 378 * @see Replacer 379 */ 380 public Replacer replacer(Substitution model) { 381 return new Replacer(this, model); 382 } 383 384 /** 385 * Tokenizes a text by an occurrences of the pattern. 386 * Note that a series of adjacent matches are regarded as a single separator. 387 * The same as new RETokenizer(Pattern,String); 388 * 389 * @see RETokenizer 390 * @see RETokenizer#RETokenizer(regexodus.Pattern, java.lang.String) 391 */ 392 public RETokenizer tokenizer(String text) { 393 return new RETokenizer(this, text); 394 } 395 396 /** 397 * Tokenizes a specified region by an occurrences of the pattern. 398 * Note that a series of adjacent matches are regarded as a single separator. 399 * The same as new RETokenizer(Pattern,char[],int,int); 400 * 401 * @see RETokenizer 402 * @see RETokenizer#RETokenizer(regexodus.Pattern, char[], int, int) 403 */ 404 public RETokenizer tokenizer(char[] data, int off, int len) { 405 return new RETokenizer(this, data, off, len); 406 } 407 408 /** 409 * Tokenizes a specified region by an occurrences of the pattern. 410 * Note that a series of adjacent matches are regarded as a single separator. 411 * The same as new RETokenizer(Pattern,Reader,int); 412 * 413 * @see RETokenizer 414 * @see RETokenizer#RETokenizer(regexodus.Pattern, java.io.Reader, int) 415 */ 416 @GwtIncompatible 417 public RETokenizer tokenizer(Reader in, int length) throws IOException { 418 return new RETokenizer(this, in, length); 419 } 420 421 public String toString() { 422 return stringRepr; 423 } 424 425 /** 426 * Returns a less or more readable representation of a bytecode for the pattern. 427 */ 428 public String toString_d() { 429 return root.toStringAll(); 430 } 431 432 private static int parseFlags(String flags) throws PatternSyntaxException { 433 boolean enable = true; 434 int len = flags.length(); 435 int result = DEFAULT; 436 for (int i = 0; i < len; i++) { 437 char c = flags.charAt(i); 438 switch (c) { 439 case '+': 440 enable = true; 441 break; 442 case '-': 443 enable = false; 444 break; 445 default: 446 int flag = getFlag(c); 447 if (enable) result |= flag; 448 else result &= (~flag); 449 } 450 } 451 return result; 452 } 453 454 static int parseFlags(char[] data, int start, int len) throws PatternSyntaxException { 455 boolean enable = true; 456 int result = DEFAULT; 457 for (int i = 0; i < len; i++) { 458 char c = data[start + i]; 459 switch (c) { 460 case '+': 461 enable = true; 462 break; 463 case '-': 464 enable = false; 465 break; 466 default: 467 int flag = getFlag(c); 468 if (enable) result |= flag; 469 else result &= (~flag); 470 } 471 } 472 return result; 473 } 474 475 private static int getFlag(char c) throws PatternSyntaxException { 476 switch (c) { 477 case 'i': 478 return IGNORE_CASE; 479 case 'm': 480 return MULTILINE; 481 case 's': 482 return DOTALL; 483 case 'x': 484 return IGNORE_SPACES; 485 case 'u': 486 return UNICODE; 487 case 'X': 488 return XML_SCHEMA; 489 } 490 throw new PatternSyntaxException("unknown flag: " + c); 491 } 492 493 @Override 494 public boolean equals(Object o) { 495 if (this == o) return true; 496 if (o == null || getClass() != o.getClass()) return false; 497 498 Pattern pattern = (Pattern) o; 499 500 if (memregs != pattern.memregs) return false; 501 if (counters != pattern.counters) return false; 502 if (lookaheads != pattern.lookaheads) return false; 503 if (stringRepr != null ? !stringRepr.equals(pattern.stringRepr) : pattern.stringRepr != null) return false; 504 return root != null ? root.equals(pattern.root) : pattern.root == null && (root0 != null ? root0.equals(pattern.root0) : pattern.root0 == null && (namedGroupMap != null ? namedGroupMap.equals(pattern.namedGroupMap) : pattern.namedGroupMap == null)); 505 506 } 507 508 @Override 509 public int hashCode() { 510 int result = stringRepr != null ? stringRepr.hashCode() : 0; 511 result = 31 * result + (root != null ? root.hashCode() : 0); 512 result = 31 * result + (root0 != null ? root0.hashCode() : 0); 513 result = 31 * result + memregs; 514 result = 31 * result + counters; 515 result = 31 * result + lookaheads; 516 result = 31 * result + (namedGroupMap != null ? namedGroupMap.hashCode() : 0); 517 return result; 518 } 519}