001/**
002 * Copyright (c) 2001, Sergey A. Samokhodkin
003 * All rights reserved.
004 * <p>
005 * Redistribution and use in source and binary forms, with or without modification,
006 * are permitted provided that the following conditions are met:
007 * <p>
008 * - Redistributions of source code must retain the above copyright notice,
009 * this list of conditions and the following disclaimer.
010 * - Redistributions in binary form
011 * must reproduce the above copyright notice, this list of conditions and the following
012 * disclaimer in the documentation and/or other materials provided with the distribution.
013 * - Neither the name of jregex nor the names of its contributors may be used
014 * to endorse or promote products derived from this software without specific prior
015 * written permission.
016 * <p>
017 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY
018 * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
019 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
020 * IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
021 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
022 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
023 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
024 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
025 * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
026 *
027 * @version 1.2_01
028 */
029
030package regexodus;
031
032import java.io.IOException;
033import java.io.Reader;
034import java.io.Serializable;
035import java.util.HashMap;
036
037/**
038 * A handle for a precompiled regular expression; core operations should be identical to java.util.regex.Pattern .
039 * Pattern should be no different.
040 * <br>
041 * To match a regular expression <code>myExpr</code> against a text <code>myString</code> one should first
042 * create a Pattern object:<pre>
043 * Pattern p = new Pattern(myExpr);
044 * </pre>
045 * or <pre>
046 * Pattern p = Pattern.compile(myExpr);
047 * </pre>
048 * then obtain a Matcher object:<pre>
049 * Matcher matcher=p.matcher(myText);
050 * </pre>
051 * The latter is an automaton that actually performs a search. It provides the following methods:
052 * <ul>
053 * <li> search for matching substrings : matcher.find() or matcher.findAll();</li>
054 * <li> test whether the text matches the whole pattern : matcher.matches();</li>
055 * <li> test whether the text matches the beginning of the pattern : matcher.matchesPrefix();</li>
056 * <li> search with custom options : matcher.find(int options)</li>
057 * </ul>
058 * <p>
059 * <b>Flags</b>
060 * <br>
061 * Flags (see REFlags interface) change the meaning of some regular expression elements at compiletime.
062 * These flags may be passed both as string(see Pattern(String,String)) and as bitwise OR of:
063 * <ul>
064 * <li><b>REFlags.IGNORE_CASE</b> - enables case insensitivity</li>
065 * <li><b>REFlags.MULTILINE</b> - forces "^" and "$" to match both at the start and the end of line;</li>
066 * <li><b>REFlags.DOTALL</b> - forces "." to match eols('\r' and '\n' in ASCII);</li>
067 * <li><b>REFlags.IGNORE_SPACES</b> - literal spaces in expression are ignored for better readability;</li>
068 * <li><b>REFlags.UNICODE</b> - the predefined classes('\w','\d',etc) are referenced to Unicode;</li>
069 * <li><b>REFlags.XML_SCHEMA</b> - permits XML Schema regular expressions syntax extensions.</li>
070 * </ul>
071 * <p>
072 * <b>Multithreading</b><br>
073 * Pattern instances are not thread-safe, and neither are Matcher objects.
074 *
075 * @see REFlags
076 * @see Matcher
077 * @see Matcher#setTarget(java.lang.CharSequence)
078 * @see Matcher#setTarget(java.lang.CharSequence, int, int)
079 * @see Matcher#setTarget(char[], int, int)
080 * @see Matcher#setTarget(java.io.Reader, int)
081 * @see MatchResult
082 * @see MatchResult#group(int)
083 * @see MatchResult#start(int)
084 * @see MatchResult#end(int)
085 * @see MatchResult#length(int)
086 * @see MatchResult#charAt(int, int)
087 * @see MatchResult#prefix()
088 * @see MatchResult#suffix()
089 */
090
091public class Pattern implements Serializable, REFlags {
092    private static final long serialVersionUID = -3628346657932720807L;
093
094    String stringRepr;
095
096    // tree entry
097    Term root, root0;
098
099    // required number of memory slots
100    int memregs;
101
102    // required number of iteration counters
103    int counters;
104
105    // number of lookahead groups
106    int lookaheads;
107
108    HashMap<String, Integer> namedGroupMap;
109
110    boolean caseless = false;
111
112    protected Pattern() throws PatternSyntaxException {
113    }
114
115    /**
116     * Compiles an expression with default flags.
117     *
118     * @param regex the Perl5-compatible regular expression string.
119     * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
120     * @see Pattern#Pattern(java.lang.String, java.lang.String)
121     * @see Pattern#Pattern(java.lang.String, int)
122     */
123    public Pattern(String regex) throws PatternSyntaxException {
124        this(regex, DEFAULT);
125    }
126
127    /**
128     * Compiles a regular expression using Perl5-style flags.
129     * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus.
130     * The meaning of letters:
131     * <ul>
132     * <li><b>i</b> - case insensitivity, corresponds to REFlags.IGNORE_CASE;</li>
133     * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;</li>
134     * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;</li>
135     * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.</li>
136     * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.</li>
137     * <li><b>X</b> - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.</li>
138     * <li><b>-</b> - turn off the specified flags; normally has no effect unless something adds the flags.</li>
139     * <li><b>+</b> - turn on the specified flags; normally is no different from just using the letters.</li>
140     * </ul>
141     *
142     * @param regex the Perl5-compatible regular expression string.
143     * @param flags the Perl5-compatible flags.
144     * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
145     *                                see REFlags
146     */
147    public Pattern(String regex, String flags) throws PatternSyntaxException {
148        internalCompile(regex, parseFlags(flags));
149    }
150
151    /**
152     * Compiles a regular expression using REFlags.
153     * The <code>flags</code> parameter is a bitwise OR of the following values:
154     * <ul>
155     * <li><b>REFlags.IGNORE_CASE</b> - case insensitivity, corresponds to '<b>i</b>' letter;</li>
156     * <li><b>REFlags.MULTILINE</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to '<b>m</b>';</li>
157     * <li><b>REFlags.DOTALL</b> - single line treatment('.' matches \r's and \n's),corresponds to '<b>s</b>';</li>
158     * <li><b>REFlags.IGNORE_SPACES</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to '<b>x</b>'.</li>
159     * <li><b>REFlags.UNICODE</b> - predefined classes are regarded as belonging to Unicode, corresponds to '<b>u</b>'; this may yield some performance penalty.</li>
160     * <li><b>REFlags.XML_SCHEMA</b> - compatibility with XML Schema, corresponds to '<b>X</b>'.</li>
161     * </ul>
162     *
163     * @param regex the Perl5-compatible regular expression string.
164     * @param flags the Perl5-compatible flags.
165     * @throws PatternSyntaxException if the argument doesn't correspond to perl5 regex syntax.
166     *                                see REFlags
167     */
168    private Pattern(String regex, int flags) throws PatternSyntaxException {
169        internalCompile(regex, flags);
170    }
171
172
173    //java.util.regex.* compatibility
174
175    /**
176     * Compiles the given String into a Pattern that can be used to match text.
177     * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
178     * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
179     * @param regex a String in normal Java regular expression format
180     * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
181     * @throws PatternSyntaxException
182     */
183    public static Pattern compile(String regex) throws PatternSyntaxException{
184        return new Pattern(regex, DEFAULT);
185    }
186    //java.util.regex.* compatibility
187
188    /**
189     * Compiles the given String into a Pattern that can be used to match text.
190     * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
191     * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
192     * <br>
193     * This variant allows flags to be passed as an int constructed via bitwise OR from REFlags constants. You may prefer
194     * the variant that takes a String for clarity.
195     * @param regex a String in normal Java regular expression format
196     * @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags.
197     * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
198     * @throws PatternSyntaxException
199     */
200    public static Pattern compile(String regex,int flags) throws PatternSyntaxException{
201        return new Pattern(regex, flags);
202    }
203    //java.util.regex.* compatibility
204    /**
205     * Compiles the given String into a Pattern that can be used to match text.
206     * The syntax is normal for Java, including backslashes as part of regex syntax, like the digit shorthand "\d",
207     * escaped twice to "\\d" (so the double-quoted String itself doesn't try to interpret the backslash).
208     * <br>
209     * This variant allows flags to be passed as an String.
210     * The flag string should consist of letters 'i','m','s','x','u','X'(the case is significant) and a hyphen or plus.
211     * The meaning of letters:
212     * <ul>
213     * <li><b>i</b> - case insensitivity, corresponds to REFlags.IGNORE_CASE;</li>
214     * <li><b>m</b> - multiline treatment(BOLs and EOLs affect the '^' and '$'), corresponds to REFlags.MULTILINE flag;</li>
215     * <li><b>s</b> - single line treatment('.' matches \r's and \n's),corresponds to REFlags.DOTALL;</li>
216     * <li><b>x</b> - extended whitespace comments (spaces and eols in the expression are ignored), corresponds to REFlags.IGNORE_SPACES.</li>
217     * <li><b>u</b> - predefined classes are regarded as belonging to Unicode, corresponds to REFlags.UNICODE; this may yield some performance penalty.</li>
218     * <li><b>X</b> - compatibility with XML Schema, corresponds to REFlags.XML_SCHEMA.</li>
219     * <li><b>-</b> - turn off the specified flags; normally has no effect unless something adds the flags.</li>
220     * <li><b>+</b> - turn on the specified flags; normally is no different from just using the letters.</li>
221     * </ul>
222     *
223     * @param regex a String in normal Java regular expression format
224     * @param flags integer flags that are constructed via bitwise OR from the flag constants in REFlags.
225     * @return a newly constructed Pattern object that can be used to match text that fits the given regular expression
226     * @throws PatternSyntaxException
227     */
228    public static Pattern compile(String regex,String flags) throws PatternSyntaxException{
229        return new Pattern(regex, flags);
230    }
231
232
233    private void internalCompile(String regex, int flags) throws PatternSyntaxException {
234        stringRepr = regex;
235        caseless = (flags & IGNORE_CASE) == IGNORE_CASE;
236        Term.makeTree(regex, flags, this);
237    }
238
239    /**
240     * How many capturing groups does this expression include?
241     */
242    public int groupCount() {
243        return memregs;
244    }
245
246    /**
247     * Get numeric id for a group name.
248     *
249     * @return <code>null</code> if no such name found.
250     * @see MatchResult#group(java.lang.String)
251     * @see MatchResult#isCaptured(java.lang.String)
252     */
253    public Integer groupId(String name) {
254        return (namedGroupMap.get(name));
255    }
256
257    /**
258     * A shorthand for Pattern.matcher(String).matches().<br>
259     *
260     * @param s the target
261     * @return true if the entire target matches the pattern
262     * @see Matcher#matches()
263     * @see Matcher#matches(String)
264     */
265    public boolean matches(String s) {
266        return matcher(s).matches();
267    }
268
269    /**
270     * A shorthand for Pattern.matcher(String).matchesPrefix().<br>
271     *
272     * @param s the target
273     * @return true if the entire target matches the beginning of the pattern
274     * @see Matcher#matchesPrefix()
275     */
276    public boolean startsWith(String s) {
277        return matcher(s).matchesPrefix();
278    }
279
280    /**
281     * Returns a target-less matcher.
282     * Don't forget to supply a target.
283     */
284    public Matcher matcher() {
285        return new Matcher(this);
286    }
287
288    /**
289     * Returns a matcher for a specified string.
290     */
291    public Matcher matcher(CharSequence s) {
292        Matcher m = new Matcher(this);
293        m.setTarget(s);
294        return m;
295    }
296
297    /**
298     * Returns a matcher for a specified region.
299     */
300    public Matcher matcher(char[] data, int start, int end) {
301        Matcher m = new Matcher(this);
302        m.setTarget(data, start, end);
303        return m;
304    }
305
306    /**
307     * Returns a matcher for a match result (in a performance-friendly way).
308     * <code>groupId</code> parameter specifies which group is a target.
309     *
310     * @param groupId which group is a target; either positive integer(group id), or one of MatchResult.MATCH,MatchResult.PREFIX,MatchResult.SUFFIX,MatchResult.TARGET.
311     */
312    public Matcher matcher(MatchResult res, int groupId) {
313        Matcher m = new Matcher(this);
314        if (res instanceof Matcher) {
315            m.setTarget((Matcher) res, groupId);
316        } else {
317            m.setTarget(res.targetChars(), res.start(groupId) + res.targetStart(), res.length(groupId));
318        }
319        return m;
320    }
321
322    /**
323     * Just as above, yet with symbolic group name.
324     *
325     * @throws NullPointerException if there is no group with such name
326     */
327    public Matcher matcher(MatchResult res, String groupName) {
328        Integer id = res.pattern().groupId(groupName);
329        if (id == null) throw new IllegalArgumentException("group not found:" + groupName);
330        int group = id;
331        return matcher(res, group);
332    }
333
334    /**
335     * Returns a matcher taking a text stream as target.
336     * <b>Note that this is not a true POSIX-style stream matching</b>, i.e. the whole length of the text is preliminary read and stored in a char array.
337     *
338     * @param text   a text stream
339     * @param length the length to read from a stream; if <code>len</code> is <code>-1</code>, the whole stream is read in.
340     * @throws IOException indicates an IO problem
341     */
342    @GwtIncompatible
343    public Matcher matcher(Reader text, int length) throws IOException {
344        Matcher m = new Matcher(this);
345        m.setTarget(text, length);
346        return m;
347    }
348
349    /**
350     * Returns a replacer of a pattern by specified perl-like expression.
351     * Such replacer will substitute all occurrences of a pattern by an evaluated expression
352     * ("$&amp;" and "$0" will substitute by the whole match, "$1" will substitute by group#1, etc).
353     * Example:<pre>
354     * String text="The quick brown fox jumped over the lazy dog";
355     * Pattern word=new Pattern("\\w+");
356     * System.out.println(word.replacer("[$&amp;]").replace(text));
357     * //prints "[The] [quick] [brown] [fox] [jumped] [over] [the] [lazy] [dog]"
358     * Pattern swap=new Pattern("(fox|dog)(.*?)(fox|dog)");
359     * System.out.println(swap.replacer("$3$2$1").replace(text));
360     * //prints "The quick brown dog jumped over the lazy fox"
361     * Pattern scramble=new Pattern("(\\w+)(.*?)(\\w+)");
362     * System.out.println(scramble.replacer("$3$2$1").replace(text));
363     * //prints "quick The fox brown over jumped lazy the dog"
364     * </pre>
365     *
366     * @param expr a perl-like expression, the "$&amp;" and "${&amp;}" standing for whole match, the "$N" and "${N}" standing for group#N, and "${Foo}" standing for named group Foo.
367     * @see Replacer
368     */
369    public Replacer replacer(String expr) {
370        return new Replacer(this, expr);
371    }
372
373    /**
374     * Returns a replacer will substitute all occurrences of a pattern
375     * through applying a user-defined substitution model.
376     *
377     * @param model a Substitution object which is in charge for match substitution
378     * @see Replacer
379     */
380    public Replacer replacer(Substitution model) {
381        return new Replacer(this, model);
382    }
383
384    /**
385     * Tokenizes a text by an occurrences of the pattern.
386     * Note that a series of adjacent matches are regarded as a single separator.
387     * The same as new RETokenizer(Pattern,String);
388     *
389     * @see RETokenizer
390     * @see RETokenizer#RETokenizer(regexodus.Pattern, java.lang.String)
391     */
392    public RETokenizer tokenizer(String text) {
393        return new RETokenizer(this, text);
394    }
395
396    /**
397     * Tokenizes a specified region by an occurrences of the pattern.
398     * Note that a series of adjacent matches are regarded as a single separator.
399     * The same as new RETokenizer(Pattern,char[],int,int);
400     *
401     * @see RETokenizer
402     * @see RETokenizer#RETokenizer(regexodus.Pattern, char[], int, int)
403     */
404    public RETokenizer tokenizer(char[] data, int off, int len) {
405        return new RETokenizer(this, data, off, len);
406    }
407
408    /**
409     * Tokenizes a specified region by an occurrences of the pattern.
410     * Note that a series of adjacent matches are regarded as a single separator.
411     * The same as new RETokenizer(Pattern,Reader,int);
412     *
413     * @see RETokenizer
414     * @see RETokenizer#RETokenizer(regexodus.Pattern, java.io.Reader, int)
415     */
416    @GwtIncompatible
417    public RETokenizer tokenizer(Reader in, int length) throws IOException {
418        return new RETokenizer(this, in, length);
419    }
420
421    public String toString() {
422        return stringRepr;
423    }
424
425    /**
426     * Returns a less or more readable representation of a bytecode for the pattern.
427     */
428    public String toString_d() {
429        return root.toStringAll();
430    }
431
432    private static int parseFlags(String flags) throws PatternSyntaxException {
433        boolean enable = true;
434        int len = flags.length();
435        int result = DEFAULT;
436        for (int i = 0; i < len; i++) {
437            char c = flags.charAt(i);
438            switch (c) {
439                case '+':
440                    enable = true;
441                    break;
442                case '-':
443                    enable = false;
444                    break;
445                default:
446                    int flag = getFlag(c);
447                    if (enable) result |= flag;
448                    else result &= (~flag);
449            }
450        }
451        return result;
452    }
453
454    static int parseFlags(char[] data, int start, int len) throws PatternSyntaxException {
455        boolean enable = true;
456        int result = DEFAULT;
457        for (int i = 0; i < len; i++) {
458            char c = data[start + i];
459            switch (c) {
460                case '+':
461                    enable = true;
462                    break;
463                case '-':
464                    enable = false;
465                    break;
466                default:
467                    int flag = getFlag(c);
468                    if (enable) result |= flag;
469                    else result &= (~flag);
470            }
471        }
472        return result;
473    }
474
475    private static int getFlag(char c) throws PatternSyntaxException {
476        switch (c) {
477            case 'i':
478                return IGNORE_CASE;
479            case 'm':
480                return MULTILINE;
481            case 's':
482                return DOTALL;
483            case 'x':
484                return IGNORE_SPACES;
485            case 'u':
486                return UNICODE;
487            case 'X':
488                return XML_SCHEMA;
489        }
490        throw new PatternSyntaxException("unknown flag: " + c);
491    }
492
493    @Override
494    public boolean equals(Object o) {
495        if (this == o) return true;
496        if (o == null || getClass() != o.getClass()) return false;
497
498        Pattern pattern = (Pattern) o;
499
500        if (memregs != pattern.memregs) return false;
501        if (counters != pattern.counters) return false;
502        if (lookaheads != pattern.lookaheads) return false;
503        if (stringRepr != null ? !stringRepr.equals(pattern.stringRepr) : pattern.stringRepr != null) return false;
504        return root != null ? root.equals(pattern.root) : pattern.root == null && (root0 != null ? root0.equals(pattern.root0) : pattern.root0 == null && (namedGroupMap != null ? namedGroupMap.equals(pattern.namedGroupMap) : pattern.namedGroupMap == null));
505
506    }
507
508    @Override
509    public int hashCode() {
510        int result = stringRepr != null ? stringRepr.hashCode() : 0;
511        result = 31 * result + (root != null ? root.hashCode() : 0);
512        result = 31 * result + (root0 != null ? root0.hashCode() : 0);
513        result = 31 * result + memregs;
514        result = 31 * result + counters;
515        result = 31 * result + lookaheads;
516        result = 31 * result + (namedGroupMap != null ? namedGroupMap.hashCode() : 0);
517        return result;
518    }
519}