
/*
 * de.unkrig.commons - A general-purpose Java class library
 *
 * Copyright (c) 2012, Arno Unkrig
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without modification, are permitted provided that the
 * following conditions are met:
 *
 *    1. Redistributions of source code must retain the above copyright notice, this list of conditions and the
 *       following disclaimer.
 *    2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the
 *       following disclaimer in the documentation and/or other materials provided with the distribution.
 *    3. The name of the author may not be used to endorse or promote products derived from this software without
 *       specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

package de.unkrig.commons.text.scanner;

import de.unkrig.commons.lang.protocol.Predicate;
import de.unkrig.commons.nullanalysis.Nullable;
import de.unkrig.commons.text.scanner.AbstractScanner.Token;

/**
 * A scanner for the JAVA programming language.
 */
public final
class JavaScanner {

    private JavaScanner() {}

    // PUBLIC INTERFACE

    /**
     * Token types of the JAVA programming language.
     */
    public
    enum TokenType {

        // CHECKSTYLE VariableCheck:OFF
        SPACE,

        CXX_COMMENT,
        SINGLE_LINE_C_COMMENT,
        MULTI_LINE_C_COMMENT_BEGINNING, MULTI_LINE_C_COMMENT_MIDDLE, MULTI_LINE_C_COMMENT_END,

        END_OF_IGNORABLES,

        KEYWORD,
        IDENTIFIER,
        SEPARATOR,
        OPERATOR,

        STRING_LITERAL, CHARACTER_LITERAL, INTEGER_LITERAL, FLOATING_POINT_LITERAL,
        // CHECKSTYLE VariableCheck:ON
    }

    /**
     * Returns a Java scanner that also produces SPACE and COMMENT tokens.
     */
    public static StringScanner<TokenType>
    rawStringScanner() {
        StatefulScanner<TokenType, State> scanner = new StatefulScanner<TokenType, State>(State.class);

        scanner.addRule("\\s+", TokenType.SPACE);

        scanner.addRule("//.*", TokenType.CXX_COMMENT);
        scanner.addRule("/\\*.*?\\*/", TokenType.SINGLE_LINE_C_COMMENT);
        scanner.addRule("/\\*.*", TokenType.MULTI_LINE_C_COMMENT_BEGINNING, State.IN_MULTI_LINE_C_COMMENT);
        scanner.addRule(State.IN_MULTI_LINE_C_COMMENT, ".*?\\*/", TokenType.MULTI_LINE_C_COMMENT_END);
        scanner.addRule(
            State.IN_MULTI_LINE_C_COMMENT,
            ".*",
            TokenType.MULTI_LINE_C_COMMENT_MIDDLE,
            State.IN_MULTI_LINE_C_COMMENT
        );

        scanner.addRule((
            "(?:abstract|assert|boolean|break|byte|case|catch|char|class|const|continue|default|do|double|else|enum"
            + "|extends|final|finally|float|for|goto|if|implements|import|instanceof|int|interface|long|native|new"
            + "|package|private|protected|public|return|short|static|strictfp|super|switch|synchronized|this|throw"
            + "|throws|transient|try|void|volatile|while)(?![\\p{L}\\p{Nd}_$])"
        ), TokenType.KEYWORD);

        // See:
        //   http://en.wikipedia.org/wiki/Mapping_of_Unicode_characters#General_Category
        //   http://docs.oracle.com/javase/7/docs/api/java/lang/Character.html#isJavaIdentifierStart%28char%29
        //   http://docs.oracle.com/javase/7/docs/api/java/lang/Character.html#isJavaIdentifierPart%28char%29
        scanner.addRule((
            "[\\p{L}\\p{Nl}\\p{Sc}\\p{Pc}]"
            + "[\\p{L}\\p{Nl}\\p{Sc}\\p{Pc}\\p{Nd}\\p{Mn}\\p{Mc}\\x00-\\x08\\x0E-\\x1B\\x7F-\\x9F]*"
        ), TokenType.IDENTIFIER);

        // Must come before separator '.' because of ambiguity with '.9'.
        scanner.addRule("\\d+\\.\\d*(?:[eE][+\\-]?\\d+)?[fFdD]?", TokenType.FLOATING_POINT_LITERAL);  // 9.
        scanner.addRule("\\.\\d+(?:[eE][+\\-]?\\d+)?[fFdD]?",     TokenType.FLOATING_POINT_LITERAL);  // .9
        scanner.addRule("\\d+[eE][+\\-]?\\d+[fFdD]?",             TokenType.FLOATING_POINT_LITERAL);  // 9e1
        scanner.addRule("\\d+([eE][+\\-]?\\d+)?[fFdD]",           TokenType.FLOATING_POINT_LITERAL);  // 9f
        scanner.addRule("(?:[1-9]\\d*|0x\\p{XDigit}+|0[0-7]*)(L|l)?", TokenType.INTEGER_LITERAL);
        scanner.addRule(
            "'(?:\\\\[btnfr\"'\\\\]|\\\\[0-7]|\\\\[0-7][0-7]|\\\\[0-3][0-7][0-7]|[^'])'",
            TokenType.CHARACTER_LITERAL
        );
        scanner.addRule(
            "\\\"(?:\\\\[btnfr\"'\\\\]|\\\\[0-7]|\\\\[0-7][0-7]|\\\\[0-3][0-7][0-7]|[^\"])*+\\\"",
            TokenType.STRING_LITERAL
        );

        scanner.addRule("\\(|\\)|\\{|\\}|\\[|]|;|\\.|,", TokenType.SEPARATOR); // ( ) { } [ ] ; . ,

        // Seems like '|' implements a first-match search (and not a greedy search, as one may expect).
        // So longer operators need to appear before the shorter ones.
        scanner.addRule((
            ">>>="                                     // >>>=
            + "|<<=|>>=|>>>"                           // <<= >>= >>>
            + "|\\+=|-=|\\*=|/=|&=|\\|=|\\^=|%="       // += -= *= /= &= |= ^= %=
            + "|==|<=|>=|!=|&&|\\|\\||\\+\\+|--|<<|>>" // == <= >= != && || ++ -- << >>
            + "|=|>|<|!|~|\\?|:"                       // = > < ! ~ ? :
            + "|\\+|-|\\*|/|&|\\||\\^|%"               // + - * / & | ^ %
            + "|@"                                     // @
        ), TokenType.OPERATOR);

        return scanner;
    }

    /**
     * @return A scanner that swallows SPACE and COMMENT tokens
     */
    public static StringScanner<TokenType>
    stringScanner() {

        return ScannerUtil.filter(JavaScanner.rawStringScanner(), new Predicate<Token<TokenType>>() {

            @Override public boolean
            evaluate(@Nullable Token<TokenType> token) {
                return token == null || token.type.ordinal() > TokenType.END_OF_IGNORABLES.ordinal();
            }
        });
    }

    // IMPLEMENTATION

    private
    enum State {
        IN_MULTI_LINE_C_COMMENT,
    }
}
