/*-------------------------------------------------------------------------
 Copyright 2009 Olivier Berlanger

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at

 http://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.
 -------------------------------------------------------------------------*/
package net.sf.sfac.file;


import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import net.sf.sfac.utils.Comparison;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;


/**
 * An implementation of LineReader filtering out all HTML tags to transform HTML to plain text.
 */
public class LineReaderFilteringHtml extends LineReader {


    private static Log log = LogFactory.getLog(LineReaderFilteringHtml.class);
    private static final Map<String, String> ENTITIES = new HashMap<String, String>();

    private List<String> lineBuffer = new ArrayList<String>();
    private boolean eof = false;
    private boolean keepNewLine = false;

    static {
        ENTITIES.put("gt", ">");
        ENTITIES.put("lt", "<");
        ENTITIES.put("apos", "'");
        ENTITIES.put("quot", "\"");
        ENTITIES.put("eacute", "");
        ENTITIES.put("egrave", "");
        ENTITIES.put("ecirc", "");
        ENTITIES.put("agrave", "");
        ENTITIES.put("acirc", "");
        ENTITIES.put("icirc", "");
        ENTITIES.put("ugrave", "");
        ENTITIES.put("ocirc", "");
        ENTITIES.put("nbsp", " ");
        ENTITIES.put("laquo", "\"");
        ENTITIES.put("raquo", "\"");
        ENTITIES.put("ntilde", "");
    }


    public LineReaderFilteringHtml(BufferedReader reader) throws IOException {
        super(reader);
    }


    public LineReaderFilteringHtml(Class<?> loader, String resourceName) throws IOException {
        super(loader, resourceName);
    }


    public LineReaderFilteringHtml(File src) throws IOException {
        super(src);
    }


    public LineReaderFilteringHtml(URL src) throws IOException {
        super(src);
    }


    @Override
    protected String readLineImpl() throws IOException {
        String line = null;
        fillBuffer();
        if (lineBuffer.size() > 0) line = lineBuffer.remove(0);
        return line;
    }


    private void fillBuffer() throws IOException {
        StringBuffer currentBuffer = null;
        int currentIndex = 0;
        while ((!eof) && (lineBuffer.size() < 2)) {
            // get next line
            String line = super.readLineImpl();
            if (line != null) {
                // setup the current buffer
                if (currentBuffer == null) {
                    currentIndex = lineBuffer.size() - 1;
                    if (currentIndex < 0) {
                        currentIndex = 0;
                        lineBuffer.add("");
                        currentBuffer = new StringBuffer();
                    } else {
                        currentBuffer = new StringBuffer(lineBuffer.get(currentIndex));
                    }
                }
                // add the line
                currentIndex = addLine(line, lineBuffer, currentBuffer, currentIndex);
            } else {
                eof = true;
            }
        }
        if (currentBuffer != null) lineBuffer.set(currentIndex, currentBuffer.toString());
    }


    private int addLine(String line, List<String> lineBuffer2, StringBuffer currentBuffer, int currentIndex) {
        int len = line.length();
        for (int i = 0; i < len; i++) {
            char ch = line.charAt(i);
            if (ch <= 32) {
                currentBuffer.append(' ');
            } else if (ch == '<') {
                String tag = getTag(line, i + 1, len);
                if (isParagraphTag(tag)) {
                    currentIndex = newLine(currentBuffer, currentIndex);
                    currentIndex = newLine(currentBuffer, currentIndex);
                } else if (isNewLineTag(tag)) {
                    currentIndex = newLine(currentBuffer, currentIndex);
                } else if (isKeepNewLineStartTag(tag)) {
                    keepNewLine = true;
                } else if (isKeepNewLineEndTag(tag)) {
                    keepNewLine = false;
                }
                i += tag.length() + 1;
            } else if (ch == '&') {
                String entity = getEntity(line, i + 1, len);
                String mappedEntity = ENTITIES.get(entity);
                if (mappedEntity == null) {
                    log.warn("Unknown entity &" + entity + "; in '" + getSourceName() + "' at line " + getLineIndex());
                    currentBuffer.append(entity);
                    i += entity.length();
                } else {
                    currentBuffer.append(mappedEntity);
                    i += entity.length() + 1;
                }
            } else {
                currentBuffer.append(ch);
            }
        }
        if (keepNewLine) currentIndex = newLine(currentBuffer, currentIndex);
        return currentIndex;
    }


    private boolean isKeepNewLineStartTag(String tag) {
        if (Comparison.startsWithIgnoreCase(tag, "pre")) return true;
        return false;
    }


    private boolean isKeepNewLineEndTag(String tag) {
        if (Comparison.startsWithIgnoreCase(tag, "/pre")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "body")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "html")) return true;
        return false;
    }


    private int newLine(StringBuffer currentBuffer, int currentIndex) {
        lineBuffer.set(currentIndex, currentBuffer.toString().trim());
        lineBuffer.add("");
        currentBuffer.setLength(0);
        currentIndex++;
        return currentIndex;
    }


    private boolean isNewLineTag(String tag) {
        if (Comparison.startsWithIgnoreCase(tag, "br")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "body")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "/h1")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "/h2")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "/title")) return true;
        return false;
    }


    private boolean isParagraphTag(String tag) {
        if (tag.equalsIgnoreCase("p")) return true;
        if (Comparison.startsWithIgnoreCase(tag, "p ")) return true;
        return false;
    }


    private String getTag(String line, int i, int len) {
        StringBuffer sb = new StringBuffer();
        for (int j = i; j < len; j++) {
            char ch = line.charAt(j);
            if (ch == '>') break;
            else sb.append(ch);
        }
        return sb.toString();
    }


    private String getEntity(String line, int i, int len) {
        StringBuffer sb = new StringBuffer();
        for (int j = i; j < len; j++) {
            if (j - i > 8) return "";
            char ch = line.charAt(j);
            if (ch == ';') break;
            else sb.append(ch);
        }
        return sb.toString();
    }


}
