package cn.elwy.common.util.io;

import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.List;

import org.mozilla.intl.chardet.nsDetector;
import org.mozilla.intl.chardet.nsICharsetDetectionObserver;
import org.mozilla.intl.chardet.nsPSMDetector;

import cn.elwy.common.Constant;
import cn.elwy.common.util.CloseUtil;

/**
 * 获取文件或输入流编码
 * @author huangsq
 * @version 1.0, 2018-02-19
 */
public class CharsetDetectorMozilla extends CharsetDetector {

	private boolean found = false;
	private String result;
	private int lang;
	private static volatile CharsetDetector instance;

	public CharsetDetectorMozilla() {
		// 设置成简体中文，检验非GB2312将返回GB18030，否则返回的编辑可能导致乱码
		this.lang = nsPSMDetector.SIMPLIFIED_CHINESE;
	}

	public CharsetDetectorMozilla(int lang) {
		this.lang = lang;
	}

	public static CharsetDetector getInstance() {
		if (instance == null) {
			synchronized (CharsetDetector.class) {
				if (instance == null) {
					instance = new CharsetDetector();
				}
			}
		}
		return instance;
	}

	/**
	 * 读取输入流的编码
	 * @param inputStream 输入流
	 * @throws IOException
	 */
	public String getCharsetName(InputStream inputStream) throws IOException {
		List<String> detectCharsets = detectCharsets(inputStream);
		if (detectCharsets.contains(Constant.ENCODING_UTF8)) {
			return Constant.ENCODING_UTF8;
		} else if (detectCharsets.contains(Constant.ENCODING_GB18030)) {
			return Constant.ENCODING_GB18030;
		} else if (detectCharsets.contains(Constant.ENCODING_GBK)) {
			return Constant.ENCODING_GBK;
		} else if (detectCharsets.contains(Constant.ENCODING_ASCII)) {
			return Constant.ENCODING_ASCII;
		} else if (detectCharsets.contains(Constant.ENCODING_ISO88591)) {
			return Constant.ENCODING_ISO88591;
		} else if (detectCharsets.contains(Constant.ENCODING_UTF16BE)) {
			return Constant.ENCODING_UTF16BE;
		} else if (detectCharsets.contains(Constant.ENCODING_UTF16LE)) {
			return Constant.ENCODING_UTF16LE;
		} else if (detectCharsets.contains(Constant.ENCODING_UTF32BE)) {
			return Constant.ENCODING_UTF32BE;
		} else if (detectCharsets.contains(Constant.ENCODING_UTF32LE)) {
			return Constant.ENCODING_UTF32LE;
		} else if (!detectCharsets.contains("nomatch")) {
			return detectCharsets.get(0);
		}
		return null;
	}

	/**
	 * 读取输入流的所有编码，获取编码后会关闭输入流
	 * @param inputStream 输入流
	 * @throws IOException
	 */
	protected List<String> detectCharsets(InputStream inputStream) throws IOException {
		nsDetector det = new nsDetector(lang);
		det.Init(new nsICharsetDetectionObserver() {

			public void Notify(String charset) {
				found = true;
				result = charset;
			}
		});

		BufferedInputStream imp = null;
		boolean isAscii = true;
		try {
			imp = new BufferedInputStream(inputStream);
			byte[] buffer = new byte[1024];
			int len;
			while ((len = imp.read(buffer, 0, buffer.length)) != -1) {
				// Check if the stream is only ascii.
				if (isAscii) {
					isAscii = det.isAscii(buffer, len);
				}
				// DoIt if non-ascii and not done yet.
				if (!isAscii) {
					if (det.DoIt(buffer, len, false)) {
						break;
					}
				}
			}
		} finally {
			CloseUtil.close(imp);
			CloseUtil.close(inputStream);
		}
		det.DataEnd();
		String[] charsetName = null;
		if (isAscii) {
			found = true;
			charsetName = new String[] { Constant.ENCODING_ASCII };
		} else if (found) {
			charsetName = new String[] { result };
		} else {
			charsetName = det.getProbableCharsets();
		}
		return Arrays.asList(charsetName);
	}

}