package jp.go.nict.langrid.wrapper.workflowsupport;

import java.lang.Character;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jp.go.nict.langrid.language.Language;
import jp.go.nict.langrid.service_1_2.AccessLimitExceededException;
import jp.go.nict.langrid.service_1_2.InvalidParameterException;
import jp.go.nict.langrid.service_1_2.LanguageNotUniquelyDecidedException;
import jp.go.nict.langrid.service_1_2.NoAccessPermissionException;
import jp.go.nict.langrid.service_1_2.NoValidEndpointsException;
import jp.go.nict.langrid.service_1_2.ProcessFailedException;
import jp.go.nict.langrid.service_1_2.ServerBusyException;
import jp.go.nict.langrid.service_1_2.ServiceNotActiveException;
import jp.go.nict.langrid.service_1_2.ServiceNotFoundException;
import jp.go.nict.langrid.service_1_2.UnsupportedLanguageException;
import jp.go.nict.langrid.service_1_2.morphologicalanalysis.Morpheme;
import jp.go.nict.langrid.service_1_2.morphologicalanalysis.MorphologicalAnalysisService;
import jp.go.nict.langrid.service_1_2.typed.PartOfSpeech;
import jp.go.nict.langrid.service_1_2.util.validator.StringValidator;

/* loaded from: input_file:jp/go/nict/langrid/wrapper/workflowsupport/DefaultMorphologicalAnalysis.class */
public class DefaultMorphologicalAnalysis implements MorphologicalAnalysisService {
    private static Logger logger = Logger.getLogger(DefaultMorphologicalAnalysis.class.getName());
    protected static Pattern shortPattern = Pattern.compile("([a-zA-Z]+'[a-zA-Z]+)|((\\d)+\\.(\\d)+)");
    protected static Pattern pattern = Pattern.compile("[[,、。．\\.！？!?<>\"”’]|^\\s+\\.|']");
    private static Set<Character.UnicodeBlock> treatAsKanji = new HashSet();

    public Morpheme[] analyze(String str, String str2) throws AccessLimitExceededException, InvalidParameterException, LanguageNotUniquelyDecidedException, NoAccessPermissionException, NoValidEndpointsException, ProcessFailedException, ServerBusyException, ServiceNotActiveException, ServiceNotFoundException, UnsupportedLanguageException {
        try {
            return doAnalyze(null, (String) new StringValidator("text", str2).notNull().trim().notEmpty().getValue());
        } catch (InvalidParameterException e) {
            throw e;
        } catch (ProcessFailedException e2) {
            throw e2;
        } catch (Throwable th) {
            logger.log(Level.SEVERE, "unknown error occurred.", th);
            if (th instanceof RuntimeException) {
                logger.severe("language: " + str + ", text: " + str2);
            }
            throw new ProcessFailedException(th);
        }
    }

    protected Morpheme[] doAnalyze(Language language, String str) throws InvalidParameterException, ProcessFailedException {
        String replaceAll = str.replaceAll("\u3000", " ");
        ArrayList arrayList = new ArrayList();
        for (String str2 : split(replaceAll)) {
            arrayList.add(new Morpheme(str2, str2, PartOfSpeech.unknown.getExpression()));
        }
        return (Morpheme[]) arrayList.toArray(new Morpheme[0]);
    }

    protected Collection<String> split(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = shortPattern.matcher(str);
        HashMap hashMap = new HashMap();
        int i = 0;
        StringBuffer stringBuffer = new StringBuffer();
        while (matcher.find()) {
            int i2 = i;
            i++;
            String str2 = "{" + i2 + "}";
            hashMap.put(str2, matcher.group());
            matcher.appendReplacement(stringBuffer, str2);
        }
        matcher.appendTail(stringBuffer);
        String[] split = pattern.split(stringBuffer);
        Matcher matcher2 = pattern.matcher(stringBuffer);
        int i3 = 0;
        while (matcher2.find()) {
            if (i3 < split.length) {
                int i4 = i3;
                i3++;
                chopNonAsciiAndAdd(split[i4], hashMap, arrayList);
            }
            String group = matcher2.group();
            if (!group.trim().equals("")) {
                arrayList.add(group);
            }
        }
        if (i3 < split.length && split[i3] != null && !split[i3].equals("")) {
            chopNonAsciiAndAdd(split[i3], hashMap, arrayList);
        }
        return arrayList;
    }

    private void chopNonAsciiAndAdd(String str, Map<String, String> map, List<String> list) {
        StringBuilder sb = new StringBuilder();
        for (char c : str.toCharArray()) {
            if (treatAsKanji.contains(Character.UnicodeBlock.of(c))) {
                if (sb.length() > 0) {
                    String sb2 = sb.toString();
                    if (map.containsKey(sb2)) {
                        sb2 = map.get(sb2);
                    }
                    list.add(sb2);
                    sb = new StringBuilder();
                }
                list.add(new String("" + c));
            } else {
                sb.append(c);
            }
        }
        if (sb.length() > 0) {
            String sb3 = sb.toString();
            if (map.containsKey(sb3)) {
                sb3 = map.get(sb3);
            }
            list.add(sb3);
            new StringBuilder();
        }
    }

    static {
        treatAsKanji.add(Character.UnicodeBlock.HIRAGANA);
        treatAsKanji.add(Character.UnicodeBlock.KATAKANA);
        treatAsKanji.add(Character.UnicodeBlock.KATAKANA_PHONETIC_EXTENSIONS);
        treatAsKanji.add(Character.UnicodeBlock.HALFWIDTH_AND_FULLWIDTH_FORMS);
        treatAsKanji.add(Character.UnicodeBlock.CJK_COMPATIBILITY);
        treatAsKanji.add(Character.UnicodeBlock.CJK_COMPATIBILITY_FORMS);
        treatAsKanji.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS);
        treatAsKanji.add(Character.UnicodeBlock.CJK_COMPATIBILITY_IDEOGRAPHS_SUPPLEMENT);
        treatAsKanji.add(Character.UnicodeBlock.CJK_RADICALS_SUPPLEMENT);
        treatAsKanji.add(Character.UnicodeBlock.CJK_SYMBOLS_AND_PUNCTUATION);
        treatAsKanji.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS);
        treatAsKanji.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_A);
        treatAsKanji.add(Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS_EXTENSION_B);
    }
}
