/*
 * Decompiled with CFR 0.152.
 */
package edu.northwestern.at.morphadorner.examples;

import edu.northwestern.at.utils.FileUtils;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultSuffixLexicon;
import edu.northwestern.at.utils.corpuslinguistics.lexicon.DefaultWordLexicon;
import edu.northwestern.at.utils.corpuslinguistics.postagger.guesser.DefaultPartOfSpeechGuesser;
import edu.northwestern.at.utils.corpuslinguistics.sentencesplitter.DefaultSentenceSplitter;
import edu.northwestern.at.utils.corpuslinguistics.tokenizer.DefaultWordTokenizer;
import java.io.BufferedOutputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.util.List;

public class SentenceAndTokenOffsets {
    public static void main(String[] args) {
        try {
            if (args.length > 0) {
                SentenceAndTokenOffsets.displayOffsets(args[0]);
            } else {
                System.err.println("Usage: SentenceAndTokenOffsets inputFileName");
            }
        }
        catch (Exception e) {
            e.printStackTrace();
        }
    }

    public static void displayOffsets(String inputFileName) throws Exception {
        PrintStream printOut = new PrintStream((OutputStream)new BufferedOutputStream(System.out), true, "utf-8");
        String sampleText = FileUtils.readTextFile(inputFileName, "utf-8");
        sampleText = sampleText.replaceAll("\\s", " ");
        DefaultSentenceSplitter splitter = new DefaultSentenceSplitter();
        DefaultPartOfSpeechGuesser partOfSpeechGuesser = new DefaultPartOfSpeechGuesser();
        DefaultWordLexicon lexicon = new DefaultWordLexicon();
        partOfSpeechGuesser.setWordLexicon(lexicon);
        DefaultSuffixLexicon suffixLexicon = new DefaultSuffixLexicon();
        partOfSpeechGuesser.setSuffixLexicon(suffixLexicon);
        splitter.setPartOfSpeechGuesser(partOfSpeechGuesser);
        DefaultWordTokenizer tokenizer = new DefaultWordTokenizer();
        List<List<String>> sentences = splitter.extractSentences(sampleText, tokenizer);
        int[] sentenceOffsets = splitter.findSentenceOffsets(sampleText, sentences);
        for (int i = 0; i < sentences.size(); ++i) {
            int start = sentenceOffsets[i];
            int end = sentenceOffsets[i + 1];
            String sentence = sampleText.substring(start, end);
            printOut.println(i + " [" + start + "," + (end - 1) + "]: " + sentence);
            List<String> words = sentences.get(i);
            int[] wordOffsets = tokenizer.findWordOffsets(sentence, words);
            for (int j = 0; j < words.size(); ++j) {
                start = wordOffsets[j];
                end = wordOffsets[j] + words.get(j).toString().length();
                printOut.println("          " + j + " [" + start + "," + (end - 1) + "]: " + sentence.substring(start, end));
            }
        }
    }
}

