package dragon.onlinedb.trec;

import dragon.nlp.Token;
import dragon.onlinedb.Article;
import dragon.onlinedb.ArticleParser;
import dragon.onlinedb.BasicArticle;
import dragon.util.SortedArray;
import java.util.Date;
import org.apache.maven.scm.ChangeSet;
import ucar.nc2.iosp.grads.GradsDataDescriptorFile;

/* loaded from: input_file:dragon/onlinedb/trec/SgmArticleParser.class */
public class SgmArticleParser implements ArticleParser {
    protected SortedArray tagList;

    @Override // dragon.onlinedb.ArticleParser
    public String assemble(Article article) {
        return null;
    }

    @Override // dragon.onlinedb.ArticleParser
    public Article parse(String str) {
        BasicArticle basicArticle = null;
        try {
            this.tagList = collectTagInformation(str);
            if (this.tagList == null || this.tagList.size() == 0) {
                return null;
            }
            basicArticle = new BasicArticle();
            basicArticle.setKey(extractDocNo(str));
            basicArticle.setTitle(extractTitle(str));
            basicArticle.setAbstract(extractAbstract(str));
            basicArticle.setMeta(extractMeta(str));
            basicArticle.setLength(extractLength(str));
            basicArticle.setDate(extractDate(str));
            basicArticle.setBody(extractBody(str));
            return basicArticle;
        } catch (Exception e) {
            e.printStackTrace();
            if (basicArticle.getKey() != null) {
                return basicArticle;
            }
            return null;
        }
    }

    protected int extractLength(String str) {
        return 0;
    }

    protected Date extractDate(String str) {
        return null;
    }

    protected String extractDocNo(String str) {
        Token docNoTag = getDocNoTag();
        if (docNoTag == null) {
            return null;
        }
        return getTagContent(str, docNoTag, false).trim();
    }

    protected Token getDocNoTag() {
        int binarySearch = this.tagList.binarySearch(new Token("DOCNO"));
        if (binarySearch < 0) {
            return null;
        }
        return (Token) this.tagList.get(binarySearch);
    }

    protected String extractTitle(String str) {
        int indexOf;
        Token titleTag = getTitleTag();
        if (titleTag == null) {
            return null;
        }
        StringBuffer stringBuffer = new StringBuffer();
        getTagContent(str, titleTag.getName(), titleTag.getIndex(), stringBuffer);
        if (titleTag.getName().equals("HL") && (indexOf = stringBuffer.indexOf("----")) >= 0) {
            stringBuffer.delete(indexOf, stringBuffer.length());
        }
        if (stringBuffer.length() < 5) {
            return null;
        }
        if (".!;?".indexOf(stringBuffer.charAt(stringBuffer.length() - 1)) < 0) {
            stringBuffer.append('.');
        }
        return stringBuffer.toString();
    }

    protected Token getTitleTag() {
        int binarySearch = this.tagList.binarySearch(new Token("HEAD"));
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token("HEADLINE"));
        }
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token("HL"));
        }
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token(GradsDataDescriptorFile.TITLE));
        }
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token("TI"));
        }
        if (binarySearch < 0) {
            return null;
        }
        return (Token) this.tagList.get(binarySearch);
    }

    protected String extractAbstract(String str) {
        Token abstractTag = getAbstractTag();
        if (abstractTag == null) {
            return null;
        }
        StringBuffer stringBuffer = new StringBuffer();
        getTagContent(str, abstractTag.getName(), abstractTag.getIndex(), stringBuffer);
        if (stringBuffer.length() < 5) {
            return null;
        }
        if (".!;?".indexOf(stringBuffer.charAt(stringBuffer.length() - 1)) < 0) {
            stringBuffer.append('.');
        }
        return stringBuffer.toString();
    }

    protected Token getAbstractTag() {
        int binarySearch = this.tagList.binarySearch(new Token("LP"));
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token("LEADPARA"));
        }
        if (binarySearch < 0) {
            return null;
        }
        return (Token) this.tagList.get(binarySearch);
    }

    protected String extractMeta(String str) {
        Token metaTag = getMetaTag();
        if (metaTag == null) {
            return null;
        }
        StringBuffer stringBuffer = new StringBuffer();
        getTagContent(str, metaTag.getName(), metaTag.getIndex(), stringBuffer);
        if (stringBuffer.length() >= 1) {
            return stringBuffer.toString();
        }
        return null;
    }

    protected Token getMetaTag() {
        int binarySearch = this.tagList.binarySearch(new Token("DESCRIPT"));
        if (binarySearch < 0) {
            binarySearch = this.tagList.binarySearch(new Token("IN"));
        }
        if (binarySearch < 0) {
            return null;
        }
        return (Token) this.tagList.get(binarySearch);
    }

    protected String extractBody(String str) {
        Token bodyTag = getBodyTag();
        if (bodyTag == null) {
            return null;
        }
        StringBuffer stringBuffer = new StringBuffer();
        int index = bodyTag.getIndex();
        int tagContent = getTagContent(str, bodyTag.getName(), index, stringBuffer);
        while (true) {
            int i = tagContent;
            if (i <= index) {
                break;
            }
            index = i;
            tagContent = getTagContent(str, bodyTag.getName(), index, stringBuffer);
        }
        if (stringBuffer.length() > 40) {
            return stringBuffer.toString();
        }
        return null;
    }

    protected Token getBodyTag() {
        int binarySearch = this.tagList.binarySearch(new Token("TEXT"));
        if (binarySearch < 0) {
            return null;
        }
        return (Token) this.tagList.get(binarySearch);
    }

    protected int getTagContent(String str, String str2, int i, StringBuffer stringBuffer) {
        int indexOf = str.indexOf(new StringBuffer().append("<").append(str2).append(">").toString(), i);
        if (indexOf < 0) {
            return indexOf;
        }
        int length = indexOf + 2 + str2.length();
        int indexOf2 = str.indexOf(new StringBuffer().append("</").append(str2).append(">").toString(), length);
        if (indexOf2 < 0) {
            return length;
        }
        if (stringBuffer.length() > 0) {
            stringBuffer.append(' ');
        }
        stringBuffer.append(removeTag(str.substring(length, indexOf2)));
        return indexOf2 + 3 + str2.length();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String getTagContent(String str, String str2, boolean z) {
        int binarySearch = this.tagList.binarySearch(new Token(str2));
        if (binarySearch < 0) {
            return null;
        }
        Token token = (Token) this.tagList.get(binarySearch);
        int index = token.getIndex() + 2 + token.getName().length();
        int indexOf = str.indexOf(new StringBuffer().append("</").append(token).append(">").toString(), index);
        if (indexOf < 0) {
            return null;
        }
        String substring = str.substring(index, indexOf);
        if (z) {
            substring = removeTag(substring);
        }
        return substring;
    }

    protected String getTagContent(String str, Token token, boolean z) {
        int index;
        int indexOf;
        if (token == null || (indexOf = str.indexOf(new StringBuffer().append("</").append(token).append(">").toString(), (index = token.getIndex() + 2 + token.getName().length()))) < 0) {
            return null;
        }
        String substring = str.substring(index, indexOf);
        if (z) {
            substring = removeTag(substring);
        }
        return substring;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public String removeTag(String str) {
        StringBuffer stringBuffer = new StringBuffer();
        int i = 0;
        int i2 = 0;
        while (i >= 0) {
            i = str.indexOf(60, i);
            if (i >= 0) {
                if (i > i2) {
                    stringBuffer.append(processTagContent(str.substring(i2, i)));
                    stringBuffer.append(' ');
                }
                i = str.indexOf(">", i);
                if (i >= 0) {
                    i2 = i + 1;
                }
            }
        }
        if (i2 < str.length()) {
            stringBuffer.append(processTagContent(str.substring(i2).trim()));
        }
        return stringBuffer.toString();
    }

    private String processTagContent(String str) {
        if (str.length() <= 10) {
            return "";
        }
        if (str.length() >= 400 && !containSentence(str)) {
            str = str.replaceAll("\n", ". ");
        }
        String replacement = replacement(str);
        if (replacement.length() > 40 && ".!;?".indexOf(replacement.charAt(replacement.length() - 1)) < 0) {
            replacement = new StringBuffer().append(replacement).append(".").toString();
        }
        return replacement;
    }

    private String replacement(String str) {
        return str.replaceAll(ChangeSet.AMPERSAND_ENTITY, "&").replaceAll("''", "\"").replaceAll("``", "\"").replace('\r', ' ').replace('\n', ' ').trim();
    }

    private boolean containSentence(String str) {
        if (str == null) {
            return false;
        }
        int indexOf = str.indexOf(". ");
        if (indexOf >= 0 && indexOf <= 400) {
            return true;
        }
        int indexOf2 = str.indexOf(".\r");
        if (indexOf2 >= 0 && indexOf2 <= 400) {
            return true;
        }
        int indexOf3 = str.indexOf(".\n");
        return indexOf3 >= 0 && indexOf3 <= 400;
    }

    protected SortedArray collectTagInformation(String str) {
        int i;
        try {
            SortedArray sortedArray = new SortedArray(30);
            int indexOf = str.indexOf(60);
            while (indexOf >= 0) {
                if (str.charAt(indexOf + 1) != '/') {
                    int indexOf2 = str.indexOf(62, indexOf);
                    if (!sortedArray.add(new Token(str.substring(indexOf + 1, indexOf2), indexOf, 1))) {
                        ((Token) sortedArray.get(sortedArray.insertedPos())).addFrequency(1);
                    }
                    i = indexOf2 + 1;
                } else {
                    i = indexOf + 1;
                }
                indexOf = str.indexOf(60, i);
            }
            return sortedArray;
        } catch (Exception e) {
            System.out.println("Invalid SGM format!");
            return null;
        }
    }
}
