package edu.pitt.dbmi.nlp.noble.coder.processor;

import edu.pitt.dbmi.nlp.noble.coder.model.Document;
import edu.pitt.dbmi.nlp.noble.coder.model.Processor;
import edu.pitt.dbmi.nlp.noble.coder.model.Section;
import edu.pitt.dbmi.nlp.noble.coder.model.Sentence;
import edu.pitt.dbmi.nlp.noble.coder.model.Spannable;
import edu.pitt.dbmi.nlp.noble.tools.SentenceDetector;
import edu.pitt.dbmi.nlp.noble.tools.SynopticReportDetector;
import edu.pitt.dbmi.nlp.noble.tools.TextTools;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/pitt/dbmi/nlp/noble/coder/processor/DocumentProcessor.class */
public class DocumentProcessor implements Processor<Document> {
    private static final String AB = "AB";
    private static final String PROSE_PATTERN = ".*\\b[a-z]+\\.\\s+[A-Z][a-z]+\\b.*";
    private String documentType;
    private Map<String, Pattern> sectioningMap;
    private long time;

    public DocumentProcessor() {
        this(Document.TYPE_MEDICAL_REPORT);
    }

    public DocumentProcessor(String str) {
        setDocumentType(str);
        this.sectioningMap = new HashMap();
        this.sectioningMap.put(Document.TYPE_MEDICAL_REPORT, Pattern.compile("^([A-Z/\\- ]{5,40}:)\\s+(.*)", 40));
        this.sectioningMap.put(Document.TYPE_MEDLINE_RECORD, Pattern.compile("^([A-Z]{2})\\s+\\-\\s+(.*)", 40));
        this.sectioningMap.put("Medline Record-AB", Pattern.compile("(?:,\\s*)?([A-Z ]+\\:)\\s+(.*)"));
    }

    public String getDocumentType() {
        return this.documentType;
    }

    public void setDocumentType(String str) {
        this.documentType = str;
    }

    public static String suggestDocumentType(String str) {
        return str.matches("(?s)^[A-Z]{2}  - .*") ? Document.TYPE_MEDLINE_RECORD : Document.TYPE_MEDICAL_REPORT;
    }

    public void processFile(File file) throws Exception {
        if (file.isDirectory()) {
            for (File file2 : file.listFiles()) {
                processFile(file2);
            }
            return;
        }
        if (file.getName().endsWith(".txt")) {
            System.out.println(file.getName());
            BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(new File(file.getAbsolutePath() + ".sectioned")));
            for (Section section : process(file).getSections()) {
                bufferedWriter.write("--------------\n[" + section.getTitle() + "]\n--------------\n" + section.getBody() + "--------------\n");
            }
            bufferedWriter.close();
        }
    }

    public Document process(File file) throws FileNotFoundException, IOException {
        Document process = process(new FileInputStream(file));
        process.setTitle(file.getName());
        process.setLocation(file.getAbsolutePath());
        process.setDocumentType(getDocumentType());
        return process(process);
    }

    public Document process(InputStream inputStream) throws IOException {
        return process(TextTools.getText(inputStream));
    }

    public Document process(String str) {
        Document document = new Document();
        document.setText(str);
        document.setDocumentType(getDocumentType());
        return process(document);
    }

    private void processMedline(Document document) {
        String text = document.getText();
        Pattern pattern = document.getDocumentType() != null ? this.sectioningMap.get(document.getDocumentType()) : null;
        if (pattern != null) {
            document.setSections(section(text, 0, pattern, pattern.matcher(text), new ArrayList()));
        }
        ArrayList arrayList = new ArrayList();
        for (Section section : document.getSections()) {
            if (section.getBody().trim().matches(PROSE_PATTERN)) {
                parseSentences(document, section.getBody(), section.getBodyOffset(), Sentence.TYPE_PROSE);
            } else {
                int bodyOffset = section.getBodyOffset();
                for (String str : section.getBody().split("\n")) {
                    parseSentences(document, str, bodyOffset, Sentence.TYPE_LINE);
                    bodyOffset += str.length() + 1;
                }
            }
        }
        document.addSections(arrayList);
    }

    private void processReport(Document document) {
        String text = document.getText();
        Pattern pattern = document.getDocumentType() != null ? this.sectioningMap.get(document.getDocumentType()) : null;
        if (pattern != null) {
            document.setSections(section(text, 0, pattern, pattern.matcher(text), new ArrayList()));
        }
        int i = 0;
        int i2 = 0;
        StringBuffer stringBuffer = new StringBuffer();
        String str = null;
        for (String str2 : document.getText().split("\n")) {
            if (!mergeLines(str, str2) && stringBuffer.toString().trim().length() > 0) {
                if (stringBuffer.toString().trim().contains("\n") || stringBuffer.toString().trim().matches(PROSE_PATTERN)) {
                    parseSentences(document, stringBuffer.toString(), i2, Sentence.TYPE_PROSE);
                } else {
                    parseSentences(document, stringBuffer.toString(), i2, Sentence.TYPE_LINE);
                }
                stringBuffer = new StringBuffer();
                i2 = i;
            }
            stringBuffer.append(str2 + "\n");
            i += str2.length() + 1;
            str = str2;
        }
        if (stringBuffer.length() > 0) {
            if (stringBuffer.toString().trim().contains("\n") || Pattern.compile("[a-z]\\.\\s*[A-Z]").matcher(stringBuffer.toString()).find()) {
                parseSentences(document, stringBuffer.toString(), i2, Sentence.TYPE_PROSE);
            } else {
                parseSentences(document, stringBuffer.toString(), i2, Sentence.TYPE_LINE);
            }
        }
    }

    @Override // edu.pitt.dbmi.nlp.noble.coder.model.Processor
    public Document process(Document document) {
        this.time = System.currentTimeMillis();
        if (Document.TYPE_MEDLINE_RECORD.equals(document.getDocumentType())) {
            processMedline(document);
        } else {
            processReport(document);
        }
        document.setDocumentStatus(Document.STATUS_PARSED);
        this.time = System.currentTimeMillis() - this.time;
        document.getProcessTime().put(getClass().getSimpleName(), Long.valueOf(this.time));
        return document;
    }

    /* JADX WARN: Multi-variable type inference failed */
    private void parseSentences(Document document, String str, int i, String str2) {
        Section section;
        Matcher matcher = Pattern.compile("^(\\s+)\\w.*", 40).matcher(str);
        if (matcher.matches()) {
            String group = matcher.group(1);
            str = str.substring(group.length());
            i += group.length();
        }
        List arrayList = new ArrayList();
        if (Sentence.TYPE_PROSE.equals(str2)) {
            arrayList = SentenceDetector.getSentences(str, i);
        } else {
            Sentence sentence = new Sentence(str, i, Sentence.TYPE_LINE);
            parseProperties(document, str);
            if (SynopticReportDetector.detect(str)) {
                sentence.setSentenceType(Sentence.TYPE_WORKSHEET);
            }
            arrayList.add(sentence);
        }
        if (!arrayList.isEmpty() && (section = document.getSection((Spannable) arrayList.get(0))) != null) {
            Sentence sentence2 = (Sentence) arrayList.get(0);
            if (sentence2.contains(section.getTitleSpan())) {
                int endPosition = section.getTitleSpan().getEndPosition() - i;
                String substring = sentence2.getText().substring(0, endPosition);
                String substring2 = sentence2.getText().substring(endPosition);
                if (substring2.trim().length() > 0) {
                    arrayList.remove(sentence2);
                    arrayList.add(0, new Sentence(substring2, i + endPosition, sentence2.getSentenceType()));
                    arrayList.add(0, new Sentence(substring, i, Sentence.TYPE_HEADER));
                } else {
                    sentence2.setSentenceType(Sentence.TYPE_HEADER);
                }
            }
            section.addSentences(arrayList);
        }
        document.addSentences(arrayList);
    }

    private void parseProperties(Document document, String str) {
        Matcher matcher = Pattern.compile("([A-Z][A-Za-z /]{3,25})(?:\\.{2,}|\\:)(.{2,25})").matcher(str);
        while (matcher.find()) {
            document.getProperties().put(matcher.group(1).trim(), matcher.group(2).trim());
        }
    }

    @Override // edu.pitt.dbmi.nlp.noble.coder.model.Processor
    public long getProcessTime() {
        return this.time;
    }

    private boolean mergeLines(String str, String str2) {
        return str != null && !SynopticReportDetector.detect(str) && str.matches(".+\\s([A-Z]?[a-z]+|\\d+),?") && str2.matches("([A-Z]?[a-z]+)\\b.+");
    }

    private List<Section> section(String str, int i, Pattern pattern, Matcher matcher, List<Section> list) {
        while (matcher.find()) {
            int start = i + matcher.start();
            int end = i + matcher.end();
            int start2 = i + matcher.start(2);
            String group = matcher.group();
            String group2 = matcher.group(1);
            String group3 = matcher.group(2);
            Matcher matcher2 = pattern.matcher(group3);
            if (matcher2.find()) {
                int start3 = start2 + matcher2.start();
                group = str.substring(start, start3);
                group3 = str.substring(start2, start3);
            }
            Section section = new Section();
            section.setText(group);
            section.setTitle(group2);
            section.setTitleOffset(start);
            section.setBody(group3);
            section.setBodyOffset(start2);
            list.add(section);
            matcher2.reset();
            section(str, start2, pattern, matcher2, list);
        }
        return list;
    }
}
