package uk.ac.shef.dcs.jate.util;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.PrintWriter;
import java.text.Normalizer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.pdfbox.pdmodel.documentinterchange.taggedpdf.StandardStructureTypes;
import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationText;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
import uk.ac.shef.dcs.jate.JATEException;
import uk.ac.shef.dcs.jate.model.JATEDocument;

/* loaded from: input_file:uk/ac/shef/dcs/jate/util/ACLRDCorpusParser.class */
public class ACLRDCorpusParser {
    public static void main(String[] strArr) throws FileNotFoundException, JATEException {
        String str = strArr[0];
        String str2 = strArr[1];
        int i = 0;
        for (File file : listFileTree(new File(str))) {
            i++;
            System.out.println(i);
            JATEDocument loadJATEDocFromXML = loadJATEDocFromXML(new FileInputStream(file));
            PrintWriter printWriter = new PrintWriter(str2 + File.separator + file.getName() + ".txt");
            printWriter.println(loadJATEDocFromXML.getContent());
            printWriter.close();
        }
    }

    public static Collection<File> listFileTree(File file) {
        HashSet hashSet = new HashSet();
        if (file == null || file.listFiles() == null) {
            return hashSet;
        }
        for (File file2 : file.listFiles()) {
            if (file2.isFile()) {
                hashSet.add(file2);
            } else {
                hashSet.addAll(listFileTree(file2));
            }
        }
        return hashSet;
    }

    private static JATEDocument loadJATEDocFromXML(InputStream inputStream) throws JATEException {
        try {
            SAXParser newSAXParser = SAXParserFactory.newInstance().newSAXParser();
            final StringBuffer stringBuffer = new StringBuffer();
            final StringBuffer stringBuffer2 = new StringBuffer();
            final StringBuffer stringBuffer3 = new StringBuffer();
            newSAXParser.parse(inputStream, new DefaultHandler() { // from class: uk.ac.shef.dcs.jate.util.ACLRDCorpusParser.1
                boolean paper = false;
                boolean title = false;
                boolean section = false;
                boolean sectionTitle = false;
                boolean paragraph = false;
                boolean reference = false;

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
                    if (str3.equalsIgnoreCase("Paper")) {
                        this.paper = true;
                        stringBuffer2.append(attributes.getValue("id"));
                    }
                    if (str3.equalsIgnoreCase("title")) {
                        this.title = true;
                    }
                    if (str3.equalsIgnoreCase("Section")) {
                        this.section = true;
                    }
                    if (str3.equalsIgnoreCase("SectionTitle")) {
                        this.sectionTitle = true;
                    }
                    if (str3.equalsIgnoreCase(PDAnnotationText.NAME_PARAGRAPH)) {
                        this.paragraph = true;
                    }
                    if (str3.equalsIgnoreCase(StandardStructureTypes.REFERENCE)) {
                        this.reference = true;
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void endElement(String str, String str2, String str3) throws SAXException {
                    if (str3.equalsIgnoreCase(PDAnnotationText.NAME_PARAGRAPH)) {
                        this.paragraph = false;
                    }
                }

                @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
                public void characters(char[] cArr, int i, int i2) throws SAXException {
                    if (this.paper) {
                        this.paper = false;
                    }
                    if (this.title) {
                        this.title = false;
                        if (!this.reference) {
                            stringBuffer3.append(new String(cArr, i, i2)).append("\n");
                        }
                        this.reference = false;
                    }
                    if (this.section) {
                        this.section = false;
                    }
                    if (this.sectionTitle) {
                        this.sectionTitle = false;
                    }
                    if (this.paragraph) {
                        stringBuffer.append(new String(cArr, i, i2));
                    }
                }
            });
            StringBuffer stringBuffer4 = new StringBuffer();
            stringBuffer4.append(stringBuffer3).append("\n").append(stringBuffer);
            String cleanText = cleanText(StringEscapeUtils.unescapeXml(Normalizer.normalize(stringBuffer4.toString(), Normalizer.Form.NFD)));
            JATEDocument jATEDocument = new JATEDocument(stringBuffer2.toString());
            jATEDocument.setContent(cleanText.trim());
            return jATEDocument;
        } catch (IOException e) {
            throw new JATEException("I/O Exception when parsing input file!" + e.toString());
        } catch (ParserConfigurationException e2) {
            throw new JATEException("Failed to initialise SAXParser!" + e2.toString());
        } catch (SAXException e3) {
            throw new JATEException("Failed to initialise SAXParser!" + e3.toString());
        }
    }

    private static JATEDocument loadJATEDocFromXMLV2(File file) throws JATEException, IOException, SAXException, ParserConfigurationException {
        DocumentBuilder newDocumentBuilder = DocumentBuilderFactory.newInstance().newDocumentBuilder();
        FileInputStream fileInputStream = new FileInputStream(file);
        try {
            JATEDocument jATEDocument = new JATEDocument(file.toString());
            NodeList childNodes = newDocumentBuilder.parse(fileInputStream).getDocumentElement().getChildNodes();
            StringBuilder sb = new StringBuilder();
            for (int i = 0; i < childNodes.getLength(); i++) {
                Node item = childNodes.item(i);
                if (item.getNodeName().equalsIgnoreCase("title")) {
                    sb.append(item.getTextContent()).append(". \n");
                } else if (item.getNodeName().equalsIgnoreCase("section")) {
                    NodeList childNodes2 = item.getChildNodes();
                    for (int i2 = 0; i2 < childNodes2.getLength(); i2++) {
                        Node item2 = childNodes2.item(i2);
                        if (item2.getNodeName().equalsIgnoreCase("S")) {
                            sb.append(item2.getTextContent()).append(" ");
                        }
                    }
                }
            }
            jATEDocument.setContent(sb.toString().trim());
            jATEDocument.setPath(file.getPath());
            fileInputStream.close();
            return jATEDocument;
        } catch (Throwable th) {
            fileInputStream.close();
            throw th;
        }
    }

    public static Set<String> extractGoldstandardTermsV2(String str) throws ParserConfigurationException, IOException, SAXException, XPathExpressionException {
        NodeList nodeList = (NodeList) XPathFactory.newInstance().newXPath().evaluate("//term", new InputSource(str), XPathConstants.NODESET);
        HashSet hashSet = new HashSet();
        for (int i = 0; i < nodeList.getLength(); i++) {
            String trim = ((Element) nodeList.item(i)).getTextContent().trim();
            if (trim.length() > 2) {
                hashSet.add(trim);
            }
        }
        return hashSet;
    }

    public static String cleanText(String str) {
        String str2 = str;
        Iterator<String> it = extractBrokenWords(str).iterator();
        while (it.hasNext()) {
            str2 = fixBrokenWords(str2, it.next());
        }
        return str2;
    }

    public static List<String> extractBrokenWords(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile("([A-Z]\\s([a-z]\\s){3,10})").matcher(str);
        while (matcher.find()) {
            arrayList.add(matcher.group());
        }
        Matcher matcher2 = Pattern.compile("([A-Z]\\s([A-Z]\\s){3,10})").matcher(str);
        while (matcher2.find()) {
            arrayList.add(matcher2.group());
        }
        return arrayList;
    }

    public static String fixBrokenWords(String str, String str2) {
        return str.replaceAll(str2, str2.replaceAll(" ", "").concat(" "));
    }
}
