package com.mdfromhtml.markdown.transform;

import com.api.json.JSON;
import com.api.json.JSONArray;
import com.api.json.JSONArtifact;
import com.api.json.JSONObject;
import com.mdfromhtml.core.MDfromHTMLUtils;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.nio.file.FileSystems;
import java.nio.file.InvalidPathException;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import org.apache.commons.text.StringSubstitutor;
import org.apache.commons.text.lookup.StringLookupFactory;

/* loaded from: input_file:com/mdfromhtml/markdown/transform/ExtractHTMLJSON.class */
public class ExtractHTMLJSON {
    String _ext = "text";
    Path _inputPath = null;
    boolean _interactive = false;
    String _outputPath = ".";
    boolean _thumbsucker = false;
    String _filePrefix = "htmljson_";
    int _fileCounter = 1;
    List<String> _filters = new ArrayList();
    Set<String> _processedURLs = new HashSet();

    public static void main(String[] strArr) {
        int i = 0;
        ExtractHTMLJSON extractHTMLJSON = new ExtractHTMLJSON();
        if (extractHTMLJSON.getParams(strArr)) {
            if (extractHTMLJSON._thumbsucker) {
                System.out.println("\nFiles ending with ." + extractHTMLJSON._ext + " will be read from " + extractHTMLJSON._inputPath + "\nand the generated htmljson files (.json) saved in " + extractHTMLJSON._outputPath);
            }
            if (extractHTMLJSON._thumbsucker) {
                System.out.println("\nFilter strings used to check html for bad pages:");
                Iterator<String> it = extractHTMLJSON._filters.iterator();
                while (it.hasNext()) {
                    System.out.println(it.next());
                }
                System.out.println();
            }
            if (extractHTMLJSON._interactive && MDfromHTMLUtils.prompt("Press q to quit or press Enter to continue...").length() == 0) {
                extractHTMLJSON._interactive = false;
            }
            if (!extractHTMLJSON._interactive) {
                try {
                    Iterator<Path> it2 = MDfromHTMLUtils.listSourceFiles(FileSystems.getDefault().getPath(extractHTMLJSON._inputPath.toString(), new String[0]), extractHTMLJSON._ext).iterator();
                    while (it2.hasNext()) {
                        i = extractHTMLJSON.doWork(it2.next());
                        if (i != 0) {
                            break;
                        }
                    }
                } catch (Exception e) {
                    System.out.println("Error: Can not reference files with extension " + extractHTMLJSON._ext + " in directory " + extractHTMLJSON._inputPath + " reason: " + e.getLocalizedMessage());
                    i = -1;
                }
            }
            if (extractHTMLJSON._thumbsucker) {
                System.out.println();
            }
        } else {
            i = -1;
        }
        if (extractHTMLJSON._thumbsucker) {
            System.out.println("Goodbye");
        }
        System.exit(i);
    }

    int doWork(Path path) {
        int i = 0;
        try {
            String path2 = path.toString();
            if (this._thumbsucker) {
                System.out.println("Processing: " + path2);
            }
            BufferedReader openTextFile = MDfromHTMLUtils.openTextFile(path2);
            StringBuffer stringBuffer = new StringBuffer();
            String readLine = openTextFile.readLine();
            int i2 = 0;
            while (readLine != null) {
                i2++;
                try {
                    if (readLine.startsWith(StringSubstitutor.DEFAULT_VAR_END)) {
                        stringBuffer.append(readLine);
                        stringBuffer.append("\n");
                        saveFile(stringBuffer.toString());
                        stringBuffer = new StringBuffer();
                    } else {
                        stringBuffer.append(readLine);
                        stringBuffer.append("\n");
                    }
                    readLine = openTextFile.readLine();
                } catch (OutOfMemoryError e) {
                    stringBuffer = new StringBuffer();
                    System.out.println("Error reading line " + i2);
                    readLine = openTextFile.readLine();
                    i2++;
                    while (true) {
                        if (readLine == null) {
                            break;
                        }
                        if (readLine.startsWith("{")) {
                            stringBuffer.append(readLine);
                            stringBuffer.append("\n");
                            readLine = openTextFile.readLine();
                            i2++;
                            break;
                        }
                        readLine = openTextFile.readLine();
                        if (readLine.contains("\"url\":")) {
                            System.out.println("Skipping: " + readLine);
                        }
                        i2++;
                    }
                    System.out.println("Resuming at line " + i2);
                }
            }
            if (stringBuffer.length() > 0) {
                try {
                    saveFile(stringBuffer.toString());
                } catch (Exception e2) {
                    System.out.println("\n\nError: " + e2.getLocalizedMessage() + "\n");
                    System.out.println(stringBuffer.toString());
                    System.out.println("\n\nEnd Error: " + e2.getLocalizedMessage() + "\n");
                }
            }
            MDfromHTMLUtils.closeTextFile(openTextFile);
        } catch (Exception e3) {
            e3.printStackTrace();
            i = -1;
        }
        return i;
    }

    void saveFile(String str) {
        StringBuilder append = new StringBuilder().append(this._outputPath).append(this._filePrefix);
        int i = this._fileCounter;
        this._fileCounter = i + 1;
        String sb = append.append(MDfromHTMLUtils.padLeft(i, 4, '0')).append(".json").toString();
        try {
            JSONArtifact parse = JSON.parse(str);
            if (!(parse instanceof JSONObject)) {
                System.out.println("Error: got a non-JSONObject from parse: " + parse);
                return;
            }
            JSONObject jSONObject = (JSONObject) parse;
            try {
                if (filterContent(jSONObject)) {
                    MDfromHTMLUtils.saveJSONFile(sb + ".rejected", jSONObject);
                } else {
                    MDfromHTMLUtils.saveJSONFile(sb, jSONObject);
                }
            } catch (Exception e) {
                System.out.println("Can not save file " + sb + "  Error: " + e.getLocalizedMessage());
            }
        } catch (IOException e2) {
            System.out.println("Error: Can not transform to JSON: " + e2.getLocalizedMessage() + "\n" + str);
        } catch (ClassCastException e3) {
            System.out.println("Error: Can not parse to JSON: " + e3.getLocalizedMessage() + "\n" + str);
        }
    }

    boolean filterContent(JSONObject jSONObject) {
        boolean z = true;
        if (jSONObject == null) {
            return true;
        }
        JSONArray jSONArray = new JSONArray();
        JSONArray jSONArray2 = (JSONArray) jSONObject.get("captureArray");
        if (jSONArray2 != null) {
            new JSONObject();
            Iterator<Object> it = jSONArray2.iterator();
            while (it.hasNext()) {
                JSONObject jSONObject2 = (JSONObject) it.next();
                String str = (String) jSONObject2.get(StringLookupFactory.KEY_URL);
                if (str.endsWith("/")) {
                    str = str.substring(0, str.length() - 1);
                }
                if (this._processedURLs.contains(str)) {
                    JSONObject jSONObject3 = new JSONObject();
                    jSONObject3.put(StringLookupFactory.KEY_URL, (Object) str);
                    jSONObject3.put("reason", (Object) "duplicate url");
                    jSONArray.add(jSONObject3);
                    it.remove();
                } else {
                    String str2 = (String) jSONObject2.get("html");
                    if (str2 == null || str2.toLowerCase().indexOf("<body") != -1) {
                        String str3 = (String) jSONObject2.get("content");
                        boolean z2 = false;
                        if (str == null || str2 == null) {
                            String str4 = "Filter: Content at " + this._fileCounter + " does not have a url or html elements.";
                            JSONObject jSONObject4 = new JSONObject();
                            jSONObject4.put("reason", (Object) str4);
                            jSONArray.add(jSONObject4);
                            System.out.println(str4);
                        } else {
                            this._processedURLs.add(str);
                            String lowerCase = str2.toLowerCase();
                            if (str3 == null) {
                                str3 = "";
                            }
                            String lowerCase2 = str3.toLowerCase();
                            Iterator<String> it2 = this._filters.iterator();
                            while (true) {
                                if (!it2.hasNext()) {
                                    break;
                                }
                                String next = it2.next();
                                if (lowerCase.contains(next)) {
                                    z2 = true;
                                    String str5 = "Filter: \"" + next + "\" found in HTML";
                                    System.out.println(str5 + " for URL " + str);
                                    JSONObject jSONObject5 = new JSONObject();
                                    jSONObject5.put(StringLookupFactory.KEY_URL, (Object) str);
                                    jSONObject5.put("reason", (Object) str5);
                                    jSONArray.add(jSONObject5);
                                    break;
                                }
                                if (lowerCase2.contains(next)) {
                                    z2 = true;
                                    String str6 = "Filter: \"" + next + "\" found in Content";
                                    System.out.println(str6 + " for URL " + str);
                                    JSONObject jSONObject6 = new JSONObject();
                                    jSONObject6.put(StringLookupFactory.KEY_URL, (Object) str);
                                    jSONObject6.put("reason", (Object) str6);
                                    jSONArray.add(jSONObject6);
                                    break;
                                }
                            }
                            if (z2) {
                                it.remove();
                            }
                        }
                    } else {
                        JSONObject jSONObject7 = new JSONObject();
                        jSONObject7.put(StringLookupFactory.KEY_URL, (Object) str);
                        jSONObject7.put("reason", (Object) "no <body tag in html");
                        jSONArray.add(jSONObject7);
                        it.remove();
                    }
                }
            }
            z = jSONArray2.size() <= 0;
        } else {
            String str7 = "Filter: Content at " + this._fileCounter + " does not have a captureArray. message: " + jSONObject.get("message");
            JSONObject jSONObject8 = new JSONObject();
            jSONObject8.put("reason", (Object) str7);
            jSONArray.add(jSONObject8);
            System.out.println(str7);
        }
        if (jSONArray.size() > 0) {
            jSONObject.put("rejected", (Object) jSONArray);
        }
        return z;
    }

    boolean getParams(String[] strArr) {
        String str;
        String str2;
        try {
            if (strArr.length >= 1) {
                str = strArr[0];
            } else {
                this._interactive = true;
                this._thumbsucker = true;
                String prompt = MDfromHTMLUtils.prompt("Enter the fully qualified path to directory containing " + this._ext + " html capture files, or q to exit (./data/):");
                if (prompt == null || prompt.length() == 0) {
                    prompt = "./data/";
                }
                if (prompt.toLowerCase().equals("q")) {
                    return false;
                }
                str = prompt;
            }
            if (!str.endsWith(File.separator)) {
                str = str + File.separator;
            }
            this._inputPath = FileSystems.getDefault().getPath(str, new String[0]);
            if (strArr.length >= 2) {
                str2 = strArr[1];
            } else {
                this._interactive = true;
                this._thumbsucker = true;
                String prompt2 = MDfromHTMLUtils.prompt("Enter the fully qualified path to the htmljson output directory, or q to exit (./data/htmljson):");
                if (prompt2 == null || prompt2.length() == 0) {
                    prompt2 = "./data/htmljson";
                }
                if (prompt2.toLowerCase().equals("q")) {
                    return false;
                }
                str2 = prompt2;
            }
            if (!str2.endsWith(File.separator)) {
                str2 = str2 + File.separator;
            }
            File file = new File(str2);
            if (!file.exists()) {
                System.out.println("Error: The output directory \"" + str2 + "\" must exist.");
                return false;
            }
            if (!file.isDirectory()) {
                System.out.println("Error: The output directory \"" + str2 + "\" must be a directory.");
                return false;
            }
            this._outputPath = str2;
            if (strArr.length >= 3) {
                String str3 = strArr[2];
            } else {
                String prompt3 = MDfromHTMLUtils.prompt("Enter the starting file suffix or q to quit (" + this._fileCounter + "):");
                if (prompt3.length() == 0) {
                    prompt3 = "" + this._fileCounter;
                }
                if ("q".equalsIgnoreCase(prompt3)) {
                    return false;
                }
                try {
                    int intValue = new Integer(prompt3).intValue();
                    if (intValue < 1) {
                        System.out.println("File suffix must be a positive number.");
                        return false;
                    }
                    this._fileCounter = intValue;
                } catch (NumberFormatException e) {
                    System.out.println("File suffix must be a positive number. Got \"" + prompt3 + "\"");
                    return false;
                }
            }
            if (strArr.length >= 4) {
                this._thumbsucker = new Boolean(strArr[3]).booleanValue();
            }
            try {
                this._filters = MDfromHTMLUtils.loadTextFile(this._inputPath + File.separator + "RejectStrings.txt");
                ArrayList arrayList = new ArrayList();
                for (String str4 : this._filters) {
                    if (!str4.startsWith("#")) {
                        String trim = str4.trim();
                        if (trim.length() != 0) {
                            arrayList.add(trim.toLowerCase());
                        }
                    }
                }
                this._filters = arrayList;
                return true;
            } catch (Exception e2) {
                e2.printStackTrace();
                return false;
            }
        } catch (InvalidPathException e3) {
            System.out.println("Error: " + strArr[0] + " is not a valid directory to form a path.");
            return false;
        }
    }
}
