package com.github.laohyx.usaddress;

import com.github.jcrfsuite.CrfTagger;
import com.github.jcrfsuite.util.Pair;
import com.github.laohyx.usaddress.feature.BoolFeature;
import com.github.laohyx.usaddress.feature.DictFeature;
import com.github.laohyx.usaddress.feature.Feature;
import com.github.laohyx.usaddress.feature.StringFeature;
import java.io.File;
import java.io.IOException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import third_party.org.chokkan.crfsuite.ItemSequence;

/* loaded from: input_file:com/github/laohyx/usaddress/USAddressParser.class */
public class USAddressParser {
    public static final String PARENT_LABEL = "AddressString";
    public static final String GROUP_LABEL = "AddressCollection";
    public static final String MODEL_FILE = "usaddr.crfsuite";
    public static CrfTagger tagger;
    public static final String[] LABLES = {"AddressNumberPrefix", "AddressNumber", "AddressNumberSuffix", "StreetNamePreModifier", "StreetNamePreDirectional", "StreetNamePreType", "StreetName", "StreetNamePostType", "StreetNamePostDirectional", "SubaddressType", "SubaddressIdentifier", "BuildingName", "OccupancyType", "OccupancyIdentifier", "CornerOf", "LandmarkName", "PlaceName", "StateName", "ZipCode", "USPSBoxType", "USPSBoxID", "USPSBoxGroupType", "USPSBoxGroupID", "IntersectionSeparator", "Recipient", "NotAddress"};
    public static final Set<String> DIRECTIONS = new HashSet(Arrays.asList("n", "s", "e", "w", "ne", "nw", "se", "sw", "north", "south", "east", "west", "northeast", "northwest", "southeast", "southwest"));
    public static final Set<String> STREET_NAMES = new HashSet(Arrays.asList("allee", "alley", "ally", "aly", "anex", "annex", "annx", "anx", "arc", "arcade", "av", "ave", "aven", "avenu", "avenue", "avn", "avnue", "bayoo", "bayou", "bch", "beach", "bend", "bg", "bgs", "bl", "blf", "blfs", "bluf", "bluff", "bluffs", "blvd", "bnd", "bot", "bottm", "bottom", "boul", "boulevard", "boulv", "br", "branch", "brdge", "brg", "bridge", "brk", "brks", "brnch", "brook", "brooks", "btm", "burg", "burgs", "byp", "bypa", "bypas", "bypass", "byps", "byu", "camp", "canyn", "canyon", "cape", "causeway", "causwa", "causway", "cen", "cent", "center", "centers", "centr", "centre", "ci", "cir", "circ", "circl", "circle", "circles", "cirs", "ck", "clb", "clf", "clfs", "cliff", "cliffs", "club", "cmn", "cmns", "cmp", "cnter", "cntr", "cnyn", "common", "commons", "cor", "corner", "corners", "cors", "course", "court", "courts", "cove", "coves", "cp", "cpe", "cr", "crcl", "crcle", "crecent", "creek", "cres", "crescent", "cresent", "crest", "crk", "crossing", "crossroad", "crossroads", "crscnt", "crse", "crsent", "crsnt", "crssing", "crssng", "crst", "crt", "cswy", "ct", "ctr", "ctrs", "cts", "curv", "curve", "cv", "cvs", "cyn", "dale", "dam", "div", "divide", "dl", "dm", "dr", "driv", "drive", "drives", "drs", "drv", "dv", "dvd", "est", "estate", "estates", "ests", "ex", "exp", "expr", "express", "expressway", "expw", "expy", "ext", "extension", "extensions", "extn", "extnsn", "exts", "fall", "falls", "ferry", "field", "fields", "flat", "flats", "fld", "flds", "fls", "flt", "flts", "ford", "fords", "forest", "forests", "forg", "forge", "forges", "fork", "forks", "fort", "frd", "frds", "freeway", "freewy", "frg", "frgs", "frk", "frks", "frry", "frst", "frt", "frway", "frwy", "fry", "ft", "fwy", "garden", "gardens", "gardn", "gateway", "gatewy", "gatway", "gdn", "gdns", "glen", "glens", "gln", "glns", "grden", "grdn", "grdns", "green", "greens", "grn", "grns", "grov", "grove", "groves", "grv", "grvs", "gtway", "gtwy", "harb", "harbor", "harbors", "harbr", "haven", "havn", "hbr", "hbrs", "height", "heights", "hgts", "highway", "highwy", "hill", "hills", "hiway", "hiwy", "hl", "hllw", "hls", "hollow", "hollows", "holw", "holws", "hrbor", "ht", "hts", "hvn", "hway", "hwy", "inlet", "inlt", "is", "island", "islands", "isle", "isles", "islnd", "islnds", "iss", "jct", "jction", "jctn", "jctns", "jcts", "junction", "junctions", "junctn", "juncton", "key", "keys", "knl", "knls", "knol", "knoll", "knolls", "ky", "kys", "la", "lake", "lakes", "land", "landing", "lane", "lanes", "lck", "lcks", "ldg", "ldge", "lf", "lgt", "lgts", "light", "lights", "lk", "lks", "ln", "lndg", "lndng", "loaf", "lock", "locks", "lodg", "lodge", "loop", "loops", "lp", "mall", "manor", "manors", "mdw", "mdws", "meadow", "meadows", "medows", "mews", "mi", "mile", "mill", "mills", "mission", "missn", "ml", "mls", "mn", "mnr", "mnrs", "mnt", "mntain", "mntn", "mntns", "motorway", "mount", "mountain", "mountains", "mountin", "msn", "mssn", "mt", "mtin", "mtn", "mtns", "mtwy", "nck", "neck", "opas", "orch", "orchard", "orchrd", "oval", "overlook", "overpass", "ovl", "ovlk", "park", "parks", "parkway", "parkways", "parkwy", "pass", "passage", "path", "paths", "pike", "pikes", "pine", "pines", "pk", "pkway", "pkwy", "pkwys", "pky", "pl", "place", "plain", "plaines", "plains", "plaza", "pln", "plns", "plz", "plza", "pne", "pnes", "point", "points", "port", "ports", "pr", "prairie", "prarie", "prk", "prr", "prt", "prts", "psge", "pt", "pts", "pw", "pwy", "rad", "radial", "radiel", "radl", "ramp", "ranch", "ranches", "rapid", "rapids", "rd", "rdg", "rdge", "rdgs", "rds", "rest", "ri", "ridge", "ridges", "rise", "riv", "river", "rivr", "rn", "rnch", "rnchs", "road", "roads", "route", "row", "rpd", "rpds", "rst", "rte", "rue", "run", "rvr", "shl", "shls", "shoal", "shoals", "shoar", "shoars", "shore", "shores", "shr", "shrs", "skwy", "skyway", "smt", "spg", "spgs", "spng", "spngs", "spring", "springs", "sprng", "sprngs", "spur", "spurs", "sq", "sqr", "sqre", "sqrs", "sqs", "squ", "square", "squares", "st", "sta", "station", "statn", "stn", "str", "stra", "strav", "strave", "straven", "stravenue", "stravn", "stream", "street", "streets", "streme", "strm", "strt", "strvn", "strvnue", "sts", "sumit", "sumitt", "summit", "te", "ter", "terr", "terrace", "throughway", "tl", "tpk", "tpke", "tr", "trace", "traces", "track", "tracks", "trafficway", "trail", "trailer", "trails", "trak", "trce", "trfy", "trk", "trks", "trl", "trlr", "trlrs", "trls", "trnpk", "trpk", "trwy", "tunel", "tunl", "tunls", "tunnel", "tunnels", "tunnl", "turn", "turnpike", "turnpk", "un", "underpass", "union", "unions", "uns", "upas", "valley", "valleys", "vally", "vdct", "via", "viadct", "viaduct", "view", "views", "vill", "villag", "village", "villages", "ville", "villg", "villiage", "vis", "vist", "vista", "vl", "vlg", "vlgs", "vlly", "vly", "vlys", "vst", "vsta", "vw", "vws", "walk", "walks", "wall", "way", "ways", "well", "wells", "wl", "wls", "wy", "xc", "xg", "xing", "xrd", "xrds"));

    public static List<Pair<String, String>> parse(String str) {
        ArrayList arrayList = new ArrayList();
        List<String> list = tokenize(str);
        if (list.size() == 0) {
            return arrayList;
        }
        List<DictFeature> list2 = tokens2features(list);
        ItemSequence itemSequence = new ItemSequence();
        Iterator<DictFeature> it = list2.iterator();
        while (it.hasNext()) {
            itemSequence.add(it.next().toItem());
        }
        List tag = tagger.tag(itemSequence);
        for (int i = 0; i < tag.size(); i++) {
            arrayList.add(new Pair(list.get(i), (String) ((Pair) tag.get(i)).getFirst()));
        }
        return arrayList;
    }

    public static List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = Pattern.compile("\\(*\\b[^\\s,;#&()]+[.,;)\\n]*|[#&]").matcher(str.replaceAll("(&#38;)|(&amp;)", "&"));
        while (matcher.find()) {
            arrayList.add(matcher.group(0));
        }
        return arrayList;
    }

    public static List<DictFeature> tokens2features(List<String> list) {
        ArrayList arrayList = new ArrayList();
        arrayList.add(tokenFeatures(list.get(0)));
        DictFeature copy = ((DictFeature) arrayList.get(arrayList.size() - 1)).copy();
        for (int i = 1; i < list.size(); i++) {
            DictFeature dictFeature = tokenFeatures(list.get(i));
            DictFeature copy2 = dictFeature.copy();
            ((DictFeature) arrayList.get(arrayList.size() - 1)).put("next", copy2);
            dictFeature.put("previous", copy);
            arrayList.add(dictFeature);
            copy = copy2;
        }
        ((DictFeature) arrayList.get(0)).put("address.start", true);
        ((DictFeature) arrayList.get(arrayList.size() - 1)).put("address.end", true);
        if (arrayList.size() > 1) {
            ((DictFeature) arrayList.get(1)).getDictByKey("previous").put("address.start", true);
            ((DictFeature) arrayList.get(arrayList.size() - 2)).getDictByKey("next").put("address.end", true);
        }
        return arrayList;
    }

    public static DictFeature tokenFeatures(String str) {
        String replaceAll = (str.equals("&") || str.equals("#") || str.equals("½")) ? str : str.replaceAll("(^[\\W]*)|([^.\\w]*$)", "");
        String replaceAll2 = replaceAll.toLowerCase().replaceAll("[.]", "");
        DictFeature dictFeature = new DictFeature();
        dictFeature.put("abbrev", replaceAll.charAt(replaceAll.length() - 1) == '.');
        dictFeature.put("digits", digits(replaceAll));
        dictFeature.put("word", !StringUtils.isNumeric(replaceAll2) ? new StringFeature(replaceAll2) : new BoolFeature(false));
        dictFeature.put("trailing.zeros", StringUtils.isNumeric(replaceAll2) ? new StringFeature(trailingZeros(replaceAll2)) : new BoolFeature(false));
        dictFeature.put("length", StringUtils.isNumeric(replaceAll2) ? "d:" + String.valueOf(replaceAll2.length()) : "w:" + String.valueOf(replaceAll2.length()));
        dictFeature.put("endsinpunc", Feature.createFromEndsinpunc(str));
        dictFeature.put("directional", DIRECTIONS.contains(replaceAll2));
        dictFeature.put("street_name", STREET_NAMES.contains(replaceAll2));
        dictFeature.put("has.vowels", stringIntersect(replaceAll2.substring(1), "aeiou").size() > 0);
        return dictFeature;
    }

    public static String trailingZeros(String str) {
        Matcher matcher = Pattern.compile("(0+)$").matcher(str);
        return matcher.find() ? matcher.group(1) : "";
    }

    public static String digits(String str) {
        return StringUtils.isNumeric(str) ? "all_digits" : stringIntersect(str, "0123456789").size() > 0 ? "some_digits" : "no_digits";
    }

    public static Set<Character> stringIntersect(String str, String str2) {
        HashSet hashSet = new HashSet();
        for (char c : str.toCharArray()) {
            hashSet.add(Character.valueOf(c));
        }
        HashSet hashSet2 = new HashSet();
        for (char c2 : str2.toCharArray()) {
            hashSet2.add(Character.valueOf(c2));
        }
        hashSet.retainAll(hashSet2);
        return hashSet;
    }

    static {
        try {
            URL resource = USAddressParser.class.getClassLoader().getResource(MODEL_FILE);
            File createTempFile = File.createTempFile("crfsuite", ".model");
            FileUtils.copyURLToFile(resource, createTempFile);
            tagger = new CrfTagger(createTempFile.getPath());
        } catch (IOException e) {
            throw new RuntimeException("Cannot initialize the model!", e);
        }
    }
}
