package edu.uci.ics.crawler4j.robotstxt;

import java.util.Arrays;
import java.util.HashSet;
import java.util.Set;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:edu/uci/ics/crawler4j/robotstxt/RobotstxtParser.class */
public class RobotstxtParser {
    private static final Logger logger = LoggerFactory.getLogger(RobotstxtParser.class);
    private static final Pattern RULE_PATTERN = Pattern.compile("(?i)^([A-Za-z\\-]+):(.*)");
    private static final Set<String> VALID_RULES = new HashSet(Arrays.asList("allow", "disallow", "user-agent", "crawl-delay", "host", "sitemap"));

    public static HostDirectives parse(String str, RobotstxtConfig robotstxtConfig) {
        HostDirectives hostDirectives = new HostDirectives(robotstxtConfig);
        StringTokenizer stringTokenizer = new StringTokenizer(str, "\n\r");
        HashSet hashSet = new HashSet();
        UserAgentDirectives userAgentDirectives = null;
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            int indexOf = nextToken.indexOf(35);
            if (indexOf > -1) {
                nextToken = nextToken.substring(0, indexOf);
            }
            String trim = nextToken.replaceAll("<[^>]+>", "").trim();
            if (!trim.isEmpty()) {
                Matcher matcher = RULE_PATTERN.matcher(trim);
                if (matcher.matches()) {
                    String lowerCase = matcher.group(1).toLowerCase();
                    String trim2 = matcher.group(2).trim();
                    if (!VALID_RULES.contains(lowerCase)) {
                        logger.info("Unrecognized rule in robots.txt: {}", lowerCase);
                    } else if (lowerCase.equals("user-agent")) {
                        String lowerCase2 = trim2.toLowerCase();
                        if (userAgentDirectives != null) {
                            hashSet = new HashSet();
                            hostDirectives.addDirectives(userAgentDirectives);
                            userAgentDirectives = null;
                        }
                        hashSet.add(lowerCase2);
                    } else {
                        if (userAgentDirectives == null) {
                            if (hashSet.isEmpty()) {
                                hashSet.add("*");
                            }
                            userAgentDirectives = new UserAgentDirectives(hashSet);
                        }
                        userAgentDirectives.add(lowerCase, trim2);
                    }
                } else {
                    logger.debug("Unrecognized line in robots.txt: {}", trim);
                }
            }
        }
        hostDirectives.addDirectives(userAgentDirectives);
        return hostDirectives;
    }
}
