/*
 * Decompiled with CFR 0.152.
 */
package de.pangaea.metadataportal.harvester;

import de.pangaea.metadataportal.Package;
import de.pangaea.metadataportal.config.HarvesterConfig;
import de.pangaea.metadataportal.harvester.RetryAfterIOException;
import de.pangaea.metadataportal.harvester.SingleFileEntitiesHarvester;
import de.pangaea.metadataportal.utils.HttpClientUtils;
import de.pangaea.metadataportal.utils.StaticFactories;
import java.io.IOException;
import java.io.InputStream;
import java.net.CookieManager;
import java.net.CookiePolicy;
import java.net.URI;
import java.net.http.HttpClient;
import java.net.http.HttpRequest;
import java.net.http.HttpResponse;
import java.nio.charset.StandardCharsets;
import java.time.Duration;
import java.time.Instant;
import java.time.format.DateTimeFormatter;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.Locale;
import java.util.Optional;
import java.util.Set;
import java.util.SortedSet;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.xml.transform.Source;
import javax.xml.transform.sax.SAXSource;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

public class WebCrawlingHarvester
extends SingleFileEntitiesHarvester {
    public static final int DEFAULT_RETRY_TIME = 60;
    public static final int DEFAULT_RETRY_COUNT = 5;
    public static final int DEFAULT_TIMEOUT = 180;
    public static final String HTML_SAX_PARSER_CLASS = "org.cyberneko.html.parsers.SAXParser";
    public static final Set<String> HTML_CONTENT_TYPES = new HashSet<String>(Arrays.asList("text/html", "application/xhtml+xml"));
    public static final String USER_AGENT = "Java/" + Runtime.version() + " (" + Package.getProductName() + '/' + Package.getVersion() + "; WebCrawlingHarvester)";
    private String baseURL;
    private final Pattern filenameFilter;
    private final Pattern excludeUrlPattern;
    private final Set<String> contentTypes = new HashSet<String>();
    private final int retryCount;
    private final int retryTime;
    private final Duration timeout;
    private final String authorizationHeader;
    private final long pauseBetweenRequests;
    private final HttpClient httpClient;
    private Set<String> harvested = new HashSet<String>();
    private SortedSet<String> needsHarvest = new TreeSet<String>();
    private Class<? extends XMLReader> htmlReaderClass = null;

    public WebCrawlingHarvester(HarvesterConfig iconfig) throws Exception {
        super(iconfig);
        String s = iconfig.properties.getProperty("baseUrl");
        if (s == null) {
            throw new IllegalArgumentException("Missing base URL to start harvesting (property \"baseUrl\")");
        }
        URI u = new URI(s);
        String proto = u.getScheme().toLowerCase(Locale.ROOT);
        if (!"http".equals(proto) && !"https".equals(proto)) {
            throw new IllegalArgumentException("WebCrawlingHarvester only allows HTTP(S) as network protocol!");
        }
        this.baseURL = u.toString();
        s = iconfig.properties.getProperty("contentTypes", "text/xml,application/xml");
        for (String c : s.split("[\\,\\;\\s]+")) {
            if ("".equals(c = c.trim().toLowerCase(Locale.ROOT))) continue;
            this.contentTypes.add(c);
        }
        this.retryCount = Integer.parseInt(iconfig.properties.getProperty("retryCount", Integer.toString(5)));
        this.retryTime = Integer.parseInt(iconfig.properties.getProperty("retryAfterSeconds", Integer.toString(60)));
        this.timeout = Duration.ofSeconds(Integer.parseInt(iconfig.properties.getProperty("timeoutAfterSeconds", Integer.toString(180))));
        this.authorizationHeader = iconfig.properties.getProperty("authorizationHeader");
        this.pauseBetweenRequests = Long.parseLong(iconfig.properties.getProperty("pauseBetweenRequests", "0"));
        s = iconfig.properties.getProperty("filenameFilter");
        this.filenameFilter = s == null ? null : Pattern.compile(s);
        s = iconfig.properties.getProperty("excludeUrlPattern");
        this.excludeUrlPattern = s == null ? null : Pattern.compile(s);
        this.httpClient = HttpClient.newBuilder().followRedirects(HttpClient.Redirect.NORMAL).connectTimeout(this.timeout).cookieHandler(new CookieManager(null, CookiePolicy.ACCEPT_ORIGINAL_SERVER)).build();
        try {
            this.htmlReaderClass = Class.forName(HTML_SAX_PARSER_CLASS).asSubclass(XMLReader.class);
        }
        catch (ClassNotFoundException cfe) {
            throw new ClassNotFoundException(this.getClass().getName() + " needs the NekoHTML parser in classpath!");
        }
    }

    @Override
    public void harvest() throws Exception {
        String urlStr = this.baseURL;
        this.baseURL = "";
        this.harvested.add(urlStr);
        URI newbaseURL = this.processURL(new URI(urlStr));
        if (newbaseURL == null) {
            throw new IOException("Cannot find base URL: " + urlStr);
        }
        this.baseURL = newbaseURL.getPath().isEmpty() ? newbaseURL.toString() : newbaseURL.resolve("./").toString();
        this.log.debug((Object)("URL directory which harvesting may not escape: " + this.baseURL));
        this.needsHarvest.removeIf(s -> !s.startsWith(this.baseURL));
        while (!this.needsHarvest.isEmpty()) {
            if (this.pauseBetweenRequests > 0L) {
                try {
                    Thread.sleep(this.pauseBetweenRequests);
                }
                catch (InterruptedException interruptedException) {
                    // empty catch block
                }
            }
            urlStr = this.needsHarvest.first();
            this.needsHarvest.remove(urlStr);
            this.harvested.add(urlStr);
            this.processURL(new URI(urlStr));
        }
    }

    @Override
    protected void enumerateValidHarvesterPropertyNames(Set<String> props) {
        super.enumerateValidHarvesterPropertyNames(props);
        props.addAll(Arrays.asList("baseUrl", "retryCount", "retryAfterSeconds", "timeoutAfterSeconds", "filenameFilter", "contentTypes", "excludeUrlPattern", "pauseBetweenRequests", "authorizationHeader"));
    }

    void queueURL(String url) {
        Matcher m;
        int p = url.indexOf(35);
        if (p >= 0) {
            url = url.substring(0, p);
        }
        if (!url.startsWith(this.baseURL)) {
            return;
        }
        if (this.harvested.contains(url)) {
            return;
        }
        if (this.excludeUrlPattern != null && (m = this.excludeUrlPattern.matcher(url)).find()) {
            return;
        }
        this.needsHarvest.add(url);
    }

    private void analyzeHTML(final URI baseURL, InputSource source) throws Exception {
        XMLReader r = this.htmlReaderClass.getConstructor(new Class[0]).newInstance(new Object[0]);
        r.setFeature("http://xml.org/sax/features/namespaces", true);
        r.setFeature("http://cyberneko.org/html/features/balance-tags", true);
        r.setFeature("http://cyberneko.org/html/features/report-errors", false);
        r.setProperty("http://cyberneko.org/html/properties/names/elems", "upper");
        r.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower");
        DefaultHandler handler = new DefaultHandler(){
            private URI base;
            private int inBODY;
            private int inFRAMESET;
            private int inHEAD;
            {
                this.base = baseURL;
                this.inBODY = 0;
                this.inFRAMESET = 0;
                this.inHEAD = 0;
            }

            @Override
            public void startElement(String namespaceURI, String localName, String qName, Attributes atts) throws SAXException {
                String url = null;
                if ("BODY".equals(localName)) {
                    ++this.inBODY;
                } else if ("FRAMESET".equals(localName)) {
                    ++this.inFRAMESET;
                } else if ("HEAD".equals(localName)) {
                    ++this.inHEAD;
                } else if (this.inHEAD > 0) {
                    String newBase;
                    if ("BASE".equals(localName) && (newBase = atts.getValue("href")) != null) {
                        try {
                            this.base = this.base.resolve(newBase);
                        }
                        catch (IllegalArgumentException mue) {
                            WebCrawlingHarvester.this.log.debug((Object)("Found invalid BASE-URL: " + url));
                            throw new SAXException("#panFMP#HTML_INVALID_BASE");
                        }
                    }
                } else {
                    if (this.inBODY > 0) {
                        if ("A".equals(localName) || "AREA".equals(localName)) {
                            url = atts.getValue("href");
                        } else if ("IFRAME".equals(localName)) {
                            url = atts.getValue("src");
                        }
                    }
                    if (this.inFRAMESET > 0 && "FRAME".equals(localName)) {
                        url = atts.getValue("src");
                    }
                }
                if (url != null) {
                    try {
                        WebCrawlingHarvester.this.queueURL(this.base.resolve(url).toString());
                    }
                    catch (IllegalArgumentException mue) {
                        WebCrawlingHarvester.this.log.debug((Object)("Found invalid URL: " + url));
                    }
                }
            }

            @Override
            public void endElement(String namespaceURI, String localName, String qName) throws SAXException {
                if ("BODY".equals(localName)) {
                    --this.inBODY;
                } else if ("FRAMESET".equals(localName)) {
                    --this.inFRAMESET;
                } else if ("HEAD".equals(localName)) {
                    --this.inHEAD;
                }
            }
        };
        r.setContentHandler(handler);
        r.setErrorHandler(handler);
        try {
            r.parse(source);
        }
        catch (SAXException saxe) {
            if ("#panFMP#HTML_INVALID_BASE".equals(saxe.getMessage())) {
                this.log.warn((Object)"HTMLParser detected an invalid URL in HTML 'BASE' tag. Stopped link parsing for this document!");
            }
            throw saxe;
        }
    }

    private boolean acceptFile(URI url) {
        if (this.filenameFilter == null) {
            return true;
        }
        String name = url.getPath();
        int p = name.lastIndexOf(47);
        if (p >= 0) {
            name = name.substring(p + 1);
        }
        Matcher m = this.filenameFilter.matcher(name);
        return m.matches();
    }

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     * Enabled aggressive block sorting
     * Enabled unnecessary exception pruning
     * Enabled aggressive exception aggregation
     */
    private URI processURL(URI uri) throws Exception {
        int retry = 0;
        while (retry <= this.retryCount) {
            this.log.info((Object)("Requesting props of '" + String.valueOf(uri) + "'..."));
            String proto = uri.getScheme().toLowerCase(Locale.ROOT);
            if (!"http".equals(proto) && !"https".equals(proto)) {
                throw new IllegalArgumentException("WebCrawlingHarvester only allows HTTP(S) as network protocol!");
            }
            HttpRequest.Builder reqBuilder = HttpRequest.newBuilder(uri).GET().timeout(this.timeout).setHeader("User-Agent", USER_AGENT).setHeader("Accept-Charset", StandardCharsets.UTF_8.name() + ", *;q=0.5").setHeader("Accept", "text/xml, application/xml, *;q=0.1").setHeader("Accept", Stream.of(this.contentTypes, HTML_CONTENT_TYPES, Set.of("*;q=0.1")).flatMap(Collection::stream).distinct().collect(Collectors.joining(", ")));
            HttpClientUtils.sendCompressionHeaders(reqBuilder);
            if (this.authorizationHeader != null) {
                reqBuilder.header("Authorization", this.authorizationHeader);
            }
            this.log.debug((Object)"Opening connection...");
            try {
                HttpResponse<InputStream> resp;
                try {
                    resp = HttpClientUtils.sendHttpRequestWithRetry(this.httpClient, reqBuilder.build(), HttpResponse.BodyHandlers.ofInputStream());
                }
                catch (IOException ioe) {
                    throw new RetryAfterIOException(this.retryTime, ioe);
                }
                boolean success = false;
                try {
                    int statusCode = resp.statusCode();
                    switch (statusCode) {
                        case 503: {
                            IOException ioe1 = new IOException("Webserver returned '503 Service Unavailable'");
                            Optional<Integer> retryAfter = resp.headers().firstValue("Retry-After").map(Integer::parseInt);
                            if (!retryAfter.isPresent()) throw new RetryAfterIOException(this.retryTime, "Webserver returned error code, repeating after " + this.retryTime + "s: " + statusCode, ioe1);
                            throw new RetryAfterIOException(retryAfter.get(), "Webserver returned '503 Service Unavailable', repeating after " + String.valueOf(retryAfter.get()) + "s.", ioe1);
                        }
                        case 200: {
                            break;
                        }
                        case 404: 
                        case 410: {
                            this.log.warn((Object)("Cannot find URL '" + String.valueOf(resp.uri()) + "'."));
                            URI uRI = null;
                            return uRI;
                        }
                        default: {
                            IOException ioe2 = new IOException("Webserver returned invalid status code: " + statusCode);
                            if (statusCode < 500) throw ioe2;
                            throw new RetryAfterIOException(this.retryTime, "Webserver returned error code, repeating after " + this.retryTime + "s: " + statusCode, ioe2);
                        }
                    }
                    success = true;
                }
                finally {
                    if (!success) {
                        try {
                            resp.body().close();
                        }
                        catch (IOException iOException) {}
                    }
                }
                try (InputStream in = HttpClientUtils.getDecompressingInputStream(resp);){
                    String contentType = resp.headers().firstValue("Content-Type").orElse(null);
                    String charset = null;
                    if (contentType != null) {
                        int contentEnd;
                        int charsetStart = (contentType = contentType.toLowerCase(Locale.ROOT)).indexOf("charset=");
                        if (charsetStart >= 0) {
                            int charsetEnd = contentType.indexOf(";", charsetStart);
                            if (charsetEnd == -1) {
                                charsetEnd = contentType.length();
                            }
                            charset = contentType.substring(charsetStart += "charset=".length(), charsetEnd).trim();
                        }
                        if ((contentEnd = contentType.indexOf(59)) >= 0) {
                            contentType = contentType.substring(0, contentEnd);
                        }
                        contentType = contentType.trim();
                    }
                    this.log.debug((Object)("Charset from Content-Type: '" + charset + "'; Type from Content-Type: '" + contentType + "'"));
                    if (contentType == null) {
                        this.log.warn((Object)("Connection to URL '" + String.valueOf(uri) + "' did not return a content-type, skipping."));
                        URI charsetStart = uri;
                        return charsetStart;
                    }
                    URI newurl = resp.uri();
                    if (!uri.toString().equals(newurl.toString())) {
                        this.log.debug((Object)("Got redirect to: " + String.valueOf(newurl)));
                        uri = newurl;
                        if (!uri.toString().startsWith(this.baseURL)) {
                            URI contentEnd = uri;
                            return contentEnd;
                        }
                        if (this.harvested.contains(uri.toString())) {
                            URI contentEnd = uri;
                            return contentEnd;
                        }
                        this.needsHarvest.remove(uri.toString());
                        this.harvested.add(uri.toString());
                    }
                    if (HTML_CONTENT_TYPES.contains(contentType)) {
                        this.log.info((Object)("Analyzing HTML links in '" + String.valueOf(uri) + "'..."));
                        InputSource src = new InputSource(in);
                        src.setSystemId(uri.toString());
                        src.setEncoding(charset);
                        this.analyzeHTML(uri, src);
                    } else if (this.contentTypes.contains(contentType) && this.acceptFile(uri)) {
                        Instant lastModified = resp.headers().firstValue("Last-Modified").map(DateTimeFormatter.RFC_1123_DATE_TIME::parse).map(Instant::from).orElse(null);
                        if (this.isDocumentOutdated(lastModified)) {
                            this.log.info((Object)("Harvesting '" + String.valueOf(uri) + "'..."));
                            InputSource src = new InputSource(in);
                            src.setSystemId(uri.toString());
                            src.setEncoding(charset);
                            SAXSource saxsrc = new SAXSource(StaticFactories.saxFactory.newSAXParser().getXMLReader(), src);
                            this.addDocument(uri.toString(), lastModified, (Source)saxsrc);
                        } else {
                            this.addDocument(uri.toString(), lastModified, null);
                        }
                    }
                    URI uRI = uri;
                    return uRI;
                }
            }
            catch (RetryAfterIOException ioe) {
                int after = this.retryTime;
                if (retry >= this.retryCount) {
                    throw ioe.getCause();
                }
                this.log.warn((Object)ioe.getMessage());
                after = ioe.getRetryAfter();
                this.log.info((Object)("Retrying after " + after + " seconds (" + (this.retryCount - retry) + " retries left)..."));
                try {
                    Thread.sleep(1000L * (long)after);
                }
                catch (InterruptedException interruptedException) {
                    // empty catch block
                }
                this.log.debug((Object)"Recreating digester instances to recover from incomplete parsers...");
                ++retry;
            }
        }
        throw new IOException("Unable to properly connect HTTP server.");
    }
}

