package de.dfki.km.leech.io;

import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.util.CookieManager;
import de.dfki.km.leech.util.LeechException;
import de.dfki.km.leech.util.UrlUtil;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.net.URLConnection;
import java.util.Date;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.zip.GZIPInputStream;
import javax.mail.URLName;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;

/* loaded from: input_file:de/dfki/km/leech/io/HttpURLStreamProvider.class */
public class HttpURLStreamProvider extends URLStreamProvider {
    protected static final int connectTimeout = 20000;
    protected static final int MAX_REDIRECTIONS = 20;
    protected static final int readTimeout = 20000;

    protected static String getRedirectedUrl(URL url, URLConnection uRLConnection) throws IOException {
        String headerField = uRLConnection.getHeaderField("Location");
        if (headerField == null) {
            throw new IOException("missing redirection location");
        }
        return new URL(url, headerField).toString();
    }

    protected static boolean isRedirected(int i) {
        return i == 300 || i == 301 || i == 302 || i == 303;
    }

    @Override // de.dfki.km.leech.io.URLStreamProvider
    public Metadata addFirstMetadata(URLName uRLName, Metadata metadata, ParseContext parseContext) throws Exception {
        String dataEntityContentFingerprint;
        if (metadata == null) {
            metadata = new Metadata();
        }
        if (metadata.get("source") != null && metadata.get("resourceName") != null && metadata.get("Content-Encoding") != null && metadata.get("Content-Type") != null && metadata.get("Content-Location") != null && metadata.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) != null && metadata.get(IncrementalCrawlingHistory.dataEntityId) != null) {
            return metadata;
        }
        if (parseContext == null) {
            parseContext = new ParseContext();
        }
        CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class, new CrawlerContext());
        IncrementalCrawlingHistory incrementalCrawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
        if (incrementalCrawlingHistory != null) {
            incrementalCrawlingHistory.openDBStuff();
        }
        String uRLName2 = uRLName.toString();
        metadata.set("source", uRLName2);
        int i = 0;
        String uRLName3 = uRLName.toString();
        CookieManager cookieManager = crawlerContext.getCookieManager();
        while (i <= MAX_REDIRECTIONS) {
            URL url = new URL(UrlUtil.normalizeURL(new URLName(new URL(uRLName3))).toString());
            String externalForm = url.toExternalForm();
            Date date = null;
            if (incrementalCrawlingHistory != null && (dataEntityContentFingerprint = incrementalCrawlingHistory.getDataEntityContentFingerprint(externalForm)) != null && dataEntityContentFingerprint.matches("\\d+")) {
                date = new Date(Long.valueOf(dataEntityContentFingerprint).longValue());
            }
            try {
                URLConnection openConnection = url.openConnection();
                if (openConnection instanceof HttpURLConnection) {
                    ((HttpURLConnection) openConnection).setRequestMethod("HEAD");
                    cookieManager.setCookies(openConnection);
                    openConnection.setConnectTimeout(20000);
                    openConnection.setReadTimeout(20000);
                    openConnection.setRequestProperty("Accept-Encoding", "gzip");
                    Map<String, String> userHeaders = crawlerContext.getUserHeaders();
                    if (userHeaders != null) {
                        for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
                            openConnection.setRequestProperty(entry.getKey(), entry.getValue());
                        }
                    }
                    String userAgent = crawlerContext.getUserAgent();
                    if (userAgent != null && !userAgent.isEmpty()) {
                        openConnection.setRequestProperty("User-Agent", userAgent);
                    }
                    ((HttpURLConnection) openConnection).setInstanceFollowRedirects(false);
                    if (date != null) {
                        openConnection.setIfModifiedSince(date.getTime());
                    }
                    openConnection.connect();
                    cookieManager.storeCookies(openConnection);
                    int responseCode = ((HttpURLConnection) openConnection).getResponseCode();
                    if (isRedirected(responseCode)) {
                        uRLName3 = getRedirectedUrl(url, openConnection);
                        i++;
                        if (uRLName3.equals(externalForm)) {
                            throw new LeechException("url redirects to itself: " + uRLName3);
                        }
                    } else {
                        if (responseCode == 404) {
                            throw new LeechException(externalForm + " not found");
                        }
                        if (responseCode == 304) {
                            if (incrementalCrawlingHistory != null && date != null) {
                                metadata.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(date.getTime()));
                            }
                        } else if (responseCode != 200) {
                            throw new IOException("Http connection error, response code = " + responseCode + ", url = " + url);
                        }
                    }
                }
                if (metadata.get(IncrementalCrawlingHistory.dataEntityContentFingerprint) == null) {
                    metadata.set(IncrementalCrawlingHistory.dataEntityContentFingerprint, String.valueOf(System.currentTimeMillis()));
                }
                metadata.set("resourceName", externalForm);
                metadata.set("source", externalForm);
                metadata.set(IncrementalCrawlingHistory.dataEntityId, externalForm);
                if (uRLName2.indexOf(externalForm) == -1) {
                    metadata.set("originalsource", uRLName2);
                }
                String contentType = openConnection.getContentType();
                if (contentType != null && !contentType.contains("text/xml")) {
                    metadata.set("Content-Type", contentType);
                }
                String contentEncoding = openConnection.getContentEncoding();
                if (contentEncoding != null) {
                    metadata.set("Content-Encoding", contentEncoding);
                }
                int contentLength = openConnection.getContentLength();
                if (contentLength >= 0) {
                    metadata.set("Content-Length", Integer.toString(contentLength));
                }
                metadata.set("Content-Location", externalForm);
                return metadata;
            } catch (Exception e) {
                if (e instanceof IOException) {
                    throw ((IOException) e);
                }
                throw new LeechException("connection to " + uRLName2 + " resulted in an exception", e);
            }
        }
        throw new IOException("too many redirections, max = 20, url = " + uRLName2);
    }

    @Override // de.dfki.km.leech.io.URLStreamProvider
    public TikaInputStream getStream(URLName uRLName, Metadata metadata, ParseContext parseContext) throws Exception {
        final URL url = new URL(uRLName.toString());
        final CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class, new CrawlerContext());
        return TikaInputStream.get(new ShiftInitInputStream() { // from class: de.dfki.km.leech.io.HttpURLStreamProvider.1
            @Override // de.dfki.km.leech.io.ShiftInitInputStream
            protected InputStream initBeforeFirstStreamDataAccess() throws Exception {
                CookieManager cookieManager = crawlerContext.getCookieManager();
                URLConnection openConnection = url.openConnection();
                cookieManager.setCookies(openConnection);
                openConnection.setConnectTimeout(20000);
                openConnection.setReadTimeout(20000);
                openConnection.setRequestProperty("Accept-Encoding", "gzip");
                Map<String, String> userHeaders = crawlerContext.getUserHeaders();
                if (userHeaders != null) {
                    for (Map.Entry<String, String> entry : userHeaders.entrySet()) {
                        openConnection.setRequestProperty(entry.getKey(), entry.getValue());
                    }
                }
                String userAgent = crawlerContext.getUserAgent();
                if (userAgent != null && !userAgent.isEmpty()) {
                    openConnection.setRequestProperty("User-Agent", userAgent);
                }
                openConnection.connect();
                cookieManager.storeCookies(openConnection);
                InputStream inputStream = openConnection.getInputStream();
                String headerField = openConnection.getHeaderField("Content-Encoding");
                if (headerField != null) {
                    headerField = headerField.toLowerCase().trim();
                }
                return "gzip".equals(headerField) ? new BufferedInputStream(new GZIPInputStream(inputStream)) : new BufferedInputStream(inputStream);
            }
        });
    }

    @Override // de.dfki.km.leech.io.URLStreamProvider
    public Set<String> getSupportedProtocols() {
        HashSet hashSet = new HashSet();
        hashSet.add("http");
        hashSet.add("https");
        return hashSet;
    }
}
