package de.dfki.km.leech.parser.filter;

import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.metadata.LeechMetadata;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Map;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/km/leech/parser/filter/URLFilteringParser.class */
public class URLFilteringParser extends ParserDecorator {
    private static final long serialVersionUID = 7864760975795972594L;
    public boolean m_bAcceptSuceedingRedirects;
    Set<String> m_hsMetadataKeys;

    /* loaded from: input_file:de/dfki/km/leech/parser/filter/URLFilteringParser$URLFilteringParserContext.class */
    public static class URLFilteringParserContext {
        public MultiValueHashMap<String, String> redirect2OriginSource = new MultiValueHashMap<>();
    }

    public URLFilteringParser(Parser parser) {
        this(parser, "source");
    }

    public URLFilteringParser(Parser parser, String... strArr) {
        super(parser);
        this.m_bAcceptSuceedingRedirects = true;
        this.m_hsMetadataKeys = new HashSet();
        this.m_hsMetadataKeys.addAll(Arrays.asList(strArr));
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class);
        if (crawlerContext == null) {
            crawlerContext = new CrawlerContext();
        }
        URLFilteringParserContext uRLFilteringParserContext = (URLFilteringParserContext) parseContext.get(URLFilteringParserContext.class);
        if (uRLFilteringParserContext == null) {
            uRLFilteringParserContext = new URLFilteringParserContext();
            parseContext.set(URLFilteringParserContext.class, uRLFilteringParserContext);
        }
        String str = metadata.get("source");
        if (str == null) {
            str = metadata.get(LeechMetadata.RESOURCE_NAME_KEY);
        }
        if (this.m_hsMetadataKeys.contains("source") && metadata.get(LeechMetadata.originSource) != null && this.m_bAcceptSuceedingRedirects && !metadata.get("source").equals(metadata.get(LeechMetadata.originSource))) {
            uRLFilteringParserContext.redirect2OriginSource.add(metadata.get("source"), metadata.get(LeechMetadata.originSource));
        }
        Iterator<String> it = this.m_hsMetadataKeys.iterator();
        while (it.hasNext()) {
            String str2 = metadata.get(it.next());
            HashSet hashSet = new HashSet();
            hashSet.add(str2);
            if (this.m_bAcceptSuceedingRedirects) {
                for (Map.Entry entry : uRLFilteringParserContext.redirect2OriginSource.entryList()) {
                    if (str2.startsWith((String) entry.getKey())) {
                        hashSet.add(str2.replace((CharSequence) entry.getKey(), (CharSequence) entry.getValue()));
                    }
                }
            }
            Iterator it2 = new LinkedList(hashSet).iterator();
            while (it2.hasNext()) {
                String str3 = (String) it2.next();
                if (str3.startsWith("http://") || str3.startsWith("https://")) {
                    String replaceFirst = str3.replaceFirst("https?://", "");
                    hashSet.add(replaceFirst);
                    hashSet.add("http://" + replaceFirst);
                    hashSet.add("https://" + replaceFirst);
                }
            }
            boolean z = true;
            Iterator it3 = hashSet.iterator();
            while (true) {
                if (!it3.hasNext()) {
                    break;
                }
                if (crawlerContext.getURLFilter().accept((String) it3.next())) {
                    z = false;
                    break;
                }
            }
            if (z) {
                if (crawlerContext.getVerbose().booleanValue()) {
                    LoggerFactory.getLogger(URLFilteringParser.class.getName()).info("Data entity " + str + " is outside the URL constraints for this data source. Skipping.");
                    return;
                }
                return;
            }
        }
        getWrappedParser().parse(inputStream, contentHandler, metadata, parseContext);
    }
}
