package de.dfki.km.leech.parser;

import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.config.HtmlCrawlerContext;
import de.dfki.km.leech.io.URLStreamProvider;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.util.MultiValueHashMap;
import de.dfki.km.leech.util.StringUtils;
import de.dfki.km.leech.util.UrlUtil;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Set;
import java.util.logging.Logger;
import javax.mail.URLName;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.HtmlParser;
import org.apache.tika.sax.Link;
import org.apache.tika.sax.LinkContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/km/leech/parser/HtmlCrawlerParser.class */
public class HtmlCrawlerParser extends CrawlerParser {
    private static final long serialVersionUID = -8214006342702249257L;
    protected static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.text("html"), MediaType.application("xhtml+xml"), MediaType.application("vnd.wap.xhtml+xml"), MediaType.application("x-asp"))));
    protected Leech m_leech;
    protected HtmlParser m_tikaHtmlParser = new HtmlParser();

    protected boolean checkIfInConstraints(String str, String str2, CrawlerContext crawlerContext, HtmlCrawlerContext htmlCrawlerContext) {
        if (crawlerContext == null) {
            return true;
        }
        if (str.startsWith("file:") && !str2.startsWith("file:") && !htmlCrawlerContext.getFollowRemoteLinksIfLocalFileCrawl()) {
            if (!crawlerContext.getVerbose().booleanValue()) {
                return false;
            }
            Logger.getLogger(CrawlerParser.class.getName()).info("URL " + str2 + " is a remote link and thus will not followed while crawling a local html file (as configured). Skipping.");
            return false;
        }
        if (crawlerContext.getURLFilter().accept(str2)) {
            return true;
        }
        if (!crawlerContext.getVerbose().booleanValue()) {
            return false;
        }
        Logger.getLogger(CrawlerParser.class.getName()).info("URL " + str2 + " is outside the URL constraints for this data source. Skipping.");
        return false;
    }

    @Override // de.dfki.km.leech.parser.CrawlerParser
    protected Iterator<MultiValueHashMap<String, Object>> getSubDataEntitiesInformation(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws Exception {
        HashSet hashSet = new HashSet();
        CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class);
        HtmlCrawlerContext htmlCrawlerContext = (HtmlCrawlerContext) parseContext.get(HtmlCrawlerContext.class, new HtmlCrawlerContext());
        String str = metadata.get("source");
        LinkContentHandler linkContentHandler = new LinkContentHandler();
        this.m_tikaHtmlParser.parse(inputStream, linkContentHandler, metadata, parseContext);
        for (Link link : linkContentHandler.getLinks()) {
            if (!StringUtils.nullOrWhitespace(link.getUri())) {
                try {
                    String uRLName = UrlUtil.normalizeURL(new URLName(new URL(link.getUri()).toExternalForm())).toString();
                    if (checkIfInConstraints(str, uRLName, crawlerContext, htmlCrawlerContext)) {
                        hashSet.add(new URLName(uRLName));
                    }
                } catch (Exception e) {
                }
            }
        }
        LinkedList linkedList = new LinkedList();
        Iterator it = hashSet.iterator();
        while (it.hasNext()) {
            URLName uRLName2 = (URLName) it.next();
            MultiValueHashMap multiValueHashMap = new MultiValueHashMap();
            URLName normalizeURL = UrlUtil.normalizeURL(uRLName2);
            multiValueHashMap.add(CrawlerParser.SOURCEID, normalizeURL.toString());
            multiValueHashMap.add("url", normalizeURL);
            linkedList.add(multiValueHashMap);
        }
        return linkedList.iterator();
    }

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    @Override // de.dfki.km.leech.parser.CrawlerParser
    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        if (StringUtils.nullOrWhitespace(metadata.get("source"))) {
            this.m_tikaHtmlParser.parse(inputStream, contentHandler, metadata, parseContext);
        } else {
            super.parse(inputStream, contentHandler, metadata, parseContext);
        }
    }

    @Override // de.dfki.km.leech.parser.CrawlerParser
    protected void processCurrentDataEntity(InputStream inputStream, Metadata metadata, ContentHandler contentHandler, ParseContext parseContext) throws Exception {
        if (IncrementalCrawlingParser.UNMODIFIED.equals(metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE))) {
            return;
        }
        this.m_tikaHtmlParser.parse(inputStream, contentHandler, metadata, parseContext);
    }

    @Override // de.dfki.km.leech.parser.CrawlerParser
    protected void processSubDataEntity(MultiValueHashMap<String, Object> multiValueHashMap, Metadata metadata, ContentHandler contentHandler, ParseContext parseContext) throws Exception {
        URLName uRLName = (URLName) multiValueHashMap.getFirst("url");
        Metadata addFirstMetadata = URLStreamProvider.getURLStreamProvider(uRLName).addFirstMetadata(uRLName, metadata, parseContext);
        TikaInputStream stream = URLStreamProvider.getURLStreamProvider(uRLName).getStream(uRLName, addFirstMetadata, parseContext);
        try {
            if (this.m_leech == null) {
                this.m_leech = new Leech();
            }
            this.m_leech.getParser().parse(stream, contentHandler, addFirstMetadata, parseContext);
            if (stream != null) {
                stream.close();
            }
        } catch (Throwable th) {
            if (stream != null) {
                stream.close();
            }
            throw th;
        }
    }
}
