package de.dfki.km.leech.parser.rss;

import com.sun.syndication.feed.synd.SyndEntry;
import com.sun.syndication.feed.synd.SyndFeed;
import com.sun.syndication.io.SyndFeedInput;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.util.TikaUtils;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/km/leech/parser/rss/FeedParser2.class */
public class FeedParser2 extends AbstractParser {
    private static final long serialVersionUID = 1326997408920690592L;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.application("rss+xml"), MediaType.application("atom+xml"))));

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        try {
            CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class);
            if (crawlerContext == null) {
                crawlerContext = new CrawlerContext();
            }
            IncrementalCrawlingHistory incrementalCrawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
            String str = metadata.get(IncrementalCrawlingHistory.dataEntityExistsID);
            SyndFeed build = new SyndFeedInput().build(new InputSource((InputStream) new CloseShieldInputStream(inputStream)));
            String stripTags = stripTags(build.getTitleEx().getValue());
            String stripTags2 = stripTags(build.getDescriptionEx().getValue());
            metadata.set("title", stripTags);
            metadata.set("description", stripTags2);
            XHTMLContentHandler xHTMLContentHandler = new XHTMLContentHandler(contentHandler, metadata);
            xHTMLContentHandler.startDocument();
            xHTMLContentHandler.element("h1", stripTags);
            xHTMLContentHandler.element("p", stripTags2);
            xHTMLContentHandler.endDocument();
            String str2 = metadata.get("Content-Type");
            for (SyndEntry syndEntry : build.getEntries()) {
                String link = syndEntry.getLink();
                if (link != null) {
                    XHTMLContentHandler xHTMLContentHandler2 = new XHTMLContentHandler(contentHandler, metadata);
                    xHTMLContentHandler2.startDocument();
                    TikaUtils.clearMetadata(metadata);
                    metadata.add(IncrementalCrawlingHistory.dataEntityExistsID, link);
                    metadata.add(IncrementalCrawlingHistory.dataEntityContentFingerprint, syndEntry.getPublishedDate().toString());
                    metadata.add(IncrementalCrawlingHistory.masterDataEntityExistsID, str);
                    IncrementalCrawlingParser.performHistoryStuff(incrementalCrawlingHistory, metadata);
                    if (!IncrementalCrawlingParser.UNMODIFIED.equals(metadata.get(IncrementalCrawlingParser.DATA_ENTITY_MODIFICATION_STATE))) {
                        metadata.add("Content-Type", str2);
                        metadata.add("source", link);
                        metadata.add("title", stripTags(syndEntry.getTitle()));
                        metadata.add("creator", syndEntry.getAuthor());
                        metadata.add(IncrementalCrawlingParser.MODIFIED, syndEntry.getPublishedDate().toString());
                        xHTMLContentHandler2.startElement("p");
                        String stripTags3 = stripTags(syndEntry.getDescription().getValue());
                        xHTMLContentHandler2.characters(stripTags3.toCharArray(), 0, stripTags3.length());
                        xHTMLContentHandler2.endElement("p");
                        xHTMLContentHandler2.endDocument();
                    }
                }
            }
        } catch (Exception e) {
            throw new TikaException("RSS parse error", e);
        }
    }

    protected static String stripTags(String str) {
        if (str == null) {
            return "";
        }
        String[] split = str.split("<[^>]*>");
        StringBuffer stringBuffer = new StringBuffer();
        for (String str2 : split) {
            stringBuffer.append(str2);
        }
        return stringBuffer.toString().trim();
    }
}
