package de.dfki.leech.foresight;

import de.dfki.inquisitor.collections.ValueBox;
import de.dfki.inquisitor.file.FileUtilz;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.SubDataEntityContentHandler;
import de.dfki.km.leech.sax.DataSinkContentHandlerAdapter;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/leech/foresight/SpringerLinkDocumentParser.class */
public class SpringerLinkDocumentParser extends AbstractParser {
    private static final long serialVersionUID = -4358818721569908234L;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.application("vnd.fs.springerlinkDocument"))));

    public static void main(String[] strArr) throws Exception {
        new Leech().parse("http://rd.springer.com/article/10.1007/s12273-014-0184-5", new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all), new ParseContext());
    }

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, final Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        inputStream.close();
        Document document = Jsoup.connect(metadata.get("source")).get();
        Elements select = document.select("#abstract-about-cover-date");
        if (!select.isEmpty()) {
            String text = select.text();
            metadata.remove(TikaCoreProperties.MODIFIED.getName());
            metadata.add(TikaCoreProperties.MODIFIED.getName(), text);
        }
        Elements select2 = document.select("#title");
        if (!select2.isEmpty()) {
            metadata.remove(TikaCoreProperties.TITLE.getName());
            metadata.add(TikaCoreProperties.TITLE.getName(), select2.text());
        }
        Elements select3 = document.select(".author-list > ul:nth-child(1) > li.author");
        if (!select3.isEmpty()) {
            metadata.remove(TikaCoreProperties.CREATOR.getName());
            Iterator it = select3.iterator();
            while (it.hasNext()) {
                Elements select4 = ((Element) it.next()).select("a:nth-child(1)");
                if (!select4.isEmpty()) {
                    metadata.add(TikaCoreProperties.CREATOR.getName(), select4.text());
                }
            }
        }
        HashSet hashSet = new HashSet();
        Elements select5 = document.select("div.formatted:nth-child(1) > ol:nth-child(1) > li:nth-child(1)");
        if (!select5.isEmpty()) {
            metadata.remove("referenceAuthor");
            Iterator it2 = select5.iterator();
            while (it2.hasNext()) {
                Elements select6 = ((Element) it2.next()).select("span:nth-child(1)");
                if (!select6.isEmpty()) {
                    String text2 = select6.text();
                    try {
                        FileUtilz.append2File(text2 + "\n", "cites.txt", "UTF-8");
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    hashSet.add(text2);
                }
            }
            Iterator it3 = hashSet.iterator();
            while (it3.hasNext()) {
                metadata.add("referenceAuthor", (String) it3.next());
            }
        }
        final ValueBox valueBox = new ValueBox((Object) null);
        Elements select7 = document.select("#abstract-actions-download-article-pdf-link[href]");
        if (select7.isEmpty()) {
            select7 = document.select("#abstract-actions-download-chapter-pdf-link[href]");
        }
        if (select7.isEmpty()) {
            select7 = document.select("#abstract-actions-download-book-pdf-link[href]");
        }
        if (select7.isEmpty()) {
            valueBox.setValue(document.select(".a-plus-plus").text());
        } else {
            String attr = select7.attr("abs:href");
            metadata.remove("source");
            metadata.add("source", attr);
            new Leech().parse(attr, new PrintlnContentHandler(new DataSinkContentHandlerAdapter() { // from class: de.dfki.leech.foresight.SpringerLinkDocumentParser.1
                public void processNewData(Metadata metadata2, String str) {
                    if (str != null) {
                        valueBox.setValue(str);
                    }
                    metadata2.remove("source");
                    for (String str2 : metadata2.names()) {
                        for (String str3 : metadata2.getValues(str2)) {
                            if (!"dataEntityId".equals(str2) && !"dataEntityContentFingerprint".equals(str2)) {
                                if (!TikaCoreProperties.MODIFIED.getName().equalsIgnoreCase(str2)) {
                                    metadata.add(str2, str3);
                                } else if (metadata.get(TikaCoreProperties.MODIFIED) == null) {
                                    metadata.add(str2, str3);
                                }
                            }
                        }
                    }
                }
            }).setShowOnlyErrors(true).setVerbosity(PrintlnContentHandler.Verbosity.titlePlusMetadata));
        }
        new SubDataEntityContentHandler(contentHandler, metadata, (String) valueBox.getValue()).triggerSubDataEntityHandling();
    }
}
