package de.dfki.leech.foresight;

import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.inquisitor.collections.ValueBox;
import de.dfki.inquisitor.processes.StopWatch;
import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.SubDataEntityContentHandler;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.sax.DataSinkContentHandlerAdapter;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.leech.AbstractCsvParser;
import java.io.IOException;
import java.io.InputStream;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import javax.mail.URLName;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/leech/foresight/SpringerLinkResultListCsvParser.class */
public class SpringerLinkResultListCsvParser extends AbstractCsvParser {
    private static final long serialVersionUID = 2806053179884244671L;
    protected static int iCurrentItemIndex = 0;
    protected static int iSleepDuration = 120000;
    protected static int iSleepIntervall = 100;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.application("vnd.fs.springerlinkSearch"))));

    public static void main(String[] strArr) throws Throwable {
        new Leech().parse("http://link.springer.com/search?query=AND+multiscale+AND+modeling+AND+simulation&date-facet-mode=between&previous-end-year=2015&facet-start-year=2009&showAll=false&facet-end-year=2015&previous-start-year=1959", new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all), new ParseContext());
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, String> getAttributeMappings() {
        MultiValueHashMap<String, String> multiValueHashMap = new MultiValueHashMap<>();
        multiValueHashMap.add("Item Title", TikaCoreProperties.TITLE.getName());
        multiValueHashMap.add("Authors", TikaCoreProperties.CREATOR.getName());
        multiValueHashMap.add("Content Type", "Item Type");
        return multiValueHashMap;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        String replace = metadata.get("source").replace("//link.springer.com/search", "//link.springer.com/search/csv").replace("//rd.springer.com/search", "//rd.springer.com/search/csv");
        if (replace.equals(metadata.get("source"))) {
            LoggerFactory.getLogger(SpringerLinkResultListCsvParser.class.getName()).warn("Error: could not transform springerlink url to csv query: " + replace);
            return;
        }
        String parseToString = new Leech().parseToString(new URLName(replace));
        LoggerFactory.getLogger(SpringerLinkResultListCsvParser.class.getName()).info("Will crawl " + StringUtils.countMatches(parseToString, "\n") + " csv entries for " + replace);
        super.parse(TikaInputStream.get(parseToString.getBytes()), contentHandler, metadata, parseContext);
    }

    @Override // de.dfki.leech.AbstractCsvParser
    protected void processCurrentDataEntity(InputStream inputStream, Metadata metadata, ContentHandler contentHandler, ParseContext parseContext) throws Exception {
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // de.dfki.leech.AbstractCsvParser
    public void processSubDataEntity(MultiValueHashMap<String, Object> multiValueHashMap, final Metadata metadata, ContentHandler contentHandler, ParseContext parseContext) throws Exception {
        String str;
        for (Map.Entry entry : multiValueHashMap.entryList()) {
            metadata.add((String) entry.getKey(), entry.getValue().toString());
        }
        String str2 = metadata.get(TikaCoreProperties.CREATOR);
        metadata.remove(TikaCoreProperties.CREATOR.getName());
        if (!StringUtils.nullOrWhitespace(str2)) {
            for (String str3 : str2.split("(?<!(^|[A-Z]|\\s))(?=[A-Z])|(?<!^|\\s)(?=[A-Z][a-z])")) {
                metadata.add(TikaCoreProperties.CREATOR.getName(), str3.trim());
            }
        }
        String str4 = metadata.get("URL");
        metadata.remove("source");
        metadata.add("source", str4);
        Document document = Jsoup.connect(str4).get();
        Elements select = document.select("#abstract-about-cover-date");
        if (!select.isEmpty()) {
            String text = select.text();
            metadata.remove(TikaCoreProperties.MODIFIED.getName());
            metadata.add(TikaCoreProperties.MODIFIED.getName(), text);
        }
        CrawlerContext crawlerContext = (CrawlerContext) parseContext.get(CrawlerContext.class);
        if (crawlerContext == null) {
            crawlerContext = new CrawlerContext();
            parseContext.set(CrawlerContext.class, crawlerContext);
        }
        IncrementalCrawlingHistory incrementalCrawlingHistory = crawlerContext.getIncrementalCrawlingHistory();
        metadata.add("dataEntityId", metadata.get("source"));
        metadata.add("dataEntityContentFingerprint", metadata.get("source"));
        if (IncrementalCrawlingParser.performHistoryStuff(incrementalCrawlingHistory, metadata)) {
            metadata.remove("dataEntityId");
            metadata.remove("dataEntityContentFingerprint");
            final ValueBox valueBox = new ValueBox((Object) null);
            Elements select2 = document.select("#abstract-actions-download-article-pdf-link[href]");
            if (select2.isEmpty()) {
                select2 = document.select("#abstract-actions-download-chapter-pdf-link[href]");
            }
            if (select2.isEmpty()) {
                select2 = document.select("#abstract-actions-download-book-pdf-link[href]");
            }
            if (select2.isEmpty()) {
                valueBox.setValue(document.select(".a-plus-plus").text());
            } else {
                String attr = select2.attr("abs:href");
                metadata.remove("source");
                metadata.add("source", attr);
                new Leech().parse(attr, new PrintlnContentHandler(new DataSinkContentHandlerAdapter() { // from class: de.dfki.leech.foresight.SpringerLinkResultListCsvParser.1
                    public void processNewData(Metadata metadata2, String str5) {
                        if (str5 != null) {
                            valueBox.setValue(str5);
                        }
                        metadata2.remove("source");
                        for (String str6 : metadata2.names()) {
                            for (String str7 : metadata2.getValues(str6)) {
                                if (!TikaCoreProperties.MODIFIED.getName().equalsIgnoreCase(str6)) {
                                    metadata.add(str6, str7);
                                } else if (metadata.get(TikaCoreProperties.MODIFIED) == null) {
                                    metadata.add(str6, str7);
                                }
                            }
                        }
                    }
                }).setShowOnlyErrors(true).setVerbosity(PrintlnContentHandler.Verbosity.titlePlusMetadata));
            }
            if (metadata.get(TikaCoreProperties.MODIFIED) == null && (str = metadata.get("Publication Year")) != null) {
                metadata.add(TikaCoreProperties.MODIFIED.getName(), str);
            }
            metadata.remove("LeechAbstractCsvParserUniqueBodyMarker");
            new SubDataEntityContentHandler(contentHandler, metadata, (String) valueBox.getValue()).triggerSubDataEntityHandling();
        }
        int i = iCurrentItemIndex + 1;
        iCurrentItemIndex = i;
        if (i >= iSleepIntervall) {
            iCurrentItemIndex = 0;
            LoggerFactory.getLogger(SpringerLinkResultListCsvParser.class.getName()).info("will sleep " + StopWatch.formatTimeDistance(iSleepDuration) + " in order to be not blocked by springer for the next 10 minutes.");
            Thread.sleep(iSleepDuration);
        }
    }
}
