package de.dfki.leech.dhbw;

import com.google.common.base.Optional;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObjectFactory;
import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.km.leech.util.LeechException;
import de.dfki.leech.AbstractCsvParser;
import java.io.File;
import java.io.IOException;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;

/* loaded from: input_file:de/dfki/leech/dhbw/DhbwOcrCsvThesisParser.class */
public class DhbwOcrCsvThesisParser extends AbstractCsvParser {
    protected static final Set<MediaType> SUPPORTED_TYPES = Set.of(MediaType.application("vnd.dhbw.thesis_ocrcsv"));
    protected static TextObjectFactory langDetectorTextProcessor = CommonTextObjectFactories.forDetectingOnLargeText();
    protected static LanguageDetector languageDetector;

    public static void main(String[] strArr) throws Exception {
        Leech leech = new Leech();
        File file = new File("./historyCheck/deleteMe");
        if (!file.exists()) {
            file.mkdirs();
        }
        Arrays.stream((File[]) Objects.requireNonNull(file.listFiles())).forEach((v0) -> {
            v0.delete();
        });
        leech.parse("/home/reuschling/downloads/muell/dhbw_thesisCorpus4dynaq", new CrawlReportContentHandler(new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all).setShowOnlyErrors(false)).setCyclicReportPrintln(3000L), new CrawlerContext().setIncrementalCrawlingHistoryPath("./historyCheck/deleteMe").createParseContext());
        System.out.println("finished");
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public boolean firstLineColumnNames() {
        return true;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, String> getAttributeMappings() {
        if (this.m_hsAttributeMappings.isEmpty()) {
            this.m_hsAttributeMappings.add("Thema", TikaCoreProperties.TITLE.getName());
            this.m_hsAttributeMappings.add("Abstract", "body");
            this.m_hsAttributeMappings.add("Titel", TikaCoreProperties.TITLE.getName());
            this.m_hsAttributeMappings.add("Beschreibung", "body");
        }
        return this.m_hsAttributeMappings;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public AbstractCsvParser.CsvParserConfig getCsvParserConfig() {
        AbstractCsvParser.CsvParserConfig csvParserConfig = new AbstractCsvParser.CsvParserConfig();
        csvParserConfig.separator = ';';
        return csvParserConfig;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public String[] getDataEntityContentFingerprintColumNames() {
        return new String[]{"dhbw_jahrabgabe", "dhbw_standort", "dhbw_fachbereich", "dhbw_modul", TikaCoreProperties.TITLE.getName()};
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public String[] getDataEntityIdColumNames() {
        return new String[]{"dhbw_jahrabgabe", "dhbw_standort", "dhbw_fachbereich", "dhbw_modul", TikaCoreProperties.TITLE.getName()};
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, Object> getFurtherAttValPairs4Lines(MultiValueHashMap<String, Object> multiValueHashMap) {
        MultiValueHashMap<String, Object> multiValueHashMap2 = new MultiValueHashMap<>();
        Path path = Paths.get(multiValueHashMap.getFirst("csvSource", new Object[0]).toString().replace("file:/", ""), new String[0]);
        List findGroups = StringUtils.findGroups("(.*)-(.*)-(.*)-(.*)-(.*)", path.getParent().getFileName().toString());
        if (findGroups == null) {
            findGroups = StringUtils.findGroups("(.*)-(.*)-(.*)-(.*)", path.getParent().getFileName().toString());
        }
        if (findGroups == null) {
            findGroups = StringUtils.findGroups("(.*)-(.*)-(.*)-(.*)-(.*).csv", path.getFileName().toString());
        }
        if (findGroups == null) {
            findGroups = StringUtils.findGroups("(.*)-(.*)-(.*)-(.*).csv", path.getFileName().toString());
        }
        if (findGroups == null) {
            throw new LeechException("Can not parse metadata out of the parent directory name of " + multiValueHashMap.getFirst("csvSource", new Object[0]).toString());
        }
        multiValueHashMap2.add(TikaCoreProperties.CREATED.getName(), "1.1." + ((String) findGroups.get(1)));
        multiValueHashMap2.add(TikaCoreProperties.MODIFIED.getName(), "1.1." + ((String) findGroups.get(1)));
        multiValueHashMap2.add("dhbw_jahrabgabe", findGroups.get(1));
        multiValueHashMap2.add("dhbw_standort", findGroups.get(2));
        multiValueHashMap2.add("dhbw_fachbereich", findGroups.get(3));
        multiValueHashMap2.add("dhbw_modul", findGroups.get(4));
        if (findGroups.size() > 5) {
            multiValueHashMap2.add("dhbw_kurs", findGroups.get(5));
        }
        String obj = multiValueHashMap.getFirst(TikaCoreProperties.TITLE.getName(), new Object[0]).toString();
        if (StringUtils.notNullOrWhitespace(obj)) {
            Optional detect = languageDetector.detect(langDetectorTextProcessor.forText(obj));
            if (detect.isPresent() && "en".equals(((LdLocale) detect.get()).getLanguage())) {
                multiValueHashMap2.add("language_detected", "en");
            } else {
                multiValueHashMap2.add("language_detected", "de");
            }
        }
        Object first = multiValueHashMap.getFirst("path2pdf", new Object[]{multiValueHashMap.getFirst("source", new Object[0])});
        multiValueHashMap.remove("source");
        multiValueHashMap2.add("source", first);
        return multiValueHashMap2;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    static {
        try {
            languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(new LanguageProfileReader().readAllBuiltIn()).build();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
