package de.dfki.leech.dhbw;

import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.i18n.LdLocale;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.CommonTextObjectFactories;
import com.optimaize.langdetect.text.TextObjectFactory;
import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.inquisitor.collections.TwoValuesBox;
import de.dfki.leech.AbstractCsvParser;
import java.io.IOException;
import java.util.Optional;
import java.util.Set;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;

/* loaded from: input_file:de/dfki/leech/dhbw/DhbwCsvThesisParser.class */
public class DhbwCsvThesisParser extends AbstractCsvParser {
    protected static final Set<MediaType> SUPPORTED_TYPES = Set.of(MediaType.application("vnd.dhbw.thesis_csv"));
    protected static TextObjectFactory langDetectorTextProcessor = CommonTextObjectFactories.forDetectingOnLargeText();
    protected static LanguageDetector languageDetector;

    @Override // de.dfki.leech.AbstractCsvParser
    public boolean firstLineColumnNames() {
        return true;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, String> getAttributeMappings() {
        if (this.m_hsAttributeMappings.isEmpty()) {
            this.m_hsAttributeMappings.add("THEMA", TikaCoreProperties.TITLE.getName());
            this.m_hsAttributeMappings.add("ABSTRACT", "body");
            this.m_hsAttributeMappings.add("DATUM", TikaCoreProperties.MODIFIED.getName());
            this.m_hsAttributeMappings.add("TYP", "dhbw_modul");
            this.m_hsAttributeMappings.add("\ufeffTYP", "dhbw_modul");
            this.m_hsAttributeMappings.add("Standort", "dhbw_standort");
            this.m_hsAttributeMappings.add("Studiengang", "dhbw_fachbereich");
        }
        return this.m_hsAttributeMappings;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, TwoValuesBox<String, String>> getAttributeValueExtractionRegExs() {
        if (this.m_hsAttributeExtractionMappings.isEmpty()) {
            this.m_hsAttributeExtractionMappings.add("KURS", new TwoValuesBox("dhbw_kurs", "\\w+-\\D+(\\d+\\w+)"));
            this.m_hsAttributeExtractionMappings.add("DATUM", new TwoValuesBox("dhbw_jahrabgabe", "(\\d+)-\\d+-\\d+ \\d+:\\d+:\\d+"));
        }
        return this.m_hsAttributeExtractionMappings;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public AbstractCsvParser.CsvParserConfig getCsvParserConfig() {
        AbstractCsvParser.CsvParserConfig csvParserConfig = new AbstractCsvParser.CsvParserConfig();
        csvParserConfig.separator = ';';
        return csvParserConfig;
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public String[] getDataEntityContentFingerprintColumNames() {
        return new String[]{"DATUM", "KURS", "THEMA"};
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public String[] getDataEntityIdColumNames() {
        return new String[]{"DATUM", "KURS", "THEMA"};
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public MultiValueHashMap<String, Object> getStaticAttValPairs4Lines() {
        return new MultiValueHashMap<>();
    }

    @Override // de.dfki.leech.AbstractCsvParser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // de.dfki.leech.AbstractCsvParser
    public void processSubDataEntity(MultiValueHashMap<String, Object> multiValueHashMap, Metadata metadata, ContentHandler contentHandler, ParseContext parseContext) throws Exception {
        Optional findFirst = multiValueHashMap.entryList().stream().filter(entry -> {
            return "LeechAbstractCsvParserUniqueBodyMarker".equals(entry.getKey());
        }).map((v0) -> {
            return v0.getValue();
        }).findFirst();
        if (findFirst.isPresent()) {
            com.google.common.base.Optional detect = languageDetector.detect(langDetectorTextProcessor.forText(findFirst.get().toString()));
            if (detect.isPresent() && "en".equals(((LdLocale) detect.get()).getLanguage())) {
                metadata.add("language_detected", "en");
            } else {
                metadata.add("language_detected", "de");
            }
        }
        super.processSubDataEntity(multiValueHashMap, metadata, contentHandler, parseContext);
    }

    static {
        try {
            languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).withProfiles(new LanguageProfileReader().readAllBuiltIn()).build();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}
