package de.dfki.km.exact.xplain.sinnodium;

import de.dfki.km.exact.file.CSVWriter;
import de.dfki.km.exact.file.EUFileReader;
import de.dfki.km.exact.lucene.LURecycler;
import de.dfki.km.exact.lucene.LUSearcher;
import de.dfki.km.exact.lucene.LUTermInfo;
import de.dfki.km.exact.lucene.LUWriter;
import de.dfki.km.exact.lucene.file.LUDcoumentFactory;
import de.dfki.km.exact.lucene.meta.LUMetaSearcher;
import de.dfki.km.exact.lucene.util.LUContextVerifier;
import de.dfki.km.exact.lucene.util.LULabelAnalyser;
import de.dfki.km.exact.lucene.util.LULabelSimilarity;
import de.dfki.km.exact.lucene.util.LULocal;
import de.dfki.km.exact.misc.EULogger;
import de.dfki.km.exact.nlp.EUCharacter;
import de.dfki.km.exact.nlp.EUDigit;
import de.dfki.km.exact.nlp.FrequencyClass;
import de.dfki.km.exact.nlp.NLP;
import de.dfki.km.exact.nlp.StopWord;
import de.dfki.km.exact.web.lucene.TRIPLE;
import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
import org.apache.lucene.store.RAMDirectory;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.util.PDFTextStripper;

/* loaded from: input_file:de/dfki/km/exact/xplain/sinnodium/UnderExtractor.class */
public class UnderExtractor {
    public static RAMDirectory DIRECTORY;
    public static String RAM_DOC_NAME = "Technischer Baukasten";
    public static String CONTEXT_FILE = "resource/Sinnodium/BaukastenContext.txt";
    public static String BAUKASTEN = "resource/Sinnodium/TechnischerBaukasten.pdf";
    public static String ANALYZED_BAUKASTEN = "resource/Sinnodium/TechnischerBaukastenAnalyzed.csv";

    public static void main(String[] strArr) throws Exception {
        String[] recycle;
        writeDirectory();
        LUSearcher lUSearcher = new LUSearcher(DIRECTORY);
        LUSearcher searcherWikipediaDE = LULocal.getSearcherWikipediaDE();
        List lines = EUFileReader.getLines(CONTEXT_FILE);
        LUMetaSearcher metaSearcherSpiegel = LULocal.getMetaSearcherSpiegel();
        LUContextVerifier lUContextVerifier = new LUContextVerifier(searcherWikipediaDE, LULocal.getMetaSearcherWikipediaDE(), lines);
        CSVWriter cSVWriter = new CSVWriter(ANALYZED_BAUKASTEN, TRIPLE.TYPE_DELIMiTER, "Cp1252");
        LULabelAnalyser lULabelAnalyser = new LULabelAnalyser(0.7d, new LULabelSimilarity(), searcherWikipediaDE);
        Set<String> singleWordTerms = lUSearcher.getSingleWordTerms();
        int maxFrequency = metaSearcherSpiegel.getMaxFrequency();
        HashMap hashMap = new HashMap();
        HashMap hashMap2 = new HashMap();
        HashMap hashMap3 = new HashMap();
        for (String str : singleWordTerms) {
            if (!StopWord.isStopWord(str, NLP.LANGUAGE.de) && str.length() > 2 && !EUCharacter.hasSpecialCharacter(str) && !EUDigit.hasDigit(str)) {
                if (str.equals("mutter")) {
                    EULogger.info(str);
                }
                LUTermInfo singleWordTermInfo = lUSearcher.getSingleWordTermInfo(str, new String[]{"content"});
                cSVWriter.writeCell(str);
                cSVWriter.writeIntegerCell(singleWordTermInfo.getFrequency());
                int frequencyClass = metaSearcherSpiegel.getFrequencyClass(str);
                if (frequencyClass == -1) {
                    frequencyClass = FrequencyClass.calculate(maxFrequency, 1) + 1;
                }
                cSVWriter.writeIntegerCell(frequencyClass);
                if (hashMap.containsKey(Integer.valueOf(frequencyClass))) {
                    hashMap.put(Integer.valueOf(frequencyClass), Integer.valueOf(((Integer) hashMap.get(Integer.valueOf(frequencyClass))).intValue() + 1));
                    hashMap2.put(Integer.valueOf(frequencyClass), Integer.valueOf(((Integer) hashMap.get(Integer.valueOf(frequencyClass))).intValue() + singleWordTermInfo.getFrequency()));
                } else {
                    hashMap.put(Integer.valueOf(frequencyClass), 1);
                    hashMap.put(Integer.valueOf(frequencyClass), Integer.valueOf(singleWordTermInfo.getFrequency()));
                }
                cSVWriter.nextLine();
                EULogger.info(str + " " + singleWordTermInfo.getFrequency());
                int conceptDoc = lULabelAnalyser.getConceptDoc(str);
                if (conceptDoc != -1 && (recycle = LURecycler.recycle(conceptDoc, "content", searcherWikipediaDE.getIndexSearcher().getIndexReader())) != null && lUContextVerifier.isContext(recycle)) {
                    hashMap3.put(str, searcherWikipediaDE.getValue(conceptDoc, "uri"));
                }
            }
        }
        LinkedList linkedList = new LinkedList(hashMap.keySet());
        Collections.sort(linkedList);
        cSVWriter.nextLine();
        cSVWriter.nextLine();
        Iterator it = linkedList.iterator();
        while (it.hasNext()) {
            Integer num = (Integer) it.next();
            cSVWriter.writeIntegerCell(num.intValue());
            cSVWriter.writeIntegerCell(((Integer) hashMap.get(num)).intValue());
            cSVWriter.writeIntegerCell(((Integer) hashMap2.get(num)).intValue());
            cSVWriter.nextLine();
        }
        cSVWriter.nextLine();
        cSVWriter.nextLine();
        Iterator it2 = hashMap3.keySet().iterator();
        while (it2.hasNext()) {
            cSVWriter.writeCell((String) hashMap3.get((String) it2.next()));
            cSVWriter.nextLine();
        }
        cSVWriter.close();
    }

    public static String getText() throws Exception {
        PDFTextStripper pDFTextStripper = new PDFTextStripper();
        PDDocument load = PDDocument.load(BAUKASTEN);
        String text = pDFTextStripper.getText(load);
        load.close();
        return text;
    }

    public static void writeDirectory() throws Exception {
        DIRECTORY = new RAMDirectory();
        LUWriter lUWriter = new LUWriter(DIRECTORY);
        lUWriter.create();
        lUWriter.add(LUDcoumentFactory.getDocument("http://www.dfki.de/km/ontology/forcher/fweb#DefaultNamedGraph", RAM_DOC_NAME, getText()));
        lUWriter.close();
    }
}
