package de.dfki.km.exact.nlp.sfc;

import de.dfk.km.exact.nlp.wkt2j.api.Lexem;
import de.dfk.km.exact.nlp.wkt2j.api.Lexicon;
import de.dfk.km.exact.nlp.wkt2j.api.Wiktionary;
import de.dfk.km.exact.nlp.wkt2j.impl.WiktionaryFactory;
import de.dfki.km.exact.file.EUFileReader;
import de.dfki.km.exact.lucene.LUCooccurrence;
import de.dfki.km.exact.lucene.LUFieldFactory;
import de.dfki.km.exact.lucene.LUQueryFactory;
import de.dfki.km.exact.lucene.LURAMWriter;
import de.dfki.km.exact.lucene.LUSearcher;
import de.dfki.km.exact.lucene.LUTermInfo;
import de.dfki.km.exact.lucene.LUWindow;
import de.dfki.km.exact.lucene.LUWriter;
import de.dfki.km.exact.lucene.meta.LUMetaSearcher;
import de.dfki.km.exact.lucene.meta.LUTermSearcher;
import de.dfki.km.exact.lucene.wiki.LUWikiSearcher;
import de.dfki.km.exact.math.Average;
import de.dfki.km.exact.math.VMATH;
import de.dfki.km.exact.misc.EULocal;
import de.dfki.km.exact.misc.EULogger;
import de.dfki.km.exact.misc.EUString;
import de.dfki.km.exact.nlp.CoocurencyClass;
import de.dfki.km.exact.nlp.EUPhrase;
import de.dfki.km.exact.nlp.EUPhraser;
import de.dfki.km.exact.nlp.EUStopWord;
import de.dfki.km.exact.nlp.EUTerm;
import de.dfki.km.exact.nlp.FrequencyClass;
import de.dfki.km.exact.nlp.NLP;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.store.RAMDirectory;

/* loaded from: input_file:de/dfki/km/exact/nlp/sfc/SemFreqClass.class */
public class SemFreqClass {
    private String mField;
    private int mTermSize;
    private int mMinFC;
    private float mMinScore;
    private int mWindowSize;
    private int mMaxFrequency;
    private int mMaxFreqClass;
    private int mSecondLargestFrequency;
    private double mSimilarity;
    private NLP.LANGUAGE mLanguage;
    private Query mContextQuery;
    private LUSearcher mSearcher;
    private VMATH.AVGTYPE mAvgType;
    private ScoreDoc[] mScoreDocs;
    private LUSearcher mAdjacencySearcher;
    private LUWikiSearcher mWikiSearcher;
    private LUTermSearcher mTermSearcher;
    private LUMetaSearcher mMetaSearcher;
    private LUWriter mAdjacenceWriter;
    private RAMDirectory mAdjacenceDirectory;
    private Map<String, Set<String>> mWordFormMap;
    private Map<String, Boolean> mConceptMap;
    private Map<String, Integer> mSFCMap;
    private String[] mMaxTerms;
    private boolean mUseMinScore;
    private boolean mUseContext;
    private boolean mUseForms;
    private Set<String> mToIgnor;
    private Wiktionary mWiktionary;

    public SemFreqClass(NLP.LANGUAGE language, LUTermSearcher lUTermSearcher, LUWikiSearcher lUWikiSearcher) throws Exception {
        this(language, lUTermSearcher, lUWikiSearcher, new String[0]);
    }

    public SemFreqClass(NLP.LANGUAGE language, LUTermSearcher lUTermSearcher, LUWikiSearcher lUWikiSearcher, String[] strArr) throws Exception {
        this.mTermSize = 3;
        this.mConceptMap = new HashMap();
        this.mMinScore = 0.1f;
        this.mToIgnor = new HashSet();
        this.mUseContext = true;
        this.mWindowSize = 300;
        this.mMinFC = 7;
        this.mMaxFreqClass = 25;
        this.mSecondLargestFrequency = 24;
        this.mUseMinScore = false;
        this.mSimilarity = 0.85d;
        this.mLanguage = language;
        this.mField = "content";
        this.mTermSearcher = lUTermSearcher;
        this.mWikiSearcher = lUWikiSearcher;
        this.mAvgType = VMATH.AVGTYPE.ARITH;
        this.mSearcher = this.mTermSearcher.getIndexSearcher();
        this.mMetaSearcher = this.mTermSearcher.getMetaSearcher();
        this.mWordFormMap = new HashMap();
        this.mUseForms = true;
        this.mMaxTerms = strArr;
        this.mSFCMap = new HashMap();
        setMaxFrequency(this.mMaxTerms);
    }

    public void setWiktionary(String str) {
        this.mWiktionary = WiktionaryFactory.getWiktionary(str);
    }

    public void setWindowSize(int i) {
        this.mWindowSize = i;
    }

    public void setUseForms(boolean z) {
        this.mUseForms = z;
    }

    public void setSimilarity(double d) {
        this.mSimilarity = d;
    }

    public int getSecondLargestFrequency() {
        return this.mSecondLargestFrequency;
    }

    public void setUseContext(boolean z) {
        this.mUseContext = z;
    }

    public void setUseMinScore(boolean z) {
        this.mUseMinScore = z;
    }

    public void setReferenceIndex(String str) {
        try {
            this.mSearcher = new LUSearcher(str);
            setMaxFrequency(this.mMaxTerms);
        } catch (Exception e) {
            EULogger.info("Could not set reference index!");
        }
    }

    private final void setMaxFrequency() throws Exception {
        int i = 0;
        for (LUTermInfo lUTermInfo : this.mSearcher.getMaxSingleWordTerms(2, new String[]{"content"})) {
            if (i != 0) {
                if (i != 1) {
                    break;
                }
                this.mSecondLargestFrequency = lUTermInfo.getFrequency();
                EULogger.info("Second largest frequency: " + lUTermInfo.getTerm());
            } else {
                this.mMaxFrequency = lUTermInfo.getFrequency();
                EULogger.info("Largest frequency: " + lUTermInfo.getTerm());
            }
            i++;
        }
        this.mMaxFreqClass = FrequencyClass.calculate(this.mMaxFrequency, 1) + 1;
    }

    private final void setMaxFrequency(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            setMaxFrequency();
            return;
        }
        LUTermInfo singleWordTermInfo = this.mSearcher.getSingleWordTermInfo(strArr[0], new String[]{this.mField});
        LUTermInfo singleWordTermInfo2 = this.mSearcher.getSingleWordTermInfo(strArr[1], new String[]{this.mField});
        this.mMaxFrequency = singleWordTermInfo.getFrequency();
        this.mSecondLargestFrequency = singleWordTermInfo2.getFrequency();
        this.mMaxFreqClass = FrequencyClass.calculate(this.mMaxFrequency, 1) + 1;
    }

    public final Set<String> getForms(String str) throws Exception {
        Set entrySet;
        Set<String> set = this.mWordFormMap.get(str);
        if (set == null) {
            set = this.mUseForms ? EUStopWord.isStopWord(str, this.mLanguage) ? new HashSet() : this.mMetaSearcher.getSimiliarTerms(str, 5, this.mSimilarity) : new HashSet();
            set.add(str);
            if (this.mWiktionary != null && (entrySet = this.mWiktionary.getEntrySet(Lexicon.Language.de, str)) != null) {
                Iterator it = entrySet.iterator();
                while (it.hasNext()) {
                    Iterator it2 = ((Lexem) it.next()).getForms().iterator();
                    while (it2.hasNext()) {
                        set.add(((String) it2.next()).toLowerCase());
                    }
                }
            }
            this.mWordFormMap.put(str, set);
        }
        return set;
    }

    public Map<String, Set<String>> getWordFormMap() {
        return this.mWordFormMap;
    }

    public final void setMinFC(int i) {
        this.mMinFC = i;
    }

    public void setAVGTYPE(VMATH.AVGTYPE avgtype) {
        this.mAvgType = avgtype;
    }

    public final void setContext(String[] strArr) throws Exception {
        HashSet hashSet = new HashSet();
        for (String str : strArr) {
            hashSet.add(str);
        }
        setContext(hashSet);
    }

    public void setContext(Collection<String> collection) throws Exception {
        this.mContextQuery = LUQueryFactory.getBooleanQuery(this.mField, BooleanClause.Occur.SHOULD, false, this.mTermSearcher.getRelevantSingleWordTerms(EUString.append(collection)));
    }

    public final int getFreqClass4Term(String str) throws Exception {
        Integer num = this.mSFCMap.get(str);
        if (num != null) {
            return num.intValue();
        }
        int i = 0;
        List<LUWindow> windows = getWindows(str.toLowerCase().trim());
        if (windows != null && windows.size() > 0) {
            if (this.mUseContext) {
                setAdjacenceDirectory(windows);
                int size = windows.size();
                windows.clear();
                this.mAdjacencySearcher = new LUSearcher(this.mAdjacenceDirectory);
                this.mScoreDocs = this.mAdjacencySearcher.search(size, this.mContextQuery);
                if (this.mUseMinScore) {
                    for (ScoreDoc scoreDoc : this.mScoreDocs) {
                        if (this.mUseMinScore && scoreDoc.score >= this.mMinScore) {
                            i++;
                        }
                    }
                } else {
                    i = this.mScoreDocs.length;
                }
            } else {
                i = windows.size();
            }
        }
        if (i == 0) {
            return this.mMaxFreqClass;
        }
        this.mScoreDocs = null;
        this.mAdjacencySearcher = null;
        this.mAdjacenceDirectory = null;
        Integer valueOf = Integer.valueOf(FrequencyClass.calculate(this.mMaxFrequency, i));
        this.mSFCMap.put(str, valueOf);
        return valueOf.intValue();
    }

    public final List<LUWindow> getWindows(String str) throws Exception {
        LinkedList linkedList = new LinkedList();
        for (String str2 : getForms(str)) {
        }
        return linkedList;
    }

    public final List<LUWindow> getSemanticWindows(String str) throws Exception {
        return getSemanticWindows(getWindows(str));
    }

    public final List<LUWindow> getSemanticWindows2(String str) throws Exception {
        LinkedList linkedList = new LinkedList();
        Iterator<String> it = getForms(str).iterator();
        while (it.hasNext()) {
            linkedList.addAll(getSemanticWindows3(this.mSearcher.getWindowIterator(this.mWindowSize, LUSearcher.WINDOW.both, this.mField, it.next())));
        }
        return linkedList;
    }

    public final List<LUWindow> getSemanticWindows3(Iterator<LUWindow> it) throws Exception {
        LinkedList linkedList = new LinkedList();
        while (it.hasNext()) {
            LUWindow next = it.next();
            setAdjacenceDirectory(next);
            this.mAdjacencySearcher = new LUSearcher(this.mAdjacenceDirectory);
            this.mScoreDocs = this.mAdjacencySearcher.search(1, this.mContextQuery);
            if (this.mUseMinScore) {
                if (this.mScoreDocs.length != 0 && this.mScoreDocs[0].score >= this.mMinScore) {
                    linkedList.add(next);
                }
            } else if (this.mScoreDocs.length != 0) {
                linkedList.add(next);
            }
            this.mScoreDocs = null;
            this.mAdjacencySearcher = null;
            this.mAdjacenceDirectory = null;
        }
        return linkedList;
    }

    public final List<LUWindow> getSemanticWindows(List<LUWindow> list) throws Exception {
        LinkedList linkedList = new LinkedList();
        if (list.size() == 0) {
            return linkedList;
        }
        setAdjacenceDirectory(list);
        this.mAdjacencySearcher = new LUSearcher(this.mAdjacenceDirectory);
        this.mScoreDocs = this.mAdjacencySearcher.search(list.size(), this.mContextQuery);
        if (this.mUseMinScore) {
            for (ScoreDoc scoreDoc : this.mScoreDocs) {
                if (this.mUseMinScore && scoreDoc.score >= this.mMinScore) {
                    linkedList.add(list.get(Integer.valueOf(this.mAdjacencySearcher.getValue(scoreDoc.doc, "uri")).intValue()));
                }
            }
        } else {
            for (ScoreDoc scoreDoc2 : this.mScoreDocs) {
                linkedList.add(list.get(Integer.valueOf(this.mAdjacencySearcher.getValue(scoreDoc2.doc, "uri")).intValue()));
            }
        }
        this.mScoreDocs = null;
        this.mAdjacencySearcher = null;
        this.mAdjacenceDirectory = null;
        return linkedList;
    }

    public final double getSemanticCoherency(String str, String str2) throws Exception {
        return getSemanticCoherency(str2phrase(str), str2phrase(str2));
    }

    public final double getSemanticCoherency(EUPhrase eUPhrase, EUPhrase eUPhrase2) throws Exception {
        LinkedList linkedList = new LinkedList();
        for (EUTerm eUTerm : eUPhrase.getTerms()) {
            Iterator it = eUPhrase2.getTerms().iterator();
            while (it.hasNext()) {
                linkedList.add(Double.valueOf(getCoocurencyCoefficient(eUTerm, (EUTerm) it.next())));
            }
        }
        return Average.getAverage(this.mAvgType, linkedList);
    }

    public final List<Double> getSemanticCoocurencyClasses(String str, String str2) throws Exception {
        return getSemanticCoocurencyClasses(str2phrase(str), str2phrase(str2));
    }

    public final List<Double> getSemanticCoocurencyClasses(EUPhrase eUPhrase, EUPhrase eUPhrase2) throws Exception {
        LinkedList linkedList = new LinkedList();
        try {
            for (EUTerm eUTerm : eUPhrase.getTerms()) {
                if (isUseable(eUTerm)) {
                    List<LUWindow> semanticWindows = getSemanticWindows(EUString.append(eUTerm.getWords()));
                    for (EUTerm eUTerm2 : eUPhrase2.getTerms()) {
                        if (isUseable(eUTerm2) && isCombineable(eUTerm, eUTerm2)) {
                            linkedList.add(Double.valueOf(getCoocurencyCoefficient(semanticWindows, EUString.append(eUTerm.getWords()), EUString.append(eUTerm2.getWords()))));
                        }
                    }
                }
            }
        } catch (Exception e) {
            EULogger.warn(getClass(), "Could not determine scc!");
        }
        if (linkedList.size() == 0) {
            linkedList.add(new Double(0.0d));
        }
        return linkedList;
    }

    private final boolean isCombineable(EUTerm eUTerm, EUTerm eUTerm2) throws Exception {
        return !getForms(EUString.append(eUTerm.getWords())).contains(EUString.append(eUTerm2.getWords()));
    }

    private final boolean isUseable(EUTerm eUTerm) throws Exception {
        if (eUTerm.getWords().length == 1) {
            return (EUStopWord.isStopWord(eUTerm.getWords()[0], this.mLanguage) || this.mToIgnor.contains(eUTerm.getWords()[0])) ? false : true;
        }
        return true;
    }

    public final double getCoocurencyCoefficient(EUTerm eUTerm, EUTerm eUTerm2) throws Exception {
        return getCoocurencyCoefficient(EUString.append(eUTerm.getWords()), EUString.append(eUTerm2.getWords()));
    }

    public final double getCoocurencyCoefficient(List<LUWindow> list, String str, String str2) throws Exception {
        int i = 0;
        int size = list.size();
        Set<String> forms = getForms(str2);
        for (LUWindow lUWindow : list) {
            Iterator<String> it = forms.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                if (lUWindow.contains(it.next())) {
                    i++;
                    break;
                }
            }
        }
        EULogger.info("size('" + str + "') = " + size);
        EULogger.info("freq('" + str2 + "') = " + i);
        double calculate = CoocurencyClass.calculate(this.mMaxFrequency, size, i);
        EULogger.info("cc = " + calculate);
        return calculate;
    }

    public final double getCoocurencyCoefficient(String str, String str2) throws Exception {
        List<LUWindow> semanticWindows2 = getSemanticWindows2(str);
        Set<String> forms = getForms(str2);
        int i = 0;
        int size = semanticWindows2.size();
        for (LUWindow lUWindow : semanticWindows2) {
            Iterator<String> it = forms.iterator();
            while (true) {
                if (!it.hasNext()) {
                    break;
                }
                if (lUWindow.contains(it.next())) {
                    i++;
                    break;
                }
            }
        }
        EULogger.info("freq(" + str2 + ") = " + i);
        EULogger.info("size(" + str + ") = " + size);
        return semanticWindows2.size() == 0 ? 0.0d : Math.pow(i, 0.1d) / Math.pow(size, 0.1d);
    }

    public final Set<LUCooccurrence> getSemanticCooccurences(String str) throws Exception {
        getSemanticWindows(str);
        return null;
    }

    public final double getFreqClass4Str(String str) {
        try {
            EUPhrase str2phrase = str2phrase(str);
            if (str2phrase != null) {
                return getFreqClass4Phrase(str2phrase);
            }
            return -1.0d;
        } catch (Exception e) {
            EULogger.warn(getClass(), "Could not determine fc!");
            return -1.0d;
        }
    }

    public final List<Double> getFreqClasses4Str(String str) throws Exception {
        try {
            EUPhrase str2phrase = str2phrase(str);
            if (str2phrase != null) {
                return getFreqClasses4Phrase(str2phrase);
            }
        } catch (Exception e) {
            EULogger.warn(getClass(), "Could not determine sfc!");
        }
        LinkedList linkedList = new LinkedList();
        linkedList.add(new Double(-1.0d));
        return linkedList;
    }

    public final EUPhrase str2phrase(String str) throws Exception {
        List<EUPhrase> permute = EUPhraser.permute(str.toLowerCase(), " …•‚”“„‘«»<>’,|`.;:?!-_'/()[]{}@§$%&=^°*+~#´\"\\→„“");
        Collections.sort(permute);
        for (EUPhrase eUPhrase : permute) {
            if (checkPhrase(eUPhrase)) {
                return eUPhrase;
            }
        }
        return (EUPhrase) permute.get(permute.size() - 1);
    }

    private double getFreqClass4Phrase(EUPhrase eUPhrase) throws Exception {
        return Average.getAverage(this.mAvgType, getFreqClasses4Phrase(eUPhrase));
    }

    private List<Double> getFreqClasses4Phrase(EUPhrase eUPhrase) throws Exception {
        LinkedList linkedList = new LinkedList();
        Iterator it = eUPhrase.getTerms().iterator();
        while (it.hasNext()) {
            String append = EUString.append(((EUTerm) it.next()).getWords());
            if (!EUStopWord.isStopWord(append, this.mLanguage)) {
                Iterator<String> it2 = getForms(append).iterator();
                while (it2.hasNext()) {
                    double frequencyClass = this.mMetaSearcher.getFrequencyClass(it2.next(), false);
                    if (frequencyClass <= 0.0d || frequencyClass < this.mMinFC) {
                    }
                }
                double freqClass4Term = getFreqClass4Term(append);
                if (freqClass4Term >= this.mMinFC) {
                    linkedList.add(Double.valueOf(freqClass4Term));
                }
            }
        }
        return linkedList;
    }

    private boolean checkPhrase(EUPhrase eUPhrase) throws Exception {
        for (EUTerm eUTerm : eUPhrase.getTerms()) {
            if (eUTerm.getWords().length > 1 && !isConcept(eUTerm.getWords())) {
                return false;
            }
        }
        return true;
    }

    private boolean isConcept(String[] strArr) throws Exception {
        BooleanQuery booleanQuery = new BooleanQuery();
        String append = EUString.append(strArr);
        Boolean bool = this.mConceptMap.get(append);
        if (bool != null) {
            return bool.booleanValue();
        }
        for (String str : strArr) {
            for (String str2 : getForms(str)) {
                if (!EUStopWord.isStopWord(str2, this.mLanguage)) {
                    booleanQuery.add(new TermQuery(new Term("label", str2)), BooleanClause.Occur.SHOULD);
                }
            }
        }
        Set<String> forms = getForms(append);
        for (ScoreDoc scoreDoc : this.mWikiSearcher.getTermSearcher().getIndexSearcher().search(booleanQuery)) {
            String trim = this.mWikiSearcher.getAricleLabelByDocID(scoreDoc.doc).toLowerCase().trim();
            int indexOf = trim.indexOf(40);
            if (indexOf > -1) {
                trim = trim.substring(0, indexOf).trim();
            }
            Iterator<String> it = forms.iterator();
            while (it.hasNext()) {
                if (trim.equals(it.next())) {
                    this.mConceptMap.put(append, true);
                    return true;
                }
            }
        }
        this.mConceptMap.put(append, false);
        return false;
    }

    public void setAdjacenceDirectory(Collection<LUWindow> collection) throws Exception {
        int i = -1;
        this.mAdjacenceDirectory = new RAMDirectory();
        this.mAdjacenceWriter = new LURAMWriter(this.mAdjacenceDirectory);
        this.mAdjacenceWriter.setLogIndex(Integer.MAX_VALUE);
        this.mAdjacenceWriter.create();
        for (LUWindow lUWindow : collection) {
            i++;
            Document document = new Document();
            document.add(LUFieldFactory.getStoredNotAnalyzedField("uri", String.valueOf(i)));
            document.add(LUFieldFactory.getNotStoredAnalyzedField(this.mField, EUString.append(lUWindow.getTerms())));
            this.mAdjacenceWriter.add(document);
        }
        this.mAdjacenceWriter.close();
    }

    public void setAdjacenceDirectory(LUWindow lUWindow) throws Exception {
        this.mAdjacenceDirectory = new RAMDirectory();
        this.mAdjacenceWriter = new LURAMWriter(this.mAdjacenceDirectory);
        this.mAdjacenceWriter.setLogIndex(Integer.MAX_VALUE);
        this.mAdjacenceWriter.create();
        Document document = new Document();
        document.add(LUFieldFactory.getStoredNotAnalyzedField("uri", "1"));
        document.add(LUFieldFactory.getNotStoredAnalyzedField(this.mField, EUString.append(lUWindow.getTerms())));
        this.mAdjacenceWriter.add(document);
        this.mAdjacenceWriter.close();
    }

    public void ignor(String str) {
        this.mToIgnor.add(str);
    }

    public static void main(String[] strArr) throws Exception {
        String value = EULocal.getValue("meta-wikipedia-de");
        String value2 = EULocal.getValue("index-wikipedia-de");
        EULogger.info("init calculator...");
        SemFreqClass semFreqClass = new SemFreqClass(NLP.LANGUAGE.de, new LUTermSearcher(value2, value, NLP.LANGUAGE.de), new LUWikiSearcher(value2, value, NLP.LANGUAGE.de), new String[]{"die", "der"});
        semFreqClass.setReferenceIndex(EULocal.getValue("index-reference"));
        EULogger.info("set context...");
        semFreqClass.setContext(EUFileReader.getLines("resource/example/medicine-context.csv"));
        semFreqClass.ignor("äußeren");
        EULogger.info("value...");
        semFreqClass.setWiktionary(EULocal.getValue("database-de-wiktionary"));
        EULogger.info("result: " + semFreqClass.getSemanticCoocurencyClasses("entwicklungsstörungen", "umschriebene entwicklungsstörung der motorischen funktionen"));
    }
}
