package de.dfki.km.exact.lucene.meta;

import de.dfki.km.exact.lucene.LUSearcher;
import de.dfki.km.exact.lucene.LUTermInfo;
import de.dfki.km.exact.lucene.LUWriter;
import de.dfki.km.exact.lucene.voc.DEFAULT;
import de.dfki.km.exact.lucene.voc.FIELD;
import de.dfki.km.exact.misc.EULogger;
import de.dfki.km.exact.misc.EUString;
import de.dfki.km.exact.nlp.FrequencyClass;
import de.dfki.km.exact.nlp.NGram;
import de.dfki.km.exact.nlp.NLP;
import java.util.Collection;
import java.util.HashMap;
import java.util.logging.Logger;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.store.Directory;
import org.apache.lucene.util.Version;

/* loaded from: input_file:de/dfki/km/exact/lucene/meta/LUMetaWriter.class */
public final class LUMetaWriter extends LUWriter {
    private int mTermNumber;
    private int mMaxFrequency;
    private String mLabelField;
    private int mTermNumberAffix;
    private LUSearcher mSearcher;
    private String mContentField;
    private String[] mMapperTerms;
    private IndexReader mIndexReader;
    private StringBuilder mTermBuilder;
    private LUMetaTermFilter mMetaTermFilter;
    private LUMetaMapper mMetaTermMapper;
    private HashMap<String, LUTermInfo> mTermInfoMap;
    private static final Logger sLogger = Logger.getLogger(LUMetaWriter.class.getName());

    public LUMetaWriter(Directory directory, Directory directory2, String str, String str2, NLP.LANGUAGE language) throws Exception {
        super(directory2);
        this.mContentField = str;
        this.mLabelField = str2;
        this.mTermBuilder = new StringBuilder();
        this.mSearcher = new LUSearcher(directory);
        this.mTermInfoMap = new HashMap<>();
        this.mMetaTermFilter = new LUMetaTermFilter(language);
        this.mIndexReader = this.mSearcher.getIndexSearcher().getIndexReader();
        this.mMaxFrequency = 1;
        setMulitWordNumber(3);
        setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_31));
    }

    public LUMetaWriter(String str, String str2, String str3, String str4, NLP.LANGUAGE language) throws Exception {
        super(str);
        this.mLabelField = str4;
        this.mContentField = str3;
        this.mSearcher = new LUSearcher(str2);
        this.mTermBuilder = new StringBuilder();
        this.mTermInfoMap = new HashMap<>();
        this.mMetaTermFilter = new LUMetaTermFilter(language);
        this.mIndexReader = this.mSearcher.getIndexSearcher().getIndexReader();
        this.mMaxFrequency = 1;
        setMulitWordNumber(3);
        setAnalyzer(new WhitespaceAnalyzer(Version.LUCENE_31));
    }

    public final void setMulitWordNumber(int i) {
        this.mTermNumber = i;
        this.mTermNumberAffix = this.mTermNumber - 1;
        this.mMetaTermMapper = new LUMetaMapper(this.mTermNumber);
        this.mMapperTerms = this.mMetaTermMapper.getTerms();
    }

    public final void writeCommonTerm(int i, int i2, String str) {
        LUTermInfo lUTermInfo = new LUTermInfo(i, i2, str);
        lUTermInfo.setTermNumber(EUString.split(str).length);
        add(getSingleWordDoc(lUTermInfo));
    }

    public final void setMaxFrequency() {
        LUTermInfo maxSingleWordTerm = this.mSearcher.getMaxSingleWordTerm(new String[]{this.mContentField});
        this.mMaxFrequency = maxSingleWordTerm.getFrequency();
        addMaxFrequency();
        sLogger.info("MaxFrequency: " + this.mMaxFrequency + " (" + maxSingleWordTerm.getTerm() + ")");
    }

    public final void setMaxFrequency(int i) {
        this.mMaxFrequency = i;
    }

    public LUMetaTermFilter getMetaTermFilter() {
        return this.mMetaTermFilter;
    }

    public final void writeSingleWordTerms() throws Exception {
        TermEnum termEnum = this.mSearcher.getTermEnum(this.mContentField);
        while (termEnum.next()) {
            Term term = termEnum.term();
            if (!this.mContentField.equals(term.field())) {
                break;
            } else {
                add(getSingleWordDoc(this.mSearcher.getSingleWordTermInfo(term.text(), new String[]{this.mContentField})));
            }
        }
        termEnum.close();
    }

    public final void writeContentMultiWordTerms() throws Exception {
        TermEnum termEnum = this.mSearcher.getTermEnum(this.mContentField);
        while (termEnum.next()) {
            Term term = termEnum.term();
            if (!this.mContentField.equals(term.field())) {
                return;
            }
            String text = term.text();
            if (!this.mMetaTermFilter.filterOutside(text)) {
                for (LUTermInfo lUTermInfo : getMultiWordTerms(text)) {
                    if (!this.mMetaTermFilter.filter(lUTermInfo)) {
                        add(getMultiWordDoc(lUTermInfo));
                    }
                }
            }
        }
    }

    private final Collection<LUTermInfo> getMultiWordTerms(String str) {
        try {
            this.mTermInfoMap.clear();
            this.mTermBuilder = new StringBuilder(str);
            int length = str.length();
            Spans spans = new SpanTermQuery(new Term(this.mContentField, str)).getSpans(this.mIndexReader);
            while (spans.next()) {
                int doc = spans.doc();
                this.mMetaTermMapper.init(spans.start(), spans.end() + this.mTermNumberAffix);
                this.mIndexReader.getTermFreqVector(doc, this.mContentField, this.mMetaTermMapper);
                this.mTermBuilder.setLength(length);
                for (int i = 2; i <= this.mMapperTerms.length; i++) {
                    String str2 = this.mMapperTerms[i - 1];
                    if (i < this.mTermNumber) {
                        if (this.mMetaTermFilter.filter(str2)) {
                            break;
                        }
                        this.mTermBuilder.append(" ");
                        this.mTermBuilder.append(str2);
                        if (!this.mMetaTermFilter.filterOutsideSimple(str2)) {
                            setMultiWordTermInfo(this.mTermBuilder.toString(), doc, i);
                        }
                    } else {
                        if (this.mMetaTermFilter.filterOutside(str2)) {
                            break;
                        }
                        this.mTermBuilder.append(" ");
                        this.mTermBuilder.append(str2);
                        setMultiWordTermInfo(this.mTermBuilder.toString(), doc, i);
                    }
                }
            }
        } catch (Exception e) {
            sLogger.warning(e.getMessage());
        }
        return this.mTermInfoMap.values();
    }

    private final void setMultiWordTermInfo(String str, int i, int i2) {
        LUTermInfo lUTermInfo = this.mTermInfoMap.get(str);
        if (lUTermInfo != null) {
            lUTermInfo.setValues(i);
        } else {
            this.mTermInfoMap.put(str, new LUTermInfo(i2, i, str));
        }
    }

    public final void writeLabelMultiWordTerms() throws Exception {
        int numDocs = this.mIndexReader.numDocs();
        for (int i = 0; i < numDocs; i++) {
            try {
                String lowerCase = this.mIndexReader.document(i).get(this.mLabelField).toLowerCase();
                if (lowerCase != null && lowerCase.length() > 0) {
                    String[] split = EUString.split(lowerCase);
                    if (split.length >= 2 && (this.mMetaTermFilter.filterOutside(split[0]) || this.mMetaTermFilter.filterOutside(split[split.length - 1]))) {
                        LUTermInfo multiWordTermInfo = this.mSearcher.getMultiWordTermInfo(lowerCase, this.mContentField);
                        multiWordTermInfo.setTermNumber(split.length);
                        if (multiWordTermInfo != null && multiWordTermInfo.getDocFrequency() > 0 && multiWordTermInfo.getFrequency() > 0) {
                            add(getMultiWordDoc(multiWordTermInfo));
                        }
                    }
                }
            } catch (Exception e) {
                sLogger.warning(e.getMessage());
            }
        }
    }

    private final Document getSingleWordDoc(LUTermInfo lUTermInfo) {
        Document document = new Document();
        String term = lUTermInfo.getTerm();
        document.add(new Field(FIELD.TERM, term, Field.Store.YES, Field.Index.ANALYZED));
        document.add(new Field(FIELD.TERM_NA, term, Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.TERM_NUMBER, "1", Field.Store.NO, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.TRIGRAM_E, EUString.append(NGram.getNGrams(3, true, term)), Field.Store.NO, Field.Index.ANALYZED));
        document.add(new Field(FIELD.FREQUENCY, String.valueOf(lUTermInfo.getFrequency()), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.DOC_FREQUENCY, String.valueOf(lUTermInfo.getDocFrequency()), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.FREQUENCY_CLASS, String.valueOf(FrequencyClass.calculate(this.mMaxFrequency, lUTermInfo.getFrequency())), Field.Store.YES, Field.Index.NOT_ANALYZED));
        return document;
    }

    private final Document getMultiWordDoc(LUTermInfo lUTermInfo) {
        Document document = new Document();
        String term = lUTermInfo.getTerm();
        document.add(new Field(FIELD.TERM, term, Field.Store.YES, Field.Index.ANALYZED));
        document.add(new Field(FIELD.TERM_NA, term, Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.TERM_NUMBER, String.valueOf(lUTermInfo.getTermNumber()), Field.Store.NO, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.FREQUENCY, String.valueOf(lUTermInfo.getFrequency()), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.DOC_FREQUENCY, String.valueOf(lUTermInfo.getDocFrequency()), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.FREQUENCY_CLASS, String.valueOf(FrequencyClass.calculate(this.mMaxFrequency, lUTermInfo.getFrequency())), Field.Store.YES, Field.Index.NOT_ANALYZED));
        return document;
    }

    protected void addMaxFrequency() {
        Document document = new Document();
        document.add(new Field(FIELD.MAX_FREQUENT_TERM, DEFAULT.MAX_FREQUENT_TERM, Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add(new Field(FIELD.MAX_FREQUENCY, String.valueOf(this.mMaxFrequency), Field.Store.YES, Field.Index.NOT_ANALYZED));
        add(document);
    }

    public static void main(String[] strArr) throws Exception {
        LUMetaWriter lUMetaWriter = new LUMetaWriter(strArr[0], strArr[1], strArr[2], strArr[3], NLP.LANGUAGE.valueOf(strArr[4]));
        lUMetaWriter.create();
        lUMetaWriter.setLogIndex(100000);
        lUMetaWriter.setCommitIndex(10000);
        EULogger.info("set max frequency...");
        lUMetaWriter.setMaxFrequency();
        EULogger.info("write single word terms...");
        lUMetaWriter.writeSingleWordTerms();
        EULogger.info("write content multi word terms...");
        lUMetaWriter.writeContentMultiWordTerms();
        EULogger.info("write label multi word terms...");
        lUMetaWriter.writeLabelMultiWordTerms();
        lUMetaWriter.close();
    }
}
