package de.dfki.km.leech.util;

import de.dfki.km.leech.lucene.Buzzwords;
import de.dfki.km.leech.lucene.FieldConfig;
import de.dfki.km.leech.lucene.PageCountEstimator;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.metadata.LeechMetadata;
import java.io.File;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermDocs;
import org.apache.lucene.index.TermEnum;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;

/* loaded from: input_file:de/dfki/km/leech/util/IndexPostprocessor.class */
public class IndexPostprocessor {
    protected boolean m_bEstimatePageCounts = false;
    protected boolean m_bSkipSimilarTerms;
    protected int m_iMaxNumberOfBuzzwords;
    protected String m_strNewField4Buzzwords;

    protected static List<String> terms(String str, String str2, int i, IndexReader indexReader) throws IOException, URISyntaxException {
        LinkedList linkedList = new LinkedList();
        TermEnum termEnum = null;
        try {
            termEnum = indexReader.terms(new Term(str, str2));
            for (int i2 = 0; i2 < i; i2++) {
                if (termEnum.term() != null && termEnum.term().field().equals(str)) {
                    String text = termEnum.term().text();
                    if (!text.startsWith(str2)) {
                        break;
                    }
                    linkedList.add(text);
                    if (!termEnum.next()) {
                        break;
                    }
                }
            }
            if (termEnum != null) {
                termEnum.close();
            }
            return linkedList;
        } catch (Throwable th) {
            if (termEnum != null) {
                termEnum.close();
            }
            throw th;
        }
    }

    public void enableBuzzwordGeneration(String str, int i, boolean z) {
        this.m_strNewField4Buzzwords = str;
        this.m_iMaxNumberOfBuzzwords = i;
        this.m_bSkipSimilarTerms = z;
    }

    public void enablePageCountEstimation() {
        this.m_bEstimatePageCounts = true;
    }

    public void postprocessIndex(String str, FieldConfig fieldConfig) throws Exception {
        if (StringUtils.nullOrWhitespace(this.m_strNewField4Buzzwords) && !this.m_bEstimatePageCounts) {
            Logger.getLogger(IndexPostprocessor.class.getName()).warning("Will do nothing - nothing is enabled.");
        }
        if (!StringUtils.nullOrWhitespace(this.m_strNewField4Buzzwords)) {
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Index postprocessing: Will create buzzwords");
        }
        if (this.m_bEstimatePageCounts) {
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Index postprocessing: Will calculate heuristic page counts");
        }
        long currentTimeMillis = System.currentTimeMillis();
        IndexReader open = IndexReader.open(new SimpleFSDirectory(new File(str)));
        File file = new File(str);
        File file2 = new File(file.getAbsolutePath() + "_4PostProcessing");
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, fieldConfig.createAnalyzer());
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        IndexWriter indexWriter = new IndexWriter(new SimpleFSDirectory(file2), indexWriterConfig);
        ToLuceneContentHandler toLuceneContentHandler = new ToLuceneContentHandler(fieldConfig, indexWriter);
        Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will get the doc ids...");
        List<String> terms = terms(LeechMetadata.id, "", Integer.MAX_VALUE, open);
        Logger.getLogger(LuceneIndexCreator.class.getName()).info("...finished");
        HashSet hashSet = new HashSet();
        hashSet.add(LeechMetadata.body);
        hashSet.add("title");
        int i = 0;
        Iterator<String> it = terms.iterator();
        while (it.hasNext()) {
            TermDocs termDocs = open.termDocs(new Term(LeechMetadata.id, it.next()));
            termDocs.next();
            int doc = termDocs.doc();
            Document document = open.document(doc);
            if (!StringUtils.nullOrWhitespace(this.m_strNewField4Buzzwords)) {
                Buzzwords.addBuzzwords(doc, document, this.m_strNewField4Buzzwords, hashSet, this.m_iMaxNumberOfBuzzwords, this.m_bSkipSimilarTerms, open);
            }
            if (this.m_bEstimatePageCounts) {
                PageCountEstimator.addHeuristicDocPageCounts(doc, document, open);
            }
            toLuceneContentHandler.processNewDocument(document);
            int i2 = i;
            i++;
            if (i2 % 100000 == 0) {
                Logger.getLogger(LuceneIndexCreator.class.getName()).info(i + " docs postprocessed");
            }
        }
        toLuceneContentHandler.crawlFinished();
        indexWriter.forceMerge(1, true);
        indexWriter.close(true);
        open.close();
        File file3 = new File(file.getAbsolutePath() + "_bak");
        file.renameTo(file3);
        file2.renameTo(file);
        FileUtils.deleteDirectory(file3);
        Logger.getLogger(LuceneIndexCreator.class.getName()).info("...postprocessing finished. Needed " + StopWatch.formatTimeDistance(System.currentTimeMillis() - currentTimeMillis));
    }
}
