package de.dfki.km.leech.util;

import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.processes.StopWatch;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.config.CrawlerContext;
import de.dfki.km.leech.lucene.LeechDefaultFieldConfig;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import java.io.File;
import java.io.IOException;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/km/leech/util/LuceneIndexCreator.class */
public class LuceneIndexCreator {
    public static boolean printErrors = true;
    public static long cyclicReportTime = 60000;

    public static void createIndex(List<String> list, String str, LinkedList<String> linkedList, String str2, int i, boolean z, String str3, MultiValueHashMap<String, String> multiValueHashMap) throws IOException, Exception, SAXException, TikaException {
        createIndex(list, str, linkedList, str2, i, z, str3, multiValueHashMap, null);
    }

    public static void createIndex(List<String> list, String str, LinkedList<String> linkedList, String str2, int i, boolean z, String str3, MultiValueHashMap<String, String> multiValueHashMap, ParseContext parseContext) throws IOException, Exception, SAXException, TikaException {
        if (parseContext == null) {
            parseContext = new ParseContext();
        }
        if (linkedList == null) {
            linkedList = new LinkedList<>();
        }
        if (multiValueHashMap == null) {
            multiValueHashMap = new MultiValueHashMap<>();
        }
        boolean z2 = false;
        if (str == null) {
            str = list.iterator().next();
            z2 = true;
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will perform only postprocessing (buzzwords and/or calculated page counts, as configured) on " + str);
        } else {
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Crawling " + list);
            if (multiValueHashMap.keySize() > 0) {
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will add static attribute value pairs to each document: " + multiValueHashMap);
            }
            Leech leech = new Leech();
            long startAndLogTime = StopWatch.startAndLogTime(Level.INFO);
            SimpleFSDirectory simpleFSDirectory = new SimpleFSDirectory(new File(str));
            LeechDefaultFieldConfig leechDefaultFieldConfig = new LeechDefaultFieldConfig();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, leechDefaultFieldConfig.createAnalyzer());
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            IndexWriter indexWriter = new IndexWriter(simpleFSDirectory, indexWriterConfig);
            ToLuceneContentHandler staticAttributeValuePairs = new ToLuceneContentHandler(leechDefaultFieldConfig, indexWriter).setIgnoreAllDocsWithout(new HashMap()).setStaticAttributeValuePairs(multiValueHashMap);
            leech.parse((String[]) list.toArray(new String[0]), (ContentHandler) (printErrors ? new CrawlReportContentHandler(new PrintlnContentHandler(PrintlnContentHandler.Verbosity.nothing, staticAttributeValuePairs).setShowOnlyErrors(true)) : new CrawlReportContentHandler(staticAttributeValuePairs)).setCyclicReportPrintln(cyclicReportTime), parseContext);
            if (indexWriter != null) {
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will commit and merge");
                indexWriter.commit();
                indexWriter.forceMerge(1, true);
                indexWriter.close(true);
                StopWatch.stopAndLogDistance(startAndLogTime, Level.INFO);
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("..finished crawling " + list);
            }
        }
        IndexPostprocessor indexPostprocessor = new IndexPostprocessor();
        boolean z3 = false;
        if (!StringUtils.nullOrWhitespace(str2)) {
            indexPostprocessor.enableBuzzwordGeneration(str2, i, true);
            z3 = true;
        }
        if (z) {
            indexPostprocessor.enablePageCountEstimation();
            z3 = true;
        }
        if (!StringUtils.nullOrWhitespace(str3)) {
            indexPostprocessor.enableFrequencyClassCalculation(str3);
            z3 = true;
        }
        if (z2 && multiValueHashMap.keySize() > 0) {
            Metadata metadata = new Metadata();
            for (Map.Entry entry : multiValueHashMap.entryList()) {
                metadata.add((String) entry.getKey(), (String) entry.getValue());
            }
            indexPostprocessor.enableStaticAttributeValuePairs(metadata);
            z3 = true;
        }
        if (z3) {
            indexPostprocessor.postprocessIndex(str, new LeechDefaultFieldConfig(), (String[]) linkedList.toArray(new String[0]));
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0 || (strArr.length != 0 && (strArr[0].equals("-?") || strArr[0].equals("-h") || strArr[0].equals("--help")))) {
            System.out.println("Usage: LuceneIndexCreator [-noPageRedirects] [-noParseGeoCoordinates] [-parseInfoBoxes] [-parseLinksAndCategories]\n [-<staticAttName>=<staticAttValue>] [-buzzwordAttName=<attName>] [-buzzwordCount=<count>] [-calculatePageCounts]\n[-frequencyClassAttName=<attName>] [-li <readonlyLookupIndexPath>] [-crawlingDepth=<depth>] <fileOrDir2CrawlPath1> .. <fileOrDir2CrawlPathN> <targetLuceneIndexPath>\n\nComments: - you can specify several static attribute value pairs.\n- if you leave <fileOrDir2CrawlPath>, only postprocessing will be performed.\n- you can add several lookup indices (-li).\n- if you leave the buzzword attName or the frequency class attName, these processing steps will be skiped.");
            System.out.println();
            return;
        }
        LinkedList linkedList = new LinkedList();
        String str = null;
        String str2 = null;
        String str3 = null;
        int i = 7;
        boolean z = false;
        LinkedList linkedList2 = new LinkedList();
        int i2 = 1;
        ParseContext parseContext = new ParseContext();
        WikipediaDumpParser.WikipediaDumpParserConfig parseLinksAndCategories = new WikipediaDumpParser.WikipediaDumpParserConfig().setDeterminePageRedirects(true).setParseGeoCoordinates(true).setParseInfoBoxes(false).setParseLinksAndCategories(false);
        parseContext.set(WikipediaDumpParser.WikipediaDumpParserConfig.class, parseLinksAndCategories);
        MultiValueHashMap multiValueHashMap = new MultiValueHashMap();
        int i3 = 0;
        while (i3 < strArr.length) {
            String str4 = strArr[i3];
            if (str4.equals("-noPageRedirects")) {
                parseLinksAndCategories.setDeterminePageRedirects(false);
            } else if (str4.equals("-noParseGeoCoordinates")) {
                parseLinksAndCategories.setParseGeoCoordinates(false);
            } else if (str4.equals("-parseInfoBoxes")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str4.equals("-parseLinksAndCategories")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str4.startsWith("-buzzwordAttName")) {
                str2 = str4.replace("-buzzwordAttName=", "").trim();
            } else if (str4.startsWith("-buzzwordCount=")) {
                i = Integer.valueOf(str4.replace("-buzzwordCount=", "")).intValue();
            } else if (str4.startsWith("-crawlingDepth=")) {
                i2 = Integer.valueOf(str4.replace("-crawlingDepth=", "")).intValue();
            } else if (str4.startsWith("-frequencyClassAttName=")) {
                str3 = str4.replace("-frequencyClassAttName=", "").trim();
            } else if (str4.startsWith("-calculatePageCounts")) {
                z = true;
            } else if (str4.startsWith("-li")) {
                i3++;
                linkedList2.add(strArr[i3]);
            } else if (str4.startsWith("-")) {
                String substring = str4.substring(1);
                if (substring.contains("=")) {
                    String[] split = substring.split("=");
                    multiValueHashMap.add(split[0], split[1]);
                }
            } else if (linkedList.size() == 0 || i3 != strArr.length - 1) {
                linkedList.add(strArr[i3]);
            } else {
                str = strArr[i3];
            }
            i3++;
        }
        new CrawlerContext().setCrawlingDepth(i2);
        createIndex(linkedList, str, linkedList2, str2, i, z, str3, multiValueHashMap, parseContext);
    }
}
