package de.dfki.km.leech.util;

import de.dfki.km.leech.Leech;
import de.dfki.km.leech.lucene.FieldConfig;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import java.io.File;
import java.util.HashMap;
import java.util.LinkedList;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;

/* loaded from: input_file:de/dfki/km/leech/util/LuceneIndexCreator.class */
public class LuceneIndexCreator {
    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0 || (strArr.length != 0 && (strArr[0].equals("-?") || strArr[0].equals("-h") || strArr[0].equals("--help")))) {
            System.out.println("Usage: LuceneIndexCreator [-noPageRedirects] [-noParseGeoCoordinates] [-parseInfoBoxes] [-parseLinksAndCategories]\n [-<staticAttName>=<staticAttValue>] [-buzzwordAttName=<attName>] [-buzzwordCount=<count>] [-calculatePageCounts]\n[-li <readonlyLookupIndexPath>] <fileOrDir2CrawlPath> <targetLuceneIndexPath>\n\nComments: - you can specify several static attribute value pairs.\n- if you leave <fileOrDir2CrawlPath>, only postprocessing will be performed.\n- you can add several lookup indices (-li).");
            System.out.println();
            return;
        }
        String str = null;
        String str2 = null;
        String str3 = null;
        int i = 7;
        boolean z = false;
        LinkedList linkedList = new LinkedList();
        ParseContext parseContext = new ParseContext();
        WikipediaDumpParser.WikipediaDumpParserConfig parseLinksAndCategories = new WikipediaDumpParser.WikipediaDumpParserConfig().setDeterminePageRedirects(true).setParseGeoCoordinates(true).setParseInfoBoxes(false).setParseLinksAndCategories(false);
        parseContext.set(WikipediaDumpParser.WikipediaDumpParserConfig.class, parseLinksAndCategories);
        MultiValueHashMap<String, String> multiValueHashMap = new MultiValueHashMap<>();
        int i2 = 0;
        while (i2 < strArr.length) {
            String str4 = strArr[i2];
            if (str4.equals("-noPageRedirects")) {
                parseLinksAndCategories.setDeterminePageRedirects(false);
            } else if (str4.equals("-noParseGeoCoordinates")) {
                parseLinksAndCategories.setParseGeoCoordinates(false);
            } else if (str4.equals("-parseInfoBoxes")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str4.equals("-parseLinksAndCategories")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str4.startsWith("-buzzwordAttName")) {
                str3 = str4.replace("-buzzwordAttName=", "").trim();
            } else if (str4.startsWith("-buzzwordCount=")) {
                i = Integer.valueOf(str4.replace("-buzzwordCount=", "")).intValue();
            } else if (str4.startsWith("-calculatePageCounts")) {
                z = true;
            } else if (str4.startsWith("-li")) {
                i2++;
                linkedList.add(strArr[i2]);
            } else if (str4.startsWith("-")) {
                String substring = str4.substring(1);
                if (substring.contains("=")) {
                    String[] split = substring.split("=");
                    multiValueHashMap.add(split[0], split[1]);
                }
            } else if (str == null) {
                str = strArr[i2];
            } else {
                str2 = strArr[i2];
            }
            i2++;
        }
        if (str2 == null) {
            str2 = str;
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will perform only postprocessing (buzzwords and/or calculated page counts, as configured) on " + str2);
        } else {
            Logger.getLogger(LuceneIndexCreator.class.getName()).info("Crawling " + str);
            if (multiValueHashMap.keySize() > 0) {
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will add static attribute value pairs to each document: " + multiValueHashMap);
            }
            Leech leech = new Leech();
            long startAndLogTime = StopWatch.startAndLogTime(Level.INFO);
            SimpleFSDirectory simpleFSDirectory = new SimpleFSDirectory(new File(str2));
            FieldConfig fieldConfig4ParserAttributes = WikipediaDumpParser.getFieldConfig4ParserAttributes();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, fieldConfig4ParserAttributes.createAnalyzer());
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
            IndexWriter indexWriter = new IndexWriter(simpleFSDirectory, indexWriterConfig);
            leech.parse(new File(str), (ContentHandler) new CrawlReportContentHandler(new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all, new ToLuceneContentHandler(fieldConfig4ParserAttributes, indexWriter).setIgnoreAllDocsWithout(new HashMap()).setStaticAttributeValuePairs(multiValueHashMap)).setShowOnlyErrors(true)).setCyclicReportPrintln(7000L), parseContext);
            if (indexWriter != null) {
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("Will commit and merge");
                indexWriter.commit();
                indexWriter.forceMerge(1, true);
                indexWriter.close(true);
                StopWatch.stopAndLogDistance(startAndLogTime, Level.INFO);
                Logger.getLogger(LuceneIndexCreator.class.getName()).info("..finished crawling " + str);
            }
        }
        if (!StringUtils.nullOrWhitespace(str3) || z) {
            IndexPostprocessor indexPostprocessor = new IndexPostprocessor();
            indexPostprocessor.enableBuzzwordGeneration(str3, i, true);
            indexPostprocessor.enablePageCountEstimation();
            indexPostprocessor.postprocessIndex(str2, WikipediaDumpParser.getFieldConfig4ParserAttributes(), (String[]) linkedList.toArray(new String[0]));
        }
    }
}
