package de.dfki.km.leech.util;

import de.dfki.km.leech.Leech;
import de.dfki.km.leech.lucene.FieldConfig;
import de.dfki.km.leech.lucene.ToLuceneContentHandler;
import de.dfki.km.leech.parser.wikipedia.WikipediaDumpParser;
import de.dfki.km.leech.sax.CrawlReportContentHandler;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import java.io.File;
import java.util.HashMap;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;

/* loaded from: input_file:de/dfki/km/leech/util/LuceneIndexCreator.class */
public class LuceneIndexCreator {
    public static void main(String[] strArr) throws Exception {
        CrawlReportContentHandler crawlReportContentHandler;
        if (strArr.length == 0 || (strArr.length != 0 && (strArr[0].equals("-?") || strArr[0].equals("-h") || strArr[0].equals("--help")))) {
            System.out.println("Usage: LuceneIndexCreator [-noPageRedirects] [-noParseGeoCoordinates] [-parseInfoBoxes] [-parseLinksAndCategories] <fileOrDir2CrawlPath> <targetLuceneIndexPath>");
            System.out.println();
            return;
        }
        String str = null;
        String str2 = null;
        ParseContext parseContext = new ParseContext();
        WikipediaDumpParser.WikipediaDumpParserConfig parseLinksAndCategories = new WikipediaDumpParser.WikipediaDumpParserConfig().setDeterminePageRedirects(true).setParseGeoCoordinates(true).setParseInfoBoxes(false).setParseLinksAndCategories(false);
        parseContext.set(WikipediaDumpParser.WikipediaDumpParserConfig.class, parseLinksAndCategories);
        for (int i = 0; i < strArr.length; i++) {
            String str3 = strArr[i];
            if (str3.equals("-noPageRedirects")) {
                parseLinksAndCategories.setDeterminePageRedirects(false);
            } else if (str3.equals("-noParseGeoCoordinates")) {
                parseLinksAndCategories.setParseGeoCoordinates(false);
            } else if (str3.equals("-parseInfoBoxes")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str3.equals("-parseLinksAndCategories")) {
                parseLinksAndCategories.setParseInfoBoxes(true);
            } else if (str == null) {
                str = strArr[i];
            } else {
                str2 = strArr[i];
            }
        }
        System.out.println("Crawling " + str);
        File file = new File(str);
        Leech leech = new Leech();
        long stopAndPrintTime = StopWatch.stopAndPrintTime();
        IndexWriter indexWriter = null;
        if (str2 != null) {
            FieldConfig fieldConfig4ParserAttributes = WikipediaDumpParser.getFieldConfig4ParserAttributes();
            IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, fieldConfig4ParserAttributes.createAnalyzer());
            indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
            indexWriter = new IndexWriter(new SimpleFSDirectory(new File(str2)), indexWriterConfig);
            crawlReportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all, new ToLuceneContentHandler(fieldConfig4ParserAttributes, indexWriter).setIgnoreAllDocsWithout(new HashMap())).setShowOnlyErrors(true));
        } else {
            crawlReportContentHandler = new CrawlReportContentHandler(new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all).setShowOnlyErrors(true));
        }
        leech.parse(file, (ContentHandler) crawlReportContentHandler.setCyclicReportPrintln(7000L), parseContext);
        if (indexWriter != null) {
            indexWriter.forceMerge(1, true);
            indexWriter.close(true);
        }
        StopWatch.stopAndPrintDistance(stopAndPrintTime);
        System.out.println("..finished crawling " + str);
    }
}
