package org.apache.nutch.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileFilter;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.PrintStream;
import java.util.BitSet;
import java.util.StringTokenizer;
import java.util.Vector;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.WhitespaceAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.MultiReader;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.HitCollector;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.store.FSDirectory;
import org.apache.nutch.util.NutchConfiguration;

/* loaded from: input_file:org/apache/nutch/tools/PruneIndexTool.class */
public class PruneIndexTool implements Runnable {
    public static final Log LOG = LogFactory.getLog(PruneIndexTool.class);
    public static int LOG_STEP = 50000;
    private Query[] queries;
    private IndexReader reader;
    private IndexSearcher searcher;
    private PruneChecker[] checkers;
    private boolean dryrun;
    private String dr;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/tools/PruneIndexTool$AllHitsCollector.class */
    public static class AllHitsCollector extends HitCollector {
        private BitSet bits;

        public AllHitsCollector(BitSet bitSet) {
            this.bits = bitSet;
        }

        public void collect(int i, float f) {
            this.bits.set(i);
        }
    }

    /* loaded from: input_file:org/apache/nutch/tools/PruneIndexTool$PrintFieldsChecker.class */
    public static class PrintFieldsChecker implements PruneChecker {
        private PrintStream ps;
        private String[] fields;

        public PrintFieldsChecker(PrintStream printStream, String[] strArr) {
            this.ps = null;
            this.fields = null;
            this.ps = printStream;
            this.fields = strArr;
        }

        @Override // org.apache.nutch.tools.PruneIndexTool.PruneChecker
        public void close() {
            this.ps.flush();
        }

        @Override // org.apache.nutch.tools.PruneIndexTool.PruneChecker
        public boolean isPrunable(Query query, IndexReader indexReader, int i) throws Exception {
            Document document = indexReader.document(i);
            StringBuffer stringBuffer = new StringBuffer("#" + i + ":");
            for (int i2 = 0; i2 < this.fields.length; i2++) {
                String[] values = document.getValues(this.fields[i2]);
                stringBuffer.append(" " + this.fields[i2] + "=");
                if (values != null) {
                    for (String str : values) {
                        stringBuffer.append("[" + str + "]");
                    }
                } else {
                    stringBuffer.append("[null]");
                }
            }
            this.ps.println(stringBuffer.toString());
            return true;
        }
    }

    /* loaded from: input_file:org/apache/nutch/tools/PruneIndexTool$PruneChecker.class */
    public interface PruneChecker {
        boolean isPrunable(Query query, IndexReader indexReader, int i) throws Exception;

        void close();
    }

    /* loaded from: input_file:org/apache/nutch/tools/PruneIndexTool$StoreUrlsChecker.class */
    public static class StoreUrlsChecker implements PruneChecker {
        private BufferedWriter output;
        private boolean storeHomeUrl;

        public StoreUrlsChecker(File file, boolean z) throws Exception {
            this.output = null;
            this.storeHomeUrl = false;
            this.output = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file), "UTF-8"));
            this.storeHomeUrl = z;
        }

        @Override // org.apache.nutch.tools.PruneIndexTool.PruneChecker
        public void close() {
            try {
                this.output.flush();
                this.output.close();
            } catch (Exception e) {
                if (PruneIndexTool.LOG.isWarnEnabled()) {
                    PruneIndexTool.LOG.warn("Error closing: " + e.getMessage());
                }
            }
        }

        @Override // org.apache.nutch.tools.PruneIndexTool.PruneChecker
        public boolean isPrunable(Query query, IndexReader indexReader, int i) throws Exception {
            int indexOf;
            int indexOf2;
            String str = indexReader.document(i).get("url");
            this.output.write(str);
            this.output.write(10);
            if (!this.storeHomeUrl || (indexOf = str.indexOf("://")) == -1 || (indexOf2 = str.indexOf(47, indexOf + 3)) == -1) {
                return true;
            }
            this.output.write(str.substring(0, indexOf2 + 1) + "\n");
            return true;
        }
    }

    public PruneIndexTool(File[] fileArr, Query[] queryArr, PruneChecker[] pruneCheckerArr, boolean z, boolean z2) throws Exception {
        FSDirectory directory;
        this.queries = null;
        this.reader = null;
        this.searcher = null;
        this.checkers = null;
        this.dryrun = false;
        this.dr = "";
        if (fileArr == null || queryArr == null) {
            throw new Exception("Invalid arguments.");
        }
        if (fileArr.length == 0 || queryArr.length == 0) {
            throw new Exception("Nothing to do.");
        }
        this.queries = queryArr;
        this.checkers = pruneCheckerArr;
        this.dryrun = z2;
        if (z2) {
            this.dr = "[DRY RUN] ";
        }
        int i = 0;
        if (fileArr.length == 1) {
            FSDirectory directory2 = FSDirectory.getDirectory(fileArr[0], false);
            if (IndexReader.isLocked(directory2)) {
                if (!z) {
                    throw new Exception("Index " + fileArr[0] + " is locked.");
                }
                if (!z2) {
                    IndexReader.unlock(directory2);
                    if (LOG.isDebugEnabled()) {
                        LOG.debug(" - had to unlock index in " + directory2);
                    }
                }
            }
            this.reader = IndexReader.open(directory2);
            i = 1;
        } else {
            Vector vector = new Vector(fileArr.length);
            for (int i2 = 0; i2 < fileArr.length; i2++) {
                try {
                    directory = FSDirectory.getDirectory(fileArr[i2], false);
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn(this.dr + "Invalid index in " + fileArr[i2] + " - skipping...");
                    }
                }
                if (IndexReader.isLocked(directory)) {
                    if (!z) {
                        if (LOG.isWarnEnabled()) {
                            LOG.warn(this.dr + "Index " + fileArr[i2] + " is locked. Skipping...");
                        }
                    } else if (!z2) {
                        IndexReader.unlock(directory);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug(" - had to unlock index in " + directory);
                        }
                    }
                }
                vector.add(IndexReader.open(directory));
                i++;
            }
            if (vector.size() == 0) {
                throw new Exception("No input indexes.");
            }
            this.reader = new MultiReader((IndexReader[]) vector.toArray(new IndexReader[0]));
        }
        if (LOG.isInfoEnabled()) {
            LOG.info(this.dr + "Opened " + i + " index(es) with total " + this.reader.numDocs() + " documents.");
        }
        this.searcher = new IndexSearcher(this.reader);
    }

    @Override // java.lang.Runnable
    public void run() {
        boolean z;
        BitSet bitSet = new BitSet(this.reader.maxDoc());
        AllHitsCollector allHitsCollector = new AllHitsCollector(bitSet);
        for (int i = 0; i < this.queries.length; i++) {
            if (LOG.isInfoEnabled()) {
                LOG.info(this.dr + "Processing query: " + this.queries[i].toString());
            }
            bitSet.clear();
            try {
                this.searcher.search(this.queries[i], allHitsCollector);
                if (bitSet.cardinality() != 0) {
                    if (LOG.isInfoEnabled()) {
                        LOG.info(this.dr + " - found " + bitSet.cardinality() + " document(s).");
                    }
                    int i2 = 0;
                    int i3 = 0;
                    while (true) {
                        int nextSetBit = bitSet.nextSetBit(i2);
                        if (nextSetBit == -1) {
                            break;
                        }
                        if (!this.reader.isDeleted(nextSetBit)) {
                            try {
                                if (this.checkers == null || this.checkers.length <= 0) {
                                    z = true;
                                } else {
                                    boolean z2 = true;
                                    for (int i4 = 0; i4 < this.checkers.length; i4++) {
                                        z2 &= this.checkers[i4].isPrunable(this.queries[i], this.reader, nextSetBit);
                                    }
                                    z = z2;
                                }
                                if (z) {
                                    if (!this.dryrun) {
                                        this.reader.deleteDocument(nextSetBit);
                                    }
                                    i3++;
                                }
                            } catch (Exception e) {
                                if (LOG.isWarnEnabled()) {
                                    LOG.warn(this.dr + " - failed to delete doc #" + nextSetBit);
                                }
                            }
                            i2 = nextSetBit + 1;
                        }
                    }
                    if (LOG.isInfoEnabled()) {
                        LOG.info(this.dr + " - deleted " + i3 + " document(s).");
                    }
                } else if (LOG.isInfoEnabled()) {
                    LOG.info(this.dr + " - no matching documents.");
                }
            } catch (IOException e2) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn(this.dr + " - failed: " + e2.getMessage());
                }
            }
        }
        if (this.checkers != null) {
            for (int i5 = 0; i5 < this.checkers.length; i5++) {
                this.checkers[i5].close();
            }
        }
        try {
            this.reader.close();
        } catch (IOException e3) {
            if (LOG.isWarnEnabled()) {
                LOG.warn(this.dr + "Exception when closing reader(s): " + e3.getMessage());
            }
        }
    }

    public static void main(String[] strArr) throws Exception {
        InputStream confResourceAsInputStream;
        if (strArr.length == 0) {
            usage();
            if (LOG.isFatalEnabled()) {
                LOG.fatal("Missing arguments");
                return;
            }
            return;
        }
        File file = new File(strArr[0]);
        if (!file.isDirectory()) {
            usage();
            if (LOG.isFatalEnabled()) {
                LOG.fatal("Not a directory: " + file);
                return;
            }
            return;
        }
        Vector vector = new Vector();
        if (IndexReader.indexExists(file)) {
            vector.add(file);
        } else {
            File[] listFiles = file.listFiles(new FileFilter() { // from class: org.apache.nutch.tools.PruneIndexTool.1
                @Override // java.io.FileFilter
                public boolean accept(File file2) {
                    return file2.isDirectory();
                }
            });
            if (listFiles == null || listFiles.length == 0) {
                usage();
                if (LOG.isFatalEnabled()) {
                    LOG.fatal("No indexes in " + file);
                    return;
                }
                return;
            }
            for (File file2 : listFiles) {
                File file3 = new File(file2, "index");
                if (file3.exists() && file3.isDirectory() && IndexReader.indexExists(file3)) {
                    vector.add(file3);
                }
            }
            if (vector.size() == 0) {
                usage();
                if (LOG.isFatalEnabled()) {
                    LOG.fatal("No indexes in " + file + " or its subdirs.");
                    return;
                }
                return;
            }
        }
        File[] fileArr = (File[]) vector.toArray(new File[0]);
        boolean z = false;
        boolean z2 = false;
        String str = null;
        String str2 = null;
        String str3 = null;
        int i = 1;
        while (i < strArr.length) {
            if (strArr[i].equals("-force")) {
                z = true;
            } else if (strArr[i].equals("-queries")) {
                i++;
                str = strArr[i];
            } else if (strArr[i].equals("-output")) {
                i++;
                str2 = strArr[i];
            } else if (strArr[i].equals("-showfields")) {
                i++;
                str3 = strArr[i];
            } else {
                if (!strArr[i].equals("-dryrun")) {
                    usage();
                    if (LOG.isFatalEnabled()) {
                        LOG.fatal("Unrecognized option: " + strArr[i]);
                        return;
                    }
                    return;
                }
                z2 = true;
            }
            i++;
        }
        Vector vector2 = new Vector();
        if (str3 != null) {
            StringTokenizer stringTokenizer = new StringTokenizer(str3, ",");
            Vector vector3 = new Vector();
            while (stringTokenizer.hasMoreTokens()) {
                vector3.add(stringTokenizer.nextToken());
            }
            vector2.add(new PrintFieldsChecker(System.out, (String[]) vector3.toArray(new String[0])));
        }
        if (str2 != null) {
            vector2.add(new StoreUrlsChecker(new File(str2), false));
        }
        PruneChecker[] pruneCheckerArr = vector2.size() > 0 ? (PruneChecker[]) vector2.toArray(new PruneChecker[0]) : null;
        if (str != null) {
            confResourceAsInputStream = new FileInputStream(str);
        } else {
            Configuration create = NutchConfiguration.create();
            str = create.get("prune.index.tool.queries");
            confResourceAsInputStream = create.getConfResourceAsInputStream(str);
        }
        if (confResourceAsInputStream == null) {
            if (LOG.isFatalEnabled()) {
                LOG.fatal("Can't load queries from " + str);
                return;
            }
            return;
        }
        try {
            try {
                new PruneIndexTool(fileArr, parseQueries(confResourceAsInputStream), pruneCheckerArr, z, z2).run();
            } catch (Exception e) {
                if (LOG.isFatalEnabled()) {
                    LOG.fatal("Error running PruneIndexTool: " + e.getMessage());
                }
            }
        } catch (Exception e2) {
            if (LOG.isFatalEnabled()) {
                LOG.fatal("Error parsing queries: " + e2.getMessage());
            }
        }
    }

    public static Query[] parseQueries(InputStream inputStream) throws Exception {
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(inputStream, "UTF-8"));
        QueryParser queryParser = new QueryParser("url", new WhitespaceAnalyzer());
        Vector vector = new Vector();
        while (true) {
            String readLine = bufferedReader.readLine();
            if (readLine == null) {
                return (Query[]) vector.toArray(new Query[0]);
            }
            String trim = readLine.trim();
            if (trim.length() != 0 && trim.charAt(0) != '#') {
                vector.add(queryParser.parse(trim));
            }
        }
    }

    private static void usage() {
        System.err.println("PruneIndexTool <indexDir | segmentsDir> [-dryrun] [-force] [-queries filename] [-output filename] [-showfields field1,field2,field3...]");
        System.err.println("\tNOTE: exactly one of <indexDir> or <segmentsDir> MUST be provided!\n");
        System.err.println("\t-dryrun\t\t\tdon't do anything, just show what would be done.");
        System.err.println("\t-force\t\t\tforce index unlock, if locked. Use with caution!");
        System.err.println("\t-queries filename\tread pruning queries from this file, instead of the");
        System.err.println("\t\t\t\tdefault defined in Nutch config files under 'prune.index.tool.queries' key.\n");
        System.err.println("\t-output filename\tstore pruned URLs in a text file");
        System.err.println("\t-showfields field1,field2...\tfor each deleted document show the values of the selected fields.");
        System.err.println("\t\t\t\tNOTE 1: this will slow down processing by orders of magnitude.");
        System.err.println("\t\t\t\tNOTE 2: only values of stored fields will be shown.");
    }
}
