package org.apache.nutch.indexer;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.MD5Hash;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputFormatBase;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.lucene.index.IndexReader;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/*  JADX ERROR: NullPointerException in pass: ClassModifier
    java.lang.NullPointerException: Cannot invoke "java.util.List.forEach(java.util.function.Consumer)" because "blocks" is null
    	at jadx.core.utils.BlockUtils.collectAllInsns(BlockUtils.java:1017)
    	at jadx.core.dex.visitors.ClassModifier.removeBridgeMethod(ClassModifier.java:239)
    	at jadx.core.dex.visitors.ClassModifier.removeSyntheticMethods(ClassModifier.java:154)
    	at java.base/java.util.ArrayList.forEach(ArrayList.java:1596)
    	at jadx.core.dex.visitors.ClassModifier.visit(ClassModifier.java:64)
    	at jadx.core.dex.visitors.ClassModifier.visit(ClassModifier.java:57)
    */
/* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates.class */
public class DeleteDuplicates extends ToolBase implements Mapper, Reducer, OutputFormat {
    private static final Log LOG = LogFactory.getLog(DeleteDuplicates.class);
    private FileSystem fs;

    /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$HashPartitioner.class */
    public static class HashPartitioner implements Partitioner {
        public void configure(JobConf jobConf) {
        }

        public void close() {
        }

        public int getPartition(WritableComparable writableComparable, Writable writable, int i) {
            return (((MD5Hash) writableComparable).hashCode() & Integer.MAX_VALUE) % i;
        }
    }

    /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$HashReducer.class */
    public static class HashReducer implements Reducer {
        boolean byScore;

        public void configure(JobConf jobConf) {
            this.byScore = jobConf.getBoolean("dedup.keep.highest.score", true);
        }

        public void close() {
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            IndexDoc indexDoc;
            IndexDoc indexDoc2;
            IndexDoc indexDoc3 = null;
            while (it.hasNext()) {
                IndexDoc indexDoc4 = (IndexDoc) it.next();
                if (!indexDoc4.keep) {
                    DeleteDuplicates.LOG.debug("-discard " + indexDoc4 + " (already marked)");
                    outputCollector.collect(indexDoc4.url, indexDoc4);
                } else if (indexDoc3 == null) {
                    indexDoc3 = indexDoc4;
                } else {
                    if (this.byScore ? indexDoc4.score > indexDoc3.score : indexDoc4.urlLen < indexDoc3.urlLen) {
                        indexDoc = indexDoc3;
                        indexDoc2 = indexDoc4;
                    } else {
                        indexDoc = indexDoc4;
                        indexDoc2 = indexDoc3;
                    }
                    if (DeleteDuplicates.LOG.isDebugEnabled()) {
                        DeleteDuplicates.LOG.debug("-discard " + indexDoc + ", keep " + indexDoc2);
                    }
                    indexDoc.keep = false;
                    outputCollector.collect(indexDoc.url, indexDoc);
                    indexDoc3 = indexDoc2;
                }
            }
            DeleteDuplicates.LOG.debug("-keep " + indexDoc3);
        }
    }

    /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$IndexDoc.class */
    public static class IndexDoc implements WritableComparable {
        private int urlLen;
        private float score;
        private long time;
        private int doc;
        private Text url = new Text();
        private MD5Hash hash = new MD5Hash();
        private Text index = new Text();
        private boolean keep = true;

        public IndexDoc() {
        }

        public String toString() {
            return "[url=" + this.url + ",score=" + this.score + ",time=" + this.time + ",hash=" + this.hash + ",index=" + this.index + ",doc=" + this.doc + ",keep=" + this.keep + "]";
        }

        public void write(DataOutput dataOutput) throws IOException {
            this.url.write(dataOutput);
            dataOutput.writeFloat(this.score);
            dataOutput.writeLong(this.time);
            this.hash.write(dataOutput);
            this.index.write(dataOutput);
            dataOutput.writeInt(this.doc);
            dataOutput.writeBoolean(this.keep);
        }

        public void readFields(DataInput dataInput) throws IOException {
            this.url.readFields(dataInput);
            this.urlLen = this.url.getLength();
            this.score = dataInput.readFloat();
            this.time = dataInput.readLong();
            this.hash.readFields(dataInput);
            this.index.readFields(dataInput);
            this.doc = dataInput.readInt();
            this.keep = dataInput.readBoolean();
        }

        public int compareTo(Object obj) {
            IndexDoc indexDoc = (IndexDoc) obj;
            return this.keep != indexDoc.keep ? this.keep ? 1 : -1 : !this.hash.equals(indexDoc.hash) ? this.hash.compareTo(indexDoc.hash) : this.time != indexDoc.time ? this.time > indexDoc.time ? 1 : -1 : this.urlLen != this.urlLen ? this.urlLen - indexDoc.urlLen : this.score > indexDoc.score ? 1 : -1;
        }

        public boolean equals(Object obj) {
            IndexDoc indexDoc = (IndexDoc) obj;
            return this.keep == indexDoc.keep && this.hash.equals(indexDoc.hash) && this.time == indexDoc.time && this.score == indexDoc.score && this.urlLen == indexDoc.urlLen && this.index.equals(indexDoc.index) && this.doc == indexDoc.doc;
        }

        /*  JADX ERROR: Failed to decode insn: 0x0002: MOVE_MULTI, method: org.apache.nutch.indexer.DeleteDuplicates.IndexDoc.access$402(org.apache.nutch.indexer.DeleteDuplicates$IndexDoc, long):long
            java.lang.ArrayIndexOutOfBoundsException: arraycopy: source index -1 out of bounds for object array[6]
            	at java.base/java.lang.System.arraycopy(Native Method)
            	at jadx.plugins.input.java.data.code.StackState.insert(StackState.java:49)
            	at jadx.plugins.input.java.data.code.CodeDecodeState.insert(CodeDecodeState.java:118)
            	at jadx.plugins.input.java.data.code.JavaInsnsRegister.dup2x1(JavaInsnsRegister.java:313)
            	at jadx.plugins.input.java.data.code.JavaInsnData.decode(JavaInsnData.java:46)
            	at jadx.core.dex.instructions.InsnDecoder.lambda$process$0(InsnDecoder.java:54)
            	at jadx.plugins.input.java.data.code.JavaCodeReader.visitInstructions(JavaCodeReader.java:81)
            	at jadx.core.dex.instructions.InsnDecoder.process(InsnDecoder.java:50)
            	at jadx.core.dex.nodes.MethodNode.load(MethodNode.java:156)
            	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:443)
            	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:449)
            	at jadx.core.ProcessClass.process(ProcessClass.java:70)
            	at jadx.core.ProcessClass.generateCode(ProcessClass.java:118)
            	at jadx.core.dex.nodes.ClassNode.generateClassCode(ClassNode.java:400)
            	at jadx.core.dex.nodes.ClassNode.decompile(ClassNode.java:388)
            	at jadx.core.dex.nodes.ClassNode.getCode(ClassNode.java:338)
            */
        static /* synthetic */ long access$402(org.apache.nutch.indexer.DeleteDuplicates.IndexDoc r6, long r7) {
            /*
                r0 = r6
                r1 = r7
                // decode failed: arraycopy: source index -1 out of bounds for object array[6]
                r0.time = r1
                return r-1
            */
            throw new UnsupportedOperationException("Method not decompiled: org.apache.nutch.indexer.DeleteDuplicates.IndexDoc.access$402(org.apache.nutch.indexer.DeleteDuplicates$IndexDoc, long):long");
        }

        /*  JADX ERROR: Failed to decode insn: 0x0007: MOVE_MULTI, method: org.apache.nutch.indexer.DeleteDuplicates.IndexDoc.access$414(org.apache.nutch.indexer.DeleteDuplicates$IndexDoc, long):long
            java.lang.ArrayIndexOutOfBoundsException: arraycopy: source index -1 out of bounds for object array[6]
            	at java.base/java.lang.System.arraycopy(Native Method)
            	at jadx.plugins.input.java.data.code.StackState.insert(StackState.java:49)
            	at jadx.plugins.input.java.data.code.CodeDecodeState.insert(CodeDecodeState.java:118)
            	at jadx.plugins.input.java.data.code.JavaInsnsRegister.dup2x1(JavaInsnsRegister.java:313)
            	at jadx.plugins.input.java.data.code.JavaInsnData.decode(JavaInsnData.java:46)
            	at jadx.core.dex.instructions.InsnDecoder.lambda$process$0(InsnDecoder.java:54)
            	at jadx.plugins.input.java.data.code.JavaCodeReader.visitInstructions(JavaCodeReader.java:81)
            	at jadx.core.dex.instructions.InsnDecoder.process(InsnDecoder.java:50)
            	at jadx.core.dex.nodes.MethodNode.load(MethodNode.java:156)
            	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:443)
            	at jadx.core.dex.nodes.ClassNode.load(ClassNode.java:449)
            	at jadx.core.ProcessClass.process(ProcessClass.java:70)
            	at jadx.core.ProcessClass.generateCode(ProcessClass.java:118)
            	at jadx.core.dex.nodes.ClassNode.generateClassCode(ClassNode.java:400)
            	at jadx.core.dex.nodes.ClassNode.decompile(ClassNode.java:388)
            	at jadx.core.dex.nodes.ClassNode.getCode(ClassNode.java:338)
            */
        static /* synthetic */ long access$414(org.apache.nutch.indexer.DeleteDuplicates.IndexDoc r6, long r7) {
            /*
                r0 = r6
                r1 = r0
                long r1 = r1.time
                r2 = r7
                long r1 = r1 + r2
                // decode failed: arraycopy: source index -1 out of bounds for object array[6]
                r0.time = r1
                return r-1
            */
            throw new UnsupportedOperationException("Method not decompiled: org.apache.nutch.indexer.DeleteDuplicates.IndexDoc.access$414(org.apache.nutch.indexer.DeleteDuplicates$IndexDoc, long):long");
        }

        static /* synthetic */ Text access$502(IndexDoc indexDoc, Text text) {
            indexDoc.index = text;
            return text;
        }

        static /* synthetic */ int access$602(IndexDoc indexDoc, int i) {
            indexDoc.doc = i;
            return i;
        }
    }

    /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$InputFormat.class */
    public static class InputFormat extends InputFormatBase {
        private static final long INDEX_LENGTH = 2147483647L;

        /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$InputFormat$DDRecordReader.class */
        public class DDRecordReader implements RecordReader {
            private IndexReader indexReader;
            private int maxDoc;
            private int doc;
            private Text index;
            final /* synthetic */ InputFormat this$0;

            public DDRecordReader(InputFormat inputFormat, FileSplit fileSplit, JobConf jobConf, Text text) throws IOException {
                this.this$0 = inputFormat;
                this.indexReader = IndexReader.open(new FsDirectory(FileSystem.get(jobConf), fileSplit.getPath(), false, jobConf));
                this.maxDoc = this.indexReader.maxDoc();
                this.index = text;
            }

            /*  JADX ERROR: JadxRuntimeException in pass: InlineMethods
                jadx.core.utils.exceptions.JadxRuntimeException: Failed to process method for inline: org.apache.nutch.indexer.DeleteDuplicates.IndexDoc.access$402(org.apache.nutch.indexer.DeleteDuplicates$IndexDoc, long):long
                	at jadx.core.dex.visitors.InlineMethods.processInvokeInsn(InlineMethods.java:74)
                	at jadx.core.dex.visitors.InlineMethods.visit(InlineMethods.java:49)
                Caused by: jadx.core.utils.exceptions.JadxRuntimeException: Class not yet loaded at codegen stage: org.apache.nutch.indexer.DeleteDuplicates
                	at jadx.core.dex.nodes.ClassNode.reloadAtCodegenStage(ClassNode.java:883)
                	at jadx.core.dex.visitors.InlineMethods.processInvokeInsn(InlineMethods.java:66)
                	... 1 more
                */
            public boolean next(org.apache.hadoop.io.Writable r6, org.apache.hadoop.io.Writable r7) throws java.io.IOException {
                /*
                    Method dump skipped, instructions count: 240
                    To view this dump add '--comments-level debug' option
                */
                throw new UnsupportedOperationException("Method not decompiled: org.apache.nutch.indexer.DeleteDuplicates.InputFormat.DDRecordReader.next(org.apache.hadoop.io.Writable, org.apache.hadoop.io.Writable):boolean");
            }

            public long getPos() throws IOException {
                if (this.maxDoc == 0) {
                    return 0L;
                }
                return (this.doc * InputFormat.INDEX_LENGTH) / this.maxDoc;
            }

            public void close() throws IOException {
                this.indexReader.close();
            }

            public WritableComparable createKey() {
                return new Text();
            }

            public Writable createValue() {
                return new IndexDoc();
            }

            public float getProgress() throws IOException {
                if (this.maxDoc == 0) {
                    return 0.0f;
                }
                return this.doc / this.maxDoc;
            }
        }

        public InputFormat() {
        }

        public InputSplit[] getSplits(JobConf jobConf, int i) throws IOException {
            Path[] listPaths = listPaths(jobConf);
            InputSplit[] inputSplitArr = new InputSplit[listPaths.length];
            for (int i2 = 0; i2 < listPaths.length; i2++) {
                inputSplitArr[i2] = new FileSplit(listPaths[i2], 0L, INDEX_LENGTH, jobConf);
            }
            return inputSplitArr;
        }

        public RecordReader getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
            FileSplit fileSplit = (FileSplit) inputSplit;
            Text text = new Text(fileSplit.getPath().toString());
            reporter.setStatus(text.toString());
            return new DDRecordReader(this, fileSplit, jobConf, text);
        }
    }

    /* loaded from: input_file:org/apache/nutch/indexer/DeleteDuplicates$UrlsReducer.class */
    public static class UrlsReducer implements Reducer {
        public UrlsReducer() {
        }

        public void configure(JobConf jobConf) {
        }

        public void close() {
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            Writable writable = null;
            while (it.hasNext()) {
                Writable writable2 = (IndexDoc) it.next();
                if (writable == null) {
                    writable = writable2;
                } else if (((IndexDoc) writable2).time > ((IndexDoc) writable).time) {
                    ((IndexDoc) writable).keep = false;
                    DeleteDuplicates.LOG.debug("-discard " + writable + ", keep " + writable2);
                    outputCollector.collect(((IndexDoc) writable).hash, writable);
                    writable = writable2;
                } else {
                    ((IndexDoc) writable2).keep = false;
                    DeleteDuplicates.LOG.debug("-discard " + writable2 + ", keep " + writable);
                    outputCollector.collect(((IndexDoc) writable2).hash, writable2);
                }
            }
            ((IndexDoc) writable).keep = true;
            outputCollector.collect(((IndexDoc) writable).hash, writable);
        }
    }

    public void configure(JobConf jobConf) {
        setConf(jobConf);
    }

    public void setConf(Configuration configuration) {
        super.setConf(configuration);
        try {
            this.fs = FileSystem.get(configuration);
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public void close() {
    }

    public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
        IndexDoc indexDoc = (IndexDoc) writable;
        if (indexDoc.keep) {
            return;
        }
        outputCollector.collect(indexDoc.index, new IntWritable(indexDoc.doc));
    }

    public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
        Path path = new Path(writableComparable.toString());
        IndexReader open = IndexReader.open(new FsDirectory(this.fs, path, false, getConf()));
        while (it.hasNext()) {
            try {
                IntWritable intWritable = (IntWritable) it.next();
                LOG.debug("-delete " + path + " doc=" + intWritable);
                open.deleteDocument(intWritable.get());
            } finally {
                open.close();
            }
        }
    }

    public RecordWriter getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
        return new RecordWriter() { // from class: org.apache.nutch.indexer.DeleteDuplicates.1
            public void write(WritableComparable writableComparable, Writable writable) throws IOException {
                throw new UnsupportedOperationException();
            }

            public void close(Reporter reporter) throws IOException {
            }
        };
    }

    public DeleteDuplicates() {
    }

    public DeleteDuplicates(Configuration configuration) {
        setConf(configuration);
    }

    public void checkOutputSpecs(FileSystem fileSystem, JobConf jobConf) {
    }

    public void dedup(Path[] pathArr) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: starting");
        }
        Path path = new Path("dedup-urls-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(getConf());
        for (int i = 0; i < pathArr.length; i++) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Dedup: adding indexes in: " + pathArr[i]);
            }
            nutchJob.addInputPath(pathArr[i]);
        }
        nutchJob.setJobName("dedup 1: urls by time");
        nutchJob.setInputFormat(InputFormat.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(IndexDoc.class);
        nutchJob.setReducerClass(UrlsReducer.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputKeyClass(MD5Hash.class);
        nutchJob.setOutputValueClass(IndexDoc.class);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        JobClient.runJob(nutchJob);
        Path path2 = new Path("dedup-hash-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob2 = new NutchJob(getConf());
        nutchJob2.setJobName("dedup 2: content by hash");
        nutchJob2.addInputPath(path);
        nutchJob2.setInputFormat(SequenceFileInputFormat.class);
        nutchJob2.setMapOutputKeyClass(MD5Hash.class);
        nutchJob2.setMapOutputValueClass(IndexDoc.class);
        nutchJob2.setPartitionerClass(HashPartitioner.class);
        nutchJob2.setSpeculativeExecution(false);
        nutchJob2.setReducerClass(HashReducer.class);
        nutchJob2.setOutputPath(path2);
        nutchJob2.setOutputKeyClass(Text.class);
        nutchJob2.setOutputValueClass(IndexDoc.class);
        nutchJob2.setOutputFormat(SequenceFileOutputFormat.class);
        JobClient.runJob(nutchJob2);
        this.fs.delete(path);
        NutchJob nutchJob3 = new NutchJob(getConf());
        nutchJob3.setJobName("dedup 3: delete from index(es)");
        nutchJob3.addInputPath(path2);
        nutchJob3.setInputFormat(SequenceFileInputFormat.class);
        nutchJob3.setInt("io.file.buffer.size", 4096);
        nutchJob3.setMapperClass(DeleteDuplicates.class);
        nutchJob3.setReducerClass(DeleteDuplicates.class);
        nutchJob3.setOutputFormat(DeleteDuplicates.class);
        nutchJob3.setOutputKeyClass(Text.class);
        nutchJob3.setOutputValueClass(IntWritable.class);
        JobClient.runJob(nutchJob3);
        this.fs.delete(path2);
        if (LOG.isInfoEnabled()) {
            LOG.info("Dedup: done");
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(new DeleteDuplicates().doMain(NutchConfiguration.create(), strArr));
    }

    public int run(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.err.println("Usage: DeleteDuplicates <indexes> ...");
            return -1;
        }
        Path[] pathArr = new Path[strArr.length];
        for (int i = 0; i < strArr.length; i++) {
            pathArr[i] = new Path(strArr[i]);
        }
        try {
            dedup(pathArr);
            return 0;
        } catch (Exception e) {
            LOG.fatal("DeleteDuplicates: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    static {
    }
}
