package org.apache.nutch.crawl;

import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Closeable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.mapred.lib.HashPartitioner;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.IdentityReducer;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader.class */
public class CrawlDbReader implements Closeable {
    public static final Log LOG = LogFactory.getLog(CrawlDbReader.class);
    private MapFile.Reader[] readers = null;

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbDumpReducer.class */
    public static class CrawlDbDumpReducer implements Reducer {
        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            while (it.hasNext()) {
                outputCollector.collect(writableComparable, (Writable) it.next());
            }
        }

        public void configure(JobConf jobConf) {
        }

        public void close() {
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbStatCombiner.class */
    public static class CrawlDbStatCombiner implements Reducer {
        LongWritable val = new LongWritable();

        public void configure(JobConf jobConf) {
        }

        public void close() {
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            this.val.set(0L);
            if (!((Text) writableComparable).toString().equals("s")) {
                while (it.hasNext()) {
                    this.val.set(this.val.get() + ((LongWritable) it.next()).get());
                }
                outputCollector.collect(writableComparable, this.val);
                return;
            }
            long j = 0;
            long j2 = Long.MAX_VALUE;
            long j3 = Long.MIN_VALUE;
            while (it.hasNext()) {
                LongWritable longWritable = (LongWritable) it.next();
                if (longWritable.get() < j2) {
                    j2 = longWritable.get();
                }
                if (longWritable.get() > j3) {
                    j3 = longWritable.get();
                }
                j += longWritable.get();
            }
            outputCollector.collect(new Text("scn"), new LongWritable(j2));
            outputCollector.collect(new Text("scx"), new LongWritable(j3));
            outputCollector.collect(new Text("sct"), new LongWritable(j));
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbStatMapper.class */
    public static class CrawlDbStatMapper implements Mapper {
        LongWritable COUNT_1 = new LongWritable(1);

        public void configure(JobConf jobConf) {
        }

        public void close() {
        }

        public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
            CrawlDatum crawlDatum = (CrawlDatum) writable;
            outputCollector.collect(new Text("T"), this.COUNT_1);
            outputCollector.collect(new Text("status " + ((int) crawlDatum.getStatus())), this.COUNT_1);
            outputCollector.collect(new Text("retry " + ((int) crawlDatum.getRetriesSinceFetch())), this.COUNT_1);
            outputCollector.collect(new Text("s"), new LongWritable((long) (crawlDatum.getScore() * 1000.0d)));
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbStatReducer.class */
    public static class CrawlDbStatReducer implements Reducer {
        public void configure(JobConf jobConf) {
        }

        public void close() {
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            String text = ((Text) writableComparable).toString();
            if (!text.equals("T")) {
                if (text.startsWith("status") || text.startsWith("retry")) {
                    LongWritable longWritable = new LongWritable();
                    while (it.hasNext()) {
                        longWritable.set(longWritable.get() + ((LongWritable) it.next()).get());
                    }
                    outputCollector.collect(writableComparable, longWritable);
                    return;
                }
                if (text.equals("scx")) {
                    LongWritable longWritable2 = new LongWritable(Long.MIN_VALUE);
                    while (it.hasNext()) {
                        LongWritable longWritable3 = (LongWritable) it.next();
                        if (longWritable2.get() < longWritable3.get()) {
                            longWritable2.set(longWritable3.get());
                        }
                    }
                    outputCollector.collect(writableComparable, longWritable2);
                    return;
                }
                if (text.equals("scn")) {
                    LongWritable longWritable4 = new LongWritable(Long.MAX_VALUE);
                    while (it.hasNext()) {
                        LongWritable longWritable5 = (LongWritable) it.next();
                        if (longWritable4.get() > longWritable5.get()) {
                            longWritable4.set(longWritable5.get());
                        }
                    }
                    outputCollector.collect(writableComparable, longWritable4);
                    return;
                }
                if (text.equals("sct")) {
                    LongWritable longWritable6 = new LongWritable();
                    while (it.hasNext()) {
                        longWritable6.set(longWritable6.get() + ((LongWritable) it.next()).get());
                    }
                    outputCollector.collect(writableComparable, longWritable6);
                    return;
                }
                return;
            }
            long j = 0;
            while (true) {
                long j2 = j;
                if (!it.hasNext()) {
                    outputCollector.collect(writableComparable, new LongWritable(j2));
                    return;
                }
                j = j2 + ((LongWritable) it.next()).get();
            }
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbTopNMapper.class */
    public static class CrawlDbTopNMapper implements Mapper {
        private static final FloatWritable fw = new FloatWritable();
        private float min = 0.0f;

        public void configure(JobConf jobConf) {
            long j = jobConf.getLong("CrawlDbReader.topN.min", 0L);
            if (j != 0) {
                this.min = ((float) j) / 1000000.0f;
            }
        }

        public void close() {
        }

        public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
            CrawlDatum crawlDatum = (CrawlDatum) writable;
            if (crawlDatum.getScore() < this.min) {
                return;
            }
            fw.set(-crawlDatum.getScore());
            outputCollector.collect(fw, writableComparable);
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReader$CrawlDbTopNReducer.class */
    public static class CrawlDbTopNReducer implements Reducer {
        private long topN;
        private long count = 0;

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            while (it.hasNext() && this.count < this.topN) {
                FloatWritable floatWritable = (FloatWritable) writableComparable;
                floatWritable.set(-floatWritable.get());
                outputCollector.collect(floatWritable, (Writable) it.next());
                this.count++;
            }
        }

        public void configure(JobConf jobConf) {
            this.topN = jobConf.getLong("CrawlDbReader.topN", 100L) / jobConf.getNumReduceTasks();
        }

        public void close() {
        }
    }

    private void openReaders(String str, Configuration configuration) throws IOException {
        if (this.readers != null) {
            return;
        }
        this.readers = MapFileOutputFormat.getReaders(FileSystem.get(configuration), new Path(str, "current"), configuration);
    }

    private void closeReaders() {
        if (this.readers == null) {
            return;
        }
        for (int i = 0; i < this.readers.length; i++) {
            try {
                this.readers[i].close();
            } catch (Exception e) {
            }
        }
    }

    public void close() {
        closeReaders();
    }

    public void processStatJob(String str, Configuration configuration) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb statistics start: " + str);
        }
        Path path = new Path(str, "stat_tmp" + System.currentTimeMillis());
        NutchJob nutchJob = new NutchJob(configuration);
        nutchJob.setJobName("stats " + str);
        nutchJob.addInputPath(new Path(str, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(CrawlDbStatMapper.class);
        nutchJob.setCombinerClass(CrawlDbStatCombiner.class);
        nutchJob.setReducerClass(CrawlDbStatReducer.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(LongWritable.class);
        JobClient.runJob(nutchJob);
        FileSystem fileSystem = FileSystem.get(configuration);
        SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(configuration, path);
        Text text = new Text();
        LongWritable longWritable = new LongWritable();
        TreeMap treeMap = new TreeMap();
        for (SequenceFile.Reader reader : readers) {
            while (reader.next(text, longWritable)) {
                String text2 = text.toString();
                LongWritable longWritable2 = (LongWritable) treeMap.get(text2);
                if (longWritable2 == null) {
                    longWritable2 = new LongWritable();
                    if (text2.equals("scx")) {
                        longWritable2.set(Long.MIN_VALUE);
                    }
                    if (text2.equals("scn")) {
                        longWritable2.set(Long.MAX_VALUE);
                    }
                    treeMap.put(text2, longWritable2);
                }
                if (text2.equals("scx")) {
                    if (longWritable2.get() < longWritable.get()) {
                        longWritable2.set(longWritable.get());
                    }
                } else if (!text2.equals("scn")) {
                    longWritable2.set(longWritable2.get() + longWritable.get());
                } else if (longWritable2.get() > longWritable.get()) {
                    longWritable2.set(longWritable.get());
                }
            }
            reader.close();
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("Statistics for CrawlDb: " + str);
            LongWritable longWritable3 = (LongWritable) treeMap.get("T");
            treeMap.remove("T");
            LOG.info("TOTAL urls:\t" + longWritable3.get());
            for (String str2 : treeMap.keySet()) {
                LongWritable longWritable4 = (LongWritable) treeMap.get(str2);
                if (str2.equals("scn")) {
                    LOG.info("min score:\t" + (((float) longWritable4.get()) / 1000.0f));
                } else if (str2.equals("scx")) {
                    LOG.info("max score:\t" + (((float) longWritable4.get()) / 1000.0f));
                } else if (str2.equals("sct")) {
                    LOG.info("avg score:\t" + ((float) ((longWritable4.get() / longWritable3.get()) / 1000.0d)));
                } else if (str2.startsWith("status")) {
                    LOG.info(str2 + " (" + CrawlDatum.getStatusName((byte) Integer.parseInt(str2.substring(str2.indexOf(32) + 1))) + "):\t" + longWritable4);
                } else {
                    LOG.info(str2 + ":\t" + longWritable4);
                }
            }
        }
        fileSystem.delete(path);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb statistics: done");
        }
    }

    public CrawlDatum get(String str, String str2, Configuration configuration) throws IOException {
        Text text = new Text(str2);
        CrawlDatum crawlDatum = new CrawlDatum();
        openReaders(str, configuration);
        return MapFileOutputFormat.getEntry(this.readers, new HashPartitioner(), text, crawlDatum);
    }

    public void readUrl(String str, String str2, Configuration configuration) throws IOException {
        CrawlDatum crawlDatum = get(str, str2, configuration);
        System.out.println("URL: " + str2);
        if (crawlDatum != null) {
            System.out.println(crawlDatum);
        } else {
            System.out.println("not found");
        }
    }

    public void processDumpJob(String str, String str2, Configuration configuration) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb dump: starting");
            LOG.info("CrawlDb db: " + str);
        }
        Path path = new Path(str2);
        NutchJob nutchJob = new NutchJob(configuration);
        nutchJob.setJobName("dump " + str);
        nutchJob.addInputPath(new Path(str, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputFormat(TextOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(CrawlDatum.class);
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb dump: done");
        }
    }

    public void processTopNJob(String str, long j, float f, String str2, Configuration configuration) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: starting (topN=" + j + ", min=" + f + ")");
            LOG.info("CrawlDb db: " + str);
        }
        Path path = new Path(str2);
        Path path2 = new Path(configuration.get("mapred.temp.dir", ".") + "/readdb-topN-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(configuration);
        nutchJob.setJobName("topN prepare " + str);
        nutchJob.addInputPath(new Path(str, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(CrawlDbTopNMapper.class);
        nutchJob.setReducerClass(IdentityReducer.class);
        nutchJob.setOutputPath(path2);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(FloatWritable.class);
        nutchJob.setOutputValueClass(Text.class);
        nutchJob.setLong("CrawlDbReader.topN.min", Math.round(1000000.0d * f));
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: collecting topN scores.");
        }
        NutchJob nutchJob2 = new NutchJob(configuration);
        nutchJob2.setJobName("topN collect " + str);
        nutchJob2.setLong("CrawlDbReader.topN", j);
        nutchJob2.addInputPath(path2);
        nutchJob2.setInputFormat(SequenceFileInputFormat.class);
        nutchJob2.setMapperClass(IdentityMapper.class);
        nutchJob2.setReducerClass(CrawlDbTopNReducer.class);
        nutchJob2.setOutputPath(path);
        nutchJob2.setOutputFormat(TextOutputFormat.class);
        nutchJob2.setOutputKeyClass(FloatWritable.class);
        nutchJob2.setOutputValueClass(Text.class);
        nutchJob2.setNumReduceTasks(1);
        JobClient.runJob(nutchJob2);
        FileSystem.get(configuration).delete(path2);
        if (LOG.isInfoEnabled()) {
            LOG.info("CrawlDb topN: done");
        }
    }

    public static void main(String[] strArr) throws IOException {
        CrawlDbReader crawlDbReader = new CrawlDbReader();
        if (strArr.length < 1) {
            System.err.println("Usage: CrawlDbReader <crawldb> (-stats | -dump <out_dir> | -topN <nnnn> <out_dir> [<min>] | -url <url>)");
            System.err.println("\t<crawldb>\tdirectory name where crawldb is located");
            System.err.println("\t-stats\tprint overall statistics to System.out");
            System.err.println("\t-dump <out_dir>\tdump the whole db to a text file in <out_dir>");
            System.err.println("\t-url <url>\tprint information on <url> to System.out");
            System.err.println("\t-topN <nnnn> <out_dir> [<min>]\tdump top <nnnn> urls sorted by score to <out_dir>");
            System.err.println("\t\t[<min>]\tskip records with scores below this value.");
            System.err.println("\t\t\tThis can significantly improve performance.");
            return;
        }
        String str = strArr[0];
        Configuration create = NutchConfiguration.create();
        int i = 1;
        while (i < strArr.length) {
            if (strArr[i].equals("-stats")) {
                crawlDbReader.processStatJob(str, create);
            } else if (strArr[i].equals("-dump")) {
                i++;
                crawlDbReader.processDumpJob(str, strArr[i], create);
            } else if (strArr[i].equals("-url")) {
                i++;
                crawlDbReader.readUrl(str, strArr[i], create);
            } else if (strArr[i].equals("-topN")) {
                int i2 = i + 1;
                long parseLong = Long.parseLong(strArr[i2]);
                i = i2 + 1;
                String str2 = strArr[i];
                float f = 0.0f;
                if (i < strArr.length - 1) {
                    i++;
                    f = Float.parseFloat(strArr[i]);
                }
                crawlDbReader.processTopNJob(str, parseLong, f, str2, create);
            } else {
                System.err.println("\nError: wrong argument " + strArr[i]);
            }
            i++;
        }
    }
}
