package org.apache.nutch.crawl;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.MapReduceBase;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Partitioner;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilterException;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/* loaded from: input_file:org/apache/nutch/crawl/Generator.class */
public class Generator extends ToolBase {
    public static final String CRAWL_GENERATE_FILTER = "crawl.generate.filter";
    public static final String GENERATE_MAX_PER_HOST_BY_IP = "generate.max.per.host.by.ip";
    public static final String GENERATE_MAX_PER_HOST = "generate.max.per.host";
    public static final String GENERATE_UPDATE_CRAWLDB = "generate.update.crawldb";
    public static final String CRAWL_TOP_N = "crawl.topN";
    public static final String CRAWL_GEN_CUR_TIME = "crawl.gen.curTime";
    public static final String CRAWL_GEN_DELAY = "crawl.gen.delay";
    public static final Log LOG = LogFactory.getLog(Generator.class);
    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMddHHmmss");

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$CrawlDbUpdater.class */
    public static class CrawlDbUpdater extends MapReduceBase implements Mapper, Reducer {
        long generateTime;

        public void configure(JobConf jobConf) {
            this.generateTime = jobConf.getLong(Nutch.GENERATE_TIME_KEY, 0L);
        }

        public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
            if (!(writableComparable instanceof FloatWritable)) {
                outputCollector.collect(writableComparable, writable);
            } else {
                SelectorEntry selectorEntry = (SelectorEntry) writable;
                outputCollector.collect(selectorEntry.url, selectorEntry.datum);
            }
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            CrawlDatum crawlDatum = null;
            LongWritable longWritable = null;
            while (it.hasNext()) {
                CrawlDatum crawlDatum2 = (CrawlDatum) it.next();
                if (crawlDatum2.getMetaData().containsKey(Nutch.WRITABLE_GENERATE_TIME_KEY)) {
                    longWritable = crawlDatum2.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
                    if (longWritable.get() != this.generateTime) {
                        crawlDatum = crawlDatum2;
                        longWritable = null;
                    }
                } else {
                    crawlDatum = crawlDatum2;
                }
            }
            if (longWritable != null) {
                crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, longWritable);
            }
            outputCollector.collect(writableComparable, crawlDatum);
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$DecreasingFloatComparator.class */
    public static class DecreasingFloatComparator extends FloatWritable.Comparator {
        public int compare(byte[] bArr, int i, int i2, byte[] bArr2, int i3, int i4) {
            return super.compare(bArr2, i3, i4, bArr, i, i2);
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$HashComparator.class */
    public static class HashComparator extends WritableComparator {
        public HashComparator() {
            super(Text.class);
        }

        public int compare(WritableComparable writableComparable, WritableComparable writableComparable2) {
            Text text = (Text) writableComparable;
            Text text2 = (Text) writableComparable2;
            int hash = hash(text.getBytes(), 0, text.getLength());
            int hash2 = hash(text2.getBytes(), 0, text2.getLength());
            if (hash < hash2) {
                return -1;
            }
            return hash == hash2 ? 0 : 1;
        }

        public int compare(byte[] bArr, int i, int i2, byte[] bArr2, int i3, int i4) {
            int hash = hash(bArr, i, i2);
            int hash2 = hash(bArr2, i3, i4);
            if (hash < hash2) {
                return -1;
            }
            return hash == hash2 ? 0 : 1;
        }

        private static int hash(byte[] bArr, int i, int i2) {
            int i3 = 1;
            for (int i4 = i2 - 1; i4 >= 0; i4--) {
                i3 = (31 * i3) + bArr[i + i4];
            }
            return i3;
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$Selector.class */
    public static class Selector implements Mapper, Partitioner, Reducer {
        private long curTime;
        private long limit;
        private long count;
        private int maxPerHost;
        private URLFilters filters;
        private URLNormalizers normalizers;
        private ScoringFilters scfilters;
        private boolean byIP;
        private boolean filter;
        private long genDelay;
        private boolean runUpdatedb;
        private LongWritable genTime = new LongWritable(System.currentTimeMillis());
        private HashMap hostCounts = new HashMap();
        private Partitioner hostPartitioner = new PartitionUrlByHost();
        private SelectorEntry entry = new SelectorEntry();
        private FloatWritable sortValue = new FloatWritable();
        private long dnsFailure = 0;

        public void configure(JobConf jobConf) {
            this.curTime = jobConf.getLong(Generator.CRAWL_GEN_CUR_TIME, System.currentTimeMillis());
            this.limit = jobConf.getLong(Generator.CRAWL_TOP_N, Long.MAX_VALUE) / jobConf.getNumReduceTasks();
            this.maxPerHost = jobConf.getInt(Generator.GENERATE_MAX_PER_HOST, -1);
            this.byIP = jobConf.getBoolean(Generator.GENERATE_MAX_PER_HOST_BY_IP, false);
            this.filters = new URLFilters(jobConf);
            this.normalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
            this.scfilters = new ScoringFilters(jobConf);
            this.hostPartitioner.configure(jobConf);
            this.filter = jobConf.getBoolean(Generator.CRAWL_GENERATE_FILTER, true);
            this.genDelay = jobConf.getLong(Generator.CRAWL_GEN_DELAY, 7L) * 3600 * 24 * 1000;
            long j = jobConf.getLong(Nutch.GENERATE_TIME_KEY, 0L);
            if (j > 0) {
                this.genTime.set(j);
            }
            this.runUpdatedb = jobConf.getBoolean(Generator.GENERATE_UPDATE_CRAWLDB, false);
        }

        public void close() {
        }

        public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
            Text text = (Text) writableComparable;
            if (this.filter) {
                try {
                    if (this.filters.filter(text.toString()) == null) {
                        return;
                    }
                } catch (URLFilterException e) {
                    if (Generator.LOG.isWarnEnabled()) {
                        Generator.LOG.warn("Couldn't filter url: " + text + " (" + e.getMessage() + ")");
                    }
                }
            }
            CrawlDatum crawlDatum = (CrawlDatum) writable;
            if (crawlDatum.getStatus() == 3 || crawlDatum.getStatus() == 5 || crawlDatum.getFetchTime() > this.curTime) {
                return;
            }
            LongWritable longWritable = crawlDatum.getMetaData().get(Nutch.WRITABLE_GENERATE_TIME_KEY);
            if (longWritable == null || longWritable.get() + this.genDelay <= this.curTime) {
                float f = 1.0f;
                try {
                    f = this.scfilters.generatorSortValue((Text) writableComparable, crawlDatum, 1.0f);
                } catch (ScoringFilterException e2) {
                    if (Generator.LOG.isWarnEnabled()) {
                        Generator.LOG.warn("Couldn't filter generatorSortValue for " + writableComparable + ": " + e2);
                    }
                }
                this.sortValue.set(f);
                crawlDatum.getMetaData().put(Nutch.WRITABLE_GENERATE_TIME_KEY, this.genTime);
                this.entry.datum = crawlDatum;
                this.entry.url = (Text) writableComparable;
                outputCollector.collect(this.sortValue, this.entry);
            }
        }

        public int getPartition(WritableComparable writableComparable, Writable writable, int i) {
            return this.hostPartitioner.getPartition(((SelectorEntry) writable).url, writableComparable, i);
        }

        public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
            while (it.hasNext() && this.count < this.limit) {
                SelectorEntry selectorEntry = (SelectorEntry) it.next();
                Text text = selectorEntry.url;
                if (this.maxPerHost > 0) {
                    URL url = new URL(text.toString());
                    String host = url.getHost();
                    if (host != null) {
                        String lowerCase = host.toLowerCase();
                        if (this.byIP) {
                            try {
                                lowerCase = InetAddress.getByName(lowerCase).getHostAddress();
                            } catch (UnknownHostException e) {
                                if (Generator.LOG.isDebugEnabled()) {
                                    Generator.LOG.debug("DNS lookup failed: " + lowerCase + ", skipping.");
                                }
                                this.dnsFailure++;
                                if (this.dnsFailure % 1000 == 0 && Generator.LOG.isWarnEnabled()) {
                                    Generator.LOG.warn("DNS failures: " + this.dnsFailure);
                                }
                            }
                        }
                        String url2 = new URL(url.getProtocol(), lowerCase, url.getPort(), url.getFile()).toString();
                        try {
                            url2 = this.normalizers.normalize(url2, URLNormalizers.SCOPE_GENERATE_HOST_COUNT);
                            String host2 = new URL(url2).getHost();
                            IntWritable intWritable = (IntWritable) this.hostCounts.get(host2);
                            if (intWritable == null) {
                                intWritable = new IntWritable();
                                this.hostCounts.put(host2, intWritable);
                            }
                            intWritable.set(intWritable.get() + 1);
                            if (intWritable.get() > this.maxPerHost) {
                                if (intWritable.get() == this.maxPerHost + 1 && Generator.LOG.isInfoEnabled()) {
                                    Generator.LOG.info("Host " + host2 + " has more than " + this.maxPerHost + " URLs. Skipping additional.");
                                }
                            }
                        } catch (Exception e2) {
                            Generator.LOG.warn("Malformed URL: '" + url2 + "', skipping (" + StringUtils.stringifyException(e2) + ")");
                        }
                    }
                }
                outputCollector.collect(writableComparable, selectorEntry);
                this.count++;
            }
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$SelectorEntry.class */
    public static class SelectorEntry implements Writable {
        public Text url = new Text();
        public CrawlDatum datum = new CrawlDatum();

        public void readFields(DataInput dataInput) throws IOException {
            this.url.readFields(dataInput);
            this.datum.readFields(dataInput);
        }

        public void write(DataOutput dataOutput) throws IOException {
            this.url.write(dataOutput);
            this.datum.write(dataOutput);
        }

        public String toString() {
            return "url=" + this.url.toString() + ", datum=" + this.datum.toString();
        }
    }

    /* loaded from: input_file:org/apache/nutch/crawl/Generator$SelectorInverseMapper.class */
    public static class SelectorInverseMapper extends MapReduceBase implements Mapper {
        public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
            SelectorEntry selectorEntry = (SelectorEntry) writable;
            outputCollector.collect(selectorEntry.url, selectorEntry.datum);
        }
    }

    public Generator() {
    }

    public Generator(Configuration configuration) {
        setConf(configuration);
    }

    public Path generate(Path path, Path path2) throws IOException {
        return generate(path, path2, -1, Long.MAX_VALUE, System.currentTimeMillis(), true, false);
    }

    public Path generate(Path path, Path path2, int i, long j, long j2, boolean z, boolean z2) throws IOException {
        Path path3 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());
        Path path4 = new Path(path2, generateSegmentName());
        Path path5 = new Path(path4, CrawlDatum.GENERATE_DIR_NAME);
        Path path6 = new Path(path, ".locked");
        FileSystem fileSystem = FileSystem.get(getConf());
        LockUtil.createLockFile(fileSystem, path6, z2);
        LOG.info("Generator: Selecting best-scoring urls due for fetch.");
        LOG.info("Generator: starting");
        LOG.info("Generator: segment: " + path4);
        LOG.info("Generator: filtering: " + z);
        if (j != Long.MAX_VALUE) {
            LOG.info("Generator: topN: " + j);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("generate: select " + path4);
        if (i == -1) {
            i = nutchJob.getNumMapTasks();
        }
        if ("local".equals(nutchJob.get("mapred.job.tracker")) && i != 1) {
            LOG.info("Generator: jobtracker is 'local', generating exactly one partition.");
            i = 1;
        }
        nutchJob.setLong(CRAWL_GEN_CUR_TIME, j2);
        long currentTimeMillis = System.currentTimeMillis();
        nutchJob.setLong(Nutch.GENERATE_TIME_KEY, currentTimeMillis);
        nutchJob.setLong(CRAWL_TOP_N, j);
        nutchJob.setBoolean(CRAWL_GENERATE_FILTER, z);
        nutchJob.setInputPath(new Path(path, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Selector.class);
        nutchJob.setPartitionerClass(Selector.class);
        nutchJob.setReducerClass(Selector.class);
        nutchJob.setOutputPath(path3);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(FloatWritable.class);
        nutchJob.setOutputKeyComparatorClass(DecreasingFloatComparator.class);
        nutchJob.setOutputValueClass(SelectorEntry.class);
        try {
            JobClient.runJob(nutchJob);
            SequenceFile.Reader[] readers = SequenceFileOutputFormat.getReaders(nutchJob, path3);
            if (readers == null || readers.length == 0 || !readers[0].next(new FloatWritable())) {
                LOG.warn("Generator: 0 records selected for fetching, exiting ...");
                LockUtil.removeLockFile(fileSystem, path6);
                fileSystem.delete(path3);
                return null;
            }
            for (SequenceFile.Reader reader : readers) {
                reader.close();
            }
            if (LOG.isInfoEnabled()) {
                LOG.info("Generator: Partitioning selected urls by host, for politeness.");
            }
            NutchJob nutchJob2 = new NutchJob(getConf());
            nutchJob2.setJobName("generate: partition " + path4);
            nutchJob2.setInt("partition.url.by.host.seed", new Random().nextInt());
            nutchJob2.setInputPath(path3);
            nutchJob2.setInputFormat(SequenceFileInputFormat.class);
            nutchJob2.setMapperClass(SelectorInverseMapper.class);
            nutchJob2.setPartitionerClass(PartitionUrlByHost.class);
            nutchJob2.setNumReduceTasks(i);
            nutchJob2.setOutputPath(path5);
            nutchJob2.setOutputFormat(SequenceFileOutputFormat.class);
            nutchJob2.setOutputKeyClass(Text.class);
            nutchJob2.setOutputValueClass(CrawlDatum.class);
            nutchJob2.setOutputKeyComparatorClass(HashComparator.class);
            try {
                JobClient.runJob(nutchJob2);
                if (getConf().getBoolean(GENERATE_UPDATE_CRAWLDB, false)) {
                    Path path7 = new Path(getConf().get("mapred.temp.dir", ".") + "/generate-temp-" + System.currentTimeMillis());
                    NutchJob nutchJob3 = new NutchJob(getConf());
                    nutchJob3.setJobName("generate: updatedb " + path);
                    nutchJob3.setLong(Nutch.GENERATE_TIME_KEY, currentTimeMillis);
                    nutchJob3.addInputPath(path3);
                    nutchJob3.addInputPath(new Path(path, "current"));
                    nutchJob3.setInputFormat(SequenceFileInputFormat.class);
                    nutchJob3.setMapperClass(CrawlDbUpdater.class);
                    nutchJob3.setReducerClass(CrawlDbUpdater.class);
                    nutchJob3.setOutputFormat(MapFileOutputFormat.class);
                    nutchJob3.setOutputKeyClass(Text.class);
                    nutchJob3.setOutputValueClass(CrawlDatum.class);
                    nutchJob3.setOutputPath(path7);
                    try {
                        JobClient.runJob(nutchJob3);
                        CrawlDb.install(nutchJob3, path);
                        fileSystem.delete(path7);
                    } catch (IOException e) {
                        LockUtil.removeLockFile(fileSystem, path6);
                        fileSystem.delete(path3);
                        fileSystem.delete(path7);
                        throw e;
                    }
                }
                LockUtil.removeLockFile(fileSystem, path6);
                fileSystem.delete(path3);
                if (LOG.isInfoEnabled()) {
                    LOG.info("Generator: done.");
                }
                return path4;
            } catch (IOException e2) {
                LockUtil.removeLockFile(fileSystem, path6);
                fileSystem.delete(path3);
                throw e2;
            }
        } catch (IOException e3) {
            LockUtil.removeLockFile(fileSystem, path6);
            throw e3;
        }
    }

    public static synchronized String generateSegmentName() {
        try {
            Thread.sleep(1000L);
        } catch (Throwable th) {
        }
        return sdf.format(new Date(System.currentTimeMillis()));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(new Generator().doMain(NutchConfiguration.create(), strArr));
    }

    public int run(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.out.println("Usage: Generator <crawldb> <segments_dir> [-force] [-topN N] [-numFetchers numFetchers] [-adddays numDays] [-noFilter]");
            return -1;
        }
        Path path = new Path(strArr[0]);
        Path path2 = new Path(strArr[1]);
        long currentTimeMillis = System.currentTimeMillis();
        long j = Long.MAX_VALUE;
        int i = -1;
        boolean z = true;
        boolean z2 = false;
        int i2 = 2;
        while (i2 < strArr.length) {
            if ("-topN".equals(strArr[i2])) {
                j = Long.parseLong(strArr[i2 + 1]);
                i2++;
            } else if ("-numFetchers".equals(strArr[i2])) {
                i = Integer.parseInt(strArr[i2 + 1]);
                i2++;
            } else if ("-adddays".equals(strArr[i2])) {
                currentTimeMillis += Integer.parseInt(strArr[i2 + 1]) * 1000 * 60 * 60 * 24;
            } else if ("-noFilter".equals(strArr[i2])) {
                z = false;
            } else if ("-force".equals(strArr[i2])) {
                z2 = true;
            }
            i2++;
        }
        try {
            return generate(path, path2, i, j, currentTimeMillis, z, z2) == null ? -2 : 0;
        } catch (Exception e) {
            LOG.fatal("Generator: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
