package org.apache.nutch.crawl;

import java.text.SimpleDateFormat;
import java.util.Date;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.nutch.fetcher.Fetcher;
import org.apache.nutch.indexer.DeleteDuplicates;
import org.apache.nutch.indexer.IndexMerger;
import org.apache.nutch.indexer.Indexer;
import org.apache.nutch.parse.ParseSegment;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/* loaded from: input_file:org/apache/nutch/crawl/Crawl.class */
public class Crawl {
    public static final Log LOG = LogFactory.getLog(Crawl.class);

    private static String getDate() {
        return new SimpleDateFormat("yyyyMMddHHmmss").format(new Date(System.currentTimeMillis()));
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.out.println("Usage: Crawl <urlDir> [-dir d] [-threads n] [-depth i] [-topN N]");
            return;
        }
        Configuration create = NutchConfiguration.create();
        create.addDefaultResource("crawl-tool.xml");
        NutchJob nutchJob = new NutchJob(create);
        Path path = null;
        Path path2 = new Path("crawl-" + getDate());
        int i = nutchJob.getInt("fetcher.threads.fetch", 10);
        int i2 = 5;
        int i3 = Integer.MAX_VALUE;
        int i4 = 0;
        while (i4 < strArr.length) {
            if ("-dir".equals(strArr[i4])) {
                path2 = new Path(strArr[i4 + 1]);
                i4++;
            } else if ("-threads".equals(strArr[i4])) {
                i = Integer.parseInt(strArr[i4 + 1]);
                i4++;
            } else if ("-depth".equals(strArr[i4])) {
                i2 = Integer.parseInt(strArr[i4 + 1]);
                i4++;
            } else if ("-topN".equals(strArr[i4])) {
                i3 = Integer.parseInt(strArr[i4 + 1]);
                i4++;
            } else if (strArr[i4] != null) {
                path = new Path(strArr[i4]);
            }
            i4++;
        }
        FileSystem fileSystem = FileSystem.get(nutchJob);
        if (fileSystem.exists(path2)) {
            throw new RuntimeException(path2 + " already exists.");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("crawl started in: " + path2);
            LOG.info("rootUrlDir = " + path);
            LOG.info("threads = " + i);
            LOG.info("depth = " + i2);
            if (i3 != Integer.MAX_VALUE) {
                LOG.info("topN = " + i3);
            }
        }
        Path path3 = new Path(path2 + "/crawldb");
        Path path4 = new Path(path2 + "/linkdb");
        Path path5 = new Path(path2 + "/segments");
        Path path6 = new Path(path2 + "/indexes");
        Path path7 = new Path(path2 + "/index");
        Path localPath = nutchJob.getLocalPath("crawl/" + getDate());
        Injector injector = new Injector(create);
        Generator generator = new Generator(create);
        Fetcher fetcher = new Fetcher(create);
        ParseSegment parseSegment = new ParseSegment(create);
        CrawlDb crawlDb = new CrawlDb(create);
        LinkDb linkDb = new LinkDb(create);
        Indexer indexer = new Indexer(create);
        DeleteDuplicates deleteDuplicates = new DeleteDuplicates(create);
        IndexMerger indexMerger = new IndexMerger(create);
        injector.inject(path3, path);
        int i5 = 0;
        while (true) {
            if (i5 >= i2) {
                break;
            }
            Path generate = generator.generate(path3, path5, -1, i3, System.currentTimeMillis(), false, false);
            if (generate == null) {
                LOG.info("Stopping at depth=" + i5 + " - no more URLs to fetch.");
                break;
            }
            fetcher.fetch(generate, i);
            if (!Fetcher.isParsing(nutchJob)) {
                parseSegment.parse(generate);
            }
            crawlDb.update(path3, new Path[]{generate}, true, true);
            i5++;
        }
        if (i5 > 0) {
            linkDb.invert(path4, path5, true, true, false);
            indexer.index(path6, path3, path4, fileSystem.listPaths(path5));
            deleteDuplicates.dedup(new Path[]{path6});
            indexMerger.merge(fileSystem.listPaths(path6), path7, localPath);
        } else {
            LOG.warn("No URLs to fetch - check your seed list and URL filters.");
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("crawl finished: " + path2);
        }
    }
}
