package org.apache.nutch.fetcher;

import java.io.IOException;
import java.net.InetAddress;
import java.net.URL;
import java.net.UnknownHostException;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileSplit;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapRunnable;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.analysis.NutchAnalysisConstants;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseStatus;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatus;
import org.apache.nutch.protocol.RobotRules;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;

/* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2.class */
public class Fetcher2 extends Configured implements MapRunnable {
    public static final Log LOG = LogFactory.getLog(Fetcher2.class);
    private OutputCollector output;
    private Reporter reporter;
    private String segmentName;
    private AtomicInteger activeThreads;
    private AtomicInteger spinWaiting;
    private long start;
    private AtomicLong lastRequestStart;
    private AtomicLong bytes;
    private AtomicInteger pages;
    private AtomicInteger errors;
    private boolean storingContent;
    private boolean parsing;
    FetchItemQueues fetchQueues;
    QueueFeeder feeder;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$FetchItem.class */
    public static class FetchItem {
        String queueID;
        Text url;
        URL u;
        CrawlDatum datum;

        public FetchItem(Text text, URL url, CrawlDatum crawlDatum, String str) {
            this.url = text;
            this.u = url;
            this.datum = crawlDatum;
            this.queueID = str;
        }

        public static FetchItem create(Text text, CrawlDatum crawlDatum, boolean z) {
            String hostAddress;
            try {
                URL url = new URL(text.toString());
                String lowerCase = url.getProtocol().toLowerCase();
                if (z) {
                    try {
                        hostAddress = InetAddress.getByName(url.getHost()).getHostAddress();
                    } catch (UnknownHostException e) {
                        Fetcher2.LOG.warn("Unable to resolve: " + url.getHost() + ", skipping.");
                        return null;
                    }
                } else {
                    String host = url.getHost();
                    if (host == null) {
                        Fetcher2.LOG.warn("Unknown host for url: " + text + ", skipping.");
                        return null;
                    }
                    hostAddress = host.toLowerCase();
                }
                return new FetchItem(text, url, crawlDatum, lowerCase + "://" + hostAddress);
            } catch (Exception e2) {
                Fetcher2.LOG.warn("Cannot parse url: " + text, e2);
                return null;
            }
        }

        public CrawlDatum getDatum() {
            return this.datum;
        }

        public String getQueueID() {
            return this.queueID;
        }

        public Text getUrl() {
            return this.url;
        }

        public URL getURL2() {
            return this.u;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$FetchItemQueue.class */
    public static class FetchItemQueue {
        List<FetchItem> queue = Collections.synchronizedList(new LinkedList());
        Set<FetchItem> inProgress = Collections.synchronizedSet(new HashSet());
        AtomicLong endTime = new AtomicLong();
        long crawlDelay;
        long minCrawlDelay;
        int maxThreads;
        Configuration conf;

        public FetchItemQueue(Configuration configuration, int i, long j, long j2) {
            this.conf = configuration;
            this.maxThreads = i;
            this.crawlDelay = j;
            this.minCrawlDelay = j2;
            this.endTime.set(System.currentTimeMillis() - j);
        }

        public int getQueueSize() {
            return this.queue.size();
        }

        public int getInProgressSize() {
            return this.inProgress.size();
        }

        public void finishFetchItem(FetchItem fetchItem) {
            if (fetchItem != null) {
                this.inProgress.remove(fetchItem);
                this.endTime.set(System.currentTimeMillis());
            }
        }

        public void addFetchItem(FetchItem fetchItem) {
            if (fetchItem == null) {
                return;
            }
            this.queue.add(fetchItem);
        }

        public void addInProgressFetchItem(FetchItem fetchItem) {
            if (fetchItem == null) {
                return;
            }
            this.inProgress.add(fetchItem);
        }

        public FetchItem getFetchItem() {
            if (this.inProgress.size() >= this.maxThreads) {
                return null;
            }
            if (this.endTime.get() + (this.maxThreads > 1 ? this.crawlDelay : this.minCrawlDelay) > System.currentTimeMillis()) {
                return null;
            }
            FetchItem fetchItem = null;
            if (this.queue.size() == 0) {
                return null;
            }
            try {
                fetchItem = this.queue.remove(0);
                this.inProgress.add(fetchItem);
            } catch (Exception e) {
            }
            return fetchItem;
        }

        public synchronized void dump() {
            Fetcher2.LOG.info("  maxThreads    = " + this.maxThreads);
            Fetcher2.LOG.info("  inProgress    = " + this.inProgress.size());
            Fetcher2.LOG.info("  crawlDelay    = " + this.crawlDelay);
            Fetcher2.LOG.info("  minCrawlDelay = " + this.minCrawlDelay);
            Fetcher2.LOG.info("  endTime       = " + this.endTime.get());
            Fetcher2.LOG.info("  now           = " + System.currentTimeMillis());
            for (int i = 0; i < this.queue.size(); i++) {
                Fetcher2.LOG.info("  " + i + ". " + this.queue.get(i).url);
            }
        }
    }

    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$FetchItemQueues.class */
    private static class FetchItemQueues {
        public static final String DEFAULT_ID = "default";
        Map<String, FetchItemQueue> queues = new HashMap();
        AtomicInteger totalSize = new AtomicInteger(0);
        int maxThreads;
        boolean byIP;
        long crawlDelay;
        long minCrawlDelay;
        Configuration conf;

        public FetchItemQueues(Configuration configuration) {
            this.conf = configuration;
            this.maxThreads = configuration.getInt("fetcher.threads.per.host", 1);
            this.byIP = configuration.getBoolean("fetcher.threads.per.host.by.ip", false);
            this.crawlDelay = configuration.getFloat("fetcher.server.delay", 1.0f) * 1000.0f;
            this.minCrawlDelay = configuration.getFloat("fetcher.server.min.delay", 0.0f) * 1000.0f;
        }

        public int getTotalSize() {
            return this.totalSize.get();
        }

        public int getQueueCount() {
            return this.queues.size();
        }

        public void addFetchItem(Text text, CrawlDatum crawlDatum) {
            FetchItem create = FetchItem.create(text, crawlDatum, this.byIP);
            if (create != null) {
                addFetchItem(create);
            }
        }

        public void addFetchItem(FetchItem fetchItem) {
            getFetchItemQueue(fetchItem.queueID).addFetchItem(fetchItem);
            this.totalSize.incrementAndGet();
        }

        public void finishFetchItem(FetchItem fetchItem) {
            FetchItemQueue fetchItemQueue = this.queues.get(fetchItem.queueID);
            if (fetchItemQueue == null) {
                Fetcher2.LOG.warn("Attempting to finish item from unknown queue: " + fetchItem);
            } else {
                fetchItemQueue.finishFetchItem(fetchItem);
            }
        }

        public synchronized FetchItemQueue getFetchItemQueue(String str) {
            FetchItemQueue fetchItemQueue = this.queues.get(str);
            if (fetchItemQueue == null) {
                fetchItemQueue = new FetchItemQueue(this.conf, this.maxThreads, this.crawlDelay, this.minCrawlDelay);
                this.queues.put(str, fetchItemQueue);
            }
            return fetchItemQueue;
        }

        public synchronized FetchItem getFetchItem() {
            Iterator<String> it = this.queues.keySet().iterator();
            while (it.hasNext()) {
                FetchItemQueue fetchItemQueue = this.queues.get(it.next());
                if (fetchItemQueue.getQueueSize() == 0 && fetchItemQueue.getInProgressSize() == 0) {
                    it.remove();
                } else {
                    FetchItem fetchItem = fetchItemQueue.getFetchItem();
                    if (fetchItem != null) {
                        this.totalSize.decrementAndGet();
                        return fetchItem;
                    }
                }
            }
            return null;
        }

        public synchronized void dump() {
            for (String str : this.queues.keySet()) {
                FetchItemQueue fetchItemQueue = this.queues.get(str);
                if (fetchItemQueue.getQueueSize() != 0) {
                    Fetcher2.LOG.info("* queue: " + str);
                    fetchItemQueue.dump();
                }
            }
        }
    }

    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$FetcherThread.class */
    private class FetcherThread extends Thread {
        private Configuration conf;
        private URLFilters urlFilters;
        private ScoringFilters scfilters;
        private ParseUtil parseUtil;
        private URLNormalizers normalizers;
        private ProtocolFactory protocolFactory;
        private long maxCrawlDelay;
        private boolean byIP;
        private int maxRedirect;

        public FetcherThread(Configuration configuration) {
            setDaemon(true);
            setName("FetcherThread");
            this.conf = configuration;
            this.urlFilters = new URLFilters(configuration);
            this.scfilters = new ScoringFilters(configuration);
            this.parseUtil = new ParseUtil(configuration);
            this.protocolFactory = new ProtocolFactory(configuration);
            this.normalizers = new URLNormalizers(configuration, URLNormalizers.SCOPE_FETCHER);
            this.maxCrawlDelay = configuration.getInt("fetcher.max.crawl.delay", 30) * 1000;
            this.byIP = configuration.getBoolean("fetcher.threads.per.host.by.ip", true);
            this.maxRedirect = configuration.getInt("http.redirect.max", 3);
        }

        /* JADX WARN: Can't fix incorrect switch cases order, some code will duplicate */
        /* JADX WARN: Failed to find 'out' block for switch in B:40:0x0256. Please report as an issue. */
        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            Fetcher2.this.activeThreads.incrementAndGet();
            FetchItem fetchItem = null;
            while (true) {
                try {
                    try {
                        fetchItem = Fetcher2.this.fetchQueues.getFetchItem();
                        if (fetchItem != null) {
                            Fetcher2.this.lastRequestStart.set(System.currentTimeMillis());
                            try {
                                if (Fetcher2.LOG.isInfoEnabled()) {
                                    Fetcher2.LOG.info("fetching " + fetchItem.url);
                                }
                                int i = 0;
                                do {
                                    if (Fetcher2.LOG.isDebugEnabled()) {
                                        Fetcher2.LOG.debug("redirectCount=" + i);
                                    }
                                    boolean z = false;
                                    Protocol protocol = this.protocolFactory.getProtocol(fetchItem.url.toString());
                                    RobotRules robotRules = protocol.getRobotRules(fetchItem.url, fetchItem.datum);
                                    if (robotRules.isAllowed(fetchItem.u)) {
                                        if (robotRules.getCrawlDelay() > 0) {
                                            if (robotRules.getCrawlDelay() > this.maxCrawlDelay) {
                                                Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                                Fetcher2.LOG.debug("Crawl-Delay for " + fetchItem.url + " too long (" + robotRules.getCrawlDelay() + "), skipping");
                                                output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                            } else {
                                                Fetcher2.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).crawlDelay = robotRules.getCrawlDelay();
                                            }
                                        }
                                        ProtocolOutput protocolOutput = protocol.getProtocolOutput(fetchItem.url, fetchItem.datum);
                                        ProtocolStatus status = protocolOutput.getStatus();
                                        Content content = protocolOutput.getContent();
                                        Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                        switch (status.getCode()) {
                                            case 1:
                                                ParseStatus output = output(fetchItem.url, fetchItem.datum, content, status, 33);
                                                Fetcher2.this.updateStatus(content.getContent().length);
                                                if (output != null && output.isSuccess() && output.getMinorCode() == 100) {
                                                    String filter = this.urlFilters.filter(this.normalizers.normalize(output.getMessage(), URLNormalizers.SCOPE_FETCHER));
                                                    if (filter != null && !filter.equals(fetchItem.url.toString())) {
                                                        output(fetchItem.url, fetchItem.datum, null, status, 36);
                                                        Text text = new Text(filter);
                                                        if (this.maxRedirect > 0) {
                                                            z = true;
                                                            i++;
                                                            fetchItem = FetchItem.create(text, new CrawlDatum(), this.byIP);
                                                            Fetcher2.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).addInProgressFetchItem(fetchItem);
                                                            if (Fetcher2.LOG.isDebugEnabled()) {
                                                                Fetcher2.LOG.debug(" - content redirect to " + text + " (fetching now)");
                                                            }
                                                        } else {
                                                            output(text, new CrawlDatum(), null, null, 67);
                                                            if (Fetcher2.LOG.isDebugEnabled()) {
                                                                Fetcher2.LOG.debug(" - content redirect to " + text + " (fetching later)");
                                                            }
                                                        }
                                                    } else if (Fetcher2.LOG.isDebugEnabled()) {
                                                        Fetcher2.LOG.debug(" - content redirect skipped: " + (filter != null ? "to same url" : "filtered"));
                                                    }
                                                    break;
                                                }
                                                break;
                                            case 2:
                                            case 3:
                                            case 4:
                                            case 5:
                                            case NutchAnalysisConstants.C_SHARP /* 6 */:
                                            case NutchAnalysisConstants.PLUS /* 7 */:
                                            case NutchAnalysisConstants.MINUS /* 8 */:
                                            case NutchAnalysisConstants.QUOTE /* 9 */:
                                            case 10:
                                            case 19:
                                            case ProtocolStatus.NOTFETCHING /* 20 */:
                                            default:
                                                if (Fetcher2.LOG.isWarnEnabled()) {
                                                    Fetcher2.LOG.warn("Unknown ProtocolStatus: " + status.getCode());
                                                }
                                                output(fetchItem.url, fetchItem.datum, null, status, 37);
                                                break;
                                            case 11:
                                            case 14:
                                            case 17:
                                            case 18:
                                            case ProtocolStatus.NOTMODIFIED /* 21 */:
                                                output(fetchItem.url, fetchItem.datum, null, status, 37);
                                                break;
                                            case 12:
                                            case 13:
                                                output(fetchItem.url, fetchItem.datum, content, status, status.getCode() == 12 ? 36 : 35);
                                                String filter2 = this.urlFilters.filter(this.normalizers.normalize(status.getMessage(), URLNormalizers.SCOPE_FETCHER));
                                                if (filter2 != null && !filter2.equals(fetchItem.url.toString())) {
                                                    Text text2 = new Text(filter2);
                                                    if (this.maxRedirect > 0) {
                                                        z = true;
                                                        i++;
                                                        fetchItem = FetchItem.create(text2, new CrawlDatum(), this.byIP);
                                                        Fetcher2.this.fetchQueues.getFetchItemQueue(fetchItem.queueID).addInProgressFetchItem(fetchItem);
                                                        if (Fetcher2.LOG.isDebugEnabled()) {
                                                            Fetcher2.LOG.debug(" - protocol redirect to " + text2 + " (fetching now)");
                                                        }
                                                    } else {
                                                        output(text2, new CrawlDatum(), null, null, 67);
                                                        if (Fetcher2.LOG.isDebugEnabled()) {
                                                            Fetcher2.LOG.debug(" - protocol redirect to " + text2 + " (fetching later)");
                                                        }
                                                    }
                                                    break;
                                                } else if (Fetcher2.LOG.isDebugEnabled()) {
                                                    Fetcher2.LOG.debug(" - protocol redirect skipped: " + (filter2 != null ? "to same url" : "filtered"));
                                                    break;
                                                }
                                                break;
                                            case 15:
                                                fetchItem.datum.setRetriesSinceFetch(fetchItem.datum.getRetriesSinceFetch() + 1);
                                                output(fetchItem.url, fetchItem.datum, null, status, 34);
                                                break;
                                            case 16:
                                                logError(fetchItem.url, status.getMessage());
                                                fetchItem.datum.setRetriesSinceFetch(fetchItem.datum.getRetriesSinceFetch() + 1);
                                                output(fetchItem.url, fetchItem.datum, null, status, 34);
                                                break;
                                            case ProtocolStatus.WOULDBLOCK /* 22 */:
                                                Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                                Fetcher2.this.fetchQueues.addFetchItem(fetchItem);
                                                break;
                                            case ProtocolStatus.BLOCKED /* 23 */:
                                                output(fetchItem.url, fetchItem.datum, null, status, 34);
                                                break;
                                        }
                                        if (z && i >= this.maxRedirect) {
                                            Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                            if (Fetcher2.LOG.isInfoEnabled()) {
                                                Fetcher2.LOG.info(" - redirect count exceeded " + fetchItem.url);
                                            }
                                            output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_REDIR_EXCEEDED, 37);
                                        }
                                    } else {
                                        Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                        if (Fetcher2.LOG.isDebugEnabled()) {
                                            Fetcher2.LOG.debug("Denied by robots.txt: " + fetchItem.url);
                                        }
                                        output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_ROBOTS_DENIED, 37);
                                    }
                                    if (z) {
                                    }
                                } while (i < this.maxRedirect);
                            } catch (Throwable th) {
                                Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                logError(fetchItem.url, th.toString());
                                output(fetchItem.url, fetchItem.datum, null, ProtocolStatus.STATUS_FAILED, 34);
                            }
                        } else {
                            if (!Fetcher2.this.feeder.isAlive() && Fetcher2.this.fetchQueues.getTotalSize() <= 0) {
                                if (fetchItem != null) {
                                    Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                                }
                                Fetcher2.this.activeThreads.decrementAndGet();
                                Fetcher2.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher2.this.activeThreads);
                                return;
                            }
                            Fetcher2.LOG.debug(getName() + " spin-waiting ...");
                            Fetcher2.this.spinWaiting.incrementAndGet();
                            try {
                                Thread.sleep(500L);
                            } catch (Exception e) {
                            }
                            Fetcher2.this.spinWaiting.decrementAndGet();
                        }
                    } catch (Throwable th2) {
                        if (fetchItem != null) {
                            Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                        }
                        Fetcher2.this.activeThreads.decrementAndGet();
                        Fetcher2.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher2.this.activeThreads);
                        throw th2;
                    }
                } catch (Throwable th3) {
                    if (Fetcher2.LOG.isFatalEnabled()) {
                        th3.printStackTrace(LogUtil.getFatalStream(Fetcher2.LOG));
                        Fetcher2.LOG.fatal("fetcher caught:" + th3.toString());
                    }
                    if (fetchItem != null) {
                        Fetcher2.this.fetchQueues.finishFetchItem(fetchItem);
                    }
                    Fetcher2.this.activeThreads.decrementAndGet();
                    Fetcher2.LOG.info("-finishing thread " + getName() + ", activeThreads=" + Fetcher2.this.activeThreads);
                    return;
                }
            }
        }

        private void logError(Text text, String str) {
            if (Fetcher2.LOG.isInfoEnabled()) {
                Fetcher2.LOG.info("fetch of " + text + " failed with: " + str);
            }
            Fetcher2.this.errors.incrementAndGet();
        }

        private ParseStatus output(Text text, CrawlDatum crawlDatum, Content content, ProtocolStatus protocolStatus, int i) {
            ParseStatus parseStatus;
            crawlDatum.setStatus(i);
            crawlDatum.setFetchTime(System.currentTimeMillis());
            if (protocolStatus != null) {
                crawlDatum.getMetaData().put(Nutch.WRITABLE_PROTO_STATUS_KEY, protocolStatus);
            }
            if (content == null) {
                String text2 = text.toString();
                content = new Content(text2, text2, new byte[0], "", new Metadata(), this.conf);
            }
            Metadata metadata = content.getMetadata();
            metadata.set(Nutch.SEGMENT_NAME_KEY, Fetcher2.this.segmentName);
            try {
                this.scfilters.passScoreBeforeParsing(text, crawlDatum, content);
            } catch (Exception e) {
                if (Fetcher2.LOG.isWarnEnabled()) {
                    e.printStackTrace(LogUtil.getWarnStream(Fetcher2.LOG));
                    Fetcher2.LOG.warn("Couldn't pass score, url " + text + " (" + e + ")");
                }
            }
            Parse parse = null;
            if (Fetcher2.this.parsing && i == 33) {
                try {
                    parse = this.parseUtil.parse(content);
                    parseStatus = parse.getData().getStatus();
                } catch (Exception e2) {
                    parseStatus = new ParseStatus(e2);
                }
                if (!parseStatus.isSuccess()) {
                    if (Fetcher2.LOG.isWarnEnabled()) {
                        Fetcher2.LOG.warn("Error parsing: " + text + ": " + parseStatus);
                    }
                    parse = parseStatus.getEmptyParse(Fetcher2.this.getConf());
                }
                byte[] calculate = SignatureFactory.getSignature(Fetcher2.this.getConf()).calculate(content, parse);
                metadata.set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(calculate));
                crawlDatum.setSignature(calculate);
                parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, Fetcher2.this.segmentName);
                parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(calculate));
                try {
                    this.scfilters.passScoreAfterParsing(text, content, parse);
                } catch (Exception e3) {
                    if (Fetcher2.LOG.isWarnEnabled()) {
                        e3.printStackTrace(LogUtil.getWarnStream(Fetcher2.LOG));
                        Fetcher2.LOG.warn("Couldn't pass score, url " + text + " (" + e3 + ")");
                    }
                }
            }
            try {
                Fetcher2.this.output.collect(text, new FetcherOutput(crawlDatum, Fetcher2.this.storingContent ? content : null, parse != null ? new ParseImpl(parse) : null));
            } catch (IOException e4) {
                if (Fetcher2.LOG.isFatalEnabled()) {
                    e4.printStackTrace(LogUtil.getFatalStream(Fetcher2.LOG));
                    Fetcher2.LOG.fatal("fetcher caught:" + e4.toString());
                }
            }
            if (parse != null) {
                return parse.getData().getStatus();
            }
            return null;
        }
    }

    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$InputFormat.class */
    public static class InputFormat extends SequenceFileInputFormat {
        public InputSplit[] getSplits(JobConf jobConf, int i) throws IOException {
            Path[] listPaths = listPaths(jobConf);
            FileSplit[] fileSplitArr = new FileSplit[listPaths.length];
            FileSystem fileSystem = FileSystem.get(jobConf);
            for (int i2 = 0; i2 < listPaths.length; i2++) {
                fileSplitArr[i2] = new FileSplit(listPaths[i2], 0L, fileSystem.getLength(listPaths[i2]), jobConf);
            }
            return fileSplitArr;
        }
    }

    /* loaded from: input_file:org/apache/nutch/fetcher/Fetcher2$QueueFeeder.class */
    private static class QueueFeeder extends Thread {
        private RecordReader reader;
        private FetchItemQueues queues;
        private int size;

        public QueueFeeder(RecordReader recordReader, FetchItemQueues fetchItemQueues, int i) {
            this.reader = recordReader;
            this.queues = fetchItemQueues;
            this.size = i;
            setDaemon(true);
            setName("QueueFeeder");
        }

        @Override // java.lang.Thread, java.lang.Runnable
        public void run() {
            boolean z = true;
            int i = 0;
            while (z) {
                int totalSize = this.size - this.queues.getTotalSize();
                if (totalSize <= 0) {
                    try {
                        Thread.sleep(1000L);
                    } catch (Exception e) {
                    }
                } else {
                    Fetcher2.LOG.debug("-feeding " + totalSize + " input urls ...");
                    while (totalSize > 0 && z) {
                        try {
                            Text text = new Text();
                            CrawlDatum crawlDatum = new CrawlDatum();
                            z = this.reader.next(text, crawlDatum);
                            if (z) {
                                this.queues.addFetchItem(text, crawlDatum);
                                i++;
                                totalSize--;
                            }
                        } catch (IOException e2) {
                            Fetcher2.LOG.fatal("QueueFeeder error reading input, record " + i, e2);
                            return;
                        }
                    }
                }
            }
            Fetcher2.LOG.info("QueueFeeder finished: total " + i + " records.");
        }
    }

    public Fetcher2() {
        super((Configuration) null);
        this.activeThreads = new AtomicInteger(0);
        this.spinWaiting = new AtomicInteger(0);
        this.start = System.currentTimeMillis();
        this.lastRequestStart = new AtomicLong(this.start);
        this.bytes = new AtomicLong(0L);
        this.pages = new AtomicInteger(0);
        this.errors = new AtomicInteger(0);
    }

    public Fetcher2(Configuration configuration) {
        super(configuration);
        this.activeThreads = new AtomicInteger(0);
        this.spinWaiting = new AtomicInteger(0);
        this.start = System.currentTimeMillis();
        this.lastRequestStart = new AtomicLong(this.start);
        this.bytes = new AtomicLong(0L);
        this.pages = new AtomicInteger(0);
        this.errors = new AtomicInteger(0);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void updateStatus(int i) throws IOException {
        this.pages.incrementAndGet();
        this.bytes.addAndGet(i);
    }

    private void reportStatus() throws IOException {
        this.reporter.setStatus(this.activeThreads + " threads, " + this.pages + " pages, " + this.errors + " errors, " + (Math.round((this.pages.get() * 10.0f) / ((float) r0)) / 10.0d) + " pages/s, " + Math.round(((((float) this.bytes.get()) * 8.0f) / 1024.0f) / ((float) ((System.currentTimeMillis() - this.start) / 1000))) + " kb/s, ");
    }

    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.segmentName = jobConf.get(Nutch.SEGMENT_NAME_KEY);
        this.storingContent = isStoringContent(jobConf);
        this.parsing = isParsing(jobConf);
    }

    public void close() {
    }

    public static boolean isParsing(Configuration configuration) {
        return configuration.getBoolean("fetcher.parse", true);
    }

    public static boolean isStoringContent(Configuration configuration) {
        return configuration.getBoolean("fetcher.store.content", true);
    }

    public void run(RecordReader recordReader, OutputCollector outputCollector, Reporter reporter) throws IOException {
        this.output = outputCollector;
        this.reporter = reporter;
        this.fetchQueues = new FetchItemQueues(getConf());
        int i = getConf().getInt("fetcher.threads.fetch", 10);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: threads: " + i);
        }
        this.feeder = new QueueFeeder(recordReader, this.fetchQueues, i * 50);
        this.feeder.start();
        getConf().setBoolean("http.plugin.check.blocking", false);
        getConf().setBoolean("http.plugin.check.robots", false);
        for (int i2 = 0; i2 < i; i2++) {
            new FetcherThread(getConf()).start();
        }
        long j = getConf().getInt("mapred.task.timeout", 600000) / 2;
        do {
            try {
                Thread.sleep(1000L);
            } catch (InterruptedException e) {
            }
            reportStatus();
            LOG.info("-activeThreads=" + this.activeThreads + ", spinWaiting=" + this.spinWaiting.get() + ", fetchQueues.totalSize=" + this.fetchQueues.getTotalSize());
            if (!this.feeder.isAlive() && this.fetchQueues.getTotalSize() < 5) {
                this.fetchQueues.dump();
            }
            if (System.currentTimeMillis() - this.lastRequestStart.get() > j) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Aborting with " + this.activeThreads + " hung threads.");
                    return;
                }
                return;
            }
        } while (this.activeThreads.get() > 0);
        LOG.info("-activeThreads=" + this.activeThreads);
    }

    public void fetch(Path path, int i, boolean z) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: starting");
            LOG.info("Fetcher: segment: " + path);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("fetch " + path);
        nutchJob.setInt("fetcher.threads.fetch", i);
        nutchJob.set(Nutch.SEGMENT_NAME_KEY, path.getName());
        nutchJob.setBoolean("fetcher.parse", z);
        nutchJob.setSpeculativeExecution(false);
        nutchJob.setInputPath(new Path(path, CrawlDatum.GENERATE_DIR_NAME));
        nutchJob.setInputFormat(InputFormat.class);
        nutchJob.setMapRunnerClass(Fetcher2.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputFormat(FetcherOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(FetcherOutput.class);
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("Fetcher: done");
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.err.println("Usage: Fetcher <segment> [-threads n] [-noParsing]");
            System.exit(-1);
        }
        Path path = new Path(strArr[0]);
        Configuration create = NutchConfiguration.create();
        int i = create.getInt("fetcher.threads.fetch", 10);
        boolean z = true;
        int i2 = 1;
        while (i2 < strArr.length) {
            if (strArr[i2].equals("-threads")) {
                i2++;
                i = Integer.parseInt(strArr[i2]);
            } else if (strArr[i2].equals("-noParsing")) {
                z = false;
            }
            i2++;
        }
        create.setInt("fetcher.threads.fetch", i);
        if (!z) {
            create.setBoolean("fetcher.parse", z);
        }
        new Fetcher2(create).fetch(path, i, z);
    }
}
