package org.apache.nutch.crawl;

import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;

/* loaded from: input_file:org/apache/nutch/crawl/CrawlDbReducer.class */
public class CrawlDbReducer implements Reducer {
    public static final Log LOG = LogFactory.getLog(CrawlDbReducer.class);
    private int retryMax;
    private CrawlDatum result = new CrawlDatum();
    private ArrayList linked = new ArrayList();
    private ScoringFilters scfilters = null;
    private boolean additionsAllowed;

    public void configure(JobConf jobConf) {
        this.retryMax = jobConf.getInt("db.fetch.retry.max", 3);
        this.scfilters = new ScoringFilters(jobConf);
        this.additionsAllowed = jobConf.getBoolean(CrawlDb.CRAWLDB_ADDITIONS_ALLOWED, true);
    }

    public void close() {
    }

    public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
        CrawlDatum crawlDatum = null;
        CrawlDatum crawlDatum2 = null;
        byte[] bArr = null;
        this.linked.clear();
        while (it.hasNext()) {
            CrawlDatum crawlDatum3 = (CrawlDatum) it.next();
            if (!CrawlDatum.hasDbStatus(crawlDatum3)) {
                if (!CrawlDatum.hasFetchStatus(crawlDatum3)) {
                    switch (crawlDatum3.getStatus()) {
                        case CrawlDatum.STATUS_SIGNATURE /* 65 */:
                            bArr = crawlDatum3.getSignature();
                            break;
                        case CrawlDatum.STATUS_LINKED /* 67 */:
                            this.linked.add(crawlDatum3);
                            break;
                        default:
                            LOG.warn("Unknown status, key: " + writableComparable + ", datum: " + crawlDatum3);
                            break;
                    }
                } else if (crawlDatum == null) {
                    crawlDatum = crawlDatum3;
                } else if (crawlDatum.getFetchTime() < crawlDatum3.getFetchTime()) {
                    crawlDatum = crawlDatum3;
                }
            } else if (crawlDatum2 == null) {
                crawlDatum2 = crawlDatum3;
            } else if (crawlDatum2.getFetchTime() < crawlDatum3.getFetchTime()) {
                crawlDatum2 = crawlDatum3;
            }
        }
        if (crawlDatum2 != null || this.additionsAllowed) {
            if (crawlDatum == null && this.linked.size() > 0) {
                crawlDatum = (CrawlDatum) this.linked.get(0);
            }
            if (crawlDatum == null) {
                if (crawlDatum2 != null) {
                    outputCollector.collect(writableComparable, crawlDatum2);
                    return;
                } else {
                    LOG.warn("Missing fetch and old value, signature=" + bArr);
                    return;
                }
            }
            this.result.set(crawlDatum);
            if (crawlDatum2 != null) {
                if (crawlDatum2.getMetaData().size() > 0) {
                    this.result.getMetaData().putAll(crawlDatum2.getMetaData());
                    if (crawlDatum.getMetaData().size() > 0) {
                        this.result.getMetaData().putAll(crawlDatum.getMetaData());
                    }
                }
                if (crawlDatum2.getModifiedTime() > 0 && crawlDatum.getModifiedTime() == 0) {
                    this.result.setModifiedTime(crawlDatum2.getModifiedTime());
                }
            }
            switch (crawlDatum.getStatus()) {
                case CrawlDatum.STATUS_FETCH_SUCCESS /* 33 */:
                    if (crawlDatum.getSignature() == null) {
                        this.result.setSignature(bArr);
                    }
                    this.result.setStatus(2);
                    this.result.setNextFetchTime();
                    break;
                case CrawlDatum.STATUS_FETCH_RETRY /* 34 */:
                    if (crawlDatum2 != null) {
                        this.result.setSignature(crawlDatum2.getSignature());
                    }
                    if (crawlDatum.getRetriesSinceFetch() >= this.retryMax) {
                        this.result.setStatus(3);
                        break;
                    } else {
                        this.result.setStatus(1);
                        break;
                    }
                case CrawlDatum.STATUS_FETCH_REDIR_TEMP /* 35 */:
                    if (crawlDatum.getSignature() == null) {
                        this.result.setSignature(bArr);
                    }
                    this.result.setStatus(4);
                    this.result.setNextFetchTime();
                    break;
                case CrawlDatum.STATUS_FETCH_REDIR_PERM /* 36 */:
                    if (crawlDatum.getSignature() == null) {
                        this.result.setSignature(bArr);
                    }
                    this.result.setStatus(5);
                    this.result.setNextFetchTime();
                    break;
                case CrawlDatum.STATUS_FETCH_GONE /* 37 */:
                    if (crawlDatum2 != null) {
                        this.result.setSignature(crawlDatum2.getSignature());
                    }
                    this.result.setStatus(3);
                    break;
                case CrawlDatum.STATUS_SIGNATURE /* 65 */:
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("Lone CrawlDatum.STATUS_SIGNATURE: " + writableComparable);
                        return;
                    }
                    return;
                case CrawlDatum.STATUS_LINKED /* 67 */:
                    if (crawlDatum2 == null) {
                        this.result.setStatus(1);
                        try {
                            this.scfilters.initialScore((Text) writableComparable, this.result);
                            break;
                        } catch (ScoringFilterException e) {
                            if (LOG.isWarnEnabled()) {
                                LOG.warn("Cannot filter init score for url " + writableComparable + ", using default: " + e.getMessage());
                            }
                            this.result.setScore(0.0f);
                            break;
                        }
                    } else {
                        this.result.set(crawlDatum2);
                        break;
                    }
                default:
                    throw new RuntimeException("Unknown status: " + ((int) crawlDatum.getStatus()) + " " + writableComparable);
            }
            try {
                this.scfilters.updateDbScore((Text) writableComparable, crawlDatum2, this.result, this.linked);
            } catch (Exception e2) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Couldn't update score, key=" + writableComparable + ": " + e2);
                }
            }
            this.result.getMetaData().remove(Nutch.WRITABLE_GENERATE_TIME_KEY);
            outputCollector.collect(writableComparable, this.result);
        }
    }
}
