package org.apache.nutch.indexer;

import java.io.IOException;
import java.util.Iterator;
import java.util.Random;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.ObjectWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.OutputFormatBase;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.Progressable;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolBase;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexWriter;
import org.apache.nutch.analysis.AnalyzerFactory;
import org.apache.nutch.analysis.NutchAnalyzer;
import org.apache.nutch.analysis.NutchDocumentAnalyzer;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;

/* loaded from: input_file:org/apache/nutch/indexer/Indexer.class */
public class Indexer extends ToolBase implements Reducer, Mapper {
    public static final String DONE_NAME = "index.done";
    public static final Log LOG = LogFactory.getLog(Indexer.class);
    private IndexingFilters filters;
    private ScoringFilters scfilters;

    /* loaded from: input_file:org/apache/nutch/indexer/Indexer$OutputFormat.class */
    public static class OutputFormat extends OutputFormatBase {

        /* renamed from: org.apache.nutch.indexer.Indexer$OutputFormat$1, reason: invalid class name */
        /* loaded from: input_file:org/apache/nutch/indexer/Indexer$OutputFormat$1.class */
        final class AnonymousClass1 implements RecordWriter {
            boolean closed;
            final /* synthetic */ AnalyzerFactory val$factory;
            final /* synthetic */ IndexWriter val$writer;
            final /* synthetic */ FileSystem val$fs;
            final /* synthetic */ Path val$perm;
            final /* synthetic */ Path val$temp;

            AnonymousClass1(AnalyzerFactory analyzerFactory, IndexWriter indexWriter, FileSystem fileSystem, Path path, Path path2) {
                this.val$factory = analyzerFactory;
                this.val$writer = indexWriter;
                this.val$fs = fileSystem;
                this.val$perm = path;
                this.val$temp = path2;
            }

            public void write(WritableComparable writableComparable, Writable writable) throws IOException {
                Document document = (Document) ((ObjectWritable) writable).get();
                NutchAnalyzer nutchAnalyzer = this.val$factory.get(document.get("lang"));
                if (Indexer.LOG.isInfoEnabled()) {
                    Indexer.LOG.info(" Indexing [" + document.getField("url").stringValue() + "] with analyzer " + nutchAnalyzer + " (" + document.get("lang") + ")");
                }
                this.val$writer.addDocument(document, nutchAnalyzer);
            }

            public void close(final Reporter reporter) throws IOException {
                try {
                    new Thread() { // from class: org.apache.nutch.indexer.Indexer.OutputFormat.1.1
                        @Override // java.lang.Thread, java.lang.Runnable
                        public void run() {
                            while (!AnonymousClass1.this.closed) {
                                try {
                                    reporter.setStatus("closing");
                                    Thread.sleep(1000L);
                                } catch (InterruptedException e) {
                                } catch (Throwable th) {
                                    return;
                                }
                            }
                        }
                    }.start();
                    if (Indexer.LOG.isInfoEnabled()) {
                        Indexer.LOG.info("Optimizing index.");
                    }
                    this.val$writer.optimize();
                    this.val$writer.close();
                    this.val$fs.completeLocalOutput(this.val$perm, this.val$temp);
                    this.val$fs.createNewFile(new Path(this.val$perm, Indexer.DONE_NAME));
                    this.closed = true;
                } catch (Throwable th) {
                    this.closed = true;
                    throw th;
                }
            }
        }

        public RecordWriter getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
            Path path = new Path(jobConf.getOutputPath(), str);
            Path localPath = jobConf.getLocalPath("index/_" + Integer.toString(new Random().nextInt()));
            fileSystem.delete(path);
            AnalyzerFactory analyzerFactory = new AnalyzerFactory(jobConf);
            IndexWriter indexWriter = new IndexWriter(fileSystem.startLocalOutput(path, localPath).toString(), new NutchDocumentAnalyzer(jobConf), true);
            indexWriter.setMergeFactor(jobConf.getInt("indexer.mergeFactor", 10));
            indexWriter.setMaxBufferedDocs(jobConf.getInt("indexer.minMergeDocs", 100));
            indexWriter.setMaxMergeDocs(jobConf.getInt("indexer.maxMergeDocs", Integer.MAX_VALUE));
            indexWriter.setTermIndexInterval(jobConf.getInt("indexer.termIndexInterval", 128));
            indexWriter.setMaxFieldLength(jobConf.getInt("indexer.max.tokens", 10000));
            indexWriter.setInfoStream(LogUtil.getInfoStream(Indexer.LOG));
            indexWriter.setUseCompoundFile(false);
            indexWriter.setSimilarity(new NutchSimilarity());
            return new AnonymousClass1(analyzerFactory, indexWriter, fileSystem, path, localPath);
        }
    }

    public Indexer() {
    }

    public Indexer(Configuration configuration) {
        setConf(configuration);
    }

    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.filters = new IndexingFilters(getConf());
        this.scfilters = new ScoringFilters(getConf());
    }

    public void close() {
    }

    public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
        Inlinks inlinks = null;
        CrawlDatum crawlDatum = null;
        CrawlDatum crawlDatum2 = null;
        CrawlDatum crawlDatum3 = null;
        ParseData parseData = null;
        ParseText parseText = null;
        while (it.hasNext()) {
            Object obj = ((ObjectWritable) it.next()).get();
            if (obj instanceof Inlinks) {
                inlinks = (Inlinks) obj;
            } else if (obj instanceof CrawlDatum) {
                CrawlDatum crawlDatum4 = (CrawlDatum) obj;
                if (CrawlDatum.hasDbStatus(crawlDatum4)) {
                    crawlDatum = crawlDatum4;
                } else if (CrawlDatum.hasFetchStatus(crawlDatum4)) {
                    crawlDatum2 = crawlDatum4;
                } else {
                    if (67 != crawlDatum4.getStatus()) {
                        throw new RuntimeException("Unexpected status: " + ((int) crawlDatum4.getStatus()));
                    }
                    crawlDatum3 = crawlDatum4;
                }
            } else if (obj instanceof ParseData) {
                parseData = (ParseData) obj;
            } else if (obj instanceof ParseText) {
                parseText = (ParseText) obj;
            } else if (LOG.isWarnEnabled()) {
                LOG.warn("Unrecognized type: " + obj.getClass());
            }
        }
        if (crawlDatum3 != null || crawlDatum2 == null || crawlDatum == null || parseText == null || parseData == null) {
            return;
        }
        Document document = new Document();
        Metadata contentMeta = parseData.getContentMeta();
        document.add(new Field("segment", contentMeta.get(Nutch.SEGMENT_NAME_KEY), Field.Store.YES, Field.Index.NO));
        document.add(new Field("digest", contentMeta.get(Nutch.SIGNATURE_KEY), Field.Store.YES, Field.Index.NO));
        ParseImpl parseImpl = new ParseImpl(parseText, parseData);
        try {
            Document filter = this.filters.filter(document, parseImpl, (Text) writableComparable, crawlDatum2, inlinks);
            try {
                float indexerScore = this.scfilters.indexerScore((Text) writableComparable, filter, crawlDatum, crawlDatum2, parseImpl, inlinks, 1.0f);
                filter.setBoost(indexerScore);
                filter.add(new Field("boost", Float.toString(indexerScore), Field.Store.YES, Field.Index.NO));
                outputCollector.collect(writableComparable, new ObjectWritable(filter));
            } catch (ScoringFilterException e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Error calculating score " + writableComparable + ": " + e);
                }
            }
        } catch (IndexingException e2) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error indexing " + writableComparable + ": " + e2);
            }
        }
    }

    public void index(Path path, Path path2, Path path3, Path[] pathArr) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("Indexer: starting");
            LOG.info("Indexer: linkdb: " + path3);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("index " + path);
        for (int i = 0; i < pathArr.length; i++) {
            if (LOG.isInfoEnabled()) {
                LOG.info("Indexer: adding segment: " + pathArr[i]);
            }
            nutchJob.addInputPath(new Path(pathArr[i], CrawlDatum.FETCH_DIR_NAME));
            nutchJob.addInputPath(new Path(pathArr[i], ParseData.DIR_NAME));
            nutchJob.addInputPath(new Path(pathArr[i], ParseText.DIR_NAME));
        }
        nutchJob.addInputPath(new Path(path2, "current"));
        nutchJob.addInputPath(new Path(path3, "current"));
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(Indexer.class);
        nutchJob.setReducerClass(Indexer.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputFormat(OutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(ObjectWritable.class);
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("Indexer: done");
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(new Indexer().doMain(NutchConfiguration.create(), strArr));
    }

    public int run(String[] strArr) throws Exception {
        if (strArr.length < 4) {
            System.err.println("Usage: <index> <crawldb> <linkdb> <segment> ...");
            return -1;
        }
        Path[] pathArr = new Path[strArr.length - 3];
        for (int i = 3; i < strArr.length; i++) {
            pathArr[i - 3] = new Path(strArr[i]);
        }
        try {
            index(new Path(strArr[0]), new Path(strArr[1]), new Path(strArr[2]), pathArr);
            return 0;
        } catch (Exception e) {
            LOG.fatal("Indexer: " + StringUtils.stringifyException(e));
            return -1;
        }
    }

    public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
        outputCollector.collect(writableComparable, new ObjectWritable(writable));
    }
}
