package org.apache.nutch.parse;

import java.io.IOException;
import java.util.Iterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.UTF8;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.SignatureFactory;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.StringUtil;

/* loaded from: input_file:org/apache/nutch/parse/ParseSegment.class */
public class ParseSegment extends Configured implements Mapper, Reducer {
    public static final Log LOG = LogFactory.getLog(Parser.class);
    private ScoringFilters scfilters;
    private Text newKey;

    public ParseSegment() {
        this(null);
    }

    public ParseSegment(Configuration configuration) {
        super(configuration);
        this.newKey = new Text();
    }

    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.scfilters = new ScoringFilters(jobConf);
    }

    public void close() {
    }

    public void map(WritableComparable writableComparable, Writable writable, OutputCollector outputCollector, Reporter reporter) throws IOException {
        ParseStatus parseStatus;
        if (writableComparable instanceof UTF8) {
            this.newKey.set(writableComparable.toString());
            writableComparable = this.newKey;
        }
        Content content = (Content) writable;
        content.forceInflate();
        Parse parse = null;
        try {
            parse = new ParseUtil(getConf()).parse(content);
            parseStatus = parse.getData().getStatus();
        } catch (Exception e) {
            parseStatus = new ParseStatus(e);
        }
        byte[] calculate = SignatureFactory.getSignature(getConf()).calculate(content, parse);
        if (parse != null) {
            parse.getData().getContentMeta().set(Nutch.SIGNATURE_KEY, StringUtil.toHexString(calculate));
            parse.getData().getContentMeta().set(Nutch.SEGMENT_NAME_KEY, getConf().get(Nutch.SEGMENT_NAME_KEY));
        }
        if (!parseStatus.isSuccess()) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error parsing: " + writableComparable + ": " + parseStatus.toString());
                return;
            }
            return;
        }
        try {
            this.scfilters.passScoreAfterParsing((Text) writableComparable, content, parse);
            outputCollector.collect(writableComparable, new ParseImpl(parse.getText(), parse.getData()));
        } catch (ScoringFilterException e2) {
            if (LOG.isWarnEnabled()) {
                e2.printStackTrace(LogUtil.getWarnStream(LOG));
                LOG.warn("Error passing score: " + writableComparable + ": " + e2.getMessage());
            }
        }
    }

    public void reduce(WritableComparable writableComparable, Iterator it, OutputCollector outputCollector, Reporter reporter) throws IOException {
        outputCollector.collect(writableComparable, (Writable) it.next());
    }

    public void parse(Path path) throws IOException {
        if (LOG.isInfoEnabled()) {
            LOG.info("Parse: starting");
            LOG.info("Parse: segment: " + path);
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("parse " + path);
        nutchJob.setInputPath(new Path(path, "content"));
        nutchJob.set(Nutch.SEGMENT_NAME_KEY, path.getName());
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(ParseSegment.class);
        nutchJob.setReducerClass(ParseSegment.class);
        nutchJob.setOutputPath(path);
        nutchJob.setOutputFormat(ParseOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(ParseImpl.class);
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("Parse: done");
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0) {
            System.err.println("Usage: ParseSegment segment");
            System.exit(-1);
        }
        new ParseSegment(NutchConfiguration.create()).parse(new Path(strArr[0]));
    }
}
