package org.apache.nutch.parse;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputFormat;
import org.apache.hadoop.mapred.RecordWriter;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.util.Progressable;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.StringUtil;

/* loaded from: input_file:org/apache/nutch/parse/ParseOutputFormat.class */
public class ParseOutputFormat implements OutputFormat {
    private static final Log LOG = LogFactory.getLog(ParseOutputFormat.class);
    private URLNormalizers urlNormalizers;
    private URLFilters filters;
    private ScoringFilters scfilters;

    public void checkOutputSpecs(FileSystem fileSystem, JobConf jobConf) throws IOException {
        if (fileSystem.exists(new Path(jobConf.getOutputPath(), CrawlDatum.PARSE_DIR_NAME))) {
            throw new IOException("Segment already parsed!");
        }
    }

    public RecordWriter getRecordWriter(FileSystem fileSystem, JobConf jobConf, String str, Progressable progressable) throws IOException {
        this.urlNormalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_OUTLINK);
        this.filters = new URLFilters(jobConf);
        this.scfilters = new ScoringFilters(jobConf);
        final float f = jobConf.getFloat("db.default.fetch.interval", 30.0f);
        final boolean z = jobConf.getBoolean("db.ignore.external.links", false);
        Path path = new Path(new Path(jobConf.getOutputPath(), ParseText.DIR_NAME), str);
        Path path2 = new Path(new Path(jobConf.getOutputPath(), ParseData.DIR_NAME), str);
        Path path3 = new Path(new Path(jobConf.getOutputPath(), CrawlDatum.PARSE_DIR_NAME), str);
        final MapFile.Writer writer = new MapFile.Writer(jobConf, fileSystem, path.toString(), Text.class, ParseText.class, SequenceFile.CompressionType.RECORD);
        final MapFile.Writer writer2 = new MapFile.Writer(jobConf, fileSystem, path2.toString(), Text.class, ParseData.class);
        final SequenceFile.Writer createWriter = SequenceFile.createWriter(fileSystem, jobConf, path3, Text.class, CrawlDatum.class);
        return new RecordWriter() { // from class: org.apache.nutch.parse.ParseOutputFormat.1
            public void write(WritableComparable writableComparable, Writable writable) throws IOException {
                String str2;
                String str3;
                String str4;
                byte[] fromHexString;
                Parse parse = (Parse) writable;
                String obj = writableComparable.toString();
                writer.append(writableComparable, new ParseText(parse.getText()));
                ParseData data = parse.getData();
                String str5 = data.getContentMeta().get(Nutch.SIGNATURE_KEY);
                if (str5 != null && (fromHexString = StringUtil.fromHexString(str5)) != null) {
                    CrawlDatum crawlDatum = new CrawlDatum(65, 0.0f);
                    crawlDatum.setSignature(fromHexString);
                    createWriter.append(writableComparable, crawlDatum);
                }
                Outlink[] outlinks = data.getOutlinks();
                if (z) {
                    try {
                        str2 = new URL(obj).getHost().toLowerCase();
                    } catch (MalformedURLException e) {
                        str2 = null;
                    }
                } else {
                    str2 = null;
                }
                String[] strArr = new String[outlinks.length];
                int i = 0;
                for (int i2 = 0; i2 < outlinks.length; i2++) {
                    try {
                        str4 = ParseOutputFormat.this.filters.filter(ParseOutputFormat.this.urlNormalizers.normalize(outlinks[i2].getToUrl(), URLNormalizers.SCOPE_OUTLINK));
                    } catch (Exception e2) {
                        str4 = null;
                    }
                    if (obj.equals(str4)) {
                        str4 = null;
                    }
                    if (str4 != null) {
                        i++;
                    }
                    strArr[i2] = str4;
                }
                for (int i3 = 0; i3 < strArr.length; i3++) {
                    if (strArr[i3] != null) {
                        if (z) {
                            try {
                                str3 = new URL(strArr[i3]).getHost().toLowerCase();
                            } catch (MalformedURLException e3) {
                                str3 = null;
                            }
                            if (str3 != null) {
                                if (!str3.equals(str2)) {
                                }
                            }
                        }
                        CrawlDatum crawlDatum2 = new CrawlDatum(67, f);
                        Text text = new Text(strArr[i3]);
                        try {
                            CrawlDatum distributeScoreToOutlink = ParseOutputFormat.this.scfilters.distributeScoreToOutlink((Text) writableComparable, text, data, crawlDatum2, null, outlinks.length, i);
                            createWriter.append(text, crawlDatum2);
                            if (distributeScoreToOutlink != null) {
                                createWriter.append(writableComparable, distributeScoreToOutlink);
                            }
                        } catch (ScoringFilterException e4) {
                            if (ParseOutputFormat.LOG.isWarnEnabled()) {
                                ParseOutputFormat.LOG.warn("Cannot distribute score from " + writableComparable + " to " + text + " - skipped (" + e4.getMessage());
                            }
                        }
                    }
                }
                writer2.append(writableComparable, data);
            }

            public void close(Reporter reporter) throws IOException {
                writer.close();
                writer2.close();
                createWriter.close();
            }
        };
    }
}
