package org.apache.nutch.analysis.lang;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.IndexingException;
import org.apache.nutch.indexer.IndexingFilter;
import org.apache.nutch.indexer.NutchDocument;
import org.apache.nutch.indexer.lucene.LuceneWriter;
import org.apache.nutch.parse.Parse;

/* loaded from: input_file:org/apache/nutch/analysis/lang/LanguageIndexingFilter.class */
public class LanguageIndexingFilter implements IndexingFilter {
    private Configuration conf;
    private LanguageIdentifier languageIdentifier;

    public NutchDocument filter(NutchDocument nutchDocument, Parse parse, Text text, CrawlDatum crawlDatum, Inlinks inlinks) throws IndexingException {
        String str = parse.getData().getParseMeta().get("language");
        if (str == null) {
            str = parse.getData().getContentMeta().get("Content-Language");
        }
        if (str == null) {
            StringBuilder sb = new StringBuilder();
            sb.append(parse.getData().getTitle()).append(" ").append(parse.getText());
            str = this.languageIdentifier.identify(sb);
        }
        if (str == null) {
            str = "unknown";
        }
        nutchDocument.add("lang", str);
        return nutchDocument;
    }

    public void addIndexBackendOptions(Configuration configuration) {
        LuceneWriter.addFieldOptions("lang", LuceneWriter.STORE.YES, LuceneWriter.INDEX.UNTOKENIZED, configuration);
    }

    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.languageIdentifier = new LanguageIdentifier(configuration);
    }

    public Configuration getConf() {
        return this.conf;
    }
}
