package org.apache.nutch.analysis.lang;

import java.util.Enumeration;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.parse.HTMLMetaTags;
import org.apache.nutch.parse.HtmlParseFilter;
import org.apache.nutch.parse.Parse;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.util.NodeWalker;
import org.w3c.dom.DocumentFragment;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;

/* loaded from: input_file:org/apache/nutch/analysis/lang/HTMLLanguageParser.class */
public class HTMLLanguageParser implements HtmlParseFilter {
    public static final Log LOG = LogFactory.getLog(HTMLLanguageParser.class);
    private static Map LANGUAGES_MAP = new HashMap();
    private Configuration conf;

    /* loaded from: input_file:org/apache/nutch/analysis/lang/HTMLLanguageParser$LanguageParser.class */
    static class LanguageParser {
        private String dublinCore = null;
        private String htmlAttribute = null;
        private String httpEquiv = null;
        private String language;

        LanguageParser(Node node) {
            this.language = null;
            parse(node);
            if (this.htmlAttribute != null) {
                this.language = this.htmlAttribute;
            } else if (this.dublinCore != null) {
                this.language = this.dublinCore;
            } else {
                this.language = this.httpEquiv;
            }
        }

        String getLanguage() {
            return this.language;
        }

        void parse(Node node) {
            Node namedItem;
            Node namedItem2;
            NodeWalker nodeWalker = new NodeWalker(node);
            while (nodeWalker.hasNext()) {
                Node nextNode = nodeWalker.nextNode();
                String nodeName = nextNode.getNodeName();
                if (nextNode.getNodeType() == 1) {
                    if (this.htmlAttribute == null) {
                        this.htmlAttribute = parseLanguage(((Element) nextNode).getAttribute("lang"));
                    }
                    if ("meta".equalsIgnoreCase(nodeName)) {
                        NamedNodeMap attributes = nextNode.getAttributes();
                        if (this.dublinCore == null) {
                            for (int i = 0; i < attributes.getLength(); i++) {
                                Node item = attributes.item(i);
                                if ("name".equalsIgnoreCase(item.getNodeName()) && "dc.language".equalsIgnoreCase(item.getNodeValue()) && (namedItem2 = attributes.getNamedItem("content")) != null) {
                                    this.dublinCore = parseLanguage(namedItem2.getNodeValue());
                                }
                            }
                        }
                        if (this.httpEquiv == null) {
                            for (int i2 = 0; i2 < attributes.getLength(); i2++) {
                                Node item2 = attributes.item(i2);
                                if ("http-equiv".equalsIgnoreCase(item2.getNodeName()) && "content-language".equals(item2.getNodeValue().toLowerCase()) && (namedItem = attributes.getNamedItem("content")) != null) {
                                    this.httpEquiv = parseLanguage(namedItem.getNodeValue());
                                }
                            }
                        }
                    }
                }
                if (this.dublinCore != null && this.htmlAttribute != null && this.httpEquiv != null) {
                    return;
                }
            }
        }

        static final String parseLanguage(String str) {
            if (str == null) {
                return null;
            }
            String str2 = null;
            String[] split = str.split(",| |;|\\.|\\(|\\)|=", -1);
            for (int i = 0; str2 == null && i < split.length; i++) {
                str2 = (String) HTMLLanguageParser.LANGUAGES_MAP.get(split[i].split("-")[0].split("_")[0].toLowerCase());
            }
            return str2;
        }
    }

    public ParseResult filter(Content content, ParseResult parseResult, HTMLMetaTags hTMLMetaTags, DocumentFragment documentFragment) {
        Parse parse = parseResult.get(content.getUrl());
        String languageFromMetadata = getLanguageFromMetadata(parse.getData().getParseMeta());
        if (languageFromMetadata != null) {
            parse.getData().getParseMeta().set("language", languageFromMetadata);
            return parseResult;
        }
        String language = new LanguageParser(documentFragment).getLanguage();
        if (language != null) {
            parse.getData().getParseMeta().set("language", language);
        }
        return parseResult;
    }

    private static String getLanguageFromMetadata(Metadata metadata) {
        String str = metadata.get("dc.language");
        if (str != null) {
            return str;
        }
        String str2 = metadata.get("content-language");
        return str2 != null ? str2 : metadata.get("lang");
    }

    public void setConf(Configuration configuration) {
        this.conf = configuration;
    }

    public Configuration getConf() {
        return this.conf;
    }

    static {
        try {
            Properties properties = new Properties();
            properties.load(HTMLLanguageParser.class.getResourceAsStream("langmappings.properties"));
            Enumeration keys = properties.keys();
            while (keys.hasMoreElements()) {
                String str = (String) keys.nextElement();
                String[] split = properties.getProperty(str).split(",", -1);
                LANGUAGES_MAP.put(str, str);
                for (String str2 : split) {
                    LANGUAGES_MAP.put(str2.trim().toLowerCase(), str);
                }
            }
        } catch (Exception e) {
            if (LOG.isFatalEnabled()) {
                LOG.fatal(e.toString());
            }
        }
    }
}
