package de.dfki.km.leech.parser.wikipedia;

import de.dfki.inquisition.collections.MultiValueBalancedTreeMap;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingParser;
import de.dfki.km.leech.util.TikaUtils;
import info.bliki.wiki.filter.PlainTextConverter;
import info.bliki.wiki.model.WikiModel;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.rmi.server.UID;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.MatchResult;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import org.apache.commons.lang3.StringEscapeUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser.class */
public class WikipediaDumpParser implements Parser {
    public static final String externalLink = "externalLink";
    public static final String infobox = "infobox";
    public static final String internalLink = "internalLink";
    protected static final WikiModel m_wikiModel = new WikiModel("http://www.mywiki.com/wiki/${image}", "http://www.mywiki.com/wiki/${title}");
    private static final long serialVersionUID = -7801896202662990477L;
    protected Pattern dmsCoordinatePattern = Pattern.compile("(\\d+\\.?\\d*)/(\\d*+\\.?\\d*)/(\\d*\\.?\\d*)/([NESW])");

    /* loaded from: input_file:de/dfki/km/leech/parser/wikipedia/WikipediaDumpParser$WikipediaDumpParserConfig.class */
    public static class WikipediaDumpParserConfig {
        protected boolean determinePageRedirects = true;
        protected boolean parseGeoCoordinates = true;
        protected boolean parseInfoBoxes = false;
        protected boolean parseLinksAndCategories = false;

        public boolean getDeterminePageRedirects() {
            return this.determinePageRedirects;
        }

        public boolean getParseGeoCoordinates() {
            return this.parseGeoCoordinates;
        }

        public boolean getParseInfoBoxes() {
            return this.parseInfoBoxes;
        }

        public boolean getParseLinksAndCategories() {
            return this.parseLinksAndCategories;
        }

        public WikipediaDumpParserConfig setDeterminePageRedirects(boolean z) {
            this.determinePageRedirects = z;
            return this;
        }

        public WikipediaDumpParserConfig setParseGeoCoordinates(boolean z) {
            this.parseGeoCoordinates = z;
            return this;
        }

        public WikipediaDumpParserConfig setParseInfoBoxes(boolean z) {
            this.parseInfoBoxes = z;
            return this;
        }

        public WikipediaDumpParserConfig setParseLinksAndCategories(boolean z) {
            this.parseLinksAndCategories = z;
            return this;
        }
    }

    protected static String readNextCharEventsText(XMLEventReader xMLEventReader) throws XMLStreamException {
        StringBuilder sb = new StringBuilder("");
        while (xMLEventReader.hasNext() && xMLEventReader.peek().isCharacters()) {
            sb.append(xMLEventReader.nextEvent().asCharacters().getData());
        }
        return sb.toString();
    }

    protected String cleanAttValue(String str, String str2) {
        if (str2 == null) {
            str2 = "";
        }
        String trim = str2.replaceAll("\\(.*?\\)", "").replaceAll("\\{\\{.*?\\}\\}", "").trim();
        if ("longitude".equals(str) || "latitude".equals(str)) {
            Matcher matcher = this.dmsCoordinatePattern.matcher(trim);
            if (matcher.find()) {
                double dmsToDecCoordinate = dmsToDecCoordinate(matcher.group(1), matcher.group(2), matcher.group(3));
                if ("E".equals(matcher.group(4))) {
                    trim = String.valueOf(dmsToDecCoordinate);
                }
                if ("W".equals(matcher.group(4))) {
                    trim = String.valueOf(dmsToDecCoordinate * (-1.0d));
                }
                if ("N".equals(matcher.group(4))) {
                    trim = String.valueOf(dmsToDecCoordinate);
                }
                if ("S".equals(matcher.group(4))) {
                    trim = String.valueOf(dmsToDecCoordinate * (-1.0d));
                }
            } else {
                try {
                    Double.valueOf(trim);
                } catch (NumberFormatException e) {
                    return null;
                }
            }
        }
        return trim;
    }

    public double dmsToDecCoordinate(String str, String str2, String str3) {
        double d = 0.0d;
        if (!StringUtils.nullOrWhitespace(str)) {
            d = Double.valueOf(str).doubleValue();
        }
        double d2 = 0.0d;
        if (!StringUtils.nullOrWhitespace(str2)) {
            d2 = Double.valueOf(str2).doubleValue();
        }
        double d3 = 0.0d;
        if (!StringUtils.nullOrWhitespace(str3)) {
            d3 = Double.valueOf(str3).doubleValue();
        }
        return d + (((d2 * 60.0d) + d3) / 3600.0d);
    }

    public MultiValueHashMap<String, String> getPageTitle2Redirects(InputStream inputStream) throws FileNotFoundException, XMLStreamException {
        String readNextCharEventsText;
        Logger.getLogger(WikipediaDumpParser.class.getName()).info("will collect redirects from wikipedia dump...");
        MultiValueBalancedTreeMap multiValueBalancedTreeMap = new MultiValueBalancedTreeMap();
        String str = "";
        XMLEventReader createXMLEventReader = XMLInputFactory.newInstance().createXMLEventReader(inputStream, "Utf-8");
        int i = 0;
        while (createXMLEventReader.hasNext()) {
            XMLEvent nextEvent = createXMLEventReader.nextEvent();
            if (nextEvent.isStartElement()) {
                if (nextEvent.asStartElement().getName().getLocalPart().equals("title")) {
                    str = readNextCharEventsText(createXMLEventReader);
                    i++;
                    if (i % 200000 == 0) {
                        Logger.getLogger(WikipediaDumpParser.class.getName()).info("read doc #" + StringUtils.beautifyNumber(Integer.valueOf(i)));
                    }
                } else if (nextEvent.asStartElement().getName().getLocalPart().equals("text") && createXMLEventReader.peek().isCharacters() && (readNextCharEventsText = readNextCharEventsText(createXMLEventReader)) != null) {
                    String trim = readNextCharEventsText.trim();
                    boolean z = false;
                    if (trim.length() >= 9 && trim.substring(0, 9).equalsIgnoreCase("#redirect")) {
                        z = true;
                    }
                    if (!z && trim.length() >= 8 && trim.substring(0, 8).equalsIgnoreCase("redirect") && !trim.contains("\n")) {
                        z = true;
                    }
                    if (!z && trim.length() >= 14 && trim.substring(0, 14).equalsIgnoreCase("#weiterleitung")) {
                        z = true;
                    }
                    if (!z && trim.length() >= 13 && trim.substring(0, 13).equalsIgnoreCase("weiterleitung") && !trim.contains("\n")) {
                        z = true;
                    }
                    if (z) {
                        int indexOf = trim.indexOf("[[");
                        int indexOf2 = trim.indexOf("]]");
                        if (indexOf >= 0 && indexOf2 >= 0 && indexOf2 > indexOf && indexOf + 2 <= trim.length() && indexOf2 <= trim.length()) {
                            multiValueBalancedTreeMap.add(trim.substring(indexOf + 2, indexOf2).trim(), str);
                        }
                    }
                }
            }
        }
        Logger.getLogger(WikipediaDumpParser.class.getName()).info("Redirects found: " + StringUtils.beautifyNumber(Integer.valueOf(multiValueBalancedTreeMap.valueSize())));
        return multiValueBalancedTreeMap;
    }

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return Collections.singleton(MediaType.application("wikipedia+xml"));
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        try {
            WikipediaDumpParserConfig wikipediaDumpParserConfig = (WikipediaDumpParserConfig) parseContext.get(WikipediaDumpParserConfig.class);
            if (wikipediaDumpParserConfig == null) {
                Logger.getLogger(WikipediaDumpParser.class.getName()).info("No wikipedia parser config found. Will take the default one.");
                wikipediaDumpParserConfig = new WikipediaDumpParserConfig();
            }
            File file = TikaInputStream.get(inputStream).getFile();
            MultiValueHashMap<String, String> multiValueHashMap = new MultiValueHashMap<>();
            if (wikipediaDumpParserConfig.determinePageRedirects) {
                multiValueHashMap = getPageTitle2Redirects(new FileInputStream(file));
            }
            HashSet hashSet = new HashSet(multiValueHashMap.values());
            String str = "";
            String str2 = null;
            XMLEventReader createXMLEventReader = XMLInputFactory.newInstance().createXMLEventReader(new FileInputStream(file), "Utf-8");
            while (createXMLEventReader.hasNext()) {
                XMLEvent nextEvent = createXMLEventReader.nextEvent();
                if (nextEvent.isEndElement() && nextEvent.asEndElement().getName().getLocalPart().equals("page")) {
                    if (metadata.size() != 0) {
                        metadata.add("Content-Type", "application/wikipedia+xml");
                        XHTMLContentHandler xHTMLContentHandler = new XHTMLContentHandler(contentHandler, metadata);
                        xHTMLContentHandler.startDocument();
                        xHTMLContentHandler.startElement("p");
                        xHTMLContentHandler.characters(str.toCharArray(), 0, str.length());
                        xHTMLContentHandler.endElement("p");
                        xHTMLContentHandler.endDocument();
                    }
                }
                if (nextEvent.isStartElement()) {
                    if (str2 == null && nextEvent.asStartElement().getName().getLocalPart().equals("base")) {
                        String readNextCharEventsText = readNextCharEventsText(createXMLEventReader);
                        str2 = readNextCharEventsText.substring(0, readNextCharEventsText.lastIndexOf("/") + 1);
                    }
                    if (nextEvent.asStartElement().getName().getLocalPart().equals("page")) {
                        for (String str3 : metadata.names()) {
                            metadata.remove(str3);
                        }
                    }
                    if (nextEvent.asStartElement().getName().getLocalPart().equals("title")) {
                        String readNextCharEventsText2 = readNextCharEventsText(createXMLEventReader);
                        if (readNextCharEventsText2.equalsIgnoreCase("DuckDuckGo")) {
                        }
                        if (!readNextCharEventsText2.toLowerCase().contains("duck") || readNextCharEventsText2.toLowerCase().contains("go")) {
                        }
                        String lowerCase = readNextCharEventsText2.trim().toLowerCase();
                        if (hashSet.contains(readNextCharEventsText2) || hashSet.contains(lowerCase) || hashSet.contains(readNextCharEventsText2.trim()) || lowerCase.startsWith("category:") || lowerCase.startsWith("kategorie:") || lowerCase.startsWith("vorlage:") || lowerCase.startsWith("template:") || lowerCase.startsWith("hilfe:") || lowerCase.startsWith("help:") || lowerCase.startsWith("wikipedia:") || lowerCase.startsWith("portal:") || lowerCase.startsWith("mediawiki:")) {
                            while (true) {
                                XMLEvent nextEvent2 = createXMLEventReader.nextEvent();
                                if (!nextEvent2.isEndElement() || !nextEvent2.asEndElement().getName().getLocalPart().equals("page")) {
                                }
                            }
                        } else {
                            metadata.add("title", readNextCharEventsText2);
                            metadata.add("source", str2 + readNextCharEventsText2);
                            for (String str4 : multiValueHashMap.get(readNextCharEventsText2)) {
                                if (!StringUtils.containsIgnoreCase(str4, metadata.getValues("title"))) {
                                    metadata.add("title", str4);
                                }
                            }
                        }
                    } else if (nextEvent.asStartElement().getName().getLocalPart().equals("text")) {
                        String readNextCharEventsText3 = readNextCharEventsText(createXMLEventReader);
                        if (wikipediaDumpParserConfig.parseLinksAndCategories) {
                            parseLinksAndCategories(readNextCharEventsText3, str2, metadata, contentHandler);
                        }
                        if (wikipediaDumpParserConfig.parseInfoBoxes) {
                            parseInfoBox(readNextCharEventsText3, metadata, contentHandler);
                        }
                        if (wikipediaDumpParserConfig.parseGeoCoordinates) {
                            parseGeoCoordinates(readNextCharEventsText3, metadata);
                        }
                        str = StringEscapeUtils.unescapeHtml4(m_wikiModel.render(new PlainTextConverter(), readNextCharEventsText3.replaceAll("==\n", "==\n\n").replaceAll("\n==", "\n\n==")).replaceAll("\\{\\{", " ").replaceAll("\\}\\}", " "));
                    } else if (nextEvent.asStartElement().getName().getLocalPart().equals("timestamp")) {
                        metadata.add(IncrementalCrawlingParser.MODIFIED, readNextCharEventsText(createXMLEventReader));
                    } else if (nextEvent.asStartElement().getName().getLocalPart().equals("username")) {
                        metadata.add("creator", readNextCharEventsText(createXMLEventReader));
                    }
                }
            }
        } catch (Exception e) {
            Logger.getLogger(WikipediaDumpParser.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
        }
    }

    protected void parseGeoCoordinates(String str, Metadata metadata) {
        Matcher matcher = Pattern.compile("(?s)\\{\\{Coordinate (.*?)\\}\\}").matcher(str);
        while (matcher.find()) {
            for (String str2 : matcher.group(1).split("\\|")) {
                if (str2.contains("text=")) {
                    break;
                }
                Matcher matcher2 = this.dmsCoordinatePattern.matcher(str2);
                if (matcher2.find()) {
                    double dmsToDecCoordinate = dmsToDecCoordinate(matcher2.group(1), matcher2.group(2), matcher2.group(3));
                    if ("E".equals(matcher2.group(4))) {
                        metadata.add("longitude", String.valueOf(dmsToDecCoordinate));
                    }
                    if ("W".equals(matcher2.group(4))) {
                        metadata.add("longitude", String.valueOf(dmsToDecCoordinate * (-1.0d)));
                    }
                    if ("N".equals(matcher2.group(4))) {
                        metadata.add("latitude", String.valueOf(dmsToDecCoordinate));
                    }
                    if ("S".equals(matcher2.group(4))) {
                        metadata.add("latitude", String.valueOf(dmsToDecCoordinate * (-1.0d)));
                    }
                } else if (str2.contains("EW=") || str2.contains("NS=")) {
                    String trim = str2.substring(3).trim();
                    try {
                        Double.valueOf(trim);
                        if (str2.contains("EW=")) {
                            metadata.add("longitude", trim);
                        }
                        if (str2.contains("NS=")) {
                            metadata.add("latitude", trim);
                        }
                    } catch (NumberFormatException e) {
                    }
                }
            }
        }
    }

    protected void parseInfoBox(String str, Metadata metadata, ContentHandler contentHandler) throws SAXException {
        MatchResult findFirst = StringUtils.findFirst("\\{\\{\\s*Infobox", str);
        if (findFirst != null) {
            int start = findFirst.start();
            int findMatchingBracket = StringUtils.findMatchingBracket(start, str) + 1;
            if (str.length() < 3 || str.length() < findMatchingBracket || findMatchingBracket <= 0 || start + 2 > findMatchingBracket) {
                return;
            }
            String substring = str.substring(start + 2, findMatchingBracket);
            if (substring.length() < 5) {
                return;
            }
            String render = m_wikiModel.render(new PlainTextConverter(), substring.replaceAll("<br />", "&lt;br /&gt;"));
            int indexOf = render.indexOf("|");
            if (indexOf == -1) {
                indexOf = render.indexOf("\n");
            }
            if (indexOf == -1) {
                return;
            }
            String trim = render.substring(7, indexOf).trim();
            metadata.add(infobox, trim);
            String[] split = render.split("\\s*\\|\\s*");
            HashMap hashMap = new HashMap();
            for (String str2 : split) {
                String[] split2 = str2.split("=");
                if (split2.length != 0 && split2[0] != null && split2.length >= 2 && split2[1] != null) {
                    String trim2 = split2[0].trim();
                    String str3 = split2[1];
                    if (!StringUtils.nullOrWhitespace(str3)) {
                        String[] split3 = str3.split(Pattern.quote("&lt;br /&gt;"));
                        Matcher matcher = Pattern.compile("([\\D]*)(\\d+)([\\D]*)").matcher(trim2);
                        if (matcher.find()) {
                            String group = matcher.group(1);
                            String group2 = matcher.group(2);
                            String group3 = matcher.group(3);
                            String str4 = group + group2;
                            String str5 = group + group3;
                            if (matcher.find()) {
                                for (String str6 : split3) {
                                    String cleanAttValue = cleanAttValue(str5, str6);
                                    if (cleanAttValue != null) {
                                        metadata.add(str5, cleanAttValue);
                                    }
                                }
                            }
                            MultiValueHashMap multiValueHashMap = (MultiValueHashMap) hashMap.get(str4);
                            if (multiValueHashMap == null) {
                                multiValueHashMap = new MultiValueHashMap();
                                hashMap.put(str4, multiValueHashMap);
                            }
                            for (String str7 : split3) {
                                multiValueHashMap.add(str5, str7.replaceAll("\\(.*?\\)", "").trim());
                            }
                        } else {
                            for (String str8 : split3) {
                                String cleanAttValue2 = cleanAttValue(trim2, str8);
                                if (cleanAttValue2 != null) {
                                    metadata.add(trim2, cleanAttValue2);
                                }
                            }
                        }
                    }
                }
            }
            String uid = new UID().toString();
            metadata.add(LeechMetadata.id, uid);
            Metadata copyMetadata = TikaUtils.copyMetadata(metadata);
            for (MultiValueHashMap multiValueHashMap2 : hashMap.values()) {
                TikaUtils.clearMetadata(metadata);
                metadata.add(LeechMetadata.parentId, uid);
                metadata.add(infobox, trim);
                String uid2 = new UID().toString();
                metadata.add(LeechMetadata.id, uid2);
                copyMetadata.add(LeechMetadata.childId, uid2);
                for (Map.Entry entry : multiValueHashMap2.entryList()) {
                    String str9 = (String) entry.getKey();
                    String cleanAttValue3 = cleanAttValue(str9, (String) entry.getValue());
                    if (cleanAttValue3 != null) {
                        metadata.add(str9, cleanAttValue3);
                    }
                }
                metadata.add("Content-Type", "application/wikipedia-meta+xml");
                XHTMLContentHandler xHTMLContentHandler = new XHTMLContentHandler(contentHandler, metadata);
                xHTMLContentHandler.startDocument();
                xHTMLContentHandler.endDocument();
            }
            TikaUtils.clearMetadata(metadata);
            TikaUtils.copyMetadataFromTo(copyMetadata, metadata);
        }
    }

    protected void parseLinksAndCategories(String str, String str2, Metadata metadata, ContentHandler contentHandler) throws SAXException {
        HashSet hashSet = new HashSet();
        HashSet hashSet2 = new HashSet();
        Matcher matcher = Pattern.compile("\\[(.*?)\\]").matcher(str);
        while (matcher.find()) {
            String group = matcher.group(1);
            if (group.startsWith("[")) {
                String substring = group.substring(1);
                int indexOf = substring.indexOf("|");
                if (indexOf != -1) {
                    substring = substring.substring(0, indexOf);
                }
                int indexOf2 = substring.indexOf(":");
                if (indexOf2 == -1) {
                    int indexOf3 = substring.indexOf("#");
                    if (indexOf3 != 0) {
                        if (indexOf3 == -1) {
                            hashSet.add(str2 + substring);
                        } else {
                            hashSet.add(str2 + substring.substring(0, indexOf3));
                        }
                    }
                } else {
                    String substring2 = substring.substring(0, indexOf2);
                    String cleanAttValue = cleanAttValue(substring2, substring.substring(indexOf2 + 1));
                    if (cleanAttValue != null) {
                        metadata.add(substring2, cleanAttValue);
                    }
                }
            } else {
                String str3 = group;
                int indexOf4 = group.indexOf(" ");
                if (indexOf4 != -1) {
                    str3 = group.substring(0, indexOf4);
                }
                hashSet2.add(str3);
            }
        }
        Iterator it = hashSet.iterator();
        while (it.hasNext()) {
            metadata.add(internalLink, (String) it.next());
        }
        Iterator it2 = hashSet2.iterator();
        while (it2.hasNext()) {
            metadata.add(externalLink, (String) it2.next());
        }
    }
}
