package org.apache.nutch.tools;

import java.io.BufferedInputStream;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FilterReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.util.Random;
import java.util.Vector;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.io.MD5Hash;
import org.apache.nutch.util.LogUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.xerces.util.XMLChar;
import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.Locator;
import org.xml.sax.SAXException;
import org.xml.sax.SAXParseException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:org/apache/nutch/tools/DmozParser.class */
public class DmozParser {
    public static final Log LOG = LogFactory.getLog(DmozParser.class);
    long pages = 0;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/tools/DmozParser$RDFProcessor.class */
    public class RDFProcessor extends DefaultHandler {
        Pattern topicPattern;
        XMLReader reader;
        int subsetDenom;
        int hashSkew;
        boolean includeAdult;
        Locator location;
        String curURL = null;
        String curSection = null;
        boolean titlePending = false;
        boolean descPending = false;
        boolean insideAdultSection = false;
        StringBuffer title = new StringBuffer();
        StringBuffer desc = new StringBuffer();

        public RDFProcessor(XMLReader xMLReader, int i, boolean z, int i2, Pattern pattern) throws IOException {
            this.topicPattern = null;
            this.reader = xMLReader;
            this.subsetDenom = i;
            this.includeAdult = z;
            this.topicPattern = pattern;
            this.hashSkew = i2 != 0 ? i2 : new Random().nextInt();
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws SAXException {
            if ("Topic".equals(str3)) {
                this.curSection = attributes.getValue("r:id");
                return;
            }
            if (!"ExternalPage".equals(str3)) {
                if (this.curURL != null && "d:Title".equals(str3)) {
                    this.titlePending = true;
                    return;
                } else {
                    if (this.curURL == null || !"d:Description".equals(str3)) {
                        return;
                    }
                    this.descPending = true;
                    return;
                }
            }
            if (this.includeAdult || !this.curSection.startsWith("Top/Adult")) {
                if (this.topicPattern == null || this.topicPattern.matcher(this.curSection).matches()) {
                    String value = attributes.getValue("about");
                    if (Math.abs(MD5Hash.digest(value).hashCode() ^ this.hashSkew) % this.subsetDenom != 0) {
                        return;
                    }
                    this.curURL = value;
                }
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) {
            if (this.titlePending) {
                this.title.append(cArr, i, i2);
            } else if (this.descPending) {
                this.desc.append(cArr, i, i2);
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if (this.curURL != null) {
                if (!"ExternalPage".equals(str3)) {
                    if ("d:Title".equals(str3)) {
                        this.titlePending = false;
                        return;
                    } else {
                        if ("d:Description".equals(str3)) {
                            this.descPending = false;
                            return;
                        }
                        return;
                    }
                }
                System.out.println(this.curURL);
                DmozParser.this.pages++;
                if (this.title.length() > 0) {
                    this.title.delete(0, this.title.length());
                }
                if (this.desc.length() > 0) {
                    this.desc.delete(0, this.desc.length());
                }
                this.curURL = null;
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startDocument() {
            if (DmozParser.LOG.isInfoEnabled()) {
                DmozParser.LOG.info("Begin parse");
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endDocument() {
            if (DmozParser.LOG.isInfoEnabled()) {
                DmozParser.LOG.info("Completed parse.  Found " + DmozParser.this.pages + " pages.");
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void setDocumentLocator(Locator locator) {
            this.location = locator;
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ErrorHandler
        public void error(SAXParseException sAXParseException) {
            if (DmozParser.LOG.isFatalEnabled()) {
                DmozParser.LOG.fatal("Error: " + sAXParseException.toString() + ": " + sAXParseException.getMessage());
                sAXParseException.printStackTrace(LogUtil.getFatalStream(DmozParser.LOG));
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ErrorHandler
        public void fatalError(SAXParseException sAXParseException) {
            if (DmozParser.LOG.isFatalEnabled()) {
                DmozParser.LOG.fatal("Fatal err: " + sAXParseException.toString() + ": " + sAXParseException.getMessage());
                DmozParser.LOG.fatal("Last known line is " + this.location.getLineNumber() + ", column " + this.location.getColumnNumber());
                sAXParseException.printStackTrace(LogUtil.getFatalStream(DmozParser.LOG));
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ErrorHandler
        public void warning(SAXParseException sAXParseException) {
            if (DmozParser.LOG.isWarnEnabled()) {
                DmozParser.LOG.warn("Warning: " + sAXParseException.toString() + ": " + sAXParseException.getMessage());
                sAXParseException.printStackTrace(LogUtil.getWarnStream(DmozParser.LOG));
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/tools/DmozParser$XMLCharFilter.class */
    public static class XMLCharFilter extends FilterReader {
        private boolean lastBad;

        public XMLCharFilter(Reader reader) {
            super(reader);
            this.lastBad = false;
        }

        @Override // java.io.FilterReader, java.io.Reader
        public int read() throws IOException {
            int read = this.in.read();
            int i = read;
            if (read != -1 && !XMLChar.isValid(read)) {
                i = 88;
            } else if (this.lastBad && read == 60) {
                this.in.mark(1);
                if (this.in.read() != 47) {
                    i = 88;
                }
                this.in.reset();
            }
            this.lastBad = read == 65533;
            return i;
        }

        @Override // java.io.FilterReader, java.io.Reader
        public int read(char[] cArr, int i, int i2) throws IOException {
            int read = this.in.read(cArr, i, i2);
            if (read != -1) {
                for (int i3 = 0; i3 < read; i3++) {
                    char c = cArr[i + i3];
                    char c2 = c;
                    if (!XMLChar.isValid(c)) {
                        c2 = 'X';
                    } else if (this.lastBad && c == '<' && i3 != read - 1 && cArr[i + i3 + 1] != '/') {
                        c2 = 'X';
                    }
                    this.lastBad = c == 65533;
                    cArr[i + i3] = c2;
                }
            }
            return read;
        }
    }

    public void parseDmozFile(File file, int i, boolean z, int i2, Pattern pattern) throws IOException, SAXException, ParserConfigurationException {
        XMLReader xMLReader = SAXParserFactory.newInstance().newSAXParser().getXMLReader();
        RDFProcessor rDFProcessor = new RDFProcessor(xMLReader, i, z, i2, pattern);
        xMLReader.setContentHandler(rDFProcessor);
        xMLReader.setErrorHandler(rDFProcessor);
        if (LOG.isInfoEnabled()) {
            LOG.info("skew = " + rDFProcessor.hashSkew);
        }
        XMLCharFilter xMLCharFilter = new XMLCharFilter(new BufferedReader(new InputStreamReader(new BufferedInputStream(new FileInputStream(file)), "UTF-8")));
        try {
            try {
                xMLReader.parse(new InputSource(xMLCharFilter));
                xMLCharFilter.close();
            } catch (Exception e) {
                if (LOG.isFatalEnabled()) {
                    LOG.fatal(e.toString());
                    e.printStackTrace(LogUtil.getFatalStream(LOG));
                }
                System.exit(0);
                xMLCharFilter.close();
            }
        } catch (Throwable th) {
            xMLCharFilter.close();
            throw th;
        }
    }

    private static void addTopicsFromFile(String str, Vector vector) throws IOException {
        BufferedReader bufferedReader = null;
        try {
            try {
                bufferedReader = new BufferedReader(new InputStreamReader(new FileInputStream(str), "UTF-8"));
                while (true) {
                    String readLine = bufferedReader.readLine();
                    if (readLine == null) {
                        bufferedReader.close();
                        return;
                    }
                    vector.addElement(new String(readLine));
                }
            } catch (Exception e) {
                if (LOG.isFatalEnabled()) {
                    LOG.fatal(e.toString());
                    e.printStackTrace(LogUtil.getFatalStream(LOG));
                }
                System.exit(0);
                bufferedReader.close();
            }
        } catch (Throwable th) {
            bufferedReader.close();
            throw th;
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length < 1) {
            System.err.println("Usage: DmozParser <dmoz_file> [-subset <subsetDenominator>] [-includeAdultMaterial] [-skew skew] [-topicFile <topic list file>] [-topic <topic> [-topic <topic> [...]]]");
            return;
        }
        int i = 1;
        int i2 = 0;
        String str = strArr[0];
        boolean z = false;
        Pattern pattern = null;
        Vector vector = new Vector();
        FileSystem fileSystem = FileSystem.get(NutchConfiguration.create());
        int i3 = 1;
        while (i3 < strArr.length) {
            try {
                if ("-includeAdultMaterial".equals(strArr[i3])) {
                    z = true;
                } else if ("-subset".equals(strArr[i3])) {
                    i = Integer.parseInt(strArr[i3 + 1]);
                    i3++;
                } else if ("-topic".equals(strArr[i3])) {
                    vector.addElement(strArr[i3 + 1]);
                    i3++;
                } else if ("-topicFile".equals(strArr[i3])) {
                    addTopicsFromFile(strArr[i3 + 1], vector);
                    i3++;
                } else if ("-skew".equals(strArr[i3])) {
                    i2 = Integer.parseInt(strArr[i3 + 1]);
                    i3++;
                }
                i3++;
            } catch (Throwable th) {
                fileSystem.close();
                throw th;
            }
        }
        DmozParser dmozParser = new DmozParser();
        if (!vector.isEmpty()) {
            String str2 = new String("^(");
            int i4 = 0;
            while (i4 < vector.size() - 1) {
                str2 = str2.concat((String) vector.get(i4)).concat("|");
                i4++;
            }
            String concat = str2.concat((String) vector.get(i4)).concat(").*");
            if (LOG.isInfoEnabled()) {
                LOG.info("Topic selection pattern = " + concat);
            }
            pattern = Pattern.compile(concat);
        }
        dmozParser.parseDmozFile(new File(str), i, z, i2, pattern);
        fileSystem.close();
    }
}
