package edu.stanford.nlp.international.arabic.process;

import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.process.AbstractTokenizer;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import edu.stanford.nlp.process.LexedTokenFactory;
import edu.stanford.nlp.process.Tokenizer;
import edu.stanford.nlp.process.TokenizerFactory;
import edu.stanford.nlp.util.StringUtils;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.io.Serializable;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.Properties;

/* loaded from: input_file:WEB-INF/lib/stanford-corenlp-3.4.1.jar:edu/stanford/nlp/international/arabic/process/ArabicTokenizer.class */
public class ArabicTokenizer<T extends HasWord> extends AbstractTokenizer<T> {
    private final ArabicLexer lexer;
    private static final Properties atbOptions = new Properties();

    /* loaded from: input_file:WEB-INF/lib/stanford-corenlp-3.4.1.jar:edu/stanford/nlp/international/arabic/process/ArabicTokenizer$ArabicTokenizerFactory.class */
    public static class ArabicTokenizerFactory<T extends HasWord> implements TokenizerFactory<T>, Serializable {
        private static final long serialVersionUID = 946818805507187330L;
        protected final LexedTokenFactory<T> factory;
        protected Properties lexerProperties = new Properties();

        public static TokenizerFactory<CoreLabel> newTokenizerFactory() {
            return new ArabicTokenizerFactory(new CoreLabelTokenFactory());
        }

        private ArabicTokenizerFactory(LexedTokenFactory<T> lexedTokenFactory) {
            this.factory = lexedTokenFactory;
        }

        @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
        public Iterator<T> getIterator(Reader reader) {
            return getTokenizer(reader);
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader) {
            return new ArabicTokenizer(reader, this.factory, this.lexerProperties);
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public void setOptions(String str) {
            for (String str2 : str.split(",")) {
                this.lexerProperties.put(str2, "true");
            }
        }

        @Override // edu.stanford.nlp.process.TokenizerFactory
        public Tokenizer<T> getTokenizer(Reader reader, String str) {
            setOptions(str);
            return getTokenizer(reader);
        }
    }

    public static ArabicTokenizer<CoreLabel> newArabicTokenizer(Reader reader, Properties properties) {
        return new ArabicTokenizer<>(reader, new CoreLabelTokenFactory(), properties);
    }

    public ArabicTokenizer(Reader reader, LexedTokenFactory<T> lexedTokenFactory, Properties properties) {
        this.lexer = new ArabicLexer(reader, lexedTokenFactory, properties);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // edu.stanford.nlp.process.AbstractTokenizer
    public T getNext() {
        T t;
        do {
            try {
                t = (T) this.lexer.next();
                if (t == null) {
                    break;
                }
            } catch (IOException e) {
                throw new RuntimeIOException(e);
            }
        } while (t.word().length() == 0);
        return t;
    }

    public static TokenizerFactory<CoreLabel> factory() {
        return ArabicTokenizerFactory.newTokenizerFactory();
    }

    public static TokenizerFactory<CoreLabel> atbFactory() {
        TokenizerFactory<CoreLabel> newTokenizerFactory = ArabicTokenizerFactory.newTokenizerFactory();
        Iterator<String> it = atbOptions.stringPropertyNames().iterator();
        while (it.hasNext()) {
            newTokenizerFactory.setOptions(it.next());
        }
        return newTokenizerFactory;
    }

    public static void main(String[] strArr) {
        if (strArr.length > 0 && strArr[0].contains("help")) {
            System.err.printf("Usage: java %s [OPTIONS] < file%n", ArabicTokenizer.class.getName());
            System.err.printf("%nOptions:%n", new Object[0]);
            System.err.println("   -help : Print this message. See javadocs for all normalization options.");
            System.err.println("   -atb  : Tokenization for the parsing experiments in Green and Manning (2010)");
            System.exit(-1);
        }
        Properties argsToProperties = StringUtils.argsToProperties(strArr);
        TokenizerFactory<CoreLabel> atbFactory = argsToProperties.containsKey("atb") ? atbFactory() : factory();
        Iterator<String> it = argsToProperties.stringPropertyNames().iterator();
        while (it.hasNext()) {
            atbFactory.setOptions(it.next());
        }
        atbFactory.setOptions("tokenizeNLs");
        int i = 0;
        int i2 = 0;
        try {
            Tokenizer<CoreLabel> tokenizer = atbFactory.getTokenizer(new InputStreamReader(System.in, "UTF-8"));
            boolean z = false;
            while (tokenizer.hasNext()) {
                i2++;
                String word = tokenizer.next().word();
                if (word.equals("*NL*")) {
                    i++;
                    z = false;
                    System.out.println();
                } else {
                    if (z) {
                        System.out.print(org.apache.commons.lang3.StringUtils.SPACE);
                    }
                    System.out.print(word);
                    z = true;
                }
            }
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
        }
        System.err.printf("Done! Tokenized %d lines (%d tokens)%n", Integer.valueOf(i), Integer.valueOf(i2));
    }

    static {
        for (String str : "normArDigits,normArPunc,normAlif,removeDiacritics,removeTatweel,removeQuranChars".split(",")) {
            atbOptions.put(str, "true");
        }
    }
}
