package org.cleartk.token.tokenizer;

import java.util.regex.Pattern;

/* loaded from: input_file:org/cleartk/token/tokenizer/PennTreebankTokenizer.class */
public class PennTreebankTokenizer extends Tokenizer_ImplBase {
    public static String openBracesRegex = "\\[\\(\\{\\<";
    public static String closedBracesRegex = "\\]\\)\\}\\>";
    public static String bracesRegex = "([" + openBracesRegex + closedBracesRegex + "])";
    public static Pattern bracesPattern = Pattern.compile(bracesRegex);
    public static String ellipsisRegex = "(" + Pattern.quote("...") + ")";
    public static Pattern ellipsisPattern = Pattern.compile(ellipsisRegex);
    public static String commaRegex = "((?<!\\d),|,(?!\\d))";
    public static Pattern commaPattern = Pattern.compile(commaRegex);
    public static String dollarSignRegex = "([A-Z]*\\$)";
    public static Pattern dollarSignPattern = Pattern.compile(dollarSignRegex);
    public static String ampersandRegex = "((?<![A-Z])&|&(?![A-Z]))";
    public static Pattern ampersandPattern = Pattern.compile(ampersandRegex);
    public static String dashRegex = "(--+|-(?=\\s))";
    public static Pattern dashPattern = Pattern.compile(dashRegex);
    public static String colonRegex = "(\\d+:\\d+|:)";
    public static Pattern colonPattern = Pattern.compile(colonRegex);
    public static String nonFinalPunctRegex = "(``|[|;@#`%])(?!-)";
    public static Pattern nonFinalPunctPattern = Pattern.compile(nonFinalPunctRegex);
    public static String periodRegex = "((?<=\\d)\\.(?=[^\\n\\S])|(?<=[^.]\\.)\\.(?![.])|(?<!\\.\\.)\\.[" + closedBracesRegex + "\"'`/_#*\\s]*$)";
    public static Pattern periodPattern = Pattern.compile(periodRegex, 8);
    public static String nonPeriodPunctRegex = "([?!])";
    public static Pattern nonPeriodPunctPattern = Pattern.compile(nonPeriodPunctRegex);
    public static String singleQuoteRegex = "('\\d+s?|(?<=\\s)'(?!')|(?<!')'(?=\\s))";
    public static Pattern singleQuotePattern = Pattern.compile(singleQuoteRegex);
    public static String tripleQuoteRegex = "'''";
    public static Pattern tripleQuotePattern = Pattern.compile(tripleQuoteRegex);
    public static String doubleQuoteRegex = "''";
    public static Pattern doubleQuotePattern = Pattern.compile(doubleQuoteRegex);
    public static String quoteRegex = Pattern.quote("\"");
    public static Pattern quotePattern = Pattern.compile(quoteRegex);
    public static String oneWordAbbreviationRegex = "('ll|'re|'ve|n't|'[smd])\\b";
    public static Pattern oneWordAbbreviationPattern = Pattern.compile(oneWordAbbreviationRegex, 2);
    public static String[] twoWordAbbreviationRegexes = {"\\b(can)(not)\\b", "\\b(d')(ye)\\b", "\\b(gim)(me)\\b", "\\b(gon)(na)\\b", "\\b(got)(ta)\\b", "\\b(lem)(me)\\b", "\\b(more)('n)\\b", "\\b(wan)(na)\\b"};
    public static Pattern[] twoWordAbbreviationPatterns = new Pattern[twoWordAbbreviationRegexes.length];
    public static String[] threeWordAbbreviationRegexes;
    public static Pattern[] threeWordAbbreviationPatterns;
    public static String tAbbreviationRegex;
    public static Pattern tAbbreviationPattern;
    public static String beginOrEndRegex;
    public static Pattern beginOrEndPattern;
    public static String extraSpaceRegex;
    public static Pattern extraSpacePattern;
    public static String multipleWhitespaceRegex;
    public static Pattern multipleWhitespacePattern;
    protected Pattern[] patterns = {ellipsisPattern, commaPattern, dollarSignPattern, ampersandPattern, dashPattern, colonPattern, nonFinalPunctPattern, periodPattern, nonPeriodPunctPattern, bracesPattern};

    @Override // org.cleartk.token.tokenizer.Tokenizer_ImplBase
    public String[] getTokenTexts(String str) {
        for (Pattern pattern : this.patterns) {
            str = pattern.matcher(str).replaceAll(" $1 ");
        }
        String replaceAll = oneWordAbbreviationPattern.matcher(quotePattern.matcher(singleQuotePattern.matcher(doubleQuotePattern.matcher(tripleQuotePattern.matcher(beginOrEndPattern.matcher(str).replaceAll(" ")).replaceAll(" ' '' ")).replaceAll(" '' ")).replaceAll(" $1 ")).replaceAll(" \" ")).replaceAll(" $1");
        for (Pattern pattern2 : twoWordAbbreviationPatterns) {
            replaceAll = pattern2.matcher(replaceAll).replaceAll(" $1 $2");
        }
        String replaceAll2 = tAbbreviationPattern.matcher(replaceAll).replaceAll(" $1 $2");
        for (Pattern pattern3 : threeWordAbbreviationPatterns) {
            replaceAll2 = pattern3.matcher(replaceAll2).replaceAll(" $1 $2 $3");
        }
        String[] split = multipleWhitespacePattern.matcher(extraSpacePattern.matcher(replaceAll2).replaceAll("")).replaceAll(" ").toString().split(" ");
        if (split.length == 1 && split[0].equals("")) {
            split = new String[0];
        }
        return split;
    }

    static {
        for (int i = 0; i < twoWordAbbreviationRegexes.length; i++) {
            twoWordAbbreviationPatterns[i] = Pattern.compile(twoWordAbbreviationRegexes[i], 2);
        }
        threeWordAbbreviationRegexes = new String[]{"\\b(wha)(dd)(ya)\\b", "\\b(wha)(t)(cha)\\b"};
        threeWordAbbreviationPatterns = new Pattern[threeWordAbbreviationRegexes.length];
        for (int i2 = 0; i2 < threeWordAbbreviationRegexes.length; i2++) {
            threeWordAbbreviationPatterns[i2] = Pattern.compile(threeWordAbbreviationRegexes[i2], 2);
        }
        tAbbreviationRegex = "('t)(is|was)\\b";
        tAbbreviationPattern = Pattern.compile(tAbbreviationRegex);
        beginOrEndRegex = "^|$";
        beginOrEndPattern = Pattern.compile(beginOrEndRegex, 8);
        extraSpaceRegex = "^(\\s+)|(\\s+)$|(?<=[ \\t])[ \\t]+";
        extraSpacePattern = Pattern.compile(extraSpaceRegex, 8);
        multipleWhitespaceRegex = "(\\s+)";
        multipleWhitespacePattern = Pattern.compile(multipleWhitespaceRegex, 8);
    }
}
