package org.apache.nutch.analysis;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.ListIterator;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.nutch.searcher.Query;
import org.apache.nutch.util.NutchConfiguration;

/* loaded from: input_file:org/apache/nutch/analysis/CommonGrams.class */
public class CommonGrams {
    private static final char SEPARATOR = '-';
    private HashMap commonTerms = new HashMap();
    private static final Log LOG = LogFactory.getLog(CommonGrams.class);
    private static final String KEY = CommonGrams.class.getName();

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/analysis/CommonGrams$ArrayTokens.class */
    public static class ArrayTokens extends TokenStream {
        private Query.Term[] terms;
        private int index;

        public ArrayTokens(Query.Phrase phrase) {
            this.terms = phrase.getTerms();
        }

        public org.apache.lucene.analysis.Token next() {
            if (this.index == this.terms.length) {
                return null;
            }
            String term = this.terms[this.index].toString();
            int i = this.index;
            int i2 = this.index + 1;
            this.index = i2;
            return new org.apache.lucene.analysis.Token(term, i, i2);
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/analysis/CommonGrams$Filter.class */
    public static class Filter extends TokenFilter {
        private HashSet common;
        private org.apache.lucene.analysis.Token previous;
        private LinkedList gramQueue;
        private LinkedList nextQueue;
        private StringBuffer buffer;

        public Filter(TokenStream tokenStream, HashSet hashSet) {
            super(tokenStream);
            this.gramQueue = new LinkedList();
            this.nextQueue = new LinkedList();
            this.buffer = new StringBuffer();
            this.common = hashSet;
        }

        public org.apache.lucene.analysis.Token next() throws IOException {
            if (this.gramQueue.size() != 0) {
                return (org.apache.lucene.analysis.Token) this.gramQueue.removeFirst();
            }
            org.apache.lucene.analysis.Token popNext = popNext();
            if (popNext == null) {
                return null;
            }
            if (!isCommon(popNext)) {
                this.previous = popNext;
                return popNext;
            }
            this.gramQueue.add(popNext);
            ListIterator listIterator = this.nextQueue.listIterator();
            org.apache.lucene.analysis.Token token = popNext;
            while (isCommon(token)) {
                if (this.previous != null && !isCommon(this.previous)) {
                    this.gramQueue.addFirst(gramToken(this.previous, token));
                }
                org.apache.lucene.analysis.Token peekNext = peekNext(listIterator);
                if (peekNext == null) {
                    break;
                }
                token = gramToken(token, peekNext);
                this.gramQueue.addLast(token);
            }
            this.previous = popNext;
            return (org.apache.lucene.analysis.Token) this.gramQueue.removeFirst();
        }

        private boolean isCommon(org.apache.lucene.analysis.Token token) {
            return this.common != null && this.common.contains(token.termText());
        }

        private org.apache.lucene.analysis.Token popNext() throws IOException {
            return this.nextQueue.size() > 0 ? (org.apache.lucene.analysis.Token) this.nextQueue.removeFirst() : this.input.next();
        }

        private org.apache.lucene.analysis.Token peekNext(ListIterator listIterator) throws IOException {
            if (!listIterator.hasNext()) {
                org.apache.lucene.analysis.Token next = this.input.next();
                if (next == null) {
                    return null;
                }
                listIterator.add(next);
                listIterator.previous();
            }
            return (org.apache.lucene.analysis.Token) listIterator.next();
        }

        private org.apache.lucene.analysis.Token gramToken(org.apache.lucene.analysis.Token token, org.apache.lucene.analysis.Token token2) {
            this.buffer.setLength(0);
            this.buffer.append(token.termText());
            this.buffer.append('-');
            this.buffer.append(token2.termText());
            org.apache.lucene.analysis.Token token3 = new org.apache.lucene.analysis.Token(this.buffer.toString(), token.startOffset(), token2.endOffset(), "gram");
            token3.setPositionIncrement(0);
            return token3;
        }
    }

    public CommonGrams(Configuration configuration) {
        init(configuration);
    }

    private void init(Configuration configuration) {
        this.commonTerms = (HashMap) configuration.getObject(KEY);
        if (this.commonTerms != null) {
            return;
        }
        try {
            this.commonTerms = new HashMap();
            BufferedReader bufferedReader = new BufferedReader(configuration.getConfResourceAsReader(configuration.get("analysis.common.terms.file")));
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    configuration.setObject(KEY, this.commonTerms);
                    return;
                }
                String trim = readLine.trim();
                if (!trim.startsWith("#") && !"".equals(trim)) {
                    NutchDocumentTokenizer nutchDocumentTokenizer = new NutchDocumentTokenizer(new StringReader(trim));
                    org.apache.lucene.analysis.Token next = nutchDocumentTokenizer.next();
                    if (next != null) {
                        String termText = next.termText();
                        org.apache.lucene.analysis.Token next2 = nutchDocumentTokenizer.next();
                        if (next2 != null) {
                            String termText2 = next2.termText();
                            while (true) {
                                org.apache.lucene.analysis.Token next3 = nutchDocumentTokenizer.next();
                                if (next3 == null) {
                                    break;
                                } else {
                                    termText2 = termText2 + '-' + next3.termText();
                                }
                            }
                            HashSet hashSet = (HashSet) this.commonTerms.get(termText);
                            if (hashSet == null) {
                                hashSet = new HashSet();
                                this.commonTerms.put(termText, hashSet);
                            }
                            hashSet.add(termText2);
                        } else if (LOG.isWarnEnabled()) {
                            LOG.warn("Line contains only a field name, no word: " + trim);
                        }
                    } else if (LOG.isWarnEnabled()) {
                        LOG.warn("Line does not contain a field name: " + trim);
                    }
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e.toString());
        }
    }

    public TokenFilter getFilter(TokenStream tokenStream, String str) {
        return new Filter(tokenStream, (HashSet) this.commonTerms.get(str));
    }

    public String[] optimizePhrase(Query.Phrase phrase, String str) {
        org.apache.lucene.analysis.Token next;
        if (LOG.isTraceEnabled()) {
            LOG.trace("Optimizing " + phrase + " for " + str);
        }
        ArrayList arrayList = new ArrayList();
        TokenFilter filter = getFilter(new ArrayTokens(phrase), str);
        org.apache.lucene.analysis.Token token = null;
        int i = 0;
        do {
            try {
                next = filter.next();
                if (next == null) {
                    break;
                }
                if (next.getPositionIncrement() != 0 && token != null) {
                    arrayList.add(token.termText());
                }
                token = next;
                i += next.getPositionIncrement();
            } catch (IOException e) {
                throw new RuntimeException(e.toString());
            }
        } while (i + arity(next.termText()) != phrase.getTerms().length);
        if (token != null) {
            arrayList.add(token.termText());
        }
        return (String[]) arrayList.toArray(new String[arrayList.size()]);
    }

    private int arity(String str) {
        int i = 0;
        int i2 = 0;
        while (true) {
            int indexOf = str.indexOf(SEPARATOR, i + 1);
            i = indexOf;
            if (indexOf == -1) {
                return i2;
            }
            i2++;
        }
    }

    public static void main(String[] strArr) throws Exception {
        StringBuffer stringBuffer = new StringBuffer();
        for (String str : strArr) {
            stringBuffer.append(str);
            stringBuffer.append(' ');
        }
        TokenStream nutchDocumentTokenizer = new NutchDocumentTokenizer(new StringReader(stringBuffer.toString()));
        CommonGrams commonGrams = new CommonGrams(NutchConfiguration.create());
        TokenFilter filter = commonGrams.getFilter(nutchDocumentTokenizer, "url");
        while (true) {
            org.apache.lucene.analysis.Token next = filter.next();
            if (next == null) {
                break;
            } else {
                System.out.println("Token: " + next);
            }
        }
        String[] optimizePhrase = commonGrams.optimizePhrase(new Query.Phrase(strArr), "url");
        System.out.print("Optimized: ");
        for (String str2 : optimizePhrase) {
            System.out.print(str2 + " ");
        }
        System.out.println();
    }
}
