package edu.stanford.nlp.pipeline;

import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.ling.MultiTokenTag;
import edu.stanford.nlp.ling.tokensregex.EnvLookup;
import edu.stanford.nlp.pipeline.Annotator;
import edu.stanford.nlp.util.ArrayCoreMap;
import edu.stanford.nlp.util.ArrayMap;
import edu.stanford.nlp.util.CollectionValuedMap;
import edu.stanford.nlp.util.CoreMap;
import edu.stanford.nlp.util.Pair;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.util.XMLUtils;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/pipeline/CleanXmlAnnotator.class */
public class CleanXmlAnnotator implements Annotator {
    private final Pattern xmlTagMatcher;
    public static final String DEFAULT_XML_TAGS = ".*";
    private final Pattern sentenceEndingTagMatcher;
    public static final String DEFAULT_SENTENCE_ENDERS = "";
    private Pattern singleSentenceTagMatcher;
    private final Pattern dateTagMatcher;
    public static final String DEFAULT_DATE_TAGS = "datetime|date";
    private Pattern docIdTagMatcher;
    public static final String DEFAULT_DOCID_TAGS = "docid";
    private Pattern docTypeTagMatcher;
    public static final String DEFAULT_DOCTYPE_TAGS = "doctype";
    private Pattern utteranceTurnTagMatcher;
    public static final String DEFAULT_UTTERANCE_TURN_TAGS = "turn";
    private Pattern speakerTagMatcher;
    public static final String DEFAULT_SPEAKER_TAGS = "speaker";
    private CollectionValuedMap<Class, Pair<Pattern, Pattern>> docAnnotationPatterns;
    public static final String DEFAULT_DOC_ANNOTATIONS_PATTERNS = "docID=doc[id],doctype=doc[type],docsourcetype=doctype[source]";
    private CollectionValuedMap<Class, Pair<Pattern, Pattern>> tokenAnnotationPatterns;
    private Pattern sectionTagMatcher;
    private Pattern ssplitDiscardTokensMatcher;
    private CollectionValuedMap<Class, Pair<Pattern, Pattern>> sectionAnnotationPatterns;
    private final boolean allowFlawedXml;
    public static final boolean DEFAULT_ALLOW_FLAWS = true;
    public static final String DEFAULT_SINGLE_SENTENCE_TAGS = null;
    public static final String DEFAULT_TOKEN_ANNOTATIONS_PATTERNS = null;
    public static final String DEFAULT_SECTION_TAGS = null;
    public static final String DEFAULT_SECTION_ANNOTATIONS_PATTERNS = null;
    private static final Pattern TAG_ATTR_PATTERN = Pattern.compile("(.*)\\[(.*)\\]");

    public CleanXmlAnnotator() {
        this(DEFAULT_XML_TAGS, "", DEFAULT_DATE_TAGS, true);
    }

    public CleanXmlAnnotator(String str, String str2, String str3, boolean z) {
        this.singleSentenceTagMatcher = null;
        this.utteranceTurnTagMatcher = null;
        this.speakerTagMatcher = null;
        this.docAnnotationPatterns = new CollectionValuedMap<>();
        this.tokenAnnotationPatterns = new CollectionValuedMap<>();
        this.sectionTagMatcher = null;
        this.ssplitDiscardTokensMatcher = null;
        this.sectionAnnotationPatterns = new CollectionValuedMap<>();
        this.allowFlawedXml = z;
        if (str != null) {
            this.xmlTagMatcher = toCaseInsensitivePattern(str);
            if (str2 == null || str2.length() <= 0) {
                this.sentenceEndingTagMatcher = null;
            } else {
                this.sentenceEndingTagMatcher = toCaseInsensitivePattern(str2);
            }
        } else {
            this.xmlTagMatcher = null;
            this.sentenceEndingTagMatcher = null;
        }
        this.dateTagMatcher = toCaseInsensitivePattern(str3);
    }

    private Pattern toCaseInsensitivePattern(String str) {
        if (str != null) {
            return Pattern.compile(str, 2);
        }
        return null;
    }

    public void setSsplitDiscardTokensMatcher(String str) {
        this.ssplitDiscardTokensMatcher = toCaseInsensitivePattern(str);
    }

    public void setSingleSentenceTagMatcher(String str) {
        this.singleSentenceTagMatcher = toCaseInsensitivePattern(str);
    }

    public void setDocIdTagMatcher(String str) {
        this.docIdTagMatcher = toCaseInsensitivePattern(str);
    }

    public void setDocTypeTagMatcher(String str) {
        this.docTypeTagMatcher = toCaseInsensitivePattern(str);
    }

    public void setSectionTagMatcher(String str) {
        this.sectionTagMatcher = toCaseInsensitivePattern(str);
    }

    public void setDiscourseTags(String str, String str2) {
        this.utteranceTurnTagMatcher = toCaseInsensitivePattern(str);
        this.speakerTagMatcher = toCaseInsensitivePattern(str2);
    }

    public void setDocAnnotationPatterns(String str) {
        this.docAnnotationPatterns.clear();
        addAnnotationPatterns(this.docAnnotationPatterns, str, true);
    }

    public void setTokenAnnotationPatterns(String str) {
        this.tokenAnnotationPatterns.clear();
        addAnnotationPatterns(this.tokenAnnotationPatterns, str, true);
    }

    public void setSectionAnnotationPatterns(String str) {
        this.sectionAnnotationPatterns.clear();
        addAnnotationPatterns(this.sectionAnnotationPatterns, str, false);
    }

    private void addAnnotationPatterns(CollectionValuedMap<Class, Pair<Pattern, Pattern>> collectionValuedMap, String str, boolean z) {
        for (String str2 : str == null ? new String[0] : str.trim().split("\\s*,\\s*")) {
            String[] split = str2.split("\\s*=\\s*", 2);
            if (split.length != 2) {
                throw new IllegalArgumentException("Invalid annotation to tag pattern: " + str2);
            }
            String str3 = split[0];
            String str4 = split[1];
            Class lookupAnnotationKey = EnvLookup.lookupAnnotationKey(null, str3);
            if (lookupAnnotationKey == null) {
                throw new IllegalArgumentException("Cannot resolve annotation key " + str3);
            }
            Matcher matcher = TAG_ATTR_PATTERN.matcher(str4);
            if (matcher.matches()) {
                collectionValuedMap.add(lookupAnnotationKey, Pair.makePair(toCaseInsensitivePattern(matcher.group(1)), toCaseInsensitivePattern(matcher.group(2))));
            } else {
                if (z) {
                    throw new IllegalArgumentException("Invalid tag pattern: " + str4 + " for annotation key " + str3);
                }
                collectionValuedMap.add(lookupAnnotationKey, Pair.makePair(toCaseInsensitivePattern(str4), (Pattern) null));
            }
        }
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public void annotate(Annotation annotation) {
        if (annotation.has(CoreAnnotations.TokensAnnotation.class)) {
            annotation.set(CoreAnnotations.TokensAnnotation.class, process(annotation, (List) annotation.get(CoreAnnotations.TokensAnnotation.class)));
        }
    }

    public List<CoreLabel> process(List<CoreLabel> list) {
        return process(null, list);
    }

    private String tokensToString(Annotation annotation, List<CoreLabel> list) {
        if (list.isEmpty()) {
            return "";
        }
        String str = annotation != null ? (String) annotation.get(CoreAnnotations.TextAnnotation.class) : null;
        if (str != null) {
            return str.substring(((Integer) list.get(0).get(CoreAnnotations.CharacterOffsetBeginAnnotation.class)).intValue(), ((Integer) list.get(list.size() - 1).get(CoreAnnotations.CharacterOffsetEndAnnotation.class)).intValue());
        }
        return StringUtils.joinWords(list, org.apache.commons.lang3.StringUtils.SPACE);
    }

    private Set<Class> annotateWithTag(Annotation annotation, CoreMap coreMap, XMLUtils.XMLTag xMLTag, CollectionValuedMap<Class, Pair<Pattern, Pattern>> collectionValuedMap, Map<Class, List<CoreLabel>> map, Collection<Class> collection, Map<Class, Stack<Pair<String, String>>> map2) {
        Stack<Pair<String, String>> stack;
        HashSet hashSet = new HashSet();
        if (collectionValuedMap == null) {
            return hashSet;
        }
        if (collection == null) {
            collection = collectionValuedMap.keySet();
        }
        for (Class cls : collection) {
            for (Pair<Pattern, Pattern> pair : collectionValuedMap.get((Object) cls)) {
                Pattern pattern = pair.first;
                Pattern pattern2 = pair.second;
                if (pattern.matcher(xMLTag.name).matches()) {
                    boolean z = false;
                    if (pattern2 != null) {
                        if (xMLTag.attributes != null) {
                            Iterator<Map.Entry<String, String>> it = xMLTag.attributes.entrySet().iterator();
                            while (true) {
                                if (!it.hasNext()) {
                                    break;
                                }
                                Map.Entry<String, String> next = it.next();
                                if (pattern2.matcher(next.getKey()).matches()) {
                                    if (map2 != null) {
                                        Stack<Pair<String, String>> stack2 = map2.get(cls);
                                        if (stack2 == null) {
                                            Stack<Pair<String, String>> stack3 = new Stack<>();
                                            stack2 = stack3;
                                            map2.put(cls, stack3);
                                        }
                                        stack2.push(Pair.makePair(xMLTag.name, next.getValue()));
                                    }
                                    coreMap.set(cls, next.getValue());
                                    hashSet.add(cls);
                                    z = true;
                                }
                            }
                        }
                        if (map2 != null && xMLTag.isEndTag && (stack = map2.get(cls)) != null && !stack.isEmpty() && stack.peek().first.equalsIgnoreCase(xMLTag.name)) {
                            stack.pop();
                            if (stack.isEmpty()) {
                                coreMap.remove(cls);
                            } else {
                                coreMap.set(cls, stack.peek().second);
                            }
                        }
                    } else if (map != null) {
                        if (!xMLTag.isEndTag || xMLTag.isSingleTag) {
                            map.put(cls, new ArrayList());
                        } else {
                            List<CoreLabel> remove = map.remove(cls);
                            if (remove != null && remove.size() > 0) {
                                coreMap.set(cls, tokensToString(annotation, remove));
                                hashSet.add(cls);
                                z = true;
                            }
                        }
                    }
                    if (z) {
                        break;
                    }
                }
            }
        }
        return hashSet;
    }

    public List<CoreLabel> process(Annotation annotation, List<CoreLabel> list) {
        String str;
        Stack stack = new Stack();
        List list2 = null;
        int i = 0;
        ArrayList arrayList = new ArrayList();
        StringBuilder sb = new StringBuilder();
        HashSet hashSet = new HashSet();
        hashSet.addAll(this.docAnnotationPatterns.keySet());
        int i2 = 0;
        boolean z = false;
        boolean z2 = false;
        String str2 = null;
        List<CoreLabel> arrayList2 = new ArrayList<>();
        List<CoreLabel> arrayList3 = new ArrayList<>();
        List<CoreLabel> arrayList4 = new ArrayList<>();
        List<CoreLabel> arrayList5 = new ArrayList<>();
        CoreMap arrayCoreMap = (this.tokenAnnotationPatterns == null || this.tokenAnnotationPatterns.isEmpty()) ? null : new ArrayCoreMap();
        Map<Class, Stack<Pair<String, String>>> arrayMap = new ArrayMap<>();
        XMLUtils.XMLTag xMLTag = null;
        CoreLabel coreLabel = null;
        ArrayCoreMap arrayCoreMap2 = null;
        Map<Class, List<CoreLabel>> hashMap = new HashMap<>();
        boolean z3 = false;
        for (CoreLabel coreLabel2 : list) {
            XMLUtils.XMLTag parseTag = XMLUtils.parseTag(coreLabel2.word().trim());
            if (parseTag == null) {
                coreLabel2.setWord(XMLUtils.unescapeStringForXML(coreLabel2.word()));
                if (i > 0 || this.xmlTagMatcher == null || this.xmlTagMatcher.matcher("").matches()) {
                    arrayList.add(coreLabel2);
                    if (z) {
                        coreLabel2.set(CoreAnnotations.UtteranceAnnotation.class, Integer.valueOf(i2));
                        if (str2 != null) {
                            coreLabel2.set(CoreAnnotations.SpeakerAnnotation.class, str2);
                        }
                    }
                    if (z3) {
                        coreLabel2.set(CoreAnnotations.ForcedSentenceUntilEndAnnotation.class, true);
                        z3 = false;
                    }
                    if (arrayCoreMap != null) {
                        ChunkAnnotationUtils.copyUnsetAnnotations(arrayCoreMap, coreLabel2);
                    }
                }
                if (sb.length() > 0) {
                    boolean z4 = false;
                    String str3 = (String) coreLabel2.get(CoreAnnotations.BeforeAnnotation.class);
                    if (str3 != null) {
                        coreLabel2.set(CoreAnnotations.BeforeAnnotation.class, ((Object) sb) + str3);
                        z4 = true;
                    }
                    if (z4 && arrayList.size() > 1) {
                        CoreLabel coreLabel3 = (CoreLabel) arrayList.get(arrayList.size() - 2);
                        String str4 = (String) coreLabel3.get(CoreAnnotations.AfterAnnotation.class);
                        if (str4 != null) {
                            coreLabel3.set(CoreAnnotations.AfterAnnotation.class, str4 + ((Object) sb));
                        } else {
                            coreLabel3.set(CoreAnnotations.AfterAnnotation.class, sb.toString());
                        }
                    }
                    sb = new StringBuilder();
                }
                if (list2 == null) {
                    list2 = Collections.unmodifiableList(new ArrayList(stack));
                }
                coreLabel2.set(CoreAnnotations.XmlContextAnnotation.class, list2);
                if (this.dateTagMatcher != null && list2.size() > 0 && this.dateTagMatcher.matcher((CharSequence) list2.get(list2.size() - 1)).matches()) {
                    arrayList3.add(coreLabel2);
                }
                if (this.docIdTagMatcher != null && list2.size() > 0 && this.docIdTagMatcher.matcher((CharSequence) list2.get(list2.size() - 1)).matches()) {
                    arrayList5.add(coreLabel2);
                }
                if (this.docTypeTagMatcher != null && list2.size() > 0 && this.docTypeTagMatcher.matcher((CharSequence) list2.get(list2.size() - 1)).matches()) {
                    arrayList4.add(coreLabel2);
                }
                if (z2) {
                    arrayList2.add(coreLabel2);
                }
                if (xMLTag != null) {
                    if (this.ssplitDiscardTokensMatcher != null ? !this.ssplitDiscardTokensMatcher.matcher(coreLabel2.word()).matches() : true) {
                        if (coreLabel == null) {
                            coreLabel = coreLabel2;
                        }
                        Iterator<List<CoreLabel>> it = hashMap.values().iterator();
                        while (it.hasNext()) {
                            it.next().add(coreLabel2);
                        }
                    }
                }
            } else {
                String str5 = (String) coreLabel2.get(CoreAnnotations.BeforeAnnotation.class);
                if (str5 != null) {
                    sb.append(str5);
                }
                String str6 = (String) coreLabel2.get(CoreAnnotations.OriginalTextAnnotation.class);
                if (str6 != null) {
                    sb.append(str6);
                }
                if (coreLabel2 == list.get(list.size() - 1) && (str = (String) coreLabel2.get(CoreAnnotations.AfterAnnotation.class)) != null) {
                    sb.append(str);
                }
                if (!hashSet.isEmpty() && parseTag.attributes != null) {
                    hashSet.removeAll(annotateWithTag(annotation, annotation, parseTag, this.docAnnotationPatterns, null, hashSet, null));
                }
                if (this.sectionTagMatcher != null && this.sectionTagMatcher.matcher(parseTag.name).matches()) {
                    if (parseTag.isEndTag) {
                        annotateWithTag(annotation, arrayCoreMap2, parseTag, this.sectionAnnotationPatterns, hashMap, null, null);
                        if (coreLabel != null) {
                            coreLabel.set(CoreAnnotations.SectionStartAnnotation.class, arrayCoreMap2);
                        }
                        if (arrayList.size() > 0) {
                            CoreLabel coreLabel4 = (CoreLabel) arrayList.get(arrayList.size() - 1);
                            coreLabel4.set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
                            coreLabel4.set(CoreAnnotations.SectionEndAnnotation.class, xMLTag.name);
                        }
                        hashMap.clear();
                        xMLTag = null;
                        coreLabel = null;
                        arrayCoreMap2 = null;
                    } else if (!parseTag.isSingleTag) {
                        xMLTag = parseTag;
                        arrayCoreMap2 = new ArrayCoreMap();
                        arrayCoreMap2.set(CoreAnnotations.SectionAnnotation.class, xMLTag.name);
                    }
                }
                if (xMLTag != null) {
                    annotateWithTag(annotation, arrayCoreMap2, parseTag, this.sectionAnnotationPatterns, hashMap, null, null);
                }
                if (arrayCoreMap != null) {
                    annotateWithTag(annotation, arrayCoreMap, parseTag, this.tokenAnnotationPatterns, null, null, arrayMap);
                }
                if (this.sentenceEndingTagMatcher != null && this.sentenceEndingTagMatcher.matcher(parseTag.name).matches() && arrayList.size() > 0) {
                    ((CoreLabel) arrayList.get(arrayList.size() - 1)).set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
                }
                if (this.utteranceTurnTagMatcher != null && this.utteranceTurnTagMatcher.matcher(parseTag.name).matches()) {
                    if (arrayList.size() > 0) {
                        ((CoreLabel) arrayList.get(arrayList.size() - 1)).set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
                    }
                    z = (parseTag.isEndTag || parseTag.isSingleTag) ? false : true;
                    if (z) {
                        i2++;
                    }
                    if (!z) {
                        str2 = null;
                    }
                }
                if (this.speakerTagMatcher != null && this.speakerTagMatcher.matcher(parseTag.name).matches()) {
                    if (arrayList.size() > 0) {
                        ((CoreLabel) arrayList.get(arrayList.size() - 1)).set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
                    }
                    z2 = (parseTag.isEndTag || parseTag.isSingleTag) ? false : true;
                    if (parseTag.isEndTag) {
                        str2 = tokensToString(annotation, arrayList2);
                        MultiTokenTag.Tag tag = new MultiTokenTag.Tag(str2, "Speaker", arrayList2.size());
                        int i3 = 0;
                        for (CoreLabel coreLabel5 : arrayList2) {
                            coreLabel5.set(CoreAnnotations.SpeakerAnnotation.class, str2);
                            coreLabel5.set(CoreAnnotations.MentionTokenAnnotation.class, new MultiTokenTag(tag, i3));
                            i3++;
                        }
                    } else {
                        str2 = null;
                    }
                    arrayList2.clear();
                }
                if (this.singleSentenceTagMatcher != null && this.singleSentenceTagMatcher.matcher(parseTag.name).matches()) {
                    if (parseTag.isEndTag) {
                        if (arrayList.size() > 0) {
                            ((CoreLabel) arrayList.get(arrayList.size() - 1)).set(CoreAnnotations.ForcedSentenceEndAnnotation.class, true);
                        }
                        z3 = false;
                    } else if (!parseTag.isSingleTag) {
                        z3 = true;
                    }
                }
                if (this.xmlTagMatcher != null && !parseTag.isSingleTag) {
                    list2 = null;
                    if (parseTag.isEndTag) {
                        while (!stack.isEmpty()) {
                            String str7 = (String) stack.pop();
                            if (this.xmlTagMatcher.matcher(str7).matches()) {
                                i--;
                            }
                            if (str7.equals(parseTag.name)) {
                                if (i < 0) {
                                    throw new AssertionError("Programming error?  We think there have been more close tags than open tags");
                                }
                            } else if (!this.allowFlawedXml) {
                                throw new IllegalArgumentException("Mismatched tags... " + parseTag.name + " closed a " + str7 + " tag.");
                            }
                        }
                        throw new IllegalArgumentException("Got a close tag " + parseTag.name + " which does not match any open tag");
                    }
                    stack.push(parseTag.name);
                    if (this.xmlTagMatcher.matcher(parseTag.name).matches()) {
                        i++;
                    }
                }
            }
        }
        if (stack.size() > 0 && !this.allowFlawedXml) {
            throw new IllegalArgumentException("Unclosed tags, starting with " + ((String) stack.pop()));
        }
        if (arrayList.size() > 0 && sb.length() > 0) {
            CoreLabel coreLabel6 = (CoreLabel) arrayList.get(arrayList.size() - 1);
            if (coreLabel6.get(CoreAnnotations.OriginalTextAnnotation.class) != null) {
                coreLabel6.set(CoreAnnotations.AfterAnnotation.class, sb.toString());
            }
        }
        if (annotation != null) {
            if (!arrayList5.isEmpty()) {
                annotation.set(CoreAnnotations.DocIDAnnotation.class, tokensToString(annotation, arrayList5).trim());
            }
            if (!arrayList3.isEmpty()) {
                annotation.set(CoreAnnotations.DocDateAnnotation.class, tokensToString(annotation, arrayList3).trim());
            }
            if (!arrayList4.isEmpty()) {
                annotation.set(CoreAnnotations.DocTypeAnnotation.class, tokensToString(annotation, arrayList4).trim());
            }
        }
        return arrayList;
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Annotator.Requirement> requires() {
        return Collections.singleton(TOKENIZE_REQUIREMENT);
    }

    @Override // edu.stanford.nlp.pipeline.Annotator
    public Set<Annotator.Requirement> requirementsSatisfied() {
        return Collections.singleton(CLEAN_XML_REQUIREMENT);
    }
}
