package de.dfki.km.perspecting.obie.model;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.io.Reader;
import java.io.StringReader;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashSet;
import java.util.List;
import java.util.Random;
import java.util.Set;
import java.util.regex.Pattern;

/* loaded from: input_file:de/dfki/km/perspecting/obie/model/LabeledTextCorpus.class */
public class LabeledTextCorpus extends TextCorpus {
    private static final String PREPOSITION = "PRP";
    private static final String SUFFIX = "s:";
    private static final String COLON = ",";
    private static final String IN = "i:";
    private static final String POST = "p:";
    private static final String PRE = "a:";
    static final int WORD = 0;
    static final int POS = 1;
    static final int PHRASE = 2;
    static final int LABEL = 3;
    static final String OUTSIDE_ANY_LABEL = "O";
    private static final String NEWLINE = "\n";
    private static final String SPACE = " ";
    private static String CAPS = "[\\p{Lu}]";
    private static String ALPHA = "[\\p{Lu}\\p{Ll}]";
    private static String PUNT = "[,\\.;:?!()]";
    private static String QUOTE = "[\"`']";
    private static final Pattern MIXEDCAPS = Pattern.compile("[A-Z][a-z]+[A-Z][A-Za-z]*");
    private static final Pattern ALLDIGITS = Pattern.compile("[0-9]+");
    private static final Pattern NUMERICAL = Pattern.compile("[-0-9]+[\\.,]+[0-9\\.,]+");
    private static final Pattern ROMAN = Pattern.compile("[ivxdlcm]+|[IVXDLCM]+");
    private static final Pattern MULTIDOTS = Pattern.compile("\\.\\.+");
    private static final Pattern ABBR = Pattern.compile(ALPHA + ALPHA + "+\\.");
    private static final Pattern LONELYINITIAL = Pattern.compile(CAPS + "\\.");
    private static final Pattern SINGLECHAR = Pattern.compile(ALPHA);
    private static final Pattern CAPLETTER = Pattern.compile("[A-Z]");
    private static final Pattern PUNC = Pattern.compile(PUNT);
    private static final Pattern QUOTES = Pattern.compile(QUOTE + ALPHA + "?");
    private static final Pattern ENUM = Pattern.compile("[0-9]+[" + PUNT + "a-z]+");
    private static final Pattern NUMRANGE = Pattern.compile("[0-9]+-[0-9]+");
    private static final Pattern DATE = Pattern.compile("[0-9]+[\\p{Punct}[0-9]+]+");
    private TextCorpus textCorpus;

    public LabeledTextCorpus(File file, TextCorpus textCorpus) {
        super(file);
        this.textCorpus = textCorpus;
    }

    public LabeledTextCorpus(File file) {
        super(file);
    }

    public TextCorpus getTextCorpus() {
        return this.textCorpus;
    }

    public Reader getGroundTruth(final String str) throws Exception {
        final StringBuilder sb = new StringBuilder();
        forEach(new DocumentProcedure<String>() { // from class: de.dfki.km.perspecting.obie.model.LabeledTextCorpus.1
            /* JADX WARN: Can't rename method to resolve collision */
            @Override // de.dfki.km.perspecting.obie.model.DocumentProcedure
            public String process(Reader reader, String str2) throws Exception {
                if (!str.equals(str2)) {
                    return null;
                }
                BufferedReader bufferedReader = new BufferedReader(reader);
                String readLine = bufferedReader.readLine();
                while (true) {
                    String str3 = readLine;
                    if (str3 == null) {
                        return null;
                    }
                    sb.append(str3);
                    sb.append(LabeledTextCorpus.NEWLINE);
                    readLine = bufferedReader.readLine();
                }
            }
        });
        return new StringReader(sb.toString());
    }

    public Reader toFeatureFormat(File file, final int[] iArr, final boolean z, final boolean z2, final boolean z3, final double d, final int i, final String... strArr) throws Exception {
        final BufferedWriter bufferedWriter = new BufferedWriter(new FileWriter(file));
        forEach(new DocumentProcedure<String>() { // from class: de.dfki.km.perspecting.obie.model.LabeledTextCorpus.2
            /* JADX WARN: Can't rename method to resolve collision */
            @Override // de.dfki.km.perspecting.obie.model.DocumentProcedure
            public String process(Reader reader, String str) throws Exception {
                BufferedReader bufferedReader = new BufferedReader(reader);
                ArrayList arrayList = new ArrayList();
                ArrayList arrayList2 = new ArrayList();
                String readLine = bufferedReader.readLine();
                while (true) {
                    String str2 = readLine;
                    if (str2 == null) {
                        break;
                    }
                    if (str2.length() != 0 || arrayList2.isEmpty()) {
                        arrayList2.add(str2.split(LabeledTextCorpus.SPACE));
                    } else {
                        arrayList.add(arrayList2);
                        arrayList2 = new ArrayList();
                    }
                    readLine = bufferedReader.readLine();
                }
                for (int i2 = LabeledTextCorpus.WORD; i2 < arrayList.size(); i2 += LabeledTextCorpus.POS) {
                    List list = (List) arrayList.get(i2);
                    ArrayList arrayList3 = new ArrayList();
                    for (int i3 = LabeledTextCorpus.WORD; i3 < list.size(); i3 += LabeledTextCorpus.POS) {
                        if (((String[]) list.get(i3)).length > LabeledTextCorpus.POS) {
                            if (!((String[]) list.get(i3))[LabeledTextCorpus.LABEL].equals(LabeledTextCorpus.OUTSIDE_ANY_LABEL)) {
                                arrayList3.add(Integer.valueOf(i3));
                            } else if (!arrayList3.isEmpty()) {
                                List extractFeatures = LabeledTextCorpus.this.extractFeatures(arrayList3, list, iArr, z, z2, z3, d, i, strArr);
                                boolean z4 = LabeledTextCorpus.POS;
                                int i4 = i2 + LabeledTextCorpus.POS;
                                while (z4 && i4 < arrayList.size()) {
                                    if (((String[]) ((List) arrayList.get(i4)).get(LabeledTextCorpus.WORD))[LabeledTextCorpus.POS].startsWith(LabeledTextCorpus.PREPOSITION)) {
                                        extractFeatures.addAll(LabeledTextCorpus.this.extractFeatures(arrayList3, list, iArr, z, z2, z3, d, i, strArr));
                                        i4 += LabeledTextCorpus.POS;
                                    } else {
                                        z4 = LabeledTextCorpus.WORD;
                                    }
                                }
                                if (!arrayList3.isEmpty()) {
                                    LabeledTextCorpus.this.serializeExample(bufferedWriter, ((String[]) list.get(((Integer) arrayList3.get(LabeledTextCorpus.WORD)).intValue()))[LabeledTextCorpus.LABEL].substring(LabeledTextCorpus.PHRASE), str + "_" + i2, extractFeatures);
                                }
                                arrayList3.clear();
                            }
                        }
                    }
                }
                return null;
            }
        });
        bufferedWriter.close();
        return new FileReader(file);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void serializeExample(Writer writer, String str, String str2, List<String> list) throws IOException {
        if (list.isEmpty()) {
            return;
        }
        writer.append((CharSequence) str2);
        writer.append(SPACE);
        writer.append((CharSequence) str);
        for (String str3 : list) {
            writer.append(SPACE);
            writer.append((CharSequence) str3);
        }
        writer.append(NEWLINE);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public List<String> extractFeatures(List<Integer> list, List<String[]> list2, int[] iArr, boolean z, boolean z2, boolean z3, double d, int i, String... strArr) {
        ArrayList arrayList = new ArrayList();
        ArrayList arrayList2 = new ArrayList();
        ArrayList arrayList3 = new ArrayList();
        ArrayList arrayList4 = new ArrayList();
        HashSet hashSet = new HashSet(Arrays.asList(strArr));
        int intValue = list.get(WORD).intValue();
        int intValue2 = list.get(list.size() - POS).intValue();
        for (int i2 = WORD; i2 < list2.size(); i2 += POS) {
            String[] strArr2 = list2.get(i2);
            if (i2 < intValue) {
                if (i2 - intValue > (-i)) {
                    arrayList2.addAll(scanWordContent(d, hashSet, strArr2, z3));
                }
            } else if (i2 >= intValue && i2 <= intValue2) {
                arrayList3.addAll(scanWordSyntax(strArr2, z3, true));
            } else if (i2 > intValue2 && i2 - intValue2 < i) {
                arrayList4.addAll(scanWordContent(d, hashSet, strArr2, z3));
            }
        }
        if (z) {
            int length = iArr.length;
            for (int i3 = WORD; i3 < length; i3 += POS) {
                int i4 = iArr[i3];
                arrayList.addAll(calculateNgrams(i4, arrayList2, PRE));
                arrayList.addAll(calculateNgrams(i4, arrayList4, POST));
            }
        }
        if (z2) {
            arrayList.addAll(calculateNgrams(POS, arrayList3, IN));
        }
        return arrayList;
    }

    public Reader compare(LabeledTextCorpus labeledTextCorpus) {
        return null;
    }

    private Collection<String> scanWordContent(double d, Set<String> set, String[] strArr, boolean z) {
        HashSet hashSet = new HashSet();
        if (strArr.length == 4) {
            if (strArr[LABEL].equals(OUTSIDE_ANY_LABEL)) {
                if (strArr[POS].length() > POS && set.contains(strArr[POS].substring(WORD, PHRASE))) {
                    hashSet.addAll(scanWordSyntax(strArr, z, false));
                } else if (strArr[LABEL].length() > POS) {
                    hashSet.addAll(scanWordSyntax(strArr, z, false));
                }
            } else if (new Random().nextDouble() <= d) {
                hashSet.add(strArr[LABEL]);
            } else if (strArr[LABEL].length() > POS) {
                hashSet.addAll(scanWordSyntax(strArr, z, false));
            }
        }
        return hashSet;
    }

    private List<String> calculateNgrams(int i, List<String> list, String str) {
        ArrayList arrayList = new ArrayList();
        int min = Math.min(list.size(), i);
        if (min > 0) {
            for (int i2 = min; i2 <= list.size(); i2 += POS) {
                StringBuilder sb = new StringBuilder();
                List<String> subList = list.subList(i2 - min, i2);
                for (int i3 = WORD; i3 < subList.size(); i3 += POS) {
                    sb.append(subList.get(i3));
                    if (i3 < subList.size() - POS) {
                        sb.append(COLON);
                    }
                }
                arrayList.add(str + ((Object) sb));
            }
        }
        return arrayList;
    }

    private Collection<String> scanWordSyntax(String[] strArr, boolean z, boolean z2) {
        String str = strArr[WORD];
        boolean z3 = WORD;
        HashSet hashSet = new HashSet();
        if (z) {
            if (MIXEDCAPS.matcher(strArr[WORD]).matches()) {
                str = "MIXEDCAPS";
                z3 = POS;
            }
            if (ALLDIGITS.matcher(strArr[WORD]).matches()) {
                str = "ALLDIGITS";
                z3 = POS;
            }
            if (NUMERICAL.matcher(strArr[WORD]).matches()) {
                str = "NUMERICAL";
                z3 = POS;
            }
            if (DATE.matcher(strArr[WORD]).matches()) {
                str = "DATE";
                z3 = POS;
            }
            if (ROMAN.matcher(strArr[WORD]).matches()) {
                str = "ROMAN";
                z3 = POS;
            }
            if (MULTIDOTS.matcher(strArr[WORD]).matches()) {
                str = "MULTIDOTS";
                z3 = POS;
            }
            if (LONELYINITIAL.matcher(strArr[WORD]).matches()) {
                str = "LONELYINITIAL";
                z3 = POS;
            }
            if (ABBR.matcher(strArr[WORD]).matches()) {
                str = "ABBR";
                z3 = POS;
            }
            if (SINGLECHAR.matcher(strArr[WORD]).matches()) {
                str = "SINGLECHARALLCAPS";
                z3 = POS;
            }
            if (CAPLETTER.matcher(strArr[WORD]).matches()) {
                str = "CAPLETTER";
                z3 = POS;
            }
            if (PUNC.matcher(strArr[WORD]).matches()) {
                str = "PUNC";
                z3 = POS;
            }
            if (QUOTES.matcher(strArr[WORD]).matches()) {
                str = "QUOTES";
                z3 = POS;
            }
            if (ENUM.matcher(strArr[WORD]).matches()) {
                str = "ENUM";
                z3 = POS;
            }
            if (NUMRANGE.matcher(strArr[WORD]).matches()) {
                str = "NUMRANGE";
                z3 = POS;
            }
        }
        if (!z3 && str.length() > LABEL && z2) {
            hashSet.add(SUFFIX + str.substring(str.length() - LABEL, str.length()));
        }
        hashSet.add(str);
        return hashSet;
    }
}
