package de.dfki.km.email2pimo.area51.topicextraction;

import com.google.common.base.Joiner;
import com.google.common.collect.Iterables;
import com.google.common.collect.Lists;
import de.dfki.km.email2pimo.accessor.Accessor;
import de.dfki.km.email2pimo.accessor.Email;
import de.dfki.km.email2pimo.util.E2PUtilities;
import de.dfki.km.email2pimo.util.ScoredObject;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.StringTokenizer;
import org.apache.commons.io.FileUtils;

/* loaded from: input_file:de/dfki/km/email2pimo/area51/topicextraction/TermClusteringTest.class */
public class TermClusteringTest {
    private static final String INPUT_PATH = "resources/minicorpus-poker";
    private static final String OUTPUT_FILENAME = "resources/clustering_output_mcpoker_pr0.1_ohneComposita.txt";
    private static final int MAX_EMAILS = 99000;
    private BufferedWriter output;
    int nrDocs;
    int nrTerms;
    private TermDictionary dictDocs = new TermDictionary();
    private TermDictionary dictTerms = new TermDictionary();
    private BigBooleanVector[] docs2termsBool = null;
    private BigBooleanVector[] terms2docsBool = null;
    private int[] docSizes = null;
    private int[] df = null;
    private List<Topic> topics = null;
    private List<MergedTopic> mergedTopics = null;

    public static void main(String[] strArr) {
        new TermClusteringTest().go();
    }

    public TermClusteringTest() {
        try {
            this.output = new BufferedWriter(new FileWriter(new File(OUTPUT_FILENAME)));
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void log(String str) {
    }

    private void flushlog() {
        try {
            this.output.flush();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private void go() {
        loadFiles();
        createIntMatrix();
        loadFiles();
        calcTopics();
        printTopics();
        flushlog();
        mergeTopics();
        printMergedTopics();
        try {
            this.output.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    public void goForEmailSet(Accessor accessor, Iterable<String> iterable) {
        loadEmailSubjects(accessor, iterable);
        createIntMatrix();
        loadEmailSubjects(accessor, iterable);
        calcTopics();
        mergeTopics();
    }

    private void calcComposita() {
        int indexOf;
        ArrayList arrayList = new ArrayList(this.dictTerms.terms());
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            String str = (String) it.next();
            HashMap hashMap = new HashMap();
            Iterator it2 = arrayList.iterator();
            while (it2.hasNext()) {
                String str2 = (String) it2.next();
                if (str != str2 && (indexOf = str.indexOf(str2)) >= 0) {
                    Set<String> set = hashMap.get(Integer.valueOf(indexOf));
                    if (set == null) {
                        set = new HashSet();
                        hashMap.put(Integer.valueOf(indexOf), set);
                    }
                    set.add(str2);
                }
            }
            List<String> compositaCovering = compositaCovering(str, hashMap, 0, new LinkedList());
            if (compositaCovering != null) {
                System.out.println(str + "\t\t--> " + compositaCovering);
                BigBooleanVector bigBooleanVector = this.terms2docsBool[this.dictTerms.indexOf(str)];
                Iterator<String> it3 = compositaCovering.iterator();
                while (it3.hasNext()) {
                    this.terms2docsBool[this.dictTerms.indexOf(it3.next())].add(bigBooleanVector);
                }
            }
        }
    }

    private List<String> compositaCovering(String str, Map<Integer, Set<String>> map, int i, List<String> list) {
        if (i >= str.length()) {
            return list;
        }
        Set<String> set = map.get(Integer.valueOf(i));
        if (set == null || set.size() <= 0) {
            return null;
        }
        for (String str2 : set) {
            int length = i + str2.length();
            LinkedList linkedList = new LinkedList(list);
            linkedList.add(str2);
            List<String> compositaCovering = compositaCovering(str, map, length, linkedList);
            if (compositaCovering != null) {
                return compositaCovering;
            }
        }
        return null;
    }

    private void createIntMatrix() {
        this.nrDocs = this.dictDocs.size();
        this.nrTerms = this.dictTerms.size();
        this.docs2termsBool = new BigBooleanVector[this.nrDocs];
        this.terms2docsBool = new BigBooleanVector[this.nrTerms];
        for (int i = 0; i < this.nrDocs; i++) {
            this.docs2termsBool[i] = new BigBooleanVector();
        }
        for (int i2 = 0; i2 < this.nrTerms; i2++) {
            this.terms2docsBool[i2] = new BigBooleanVector();
        }
        this.docSizes = new int[this.nrDocs];
    }

    private void calcDF() {
        this.df = new int[this.nrTerms];
        for (int i = 0; i < this.nrTerms; i++) {
            int i2 = 0;
            for (int i3 = 0; i3 < this.nrDocs; i3++) {
                i2 += this.terms2docsBool[i].getInt(i3);
            }
            if (i2 == 0.0d) {
                System.out.println("sum == 0.0  <--  t=#" + i + this.dictTerms.get(i));
            }
            this.df[i] = i2;
        }
    }

    private String arrayToString(int[] iArr) {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("[");
        boolean z = true;
        for (int i = 0; i < iArr.length; i++) {
            if (iArr[i] != 0) {
                if (z) {
                    z = false;
                } else {
                    stringBuffer.append(", ");
                }
                stringBuffer.append("" + i);
            }
        }
        stringBuffer.append("]");
        return stringBuffer.toString();
    }

    private void printIntMat() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("\t");
        for (int i = 0; i < this.nrTerms; i++) {
            stringBuffer.append(String.format("%6d", Integer.valueOf(i)));
        }
        stringBuffer.append("\n\n");
        for (int i2 = 0; i2 < this.nrDocs; i2++) {
            stringBuffer.append(i2 + "\t");
            for (int i3 = 0; i3 < this.nrTerms; i3++) {
                int i4 = this.docs2termsBool[i2].getInt(i3);
                if (i4 == 0) {
                    stringBuffer.append("     .");
                } else {
                    stringBuffer.append(String.format("%6d", Integer.valueOf(i4)));
                }
            }
            stringBuffer.append("\n");
        }
        log("\nintmatDocs2Terms:\n" + stringBuffer.toString());
    }

    private void printDF() {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("\t");
        for (int i = 0; i < this.nrTerms; i++) {
            stringBuffer.append(String.format("%6d", Integer.valueOf(this.df[i])));
        }
        log("\ndf:\n" + stringBuffer.toString());
    }

    private void loadEmailSubjects(Accessor accessor, Iterable<String> iterable) {
        long currentTimeMillis = System.currentTimeMillis();
        int i = 0;
        for (String str : iterable) {
            int indexOf = this.dictDocs.indexOf(str);
            Email email = accessor.getEmail(str);
            if (email != null && email.getContent() != null && email.getContent().getCleanedSubject() != null) {
                if (!loadString(indexOf, Joiner.on(" ").join(Iterables.filter(email.getContent().getCleanedSubjectTokens(), E2PUtilities.isNounTagPredicate(email.getContent().getLanguage()))))) {
                    this.dictDocs.remove(str);
                }
                i++;
            }
        }
        long currentTimeMillis2 = System.currentTimeMillis();
        System.out.println("Loaded " + i + " email subjects in " + (currentTimeMillis2 - currentTimeMillis) + " ms (avg " + ((currentTimeMillis2 - currentTimeMillis) / i) + " ms)");
    }

    private void loadFiles() {
        int i = 0;
        for (File file : new File(INPUT_PATH).listFiles()) {
            i++;
            if (i % 100 == 0) {
                System.out.println("" + i + " files loaded");
                System.out.flush();
            }
            if (i >= MAX_EMAILS) {
                System.out.println("**** vorzeitiger abbruch ****");
                return;
            }
            loadFile(file);
        }
    }

    private void loadFile(File file) {
        try {
            if (!loadString(this.dictDocs.indexOf(file.getName()), FileUtils.readFileToString(file))) {
                this.dictDocs.remove(file.getName());
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    private boolean loadString(int i, String str) {
        if (str == null) {
            return false;
        }
        try {
            int i2 = 0;
            StringTokenizer stringTokenizer = new StringTokenizer(str, " \t\r\n,.-;:_+*#'^°!\"§$%&/()=?´`{[]}\\<>|");
            while (stringTokenizer.hasMoreTokens()) {
                String lowerCase = stringTokenizer.nextToken().toLowerCase();
                if (lowerCase.length() >= 3) {
                    i2++;
                    int indexOf = this.dictTerms.indexOf(lowerCase);
                    if (this.terms2docsBool != null) {
                        this.terms2docsBool[indexOf].include(i);
                    }
                    if (this.docs2termsBool != null) {
                        this.docs2termsBool[i].include(indexOf);
                    }
                }
            }
            if (i2 == 0.0d) {
                return false;
            }
            if (this.docSizes == null) {
                return true;
            }
            this.docSizes[i] = i2;
            return true;
        } catch (Exception e) {
            e.printStackTrace();
            return false;
        }
    }

    private void calcTopics() {
        this.topics = new LinkedList();
        for (int i = 0; i < this.nrTerms; i++) {
            if (i % 100 == 0) {
                log("" + i + " / " + this.nrTerms + " terms");
            }
            Topic topic = new Topic(this.dictDocs, this.dictTerms, this.terms2docsBool[i], i);
            if (topic.getNumberOfDocs() > 1) {
                Topic topic2 = null;
                double d = Double.MAX_VALUE;
                for (Topic topic3 : this.topics) {
                    double calcDistance = topic3.calcDistance(topic);
                    if (calcDistance < d) {
                        topic2 = topic3;
                        d = calcDistance;
                    }
                }
                if (topic2 == null || d >= 1.0d) {
                    Topic topic4 = new Topic(this.dictDocs, this.dictTerms);
                    topic4.addTopic(topic);
                    this.topics.add(topic4);
                } else {
                    topic2.addTopic(topic);
                }
            }
        }
    }

    private void printTopics() {
        ArrayList arrayList = new ArrayList(this.topics);
        Collections.sort(arrayList, new Comparator<Topic>() { // from class: de.dfki.km.email2pimo.area51.topicextraction.TermClusteringTest.1
            @Override // java.util.Comparator
            public int compare(Topic topic, Topic topic2) {
                return topic2.getNumberOfDocs() - topic.getNumberOfDocs();
            }
        });
        StringBuffer stringBuffer = new StringBuffer();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            stringBuffer.append(((Topic) it.next()).toString() + "\n");
        }
        log("\ntopics:\n" + stringBuffer.toString());
    }

    private void mergeTopics() {
        this.mergedTopics = new ArrayList();
        for (Topic topic : this.topics) {
            MergedTopic mergedTopic = new MergedTopic(this.dictDocs, this.dictTerms);
            mergedTopic.addTopic(topic);
            this.mergedTopics.add(mergedTopic);
        }
        double d = 1.0d;
        while (true) {
            double d2 = d;
            if (d2 >= this.nrDocs) {
                return;
            }
            log("\n\n\n\nmerge run with maxDist = " + d2 + "\n\n");
            flushlog();
            do {
            } while (mergeTopicsIteration(d2));
            d = d2 + 1.0d;
        }
    }

    private boolean mergeTopicsIteration(double d) {
        log("\n\nmergeTopicsIteration()\tcurrently #" + this.mergedTopics.size() + " (merged) topics; maxDist = " + d);
        LinkedList linkedList = new LinkedList();
        LinkedList linkedList2 = new LinkedList();
        double d2 = Double.MAX_VALUE;
        MergedTopic mergedTopic = null;
        MergedTopic mergedTopic2 = null;
        for (MergedTopic mergedTopic3 : this.mergedTopics) {
            if (mergedTopic3.getNumberOfDocs() > d) {
                boolean z = true;
                for (MergedTopic mergedTopic4 : this.mergedTopics) {
                    if (!z) {
                        int numberOfDocs = mergedTopic4.getNumberOfDocs();
                        if (r0 + numberOfDocs > d) {
                            double calcDistance = mergedTopic3.calcDistance(mergedTopic4);
                            if (calcDistance <= d && calcDistance > 0.0d && (r0 + numberOfDocs) - (2.0d * calcDistance) >= 2.0d) {
                                MergedTopic mergedTopic5 = new MergedTopic(this.dictDocs, this.dictTerms);
                                mergedTopic5.addTopic(mergedTopic3);
                                mergedTopic5.addTopic(mergedTopic4);
                                if (!mergedTopic5.terms.equals(mergedTopic3.terms) && !mergedTopic5.terms.equals(mergedTopic4.terms) && !linkedList.contains(mergedTopic5) && !this.mergedTopics.contains(mergedTopic5)) {
                                    log("match with  " + calcDistance + "  -> merging the following topics:\n" + mergedTopic3 + "\n" + mergedTopic4 + "\n");
                                    linkedList.add(mergedTopic5);
                                    linkedList2.add(mergedTopic3);
                                    linkedList2.add(mergedTopic4);
                                }
                                if (calcDistance < d2) {
                                    d2 = calcDistance;
                                    mergedTopic = mergedTopic3;
                                    mergedTopic2 = mergedTopic4;
                                }
                            }
                        }
                    } else if (mergedTopic3 == mergedTopic4) {
                        z = false;
                    }
                }
            }
        }
        this.mergedTopics.removeAll(linkedList2);
        this.mergedTopics.addAll(linkedList);
        if (linkedList.size() > 0) {
            printMergedTopics();
            flushlog();
            return true;
        }
        log("\nmerging stopped - best dist was " + d2 + " between the following topics:\n" + mergedTopic + "\n" + mergedTopic2 + "\n");
        flushlog();
        return false;
    }

    private void mergeTopics_cosineApproach() {
        this.mergedTopics = new ArrayList();
        for (Topic topic : this.topics) {
            MergedTopic mergedTopic = new MergedTopic(this.dictDocs, this.dictTerms);
            mergedTopic.addTopic(topic);
            this.mergedTopics.add(mergedTopic);
        }
        do {
        } while (mergeTopicsIteration_cosineApproach());
    }

    private boolean mergeTopicsIteration_cosineApproach() {
        log("\n\nmergeTopicsIteration_cosineApproach()\tcurrently #" + this.mergedTopics.size() + " (merged) topics");
        double d = 1.0d;
        MergedTopic mergedTopic = null;
        MergedTopic mergedTopic2 = null;
        for (MergedTopic mergedTopic3 : this.mergedTopics) {
            boolean z = true;
            for (MergedTopic mergedTopic4 : this.mergedTopics) {
                if (!z) {
                    double calcDistance_cosineMeasure = mergedTopic3.calcDistance_cosineMeasure(mergedTopic4);
                    if (calcDistance_cosineMeasure < d) {
                        d = calcDistance_cosineMeasure;
                        mergedTopic = mergedTopic3;
                        mergedTopic2 = mergedTopic4;
                    }
                } else if (mergedTopic3 == mergedTopic4) {
                    z = false;
                }
            }
        }
        if (d > 0.9d) {
            log("\nmerging stopped - best dist was " + d + " between the following topics:\n" + mergedTopic + "\n" + mergedTopic2 + "\n");
            flushlog();
            return false;
        }
        log("match with  " + d + "  -> merging the following topics:\n" + mergedTopic + "\n" + mergedTopic2 + "\n");
        MergedTopic mergedTopic5 = new MergedTopic(this.dictDocs, this.dictTerms);
        mergedTopic5.addTopic(mergedTopic);
        mergedTopic5.addTopic(mergedTopic2);
        if (mergedTopic5.terms.equals(mergedTopic.terms) || mergedTopic5.terms.equals(mergedTopic2.terms)) {
            return true;
        }
        this.mergedTopics.add(mergedTopic5);
        this.mergedTopics.remove(mergedTopic);
        this.mergedTopics.remove(mergedTopic2);
        printMergedTopics();
        flushlog();
        return true;
    }

    private void printMergedTopics() {
        ArrayList arrayList = new ArrayList(this.mergedTopics);
        Collections.sort(arrayList, new Comparator<MergedTopic>() { // from class: de.dfki.km.email2pimo.area51.topicextraction.TermClusteringTest.2
            @Override // java.util.Comparator
            public int compare(MergedTopic mergedTopic, MergedTopic mergedTopic2) {
                return mergedTopic2.getNumberOfDocs() - mergedTopic.getNumberOfDocs();
            }
        });
        StringBuffer stringBuffer = new StringBuffer();
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            stringBuffer.append(((MergedTopic) it.next()).toString() + "\n");
        }
        log("\n\nmerged topics (" + this.mergedTopics.size() + "):\n" + stringBuffer.toString());
        flushlog();
    }

    public List<String> topMergedTopics() {
        ArrayList arrayList = new ArrayList(this.mergedTopics);
        Collections.sort(arrayList, new Comparator<MergedTopic>() { // from class: de.dfki.km.email2pimo.area51.topicextraction.TermClusteringTest.3
            @Override // java.util.Comparator
            public int compare(MergedTopic mergedTopic, MergedTopic mergedTopic2) {
                return mergedTopic2.getNumberOfDocs() - mergedTopic.getNumberOfDocs();
            }
        });
        ArrayList newArrayList = Lists.newArrayList();
        for (int i = 0; i < Math.min(5, arrayList.size()); i++) {
            newArrayList.add(((MergedTopic) arrayList.get(i)).getString());
        }
        return newArrayList;
    }

    public List<ScoredObject> rankedTopics() {
        ArrayList arrayList = new ArrayList(this.mergedTopics);
        Collections.sort(arrayList, new Comparator<MergedTopic>() { // from class: de.dfki.km.email2pimo.area51.topicextraction.TermClusteringTest.4
            @Override // java.util.Comparator
            public int compare(MergedTopic mergedTopic, MergedTopic mergedTopic2) {
                return mergedTopic2.getNumberOfDocs() - mergedTopic.getNumberOfDocs();
            }
        });
        ArrayList newArrayList = Lists.newArrayList();
        for (int i = 0; i < arrayList.size(); i++) {
            MergedTopic mergedTopic = (MergedTopic) arrayList.get(i);
            ScoredObject scoredObject = new ScoredObject(mergedTopic.getNumberOfDocs());
            scoredObject.setPayload("label", mergedTopic.getString());
            scoredObject.setPayload("rank", "" + i + 1);
            newArrayList.add(scoredObject);
        }
        return newArrayList;
    }
}
