package dfki.km.tweekreco.ner.util;

import de.dfki.inquisition.collections.ValueBox;
import de.dfki.inquisition.lucene.DynamicFieldType;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.inquisition.lucene.LuceneUtilz;
import de.dfki.inquisition.text.StringUtils;
import de.dfki.km.leech.lucene.LeechDefaultFieldConfig;
import dfki.km.tweekreco.ner.MultiNamedEntityRecognizer;
import dfki.km.tweekreco.ner.NamedEntityRecognizer;
import dfki.km.tweekreco.ner.NerEntity;
import java.io.File;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Set;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiFields;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.Version;

/* loaded from: input_file:dfki/km/tweekreco/ner/util/LuceneIndexNerPostprocessor.class */
public class LuceneIndexNerPostprocessor {
    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0 || (strArr.length != 0 && (strArr[0].equals("-?") || strArr[0].equals("-h") || strArr[0].equals("--help")))) {
            System.out.println("Adds additional attributes with recognized entities to an lucene index\n -i <index>  the source index\n [-o <outputIndex>] optional. If specified, a new index will be created, with all data from the source index plus the additional entity data. The source index will be not modified in this case.\n [-b <numberOfDocs>] optional. If specified, the process will stop after the specified number of documents. This is for testing purposes. -a <sourceAttributeName> <targetAttributeName4Ids> <targetAttributeName4Labels> <targetAttributeName4Types> <vocabularyId> <true|false (fuzzy)> <true|false (stopwordRemoval)> <true|false (ignoreAllHighFrqTerms)>\n -a <sourceAttributeName> <targetAttributeName4Ids> <targetAttributeName4Labels> <targetAttributeName4Types> <vocabularyId> <true|false (fuzzy)> <true|false (stopwordRemoval)> <true|false (ignoreAllHighFrqTerms)>\n \\nExample: LuceneIndexNerPostprocessor -i ~/sourceIndex -o ~/targetIndex -a body concepts conceptTypes ger true false false\n - If you have spaces in attribute names, vocabularyId or index paths, use %20 instead. (the Strings are not URL encoded, just %20 will be replaced by a space).\n - If you set an target attributes name to 'false', it won't be generated, nor the values will be loaded from the underlying NER index. This is for performance reasons.");
            System.out.println();
            return;
        }
        String str = null;
        String str2 = null;
        String str3 = null;
        String str4 = null;
        String str5 = null;
        String str6 = null;
        String str7 = null;
        boolean z = false;
        boolean z2 = false;
        boolean z3 = false;
        int i = Integer.MAX_VALUE;
        int i2 = 0;
        while (i2 < strArr.length) {
            String str8 = strArr[i2];
            if (str8.equals("-i")) {
                i2++;
                str = strArr[i2].replace("%20", " ");
            } else if (str8.equals("-o")) {
                i2++;
                str2 = strArr[i2].replace("%20", " ");
            } else if (str8.equals("-b")) {
                i2++;
                i = Integer.valueOf(strArr[i2]).intValue();
            } else if (str8.equals("-a")) {
                int i3 = i2 + 1;
                str3 = strArr[i3].replace("%20", " ");
                int i4 = i3 + 1;
                str4 = strArr[i4].replace("%20", " ");
                int i5 = i4 + 1;
                str5 = strArr[i5].replace("%20", " ");
                int i6 = i5 + 1;
                str6 = strArr[i6].replace("%20", " ");
                int i7 = i6 + 1;
                str7 = strArr[i7].replace("%20", " ");
                int i8 = i7 + 1;
                z = Boolean.parseBoolean(strArr[i8]);
                int i9 = i8 + 1;
                z2 = Boolean.parseBoolean(strArr[i9]);
                i2 = i9 + 1;
                z3 = Boolean.parseBoolean(strArr[i2]);
            }
            i2++;
        }
        if (str == null) {
            System.err.println("You have to specify the source index");
            return;
        }
        if (str3 == null) {
            System.err.println("You have to specify the source attribute");
            return;
        }
        if (str4 == null) {
            System.err.println("You have to specify the target attribute for the entity ids");
            return;
        }
        if (str5 == null) {
            System.err.println("You have to specify the target attribute for the entity labels");
            return;
        }
        if (str6 == null) {
            System.err.println("You have to specify the target attribute for the entity types");
            return;
        }
        if (str7 == null) {
            System.err.println("You have to specify the target attribute for the vocabulary id");
            return;
        }
        if (str2 == null) {
            str2 = str;
        }
        DirectoryReader open = DirectoryReader.open(new NIOFSDirectory(new File(str)));
        NIOFSDirectory nIOFSDirectory = new NIOFSDirectory(new File(str2));
        LeechDefaultFieldConfig leechDefaultFieldConfig = new LeechDefaultFieldConfig();
        ((FieldConfig) leechDefaultFieldConfig).fieldName2FieldType.put(str6, DynamicFieldType.keywordFieldType);
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, leechDefaultFieldConfig.createAnalyzer());
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        IndexWriter indexWriter = new IndexWriter(nIOFSDirectory, indexWriterConfig);
        Bits liveDocs = MultiFields.getLiveDocs(open);
        int maxDoc = open.maxDoc();
        Logger.getLogger(LuceneIndexNerPostprocessor.class.getName()).info("will process ~" + StringUtils.beautifyNumber(Integer.valueOf(maxDoc)) + " documents");
        MultiNamedEntityRecognizer multiNamedEntityRecognizer = new MultiNamedEntityRecognizer();
        multiNamedEntityRecognizer.init();
        HashSet hashSet = new HashSet();
        if (!"false".equalsIgnoreCase(str4)) {
            hashSet.add(NamedEntityRecognizer.IndexAtts.id);
        }
        if (!"false".equalsIgnoreCase(str5)) {
            hashSet.add(NamedEntityRecognizer.IndexAtts.labelAsKeyword);
        }
        if (!"false".equalsIgnoreCase(str6)) {
            hashSet.add(NamedEntityRecognizer.IndexAtts.type);
        }
        ValueBox valueBox = new ValueBox(0L);
        for (int i10 = 0; i10 < maxDoc && i10 < i; i10++) {
            if (liveDocs == null || liveDocs.get(i10)) {
                Document document = open.document(i10);
                performExctraction(document, str3, leechDefaultFieldConfig, multiNamedEntityRecognizer, str7, z, z2, z3, hashSet, valueBox, str4, str5, str6, indexWriter, i10);
                if (StringUtils.nullOrWhitespace(document.get(str3))) {
                }
            }
        }
        Logger.getLogger(LuceneIndexNerPostprocessor.class.getName()).info("will commit and close");
        indexWriter.commit();
        indexWriter.close(true);
        Logger.getLogger(LuceneIndexNerPostprocessor.class.getName()).info("...finished");
    }

    protected static void performExctraction(Document document, String str, FieldConfig fieldConfig, MultiNamedEntityRecognizer multiNamedEntityRecognizer, String str2, boolean z, boolean z2, boolean z3, Set<String> set, ValueBox<Long> valueBox, String str3, String str4, String str5, IndexWriter indexWriter, int i) throws Exception {
        LuceneUtilz.reInsertStoredFieldTypes(document, fieldConfig);
        String str6 = "";
        for (String str7 : document.getValues(str)) {
            str6 = str6 + str7 + " ";
        }
        NerEntity[] recognizeNamedEntities = multiNamedEntityRecognizer.recognizeNamedEntities(str2, str6, z, z2, z3, null, set, 1000);
        valueBox.setValue(Long.valueOf(((Long) valueBox.getValue()).longValue() + recognizeNamedEntities.length));
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        LinkedHashSet linkedHashSet2 = new LinkedHashSet();
        LinkedHashSet linkedHashSet3 = new LinkedHashSet();
        for (NerEntity nerEntity : recognizeNamedEntities) {
            linkedHashSet.add(nerEntity.id);
            linkedHashSet2.add(nerEntity.label);
            Iterator<String> it = nerEntity.synonyms.iterator();
            while (it.hasNext()) {
                linkedHashSet2.add(it.next());
            }
            Iterator<String> it2 = nerEntity.types.iterator();
            while (it2.hasNext()) {
                linkedHashSet3.add(it2.next());
            }
        }
        Iterator it3 = linkedHashSet.iterator();
        while (it3.hasNext()) {
            document.add(fieldConfig.createField(str3, (String) it3.next()));
        }
        Iterator it4 = linkedHashSet2.iterator();
        while (it4.hasNext()) {
            document.add(fieldConfig.createField(str4, (String) it4.next()));
        }
        Iterator it5 = linkedHashSet3.iterator();
        while (it5.hasNext()) {
            document.add(fieldConfig.createField(str5, (String) it5.next()));
        }
        indexWriter.addDocument(document);
        if (i % 10000 == 0) {
            Logger.getLogger(LuceneIndexNerPostprocessor.class.getName()).info("processed " + StringUtils.beautifyNumber(Integer.valueOf(i + 1)) + " documents, extracted in average " + (((Long) valueBox.getValue()).longValue() / (i + 1)) + " entities/doc");
        }
    }
}
