package de.dfki.sds.lodex.util;

import de.dfki.inquisitor.collections.ValueBox;
import de.dfki.inquisitor.text.StringUtils;
import de.dfki.sds.lodex.MultiNamedEntityLinker;
import de.dfki.sds.lodex.NamedEntityLinker;
import de.dfki.sds.lodex.lucene.TextWithTermVectorOffsetsField;
import java.io.File;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.Set;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StringField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.MultiBits;
import org.apache.lucene.store.NIOFSDirectory;
import org.apache.lucene.util.Bits;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:WEB-INF/lib/lodex-1.4-SNAPSHOT.jar:de/dfki/sds/lodex/util/LuceneIndexNerPostprocessor.class */
public class LuceneIndexNerPostprocessor {
    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0 || strArr[0].equals("-?") || strArr[0].equals("-h") || strArr[0].equals("--help")) {
            System.out.println("Adds additional attributes with recognized entities to an lucene index\n -i <index>  the source index\n [-o <outputIndex>] optional. If specified, a new index will be created, with all data from the source index plus the additional entity data.\n                              The source index will be not modified in this case.\n [-b <numberOfDocs>] optional. If specified, the process will stop after the specified number of documents. This is for testing purposes.\n -a <sourceAttributeName> <targetAttributeName4Ids> <targetAttributeName4Labels> <targetAttributeName4Types> <vocabularyId> <true|false (fuzzy)> <none|fallback|only (embeddings)> <true|false (stopwordRemoval)> <true|false (ignoreAllHighFrqTerms)>\n\n Example: LuceneIndexNerPostprocessor -i ~/sourceIndex -o ~/targetIndex -a body concepts conceptTypes ger true false false\n - If you have spaces in attribute names, vocabularyId or index paths, use %20 instead. (the Strings are not URL encoded, just %20 will be replaced by a space).\n - If you set a target attributes name to 'false', it won't be generated, nor the values will be loaded from the underlying NER index. This is for performance reasons.");
            System.out.println();
            return;
        }
        String str = null;
        String str2 = null;
        String str3 = null;
        String str4 = null;
        String str5 = null;
        String str6 = null;
        String str7 = null;
        boolean z = false;
        NamedEntityLinker.EmbeddingsMode embeddingsMode = NamedEntityLinker.EmbeddingsMode.none;
        boolean z2 = false;
        boolean z3 = false;
        int i = Integer.MAX_VALUE;
        int i2 = 0;
        while (i2 < strArr.length) {
            String str8 = strArr[i2];
            if (str8.equals("-i")) {
                i2++;
                str = strArr[i2].replace("%20", " ");
            } else if (str8.equals("-o")) {
                i2++;
                str2 = strArr[i2].replace("%20", " ");
            } else if (str8.equals("-b")) {
                i2++;
                i = Integer.valueOf(strArr[i2]).intValue();
            } else if (str8.equals("-a")) {
                int i3 = i2 + 1;
                str3 = strArr[i3].replace("%20", " ");
                int i4 = i3 + 1;
                str4 = strArr[i4].replace("%20", " ");
                int i5 = i4 + 1;
                str5 = strArr[i5].replace("%20", " ");
                int i6 = i5 + 1;
                str6 = strArr[i6].replace("%20", " ");
                int i7 = i6 + 1;
                str7 = strArr[i7].replace("%20", " ");
                int i8 = i7 + 1;
                z = Boolean.parseBoolean(strArr[i8]);
                int i9 = i8 + 1;
                embeddingsMode = NamedEntityLinker.EmbeddingsMode.valueOf(strArr[i9]);
                int i10 = i9 + 1;
                z2 = Boolean.parseBoolean(strArr[i10]);
                i2 = i10 + 1;
                z3 = Boolean.parseBoolean(strArr[i2]);
            }
            i2++;
        }
        if (str == null) {
            System.err.println("You have to specify the source index");
            return;
        }
        if (str3 == null) {
            System.err.println("You have to specify the source attribute");
            return;
        }
        if (str4 == null) {
            System.err.println("You have to specify the target attribute for the entity ids");
            return;
        }
        if (str5 == null) {
            System.err.println("You have to specify the target attribute for the entity labels");
            return;
        }
        if (str6 == null) {
            System.err.println("You have to specify the target attribute for the entity types");
            return;
        }
        if (str7 == null) {
            System.err.println("You have to specify the target attribute for the vocabulary id");
            return;
        }
        if (str2 == null) {
            str2 = str;
        }
        DirectoryReader open = DirectoryReader.open(new NIOFSDirectory(new File(str).toPath()));
        NIOFSDirectory nIOFSDirectory = new NIOFSDirectory(new File(str2).toPath());
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(NamedEntityLinker.indexAnalyzer);
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE_OR_APPEND);
        IndexWriter indexWriter = new IndexWriter(nIOFSDirectory, indexWriterConfig);
        Bits liveDocs = MultiBits.getLiveDocs(open);
        int maxDoc = open.maxDoc();
        LoggerFactory.getLogger((Class<?>) LuceneIndexNerPostprocessor.class).info("will process ~" + StringUtils.beautifyNumber(Integer.valueOf(maxDoc)) + " documents");
        MultiNamedEntityLinker multiNamedEntityLinker = new MultiNamedEntityLinker();
        multiNamedEntityLinker.init();
        HashSet hashSet = new HashSet();
        if (!"false".equalsIgnoreCase(str4)) {
            hashSet.add(NamedEntityLinker.IndexAtts.id);
        }
        if (!"false".equalsIgnoreCase(str5)) {
            hashSet.add(NamedEntityLinker.IndexAtts.labelAsKeyword);
        }
        if (!"false".equalsIgnoreCase(str6)) {
            hashSet.add("type");
        }
        ValueBox valueBox = new ValueBox(0L);
        for (int i11 = 0; i11 < maxDoc && i11 < i; i11++) {
            if (liveDocs == null || liveDocs.get(i11)) {
                Document document = open.document(i11);
                performExctraction(document, str3, multiNamedEntityLinker, str7, z, embeddingsMode, z2, z3, hashSet, valueBox, str4, str5, str6, indexWriter, i11);
                if (StringUtils.nullOrWhitespace(document.get(str3))) {
                }
            }
        }
        LoggerFactory.getLogger((Class<?>) LuceneIndexNerPostprocessor.class).info("will commit and close");
        indexWriter.commit();
        indexWriter.close();
        LoggerFactory.getLogger((Class<?>) LuceneIndexNerPostprocessor.class).info("...finished");
    }

    protected static void performExctraction(Document document, String str, MultiNamedEntityLinker multiNamedEntityLinker, String str2, boolean z, NamedEntityLinker.EmbeddingsMode embeddingsMode, boolean z2, boolean z3, Set<String> set, ValueBox<Long> valueBox, String str3, String str4, String str5, IndexWriter indexWriter, int i) throws Exception {
        String str6 = "";
        for (String str7 : document.getValues(str)) {
            str6 = str6 + str7 + " ";
        }
        MultiNamedEntityLinker.EntityExplained[] linkNamedEntities = multiNamedEntityLinker.linkNamedEntities(str2, str6, z, embeddingsMode.toString(), null, z2, z3, true, 1, 7, 50, null, set, 1000);
        valueBox.setValue(Long.valueOf(valueBox.getValue().longValue() + linkNamedEntities.length));
        LinkedHashSet linkedHashSet = new LinkedHashSet();
        LinkedHashSet linkedHashSet2 = new LinkedHashSet();
        LinkedHashSet linkedHashSet3 = new LinkedHashSet();
        for (MultiNamedEntityLinker.EntityExplained entityExplained : linkNamedEntities) {
            linkedHashSet.add(entityExplained.id);
            linkedHashSet2.add(entityExplained.label);
            linkedHashSet2.addAll(Arrays.asList(entityExplained.synonyms));
            linkedHashSet3.addAll(Arrays.asList(entityExplained.types));
        }
        Iterator it = linkedHashSet.iterator();
        while (it.hasNext()) {
            document.add(new StringField(str3, (String) it.next(), Field.Store.YES));
        }
        Iterator it2 = linkedHashSet2.iterator();
        while (it2.hasNext()) {
            document.add(new TextWithTermVectorOffsetsField(str4, (String) it2.next(), Field.Store.YES));
        }
        Iterator it3 = linkedHashSet3.iterator();
        while (it3.hasNext()) {
            document.add(new StringField(str5, (String) it3.next(), Field.Store.YES));
        }
        indexWriter.addDocument(document);
        if (i % 10000 == 0) {
            LoggerFactory.getLogger((Class<?>) LuceneIndexNerPostprocessor.class).info("processed " + StringUtils.beautifyNumber(Integer.valueOf(i + 1)) + " documents, extracted in average " + (valueBox.getValue().longValue() / (i + 1)) + " entities/doc");
        }
    }
}
