package de.dfki.km.leech.lucene;

import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.file.FileUtils;
import de.dfki.inquisition.lucene.FieldConfig;
import de.dfki.km.leech.metadata.LeechMetadata;
import de.dfki.km.leech.parser.incremental.IncrementalCrawlingHistory;
import de.dfki.km.leech.sax.DataSinkContentHandler;
import java.io.File;
import java.io.IOException;
import java.rmi.server.UID;
import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.CyclicBarrier;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.Term;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.LockObtainFailedException;
import org.apache.lucene.store.SimpleFSDirectory;
import org.apache.lucene.util.Version;
import org.apache.tika.metadata.Metadata;

/* loaded from: input_file:de/dfki/km/leech/lucene/ToLuceneContentHandler.class */
public class ToLuceneContentHandler extends DataSinkContentHandler {
    protected final BlockingQueue<List<Document>> m_addDocsQueue;
    protected boolean m_bBlockIndexing;
    protected CyclicBarrier m_cyclicBarrier4DocConsumerThreads;
    protected FieldConfig m_fieldConfig;
    protected HashSet<String> m_hsAttNamesNot2Store;
    protected Map<String, String> m_hsFieldName2FieldValueConstraint;
    protected MultiValueHashMap<String, String> m_hsSource2TargetFieldnames;
    protected MultiValueHashMap<String, String> m_hsStaticAttValuePairs;
    protected MultiValueHashMap<String, String> m_hsTarget2SourcesFieldnames;
    protected HashSet<String> m_hsTmpLuceneWriterPaths2Merge;
    protected IndexWriter m_initialLuceneWriter;
    protected int m_iSplitIndexDocumentCount;
    protected LinkedList<Thread> m_llConsumerThreads;
    protected LinkedList<IndexWriter> m_llIndexWriter2Close;
    protected LinkedList<Document> m_llLastChildDocuments;
    protected IndexWriter m_luceneWriter;

    /* JADX INFO: Access modifiers changed from: protected */
    /* loaded from: input_file:de/dfki/km/leech/lucene/ToLuceneContentHandler$DocConsumer.class */
    public class DocConsumer implements Runnable {
        protected DocConsumer() {
        }

        @Override // java.lang.Runnable
        public void run() {
            while (true) {
                try {
                    try {
                        List<Document> take = ToLuceneContentHandler.this.m_addDocsQueue.take();
                        if (take instanceof InterruptThreadList) {
                            try {
                                return;
                            } catch (Exception e) {
                                return;
                            }
                        }
                        try {
                            if (take.size() == 1) {
                                ToLuceneContentHandler.this.getCurrentWriter().addDocument(take.get(0));
                            } else if (take.size() > 1) {
                                ToLuceneContentHandler.this.getCurrentWriter().addDocuments(take);
                            }
                        } catch (Exception e2) {
                            Logger.getLogger(DocConsumer.class.getName()).log(Level.WARNING, "Error during writing a document to the index (lucene exception while addDocument) - will ignore it. This is a hint to a lucene bug." + take);
                        }
                    } finally {
                        try {
                            ToLuceneContentHandler.this.m_cyclicBarrier4DocConsumerThreads.await();
                        } catch (Exception e3) {
                            Logger.getLogger(DocConsumer.class.getName()).log(Level.SEVERE, "Error", (Throwable) e3);
                        }
                    }
                } catch (InterruptedException e4) {
                    try {
                        ToLuceneContentHandler.this.m_cyclicBarrier4DocConsumerThreads.await();
                        return;
                    } catch (Exception e5) {
                        Logger.getLogger(DocConsumer.class.getName()).log(Level.SEVERE, "Error", (Throwable) e5);
                        return;
                    }
                } catch (Exception e6) {
                    Logger.getLogger(DocConsumer.class.getName()).log(Level.SEVERE, "Error", (Throwable) e6);
                    try {
                        ToLuceneContentHandler.this.m_cyclicBarrier4DocConsumerThreads.await();
                        return;
                    } catch (Exception e7) {
                        Logger.getLogger(DocConsumer.class.getName()).log(Level.SEVERE, "Error", (Throwable) e7);
                        return;
                    }
                }
            }
        }
    }

    /* loaded from: input_file:de/dfki/km/leech/lucene/ToLuceneContentHandler$InterruptThreadList.class */
    protected class InterruptThreadList extends LinkedList<Document> {
        private static final long serialVersionUID = 196832081918659203L;

        protected InterruptThreadList() {
        }
    }

    public ToLuceneContentHandler(FieldConfig fieldConfig, IndexWriter indexWriter) throws Exception {
        this.m_addDocsQueue = new LinkedBlockingQueue(23);
        this.m_bBlockIndexing = true;
        this.m_fieldConfig = new FieldConfig();
        this.m_hsAttNamesNot2Store = new HashSet<>();
        this.m_hsSource2TargetFieldnames = new MultiValueHashMap<>();
        this.m_hsStaticAttValuePairs = new MultiValueHashMap<>();
        this.m_hsTarget2SourcesFieldnames = new MultiValueHashMap<>();
        this.m_hsTmpLuceneWriterPaths2Merge = new HashSet<>();
        this.m_iSplitIndexDocumentCount = -1;
        this.m_llConsumerThreads = new LinkedList<>();
        this.m_llIndexWriter2Close = new LinkedList<>();
        this.m_llLastChildDocuments = new LinkedList<>();
        this.m_fieldConfig = fieldConfig;
        this.m_luceneWriter = indexWriter;
        this.m_initialLuceneWriter = this.m_luceneWriter;
        init();
    }

    public ToLuceneContentHandler(int i, FieldConfig fieldConfig, IndexWriter indexWriter) throws Exception {
        super(i);
        this.m_addDocsQueue = new LinkedBlockingQueue(23);
        this.m_bBlockIndexing = true;
        this.m_fieldConfig = new FieldConfig();
        this.m_hsAttNamesNot2Store = new HashSet<>();
        this.m_hsSource2TargetFieldnames = new MultiValueHashMap<>();
        this.m_hsStaticAttValuePairs = new MultiValueHashMap<>();
        this.m_hsTarget2SourcesFieldnames = new MultiValueHashMap<>();
        this.m_hsTmpLuceneWriterPaths2Merge = new HashSet<>();
        this.m_iSplitIndexDocumentCount = -1;
        this.m_llConsumerThreads = new LinkedList<>();
        this.m_llIndexWriter2Close = new LinkedList<>();
        this.m_llLastChildDocuments = new LinkedList<>();
        this.m_fieldConfig = fieldConfig;
        this.m_luceneWriter = indexWriter;
        this.m_initialLuceneWriter = this.m_luceneWriter;
        init();
    }

    public ToLuceneContentHandler(Metadata metadata, FieldConfig fieldConfig, IndexWriter indexWriter) throws Exception {
        super(metadata);
        this.m_addDocsQueue = new LinkedBlockingQueue(23);
        this.m_bBlockIndexing = true;
        this.m_fieldConfig = new FieldConfig();
        this.m_hsAttNamesNot2Store = new HashSet<>();
        this.m_hsSource2TargetFieldnames = new MultiValueHashMap<>();
        this.m_hsStaticAttValuePairs = new MultiValueHashMap<>();
        this.m_hsTarget2SourcesFieldnames = new MultiValueHashMap<>();
        this.m_hsTmpLuceneWriterPaths2Merge = new HashSet<>();
        this.m_iSplitIndexDocumentCount = -1;
        this.m_llConsumerThreads = new LinkedList<>();
        this.m_llIndexWriter2Close = new LinkedList<>();
        this.m_llLastChildDocuments = new LinkedList<>();
        this.m_fieldConfig = fieldConfig;
        this.m_luceneWriter = indexWriter;
        this.m_initialLuceneWriter = this.m_luceneWriter;
        init();
    }

    public ToLuceneContentHandler(Metadata metadata, int i, FieldConfig fieldConfig, IndexWriter indexWriter) throws Exception {
        super(metadata, i);
        this.m_addDocsQueue = new LinkedBlockingQueue(23);
        this.m_bBlockIndexing = true;
        this.m_fieldConfig = new FieldConfig();
        this.m_hsAttNamesNot2Store = new HashSet<>();
        this.m_hsSource2TargetFieldnames = new MultiValueHashMap<>();
        this.m_hsStaticAttValuePairs = new MultiValueHashMap<>();
        this.m_hsTarget2SourcesFieldnames = new MultiValueHashMap<>();
        this.m_hsTmpLuceneWriterPaths2Merge = new HashSet<>();
        this.m_iSplitIndexDocumentCount = -1;
        this.m_llConsumerThreads = new LinkedList<>();
        this.m_llIndexWriter2Close = new LinkedList<>();
        this.m_llLastChildDocuments = new LinkedList<>();
        this.m_fieldConfig = fieldConfig;
        this.m_luceneWriter = indexWriter;
        this.m_initialLuceneWriter = this.m_luceneWriter;
        init();
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void crawlFinished() {
        for (int i = 0; i < this.m_llConsumerThreads.size(); i++) {
            try {
                this.m_addDocsQueue.put(new InterruptThreadList());
            } catch (Exception e) {
                Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
                return;
            }
        }
        this.m_cyclicBarrier4DocConsumerThreads.await();
        this.m_llConsumerThreads.clear();
        if (getSplitAndMergeIndex() <= 0) {
            return;
        }
        if (this.m_luceneWriter != this.m_initialLuceneWriter) {
            Iterator<IndexWriter> it = this.m_llIndexWriter2Close.iterator();
            while (it.hasNext()) {
                it.next().close();
            }
            this.m_luceneWriter.close();
        }
        LinkedList linkedList = new LinkedList();
        Iterator<String> it2 = this.m_hsTmpLuceneWriterPaths2Merge.iterator();
        while (it2.hasNext()) {
            linkedList.add(new SimpleFSDirectory(new File(it2.next())));
        }
        if (linkedList.size() == 0) {
            return;
        }
        Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will merge " + linkedList.size() + " temporary indices to the final one.");
        this.m_initialLuceneWriter.addIndexes((Directory[]) linkedList.toArray(new Directory[0]));
        this.m_initialLuceneWriter.commit();
        Iterator<String> it3 = this.m_hsTmpLuceneWriterPaths2Merge.iterator();
        while (it3.hasNext()) {
            FileUtils.deleteDirectory(new File(it3.next()));
        }
    }

    public boolean getBlockIndexing() {
        return this.m_bBlockIndexing;
    }

    public MultiValueHashMap<String, String> getFieldAggregationMap() {
        return this.m_hsTarget2SourcesFieldnames;
    }

    public FieldConfig getFieldConfig() {
        return this.m_fieldConfig;
    }

    public MultiValueHashMap<String, String> getFieldCopyMap() {
        return this.m_hsSource2TargetFieldnames;
    }

    public HashSet<String> getFields2Ignore() {
        return this.m_hsAttNamesNot2Store;
    }

    public Map<String, String> getIgnoreAllDocsWithout() {
        return this.m_hsFieldName2FieldValueConstraint;
    }

    public int getSplitAndMergeIndex() {
        return this.m_iSplitIndexDocumentCount;
    }

    public MultiValueHashMap<String, String> getStaticAttributeValuePairs() {
        return this.m_hsStaticAttValuePairs;
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processErrorData(Metadata metadata) {
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processModifiedData(Metadata metadata, String str) {
        try {
            Document createAndFillLuceneDocument = createAndFillLuceneDocument(metadata, str);
            if (createAndFillLuceneDocument == null) {
                return;
            }
            this.m_initialLuceneWriter.updateDocument(new Term(IncrementalCrawlingHistory.dataEntityExistsID, metadata.get(IncrementalCrawlingHistory.dataEntityExistsID)), createAndFillLuceneDocument);
        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", (Throwable) e);
        }
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processNewData(Metadata metadata, String str) {
        try {
            if (this.m_initialLuceneWriter == null) {
                throw new IllegalStateException("Lucene writer was not specified");
            }
            this.m_luceneWriter = getCurrentWriter();
            ensureConsumerThreadsRunning();
            Document createAndFillLuceneDocument = createAndFillLuceneDocument(metadata, str);
            if (createAndFillLuceneDocument == null) {
                return;
            }
            if (!getBlockIndexing()) {
                this.m_addDocsQueue.put(Collections.singletonList(createAndFillLuceneDocument));
            } else if (metadata.get(LeechMetadata.parentId) != null) {
                this.m_llLastChildDocuments.add(createAndFillLuceneDocument);
            } else if (metadata.get(LeechMetadata.childId) != null) {
                this.m_llLastChildDocuments.add(createAndFillLuceneDocument);
                this.m_addDocsQueue.put(new LinkedList(this.m_llLastChildDocuments));
                this.m_llLastChildDocuments.clear();
            } else {
                Iterator<Document> it = this.m_llLastChildDocuments.iterator();
                while (it.hasNext()) {
                    this.m_addDocsQueue.put(Collections.singletonList(it.next()));
                }
                this.m_addDocsQueue.put(Collections.singletonList(createAndFillLuceneDocument));
            }
        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
        }
    }

    public void processNewDocument(Document document) {
        try {
            if (this.m_initialLuceneWriter == null) {
                throw new IllegalStateException("Lucene writer was not specified");
            }
            this.m_luceneWriter = getCurrentWriter();
            ensureConsumerThreadsRunning();
            if (document == null) {
                return;
            }
            if (!getBlockIndexing()) {
                this.m_addDocsQueue.put(Collections.singletonList(document));
            } else if (document.get(LeechMetadata.parentId) != null) {
                this.m_llLastChildDocuments.add(document);
            } else if (document.get(LeechMetadata.childId) != null) {
                this.m_llLastChildDocuments.add(document);
                this.m_addDocsQueue.put(new LinkedList(this.m_llLastChildDocuments));
                this.m_llLastChildDocuments.clear();
            } else {
                Iterator<Document> it = this.m_llLastChildDocuments.iterator();
                while (it.hasNext()) {
                    this.m_addDocsQueue.put(Collections.singletonList(it.next()));
                }
                this.m_addDocsQueue.put(Collections.singletonList(document));
            }
        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
        }
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processProcessedData(Metadata metadata) {
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processRemovedData(Metadata metadata) {
        try {
            this.m_initialLuceneWriter.deleteDocuments(new Term[]{new Term(IncrementalCrawlingHistory.dataEntityExistsID, metadata.get(IncrementalCrawlingHistory.dataEntityExistsID))});
        } catch (Exception e) {
            Logger.getLogger(ToLuceneContentHandler.class.getName()).log(Level.SEVERE, "Error during writing into the index", (Throwable) e);
        }
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    public void processUnmodifiedData(Metadata metadata) {
    }

    public void setBlockIndexing(boolean z) {
        this.m_bBlockIndexing = z;
    }

    public void setFieldAggregationMap(MultiValueHashMap<String, String> multiValueHashMap) {
        this.m_hsTarget2SourcesFieldnames = multiValueHashMap;
    }

    public void setFieldCopyMap(MultiValueHashMap<String, String> multiValueHashMap) {
        this.m_hsSource2TargetFieldnames = multiValueHashMap;
    }

    public void setFieldNames2Ignore(HashSet<String> hashSet) {
        this.m_hsAttNamesNot2Store = hashSet;
    }

    public ToLuceneContentHandler setIgnoreAllDocsWithout(Map<String, String> map) {
        this.m_hsFieldName2FieldValueConstraint = map;
        return this;
    }

    public ToLuceneContentHandler setSplitAndMergeIndex(int i) {
        this.m_iSplitIndexDocumentCount = i;
        return this;
    }

    public ToLuceneContentHandler setStaticAttributeValuePairs(MultiValueHashMap<String, String> multiValueHashMap) {
        this.m_hsStaticAttValuePairs = multiValueHashMap;
        return this;
    }

    protected void addStaticAttValuePairs(Document document) throws Exception {
        for (Map.Entry entry : getStaticAttributeValuePairs().entryList()) {
            Field createField = this.m_fieldConfig.createField((String) entry.getKey(), entry.getValue());
            if (createField != null) {
                document.add(createField);
            } else {
                Logger.getLogger(ToLuceneContentHandler.class.getName()).warning("Could not create lucene field for " + ((String) entry.getKey()) + ":" + ((String) entry.getValue()) + ". Will ignore it.");
            }
        }
    }

    protected Document createAndFillLuceneDocument(Metadata metadata, String str) throws Exception {
        Document document = new Document();
        if (metadata.getValues(LeechMetadata.id).length == 0) {
            document.add(this.m_fieldConfig.createField(LeechMetadata.id, new UID().toString()));
        }
        if (!getFields2Ignore().contains(LeechMetadata.body)) {
            document.add(this.m_fieldConfig.createField(LeechMetadata.body, str));
        }
        for (String str2 : getFieldCopyMap().get(LeechMetadata.body)) {
            if (!getFields2Ignore().contains(str2)) {
                document.add(this.m_fieldConfig.createField(str2, str));
            }
        }
        for (String str3 : metadata.names()) {
            if (!getFields2Ignore().contains(str3)) {
                for (String str4 : metadata.getValues(str3)) {
                    Field createField = this.m_fieldConfig.createField(str3, str4);
                    if (createField != null) {
                        document.add(createField);
                    } else {
                        Logger.getLogger(ToLuceneContentHandler.class.getName()).warning("Could not create lucene field for " + str3 + ":" + str4 + ". Will ignore it.");
                    }
                }
            }
            for (String str5 : getFieldCopyMap().get(str3)) {
                if (!getFields2Ignore().contains(str5)) {
                    for (String str6 : metadata.getValues(str3)) {
                        Field createField2 = this.m_fieldConfig.createField(str5, str6);
                        if (createField2 != null) {
                            document.add(createField2);
                        } else {
                            Logger.getLogger(ToLuceneContentHandler.class.getName()).warning("Could not create lucene field for " + str5 + ":" + str6 + ". Will ignore it.");
                        }
                    }
                }
            }
        }
        addStaticAttValuePairs(document);
        for (String str7 : getFieldAggregationMap().keySet()) {
            if (document.get(str7) == null) {
                Iterator it = getFieldAggregationMap().get(str7).iterator();
                while (true) {
                    if (it.hasNext()) {
                        String str8 = (String) it.next();
                        String str9 = metadata.get(str8);
                        if (str9 == null) {
                            str9 = (String) getStaticAttributeValuePairs().getFirst(str8);
                        }
                        if (str9 != null) {
                            Field createField3 = this.m_fieldConfig.createField(str7, str9);
                            if (createField3 != null) {
                                document.add(createField3);
                            } else {
                                Logger.getLogger(ToLuceneContentHandler.class.getName()).warning("Could not create lucene field for " + str7 + ":" + str9 + ". Will ignore it.");
                            }
                        }
                    }
                }
            }
        }
        if (this.m_hsFieldName2FieldValueConstraint == null || this.m_hsFieldName2FieldValueConstraint.size() == 0) {
            return document;
        }
        for (Map.Entry<String, String> entry : this.m_hsFieldName2FieldValueConstraint.entrySet()) {
            for (IndexableField indexableField : document.getFields(entry.getKey())) {
                if (indexableField.stringValue().matches(entry.getValue())) {
                    return document;
                }
            }
        }
        return null;
    }

    protected void ensureConsumerThreadsRunning() {
        if (this.m_llConsumerThreads.size() != 0) {
            return;
        }
        int max = Math.max((int) Math.round(Runtime.getRuntime().availableProcessors() / 2.0d), 1);
        this.m_cyclicBarrier4DocConsumerThreads = new CyclicBarrier(max + 1);
        for (int i = 0; i < max; i++) {
            Thread thread = new Thread(new DocConsumer(), "ToLuceneContentHandlerDocConsumer " + i);
            this.m_llConsumerThreads.add(thread);
            thread.setDaemon(true);
            thread.start();
        }
    }

    protected synchronized IndexWriter getCurrentWriter() throws CorruptIndexException, LockObtainFailedException, IOException {
        File file;
        if (getSplitAndMergeIndex() <= 0) {
            return this.m_initialLuceneWriter;
        }
        if (this.m_luceneWriter.maxDoc() < getSplitAndMergeIndex()) {
            return this.m_luceneWriter;
        }
        FSDirectory directory = this.m_initialLuceneWriter.getDirectory();
        if (directory instanceof FSDirectory) {
            if (this.m_luceneWriter != this.m_initialLuceneWriter) {
                this.m_llIndexWriter2Close.add(this.m_luceneWriter);
            }
            file = new File(directory.getDirectory().getAbsolutePath() + "_" + (this.m_hsTmpLuceneWriterPaths2Merge.size() + 1));
        } else {
            file = new File(new File(System.getProperty("java.io.tmpdir")).getAbsolutePath() + "/leechTmp/" + UUID.randomUUID().toString().replaceAll("\\W", "_"));
        }
        Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Current index exceeds " + this.m_iSplitIndexDocumentCount + " documents. Will create another temporary one under " + file);
        IndexWriterConfig indexWriterConfig = new IndexWriterConfig(Version.LUCENE_CURRENT, this.m_initialLuceneWriter.getConfig().getAnalyzer());
        indexWriterConfig.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
        this.m_luceneWriter = new IndexWriter(new SimpleFSDirectory(file), indexWriterConfig);
        this.m_hsTmpLuceneWriterPaths2Merge.add(file.getAbsolutePath());
        return this.m_luceneWriter;
    }

    @Override // de.dfki.km.leech.sax.DataSinkContentHandler
    protected void init() {
        Logger.getLogger(ToLuceneContentHandler.class.getName()).info("Will write crawled data into " + this.m_luceneWriter.getDirectory().toString());
        ensureConsumerThreadsRunning();
    }
}
