package org.dynaq.util.wikipedia;

import de.dfki.inquisition.collections.ConfigurationException;
import de.dfki.inquisition.collections.MultiValueHashMap;
import de.dfki.inquisition.lucene.IndexAccessor;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.logging.Logger;
import javax.xml.stream.XMLEventFactory;
import javax.xml.stream.XMLEventReader;
import javax.xml.stream.XMLEventWriter;
import javax.xml.stream.XMLInputFactory;
import javax.xml.stream.XMLOutputFactory;
import javax.xml.stream.XMLStreamException;
import javax.xml.stream.events.XMLEvent;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.Term;
import org.dynaq.config.AttributeConfig;
import org.dynaq.index.LuceneIndexSet;
import org.dynaq.util.lucene.FieldFactory;

/* loaded from: input_file:org/dynaq/util/wikipedia/WikipediaUtils.class */
public class WikipediaUtils {
    public static void main(String[] strArr) throws Exception {
        wikipediaRedirects2TitleTags(new File("/home/reuschling/Projectz/leech/resource/testData_wikipedia/de_wikidump_mit_header_short_test4allocation.xml"));
    }

    static String readNextCharEventsText(XMLEventReader xMLEventReader, LinkedList<XMLEvent> linkedList) throws XMLStreamException {
        StringBuilder sb = new StringBuilder("");
        while (xMLEventReader.hasNext()) {
            XMLEvent peek = xMLEventReader.peek();
            if (!peek.isCharacters()) {
                break;
            }
            sb.append(peek.asCharacters().getData());
            linkedList.add(xMLEventReader.nextEvent());
        }
        return sb.toString();
    }

    /* JADX WARN: Finally extract failed */
    public static void setCategoryDocEntries(String str, String str2) throws ConfigurationException, Exception {
        IndexReader luceneIndexReader = IndexAccessor.getLuceneIndexReader(str, false);
        IndexWriter indexWriter = IndexAccessor.getIndexWriter(str, LuceneIndexSet.getDynaQAnalyzer());
        try {
            int maxDoc = luceneIndexReader.maxDoc();
            for (int i = 0; i < maxDoc; i++) {
                if (!luceneIndexReader.isDeleted(i)) {
                    Document document = luceneIndexReader.document(i);
                    document.removeFields(AttributeConfig.IndexAttributes.DYNAQ_CATEGORY);
                    document.add(FieldFactory.createField(AttributeConfig.IndexAttributes.DYNAQ_CATEGORY, str2));
                    indexWriter.updateDocument(new Term(AttributeConfig.IndexAttributes.ID, document.get(AttributeConfig.IndexAttributes.ID)), document);
                    if (i % 10000 == 0) {
                        Logger.getLogger(WikipediaUtils.class.getName()).info(i + " documents updated");
                    }
                }
            }
            Logger.getLogger(WikipediaUtils.class.getName()).info("will commit");
            indexWriter.commit();
            Logger.getLogger(WikipediaUtils.class.getName()).info("will optimize");
            indexWriter.optimize();
            IndexAccessor.releaseIndexWriter(indexWriter);
            IndexAccessor.releaseLuceneIndexReader(luceneIndexReader);
            Logger.getLogger(WikipediaUtils.class.getName()).info("finished");
        } catch (Throwable th) {
            Logger.getLogger(WikipediaUtils.class.getName()).info("will commit");
            indexWriter.commit();
            Logger.getLogger(WikipediaUtils.class.getName()).info("will optimize");
            indexWriter.optimize();
            IndexAccessor.releaseIndexWriter(indexWriter);
            IndexAccessor.releaseLuceneIndexReader(luceneIndexReader);
            Logger.getLogger(WikipediaUtils.class.getName()).info("finished");
            throw th;
        }
    }

    public static void wikipediaRedirects2TitleTags(File file) throws FileNotFoundException, XMLStreamException {
        String readNextCharEventsText;
        MultiValueHashMap multiValueHashMap = new MultiValueHashMap();
        HashSet hashSet = new HashSet();
        XMLEventFactory newInstance = XMLEventFactory.newInstance();
        String str = "";
        XMLInputFactory newInstance2 = XMLInputFactory.newInstance();
        XMLEventReader createXMLEventReader = newInstance2.createXMLEventReader(new FileInputStream(file), "Utf-8");
        int i = 0;
        while (createXMLEventReader.hasNext()) {
            XMLEvent nextEvent = createXMLEventReader.nextEvent();
            if (nextEvent.isStartElement()) {
                if (nextEvent.asStartElement().getName().getLocalPart().equals("title")) {
                    str = readNextCharEventsText(createXMLEventReader, new LinkedList());
                    i++;
                    if (i % 10000 == 0) {
                        System.out.println("read doc #" + i);
                    }
                } else if (nextEvent.asStartElement().getName().getLocalPart().equals("text") && createXMLEventReader.peek().isCharacters() && (readNextCharEventsText = readNextCharEventsText(createXMLEventReader, new LinkedList())) != null && ((readNextCharEventsText.trim().length() >= 10 && readNextCharEventsText.trim().substring(0, 9).toLowerCase().startsWith("#redirect")) || (readNextCharEventsText.trim().length() >= 15 && readNextCharEventsText.trim().substring(0, 14).toLowerCase().startsWith("#weiterleitung")))) {
                    int indexOf = readNextCharEventsText.indexOf("[[");
                    int indexOf2 = readNextCharEventsText.indexOf("]]");
                    if (indexOf >= 0 && indexOf2 >= 0 && indexOf2 > indexOf && indexOf + 2 <= readNextCharEventsText.length() && indexOf2 <= readNextCharEventsText.length()) {
                        String trim = readNextCharEventsText.substring(indexOf + 2, indexOf2).trim();
                        multiValueHashMap.add(trim, str);
                        hashSet.add(str);
                        System.out.println("redirect found: (" + hashSet.size() + ") " + str + " => '" + trim + "'");
                    }
                }
            }
        }
        System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~");
        System.out.println("Redirects found: " + multiValueHashMap.valueSize());
        System.out.println("will now inject corresponding title tags");
        XMLEventWriter createXMLEventWriter = XMLOutputFactory.newInstance().createXMLEventWriter(new FileOutputStream(new File(file.getParentFile().getAbsolutePath() + "/redirectTitles_" + file.getName())), "Utf-8");
        XMLEventReader createXMLEventReader2 = newInstance2.createXMLEventReader(new FileInputStream(file), "Utf-8");
        LinkedList linkedList = new LinkedList();
        int i2 = 0;
        while (createXMLEventReader2.hasNext()) {
            XMLEvent nextEvent2 = createXMLEventReader2.nextEvent();
            linkedList.add(nextEvent2);
            if (nextEvent2.isStartElement() || nextEvent2.isEndElement()) {
                if (nextEvent2.isStartElement() && nextEvent2.asStartElement().getName().getLocalPart().equals("title")) {
                    str = readNextCharEventsText(createXMLEventReader2, linkedList);
                    String lowerCase = str.trim().toLowerCase();
                    if (hashSet.contains(str) || lowerCase.startsWith("category:") || lowerCase.startsWith("kategorie:") || lowerCase.startsWith("vorlage:") || lowerCase.startsWith("template:") || lowerCase.startsWith("hilfe:") || lowerCase.startsWith("help:") || lowerCase.startsWith("wikipedia:") || lowerCase.startsWith("portal:") || lowerCase.startsWith("mediawiki:")) {
                        while (true) {
                            XMLEvent xMLEvent = (XMLEvent) linkedList.pollLast();
                            if (xMLEvent.isStartElement() && xMLEvent.asStartElement().getName().getLocalPart().equals("page")) {
                                break;
                            }
                        }
                        while (true) {
                            XMLEvent nextEvent3 = createXMLEventReader2.nextEvent();
                            if (nextEvent3.isEndElement() && nextEvent3.asEndElement().getName().getLocalPart().equals("page")) {
                                break;
                            }
                        }
                    }
                    i2++;
                    if (i2 % 10000 == 0) {
                        System.out.println("read doc #" + i2);
                    }
                } else {
                    if (nextEvent2.isEndElement() && nextEvent2.asEndElement().getName().getLocalPart().equals("title")) {
                        if (multiValueHashMap.containsKey(str)) {
                            for (String str2 : multiValueHashMap.get(str)) {
                                linkedList.add(newInstance.createStartElement("", "", "title"));
                                linkedList.add(newInstance.createCharacters(str2));
                                linkedList.add(newInstance.createEndElement("", "", "title"));
                                System.out.println("inject redirect title " + str2 + " at page " + str);
                            }
                        }
                    }
                    if (nextEvent2.isEndElement() && nextEvent2.asEndElement().getName().getLocalPart().equals("page") && linkedList.size() > 1000) {
                        Iterator it = linkedList.iterator();
                        while (it.hasNext()) {
                            createXMLEventWriter.add((XMLEvent) it.next());
                        }
                        linkedList.clear();
                        createXMLEventWriter.flush();
                    }
                }
            }
        }
        Iterator it2 = linkedList.iterator();
        while (it2.hasNext()) {
            createXMLEventWriter.add((XMLEvent) it2.next());
        }
        createXMLEventWriter.flush();
        System.out.println("...finished. Inserted " + multiValueHashMap.valueSize() + " redirect titles");
    }
}
