package de.dfki.km.exact.lucene.wiki;

import de.dfki.km.exact.lucene.analyzer.LUAnalyzer;
import de.dfki.km.exact.lucene.file.LUFileWriter;
import de.dfki.km.exact.lucene.util.LULocal;
import de.dfki.km.exact.lucene.voc.FIELD;
import de.dfki.km.exact.lucene.voc.WIKI;
import de.dfki.km.exact.misc.EULogger;
import de.dfki.km.exact.misc.EUString;
import de.dfki.km.exact.nlp.EUCharacter;
import de.dfki.km.exact.nlp.NGram;
import de.dfki.km.exact.nlp.NLP;
import edu.jhu.nlp.wikipedia.PageCallbackHandler;
import edu.jhu.nlp.wikipedia.WikiPage;
import edu.jhu.nlp.wikipedia.WikiXMLParser;
import edu.jhu.nlp.wikipedia.WikiXMLParserFactory;
import java.io.File;
import java.io.StringReader;
import java.util.Vector;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Fieldable;

/* loaded from: input_file:WEB-INF/lib/lucene-util-17-20140430.114905-2.jar:de/dfki/km/exact/lucene/wiki/LUWikiWriter.class */
public class LUWikiWriter extends LUFileWriter implements PageCallbackHandler {
    private String mUriPrefix;
    private String mExplanationSuffix;

    public LUWikiWriter(String str, NLP.LANGUAGE language) throws Exception {
        super(str);
        if (language == NLP.LANGUAGE.de) {
            this.mExplanationSuffix = "(Begriffserklärung)";
            this.mUriPrefix = "http://de.wikipedia.org/wiki/";
        } else if (language == NLP.LANGUAGE.en) {
            this.mExplanationSuffix = "(disambiguation)";
            this.mUriPrefix = WIKI.URI_PREFIX_WIKIPEDIA_EN;
        } else {
            this.mExplanationSuffix = "(Begriffserklärung)";
            this.mUriPrefix = "http://any.wikipedia.org/wiki/";
        }
        setAnalyzer(new LUAnalyzer(EUCharacter.cloneCommonSigns()));
        setLogIndex(10000);
    }

    private final String getURI(String str) {
        return this.mUriPrefix + str.replace(' ', '_');
    }

    @Override // de.dfki.km.exact.lucene.file.LUFileWriter
    protected void analyze(File file) {
        WikiXMLParser sAXParser = WikiXMLParserFactory.getSAXParser(file.toString());
        try {
            sAXParser.setPageCallback(this);
            sAXParser.parse();
        } catch (Exception e) {
            EULogger.warn(e);
        }
    }

    public void process(WikiPage wikiPage) {
        String trim;
        if (wikiPage.isSpecialPage() || wikiPage.isStub() || (trim = wikiPage.getTitle().trim()) == null || trim.equals("")) {
            return;
        }
        boolean isDisambiguationPage = wikiPage.isDisambiguationPage();
        if (!isDisambiguationPage && trim.endsWith(this.mExplanationSuffix)) {
            isDisambiguationPage = true;
        }
        Document document = new Document();
        String trim2 = EUString.substring(trim, 0, '(').trim();
        String lowerCase = EUString.append(NGram.getNGrams(3, false, trim2, NLP.CommonDelimeter)).toLowerCase();
        document.add((Fieldable) new Field(FIELD.URI, getURI(trim), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add((Fieldable) new Field("label", trim2, Field.Store.YES, Field.Index.ANALYZED));
        document.add((Fieldable) new Field(FIELD.LABEL_NA, trim, Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add((Fieldable) new Field(FIELD.EXPLANATION, String.valueOf(isDisambiguationPage), Field.Store.YES, Field.Index.NOT_ANALYZED));
        document.add((Fieldable) new Field(FIELD.TRIGRAM, lowerCase, Field.Store.YES, Field.Index.ANALYZED));
        Vector categories = wikiPage.getCategories();
        if (categories != null && categories.size() > 0) {
            document.add((Fieldable) new Field("category", EUString.append(categories), Field.Store.YES, Field.Index.ANALYZED));
        }
        if (wikiPage.getText() != null) {
            document.add((Fieldable) new Field(FIELD.CONTENT, new StringReader(wikiPage.getText()), Field.TermVector.WITH_POSITIONS));
        }
        add(document);
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 3) {
            LUWikiWriter lUWikiWriter = new LUWikiWriter(strArr[0], NLP.LANGUAGE.valueOf(strArr[2]));
            lUWikiWriter.create();
            lUWikiWriter.add(new File(strArr[1]));
            lUWikiWriter.close();
            return;
        }
        LUWikiWriter lUWikiWriter2 = new LUWikiWriter(LULocal.getIndexWikipediaDE(), NLP.LANGUAGE.de);
        lUWikiWriter2.create();
        lUWikiWriter2.analyze(new File("C:/Users/Zodac78/Daten/Work/Programming/Dataset/Wikipedia/dewiki-20140114-pages-articles.xml.bz2"));
        lUWikiWriter2.close();
    }
}
