package de.dfki.catwiesel.synchronizer.importer.xml;

import de.dfki.catwiesel.categorymanager.CategoryManager;
import de.dfki.catwiesel.index.AllTypesMultiValueMap;
import de.dfki.catwiesel.index.IndexManagerQueue;
import de.dfki.catwiesel.synchronizer.importer.EnhancedRawData;
import de.dfki.catwiesel.synchronizer.importer.ImportConfiguration;
import de.dfki.catwiesel.synchronizer.importer.ImportStoppedException;
import de.dfki.catwiesel.synchronizer.importer.Importer;
import de.dfki.catwiesel.synchronizer.importer.ImporterException;
import de.dfki.catwiesel.synchronizer.importer.ImporterHelper;
import de.dfki.catwiesel.util.Catwiesel;
import de.dfki.catwiesel.util.DateParser;
import de.dfki.catwiesel.util.FileHandling;
import de.dfki.catwiesel.util.Pair;
import de.dfki.catwiesel.util.ProfilingHelper;
import de.dfki.catwiesel.util.SaxContext;
import de.dfki.catwiesel.util.SimpleMultiValueMap;
import de.dfki.catwiesel.vocabulary.AttributeURIs;
import de.dfki.catwiesel.vocabulary.StringConstants;
import de.dfki.inquisition.collections.ConfigurationException;
import de.dfki.inquisition.collections.MultiValueConfiguration;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.ontoware.rdf2go.model.node.URI;
import org.ontoware.rdf2go.model.node.impl.URIImpl;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;

/* loaded from: input_file:de/dfki/catwiesel/synchronizer/importer/xml/WikipediaImporter.class */
public class WikipediaImporter implements Importer {
    public static final String STOP_REQUEST = "Explicite stop request";
    private ProfilingHelper m_profilingHelper;
    private Map<String, ImportConfiguration> m_importCapabilities;
    private ImporterHelper m_importerHelper;
    private CategoryManager m_categoryManager;
    private static final SaxContext PAGE_TAG = new SaxContext("page");
    private static final SaxContext WIKIPEDIA_TAG = new SaxContext("mediawiki");
    private static final SaxContext DATE_TAG = new SaxContext("timestamp");
    private static final SaxContext CREATOR_TAG = new SaxContext("username");
    private static final SaxContext CONTENT_TAG = new SaxContext(StringConstants.INDEX_TYPE_TEXT);
    private static final SaxContext COMMENT_TAG = new SaxContext("comment");
    private static final SaxContext BASE_SOURCE_TAG = new SaxContext("base");
    private static final SaxContext SOURCE_PAGE_TITLE = new SaxContext("title");
    private static final SaxContext PAGE_ID = new SaxContext("page", "id");
    private static SimpleMultiValueMap<SaxContext, URI> m_attributeMappingTable = createAttributeMappingTable();
    private static Logger m_logger = Logger.getLogger(WikipediaImporter.class.getName());
    private static final LinkedList<String> ROOT_CATEGORY_LIST = new LinkedList<>();
    private HashMap<String, URI> m_wikipediaCategories = new HashMap<>();
    private LinkedList<Pattern> m_excludePatterns = new LinkedList<>();
    private HashMap<Thread, Boolean> m_stopRequests = new HashMap<>();
    private SimpleMultiValueMap<String, String> m_redirectLinks = new SimpleMultiValueMap<>();
    private URI m_lostAndFoundCategoryUri = null;
    private IndexManagerQueue m_indexManagerQueue = IndexManagerQueue.getInstance();
    private List<String> m_typeList = new LinkedList();

    /* loaded from: input_file:de/dfki/catwiesel/synchronizer/importer/xml/WikipediaImporter$SaxHandler.class */
    public class SaxHandler extends DefaultHandler {
        private String m_baseSource;
        private EnhancedRawData m_enhancedRawData;
        private StringBuilder m_textTagContent = new StringBuilder();
        private StringBuilder m_innerTagContent = new StringBuilder();
        private LinkedList<Pair<URI, Object>> m_metaAttributes = new LinkedList<>();
        private SaxContext m_saxContext = new SaxContext();
        private String m_categoryName = null;
        private int m_numberOfInsertedDocuments = 0;

        public SaxHandler(EnhancedRawData enhancedRawData) {
            this.m_enhancedRawData = enhancedRawData;
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws ImportStoppedException {
            if (WikipediaImporter.this.m_stopRequests.containsKey(Thread.currentThread())) {
                WikipediaImporter.getLogger().info("Got stop request");
                throw new ImportStoppedException(WikipediaImporter.STOP_REQUEST);
            }
            this.m_saxContext.addToContext(str3);
            if (this.m_saxContext.isRepresenting(WikipediaImporter.BASE_SOURCE_TAG)) {
                this.m_textTagContent = new StringBuilder();
            }
            if (this.m_saxContext.isRepresenting(WikipediaImporter.PAGE_TAG)) {
                this.m_textTagContent = new StringBuilder();
                this.m_metaAttributes = new LinkedList<>();
                this.m_categoryName = null;
            }
            this.m_innerTagContent = new StringBuilder();
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) {
            if (this.m_saxContext.isRepresenting(WikipediaImporter.COMMENT_TAG)) {
                WikipediaImporter.getLogger().finest("Will skip comment");
            } else {
                this.m_textTagContent.append(cArr, i, i2);
                this.m_innerTagContent.append(cArr, i, i2);
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            try {
                if (this.m_saxContext.isRepresenting(WikipediaImporter.BASE_SOURCE_TAG)) {
                    this.m_baseSource = getBaseSource(this.m_textTagContent.toString());
                } else {
                    if (this.m_saxContext.isRepresenting(WikipediaImporter.SOURCE_PAGE_TITLE)) {
                        String sb = this.m_innerTagContent.toString();
                        if (WikipediaImporter.isTitleRepresentingAWikipediaCategory(sb)) {
                            this.m_categoryName = WikipediaImporter.extractNameFromWikipediaCategoryEntry(sb);
                        }
                    }
                    WikipediaImporter.this.addAttributeIfTranslatable(this.m_metaAttributes, this.m_saxContext, this.m_innerTagContent.toString());
                }
                if (this.m_saxContext.isRepresenting(WikipediaImporter.PAGE_TAG)) {
                    wikipediaPageFinished();
                }
                if (this.m_saxContext.isRepresenting(WikipediaImporter.WIKIPEDIA_TAG)) {
                    deleteCategoriesToBeExcluded();
                    WikipediaImporter.getLogger().fine("Processing of wikipedia entries is now finished");
                    if (WikipediaImporter.this.isProfilingWanted()) {
                        WikipediaImporter.this.m_profilingHelper.dispose();
                    }
                }
                this.m_saxContext.removeFromContext(str3);
            } catch (ImporterException e) {
                throw new SAXException("Importer error while parsing document", e);
            }
        }

        private void deleteCategoriesToBeExcluded() {
            for (String str : WikipediaImporter.this.m_wikipediaCategories.keySet()) {
                Iterator it = WikipediaImporter.this.m_excludePatterns.iterator();
                while (true) {
                    if (it.hasNext()) {
                        if (((Pattern) it.next()).matcher(str).matches()) {
                            URI uri = (URI) WikipediaImporter.this.m_wikipediaCategories.get(str);
                            if (WikipediaImporter.this.m_categoryManager.isCategory(uri)) {
                                WikipediaImporter.this.m_indexManagerQueue.delete(uri);
                            }
                        }
                    }
                }
            }
        }

        private void wikipediaPageFinished() {
            URI addSourceAttributeAndCreateUriIfNotRedirectLink;
            if (isCategoryEntry()) {
                moveCategoryToDestinations(this.m_categoryName);
            } else if (this.m_textTagContent.length() != 0 && (addSourceAttributeAndCreateUriIfNotRedirectLink = addSourceAttributeAndCreateUriIfNotRedirectLink()) != null) {
                insertNewEntryIntoIndexManagerQueue(addSourceAttributeAndCreateUriIfNotRedirectLink, this.m_enhancedRawData.getParentURI(), this.m_metaAttributes);
            }
            if (WikipediaImporter.this.isProfilingWanted() && WikipediaImporter.this.m_profilingHelper.isPointOfInterestReached(this.m_numberOfInsertedDocuments)) {
                WikipediaImporter.this.m_profilingHelper.waitForUserOkay();
            }
        }

        private void moveCategoryToDestinations(String str) {
            String sb = this.m_textTagContent.toString();
            boolean z = false;
            URI uriForWikipediaCategory = WikipediaImporter.this.getUriForWikipediaCategory(str);
            if (uriForWikipediaCategory == null) {
                WikipediaImporter.getLogger().warning("Could not find URI for category '" + str + "'. Ignored");
                return;
            }
            List<String> wikipediaCategories = WikipediaImporter.getWikipediaCategories(sb);
            if (wikipediaCategories.size() == 0) {
                if (WikipediaImporter.ROOT_CATEGORY_LIST.contains(str.toLowerCase())) {
                    return;
                }
                WikipediaImporter.getLogger().warning("Category '" + str + "' has no parent category. Will be put into lost-and-found category.");
                WikipediaImporter.this.m_categoryManager.move(this.m_enhancedRawData.getParentURI(), WikipediaImporter.this.m_lostAndFoundCategoryUri, uriForWikipediaCategory);
                return;
            }
            if (wikipediaCategories.size() > 1) {
                WikipediaImporter.getLogger().finer("Category '" + str + "' has more than one parent");
            }
            for (String str2 : wikipediaCategories) {
                if (z) {
                    URI uriForWikipediaCategory2 = WikipediaImporter.this.getUriForWikipediaCategory(str2);
                    if (uriForWikipediaCategory2 != null) {
                        WikipediaImporter.getLogger().finer("Will add '" + str + "' to '" + str2 + "' (" + uriForWikipediaCategory2 + ")");
                        WikipediaImporter.this.m_categoryManager.add(uriForWikipediaCategory2, uriForWikipediaCategory);
                    } else {
                        WikipediaImporter.getLogger().warning("Could not find URI for parent category '" + str2 + "'. Ignored");
                    }
                } else {
                    URI uriForWikipediaCategory3 = WikipediaImporter.this.getUriForWikipediaCategory(str2);
                    if (uriForWikipediaCategory3 != null) {
                        z = true;
                        WikipediaImporter.getLogger().finer("Will move '" + str + "' to '" + str2 + "' (" + uriForWikipediaCategory3 + ")");
                        WikipediaImporter.this.m_categoryManager.move(this.m_enhancedRawData.getParentURI(), uriForWikipediaCategory3, uriForWikipediaCategory);
                    } else {
                        WikipediaImporter.getLogger().warning("Could not find URI for parent category '" + str2 + "'. Ignored");
                    }
                }
            }
            if (z) {
                return;
            }
            if (WikipediaImporter.this.m_lostAndFoundCategoryUri.equals(this.m_enhancedRawData.getParentURI())) {
                WikipediaImporter.getLogger().warning("Category '" + str + "' has no parent category. Will remain in root category of import.");
            } else {
                WikipediaImporter.getLogger().warning("Category '" + str + "' has no parent category. Will be put into lost-and-found category.");
                WikipediaImporter.this.m_categoryManager.move(this.m_enhancedRawData.getParentURI(), WikipediaImporter.this.m_lostAndFoundCategoryUri, uriForWikipediaCategory);
            }
        }

        private boolean isCategoryEntry() {
            return this.m_categoryName != null;
        }

        private String addRedirectsToTitle(String str) {
            Set<String> allRedirectLinksTo = getAllRedirectLinksTo(str);
            if (allRedirectLinksTo.isEmpty()) {
                return null;
            }
            String str2 = str;
            Iterator<String> it = allRedirectLinksTo.iterator();
            while (it.hasNext()) {
                str2 = String.valueOf(str2) + ";" + it.next();
            }
            WikipediaImporter.getLogger().finest("Title '" + str + "' was replaced by '" + str2 + "'");
            return str2;
        }

        private Set<String> getAllRedirectLinksTo(String str) {
            HashSet hashSet = new HashSet();
            String str2 = str;
            hashSet.addAll(WikipediaImporter.this.m_redirectLinks.get(str2));
            int indexOf = str2.indexOf(35);
            if (indexOf >= 0) {
                str2 = str2.substring(0, indexOf);
                hashSet.addAll(WikipediaImporter.this.m_redirectLinks.get(str2));
            }
            if (str2.contains("_")) {
                hashSet.addAll(WikipediaImporter.this.m_redirectLinks.get(Pattern.compile("_").matcher(str2).replaceAll(" ")));
            } else if (str2.contains(" ")) {
                hashSet.addAll(WikipediaImporter.this.m_redirectLinks.get(Pattern.compile(" ").matcher(str2).replaceAll("_")));
            }
            return hashSet;
        }

        private String getBaseSource(String str) {
            Matcher matcher = Pattern.compile("(/[^/]+)$").matcher(str);
            StringBuffer stringBuffer = new StringBuffer();
            if (!matcher.find()) {
                throw new RuntimeException("Could not find base URL in wikipedia data");
            }
            matcher.appendReplacement(stringBuffer, "");
            matcher.appendTail(stringBuffer);
            return stringBuffer.toString();
        }

        private URI addSourceAttributeAndCreateUriIfNotRedirectLink() {
            Iterator<Pair<URI, Object>> it = this.m_metaAttributes.iterator();
            while (it.hasNext()) {
                Pair<URI, Object> next = it.next();
                if (next.getKey().equals(AttributeURIs.TITLE)) {
                    if (handlePossibleRedirection((String) next.getValue())) {
                        return null;
                    }
                    String str = (String) next.getValue();
                    URI createUriFromBaseUriString = Catwiesel.createUriFromBaseUriString(this.m_baseSource, str);
                    String humanReadableSourceString = Catwiesel.getHumanReadableSourceString(createUriFromBaseUriString);
                    this.m_metaAttributes.addLast(new Pair<>(AttributeURIs.SOURCE, createUriFromBaseUriString.toString()));
                    this.m_metaAttributes.addLast(new Pair<>(AttributeURIs.SOURCE_HUMAN_READABLE, humanReadableSourceString));
                    this.m_metaAttributes.addLast(new Pair<>(AttributeURIs.MIME_TYPE, StringConstants.IMPORT_TYPE_HTML));
                    String addRedirectsToTitle = addRedirectsToTitle(str);
                    if (addRedirectsToTitle != null) {
                        next.setValue(addRedirectsToTitle);
                    }
                    return WikipediaImporter.this.m_importerHelper.getUriBySource(createUriFromBaseUriString.toString(), this.m_enhancedRawData.isNoIndexRead());
                }
            }
            WikipediaImporter.getLogger().warning("Could not create URI for wikipedia page element");
            return null;
        }

        private boolean handlePossibleRedirection(String str) {
            Iterator<Pair<URI, Object>> it = this.m_metaAttributes.iterator();
            while (it.hasNext()) {
                Pair<URI, Object> next = it.next();
                if (next.getKey().equals(AttributeURIs.CONTENT)) {
                    String redirectDestination = WikipediaImporter.getRedirectDestination((String) next.getValue());
                    if (redirectDestination != null) {
                        WikipediaImporter.getLogger().fine("'" + str + "' is redirected to '" + redirectDestination + "' and will be skipped");
                        return true;
                    }
                    WikipediaImporter.getLogger().finest("'" + str + "' is not a redirect link");
                    return false;
                }
            }
            return false;
        }

        private void insertNewEntryIntoIndexManagerQueue(URI uri, URI uri2, Collection<Pair<URI, Object>> collection) {
            AllTypesMultiValueMap createForDocument = AllTypesMultiValueMap.createForDocument(uri, collection.iterator(), this.m_enhancedRawData.isNoIndexRead());
            AllTypesMultiValueMap.addAdditionalMetaAttributes(createForDocument, this.m_enhancedRawData.getMetaAttributeIterator());
            boolean z = false;
            for (String str : WikipediaImporter.getWikipediaCategories(this.m_textTagContent.toString())) {
                URI uriForWikipediaCategory = WikipediaImporter.this.getUriForWikipediaCategory(str);
                if (uriForWikipediaCategory != null) {
                    createForDocument.add(StringConstants.INDEX_TYPE_STRUCTURE, AttributeURIs.PARENT_URI, uriForWikipediaCategory);
                    z = true;
                } else {
                    WikipediaImporter.getLogger().warning("Could not find URI for parent category '" + str + "'. Ignored");
                }
            }
            if (!z) {
                WikipediaImporter.getLogger().info("Document '" + uri + "' has no parent category. Will be put into lost-and-found category.");
                createForDocument.add(StringConstants.INDEX_TYPE_STRUCTURE, AttributeURIs.PARENT_URI, WikipediaImporter.this.m_lostAndFoundCategoryUri);
            }
            this.m_numberOfInsertedDocuments++;
            WikipediaImporter.getLogger().finer(String.valueOf(getClass().getName()) + " is putting map \"" + createForDocument + "\" of wikipedia page into the IndexManagerQueue");
            WikipediaImporter.this.m_indexManagerQueue.insert(createForDocument);
        }
    }

    /* loaded from: input_file:de/dfki/catwiesel/synchronizer/importer/xml/WikipediaImporter$SaxHandlerForPreprocessing.class */
    public class SaxHandlerForPreprocessing extends DefaultHandler {
        private StringBuilder m_titleTagContent;
        private String m_title;
        private EnhancedRawData m_enhancedRawData;
        private StringBuilder m_textTagContent = new StringBuilder();
        private SaxContext saxContext = new SaxContext();
        private HashMap<String, String> m_categoryRedirects = new HashMap<>();
        private int m_numberOfNonRedirectedPages = 0;

        public SaxHandlerForPreprocessing(EnhancedRawData enhancedRawData) {
            this.m_enhancedRawData = enhancedRawData;
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void startElement(String str, String str2, String str3, Attributes attributes) throws ImportStoppedException {
            if (WikipediaImporter.this.m_stopRequests.containsKey(Thread.currentThread())) {
                WikipediaImporter.getLogger().info("Got stop request");
                throw new ImportStoppedException(WikipediaImporter.STOP_REQUEST);
            }
            this.saxContext.addToContext(str3);
            if (this.saxContext.isRepresenting(WikipediaImporter.CONTENT_TAG)) {
                this.m_textTagContent = new StringBuilder();
            }
            if (this.saxContext.isRepresenting(WikipediaImporter.SOURCE_PAGE_TITLE)) {
                this.m_titleTagContent = new StringBuilder();
                this.m_title = null;
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void characters(char[] cArr, int i, int i2) {
            this.m_textTagContent.append(cArr, i, i2);
            if (this.m_titleTagContent != null) {
                this.m_titleTagContent.append(cArr, i, i2);
            }
        }

        @Override // org.xml.sax.helpers.DefaultHandler, org.xml.sax.ContentHandler
        public void endElement(String str, String str2, String str3) throws SAXException {
            if (this.saxContext.isRepresenting(WikipediaImporter.SOURCE_PAGE_TITLE)) {
                if (this.m_titleTagContent.length() != 0) {
                    this.m_title = this.m_titleTagContent.toString();
                }
                this.m_titleTagContent = null;
            }
            if (this.saxContext.isRepresenting(WikipediaImporter.PAGE_TAG)) {
                wikipediaPageFinished();
            }
            if (this.saxContext.isRepresenting(WikipediaImporter.WIKIPEDIA_TAG)) {
                handleCategoryRedirects();
                WikipediaImporter.getLogger().info("Preprocessing is now finished. Found " + WikipediaImporter.this.m_redirectLinks.size() + " different redirect destinations and " + WikipediaImporter.this.m_wikipediaCategories.size() + " wikipedia categories.");
                WikipediaImporter.getLogger().info("Wikipedia file contains " + this.m_numberOfNonRedirectedPages + " pages (redirects excluded).");
                if (WikipediaImporter.this.isProfilingWanted()) {
                    WikipediaImporter.this.m_profilingHelper.waitForUserOkay();
                }
            }
            this.saxContext.removeFromContext(str3);
        }

        private void handleCategoryRedirects() {
            int size = this.m_categoryRedirects.size();
            while (true) {
                int i = size;
                if (this.m_categoryRedirects.size() <= 0) {
                    return;
                }
                Iterator<String> it = this.m_categoryRedirects.keySet().iterator();
                while (it.hasNext()) {
                    String next = it.next();
                    String lowerCase = this.m_categoryRedirects.get(next).toLowerCase();
                    URI uriForWikipediaCategory = WikipediaImporter.this.getUriForWikipediaCategory(lowerCase);
                    if (uriForWikipediaCategory != null) {
                        WikipediaImporter.getLogger().info("Category '" + next + "' was redirected to " + lowerCase + " (" + uriForWikipediaCategory + ")");
                        WikipediaImporter.this.m_wikipediaCategories.put(next, uriForWikipediaCategory);
                        it.remove();
                    }
                }
                if (i <= this.m_categoryRedirects.size()) {
                    for (String str : this.m_categoryRedirects.keySet()) {
                        WikipediaImporter.getLogger().warning("Category '" + str + "' was redirected to a non existing destination: " + this.m_categoryRedirects.get(str));
                    }
                    this.m_categoryRedirects.clear();
                }
                size = this.m_categoryRedirects.size();
            }
        }

        private void wikipediaPageFinished() {
            if (this.m_textTagContent.length() == 0 || this.m_title == null) {
                return;
            }
            String redirectDestination = WikipediaImporter.getRedirectDestination(this.m_textTagContent.toString());
            if (redirectDestination == null) {
                if (WikipediaImporter.isTitleRepresentingAWikipediaCategory(this.m_title)) {
                    WikipediaImporter.this.addWikipediaToplevelCategory(WikipediaImporter.extractNameFromWikipediaCategoryEntry(this.m_title), this.m_enhancedRawData.getParentURI(), this.m_enhancedRawData.isNoIndexRead());
                }
                this.m_numberOfNonRedirectedPages++;
                return;
            }
            int indexOf = redirectDestination.indexOf(35);
            if (indexOf >= 0) {
                redirectDestination = redirectDestination.substring(0, indexOf);
            }
            WikipediaImporter.this.m_redirectLinks.add(redirectDestination, this.m_title);
            if (WikipediaImporter.this.m_redirectLinks.size() % 10000 == 0) {
                WikipediaImporter.getLogger().info("Processed: " + WikipediaImporter.this.m_redirectLinks.size() + " redirects in total");
            }
            if (WikipediaImporter.isTitleRepresentingAWikipediaCategory(this.m_title)) {
                addCategoryRedirect(WikipediaImporter.extractNameFromWikipediaCategoryEntry(this.m_title), redirectDestination);
            }
        }

        private void addCategoryRedirect(String str, String str2) {
            String extractNameFromWikipediaCategoryEntry = WikipediaImporter.extractNameFromWikipediaCategoryEntry(str2, false);
            if (extractNameFromWikipediaCategoryEntry == null) {
                extractNameFromWikipediaCategoryEntry = str2;
            }
            if (extractNameFromWikipediaCategoryEntry.equals(str)) {
                return;
            }
            this.m_categoryRedirects.put(str, extractNameFromWikipediaCategoryEntry.toLowerCase());
        }
    }

    static {
        ROOT_CATEGORY_LIST.add("!Hauptkategorie".toLowerCase());
        ROOT_CATEGORY_LIST.add("Categories".toLowerCase());
    }

    public WikipediaImporter(MultiValueConfiguration multiValueConfiguration, ImporterHelper importerHelper, CategoryManager categoryManager) throws ImporterException {
        this.m_importerHelper = importerHelper;
        this.m_categoryManager = categoryManager;
        this.m_typeList.add(StringConstants.IMPORT_TYPE_WIKIPEDIA);
        this.m_redirectLinks.setInitialCapacityAndLoadFactor(4, 0.75f);
        createImportCapabilities();
        if (isProfilingWanted()) {
            this.m_profilingHelper = new ProfilingHelper("Profiling Widget");
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public boolean isProfilingWanted() {
        return (this.m_profilingHelper == null && System.getProperty(ProfilingHelper.PROFILING_PROPERTY) == null) ? false : true;
    }

    private static SimpleMultiValueMap<SaxContext, URI> createAttributeMappingTable() {
        SimpleMultiValueMap<SaxContext, URI> simpleMultiValueMap = new SimpleMultiValueMap<>();
        simpleMultiValueMap.add(DATE_TAG, AttributeURIs.MODIFICATION_DATE);
        simpleMultiValueMap.add(CREATOR_TAG, AttributeURIs.CREATOR);
        simpleMultiValueMap.add(CONTENT_TAG, AttributeURIs.CONTENT);
        simpleMultiValueMap.add(BASE_SOURCE_TAG, AttributeURIs.SOURCE);
        simpleMultiValueMap.add(SOURCE_PAGE_TITLE, AttributeURIs.TITLE);
        simpleMultiValueMap.add(PAGE_ID, AttributeURIs.WIKIPEDIA_PAGE_ID);
        return simpleMultiValueMap;
    }

    private void createImportCapabilities() {
        this.m_importCapabilities = new HashMap();
        ImportConfiguration importConfiguration = new ImportConfiguration(StringConstants.IMPORT_TYPE_WIKIPEDIA);
        importConfiguration.add(ImportConfiguration.SOURCE_KEY, "The source of the wikipedia entries (currently only a standard dump from Wikipedia is possible, see http://download.wikimedia.org/backup-index.html, choose 'pages-articles.xml.bz2' of the wiki of your choice)");
        importConfiguration.add(ImportConfiguration.PARENT_URI_KEY, "The URI of the parent category.");
        importConfiguration.add(StringConstants.LOST_AND_FOUND_CATEGORY_KEY, "Set to the name of a category to be used for orphaned wikipedia documents (that is: documents with no wikipedia parent category).The category will be created automatically below the root category of the import.");
        importConfiguration.add(StringConstants.WIKIPEDIA_EXCLUSION_PATTERN_FILE, "Set to a file containing exclusion patterns for wikipedia pages (that is: pages with titles matching one of the patterns will be excluded).");
        importConfiguration.add(StringConstants.NO_INDEX_READ_KEY, "Set to 'true' to fasten up the import if none of the sources to be imported are already present in the index. MUST be set to 'false' if one of the sources already exists in the index. Note that inserting a new category might be split into two independent steps inside the index manager (depending on the used category manager) and therefore might include a reading access.");
        importConfiguration.addDefaultImportAttributes();
        this.m_importCapabilities.put(StringConstants.IMPORT_TYPE_WIKIPEDIA, importConfiguration);
    }

    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public boolean process(EnhancedRawData enhancedRawData) throws ImporterException {
        if (!this.m_typeList.contains(enhancedRawData.getImportType())) {
            return false;
        }
        Object data = enhancedRawData.getData();
        getLogger().fine("processing data " + enhancedRawData);
        if (data instanceof File) {
            return processWikipediaFile(enhancedRawData);
        }
        getLogger().log(Level.WARNING, "Data is not an instance of a known class");
        throw new ImporterException("Data is not an instance of a known class");
    }

    private boolean processWikipediaFile(EnhancedRawData enhancedRawData) throws ImporterException {
        File file = (File) enhancedRawData.getData();
        try {
            parseFile(file, enhancedRawData);
            return true;
        } catch (Exception e) {
            try {
                throw new ImporterException("Could not parse given file '" + file.getCanonicalPath() + "'", e);
            } catch (IOException e2) {
                throw new ImporterException("Could not parse given file", e2);
            }
        }
    }

    private void parseFile(File file, EnhancedRawData enhancedRawData) throws IOException, ParserConfigurationException, SAXException {
        try {
            SAXParser createParser = createParser();
            createParser.parse(new FileInputStream(file), new SaxHandlerForPreprocessing(enhancedRawData));
            createParser.parse(new FileInputStream(file), new SaxHandler(enhancedRawData));
        } catch (ImportStoppedException e) {
            getLogger().info("Parsing was stopped by explicite user request");
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public void addAttributeIfTranslatable(List<Pair<URI, Object>> list, SaxContext saxContext, String str) throws ImporterException {
        Set<URI> setOfMatchingContextElements = getSetOfMatchingContextElements(saxContext);
        if (setOfMatchingContextElements != null) {
            for (URI uri : setOfMatchingContextElements) {
                if (uri.equals(AttributeURIs.MODIFICATION_DATE) || uri.equals(AttributeURIs.CREATION_DATE)) {
                    Date parseDateString = DateParser.parseDateString(str);
                    if (parseDateString != null) {
                        list.add(new Pair<>(uri, parseDateString));
                        if (uri.equals(AttributeURIs.MODIFICATION_DATE)) {
                            list.add(new Pair<>(AttributeURIs.DYNAQ_SIGNIFICANT_TIME, new SimpleDateFormat(StringConstants.MODIFICATION_TIME_FORMAT).format(parseDateString)));
                        }
                    }
                } else {
                    list.add(new Pair<>(uri, str));
                }
            }
        }
    }

    private Set<URI> getSetOfMatchingContextElements(SaxContext saxContext) {
        SimpleMultiValueMap<SaxContext, URI> attributeMappingTable = getAttributeMappingTable();
        HashSet hashSet = new HashSet();
        for (SaxContext saxContext2 : attributeMappingTable.getKeys()) {
            if (saxContext.isRepresenting(saxContext2)) {
                hashSet.addAll(attributeMappingTable.get(saxContext2));
            }
        }
        return hashSet;
    }

    private SimpleMultiValueMap<SaxContext, URI> getAttributeMappingTable() {
        return m_attributeMappingTable;
    }

    private SAXParser createParser() throws ParserConfigurationException, SAXException {
        return SAXParserFactory.newInstance().newSAXParser();
    }

    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public URI startImport(ImportConfiguration importConfiguration) throws ImporterException, ConfigurationException {
        getLogger().finer("starting import");
        String importType = importConfiguration.getImportType();
        if (!importType.equals(StringConstants.IMPORT_TYPE_WIKIPEDIA)) {
            return null;
        }
        try {
            File file = new File(FileHandling.getNormalizedPath(importConfiguration.getUniqueAsString(ImportConfiguration.SOURCE_KEY)));
            String uniqueAsString = importConfiguration.getUniqueAsString(ImportConfiguration.PARENT_URI_KEY);
            URI uRIImpl = Catwiesel.VIRTUAL_ROOT_STRING.equals(uniqueAsString) ? Catwiesel.VIRTUAL_ROOT_URI : new URIImpl(uniqueAsString);
            boolean z = false;
            String firstAsString = importConfiguration.getFirstAsString(StringConstants.NO_INDEX_READ_KEY);
            if (firstAsString != null) {
                z = Boolean.parseBoolean(firstAsString);
            }
            EnhancedRawData enhancedRawData = this.m_importerHelper.getEnhancedRawData(file, file.getCanonicalPath(), uRIImpl, importType, z);
            String firstAsString2 = importConfiguration.getFirstAsString(StringConstants.LOST_AND_FOUND_CATEGORY_KEY);
            if (firstAsString2 != null) {
                this.m_lostAndFoundCategoryUri = addWikipediaToplevelCategory(firstAsString2, uRIImpl, z);
            } else {
                this.m_lostAndFoundCategoryUri = uRIImpl;
            }
            establishExclusionPatterns(importConfiguration);
            enhancedRawData.addFixedAttributesIfAny(importConfiguration);
            process(enhancedRawData);
            return enhancedRawData.getURI();
        } catch (Exception e) {
            getLogger().log(Level.WARNING, "Cannot process given file", (Throwable) e);
            throw new ImporterException("Importing of given data source failed", e);
        } catch (ConfigurationException e2) {
            throw e2;
        }
    }

    private void establishExclusionPatterns(ImportConfiguration importConfiguration) throws ImporterException, ConfigurationException {
        String firstAsString = importConfiguration.getFirstAsString(StringConstants.WIKIPEDIA_EXCLUSION_PATTERN_FILE);
        if (firstAsString != null) {
            try {
                Iterator<String> it = FileHandling.getLinesOfFileAsList(firstAsString, FileHandling.ENCODING_TYPE_UTF8).iterator();
                while (it.hasNext()) {
                    this.m_excludePatterns.add(Pattern.compile(it.next(), 2));
                }
            } catch (IOException e) {
                getLogger().log(Level.WARNING, "Error while trying to establish exclude patterns", (Throwable) e);
                throw new ImporterException("Error while trying to establish exclude patterns", e);
            }
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String getRedirectDestination(String str) {
        Matcher matcher = Pattern.compile("^\\s*#Redirect.{0,5}\\[\\[(.*?)\\]\\].*\\s*", 34).matcher(str);
        if (matcher.matches()) {
            return matcher.group(1);
        }
        return null;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static List<String> getWikipediaCategories(String str) {
        LinkedList linkedList = new LinkedList();
        String orPattern = getOrPattern(getCategoryPrefixes());
        Matcher matcher = Pattern.compile("(?:\\[\\[(?:" + orPattern + "):(.*?)\\]\\]|\\{\\{(?:" + orPattern + ") (.*?)\\}\\})", 34).matcher(str);
        while (matcher.find()) {
            String group = matcher.group(1);
            if (group == null) {
                group = matcher.group(2);
            }
            int indexOf = group.indexOf("|");
            if (indexOf >= 0) {
                group = group.substring(0, indexOf);
            }
            if (group.length() > 0) {
                linkedList.add(group);
            } else {
                getLogger().warning("Illegal Category name of length zero. Ignored");
            }
        }
        return linkedList;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String extractNameFromWikipediaCategoryEntry(String str) {
        return extractNameFromWikipediaCategoryEntry(str, true);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static String extractNameFromWikipediaCategoryEntry(String str, boolean z) {
        String str2 = null;
        String orPattern = getOrPattern(getCategoryPrefixes());
        Matcher matcher = Pattern.compile(z ? "(" + orPattern + "):(.+?)" : ":?(" + orPattern + "):(.+?)", 34).matcher(str);
        if (matcher.matches()) {
            str2 = matcher.group(2).trim();
            if (str2.length() == 0) {
                str2 = null;
                getLogger().warning("Found empty category name. Ignored");
            }
        }
        return str2;
    }

    private static String getOrPattern(Set<String> set) {
        String str = "";
        for (String str2 : set) {
            if (str.length() > 0) {
                str = String.valueOf(str) + "|";
            }
            str = String.valueOf(str) + str2;
        }
        return str;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static boolean isTitleRepresentingAWikipediaCategory(String str) {
        return extractNameFromWikipediaCategoryEntry(str) != null;
    }

    private static Set<String> getCategoryPrefixes() {
        HashSet hashSet = new HashSet();
        hashSet.add("Kategorie");
        hashSet.add("Category");
        return hashSet;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public URI addWikipediaToplevelCategory(String str, URI uri, boolean z) {
        URI uri2;
        String lowerCase = str.toLowerCase();
        if (this.m_wikipediaCategories.containsKey(lowerCase)) {
            getLogger().warning("Key '" + lowerCase + "' exists more than once: first occurency wins!");
            uri2 = this.m_wikipediaCategories.get(lowerCase);
        } else {
            URI createURI = this.m_importerHelper.createURI();
            AllTypesMultiValueMap createForCategory = AllTypesMultiValueMap.createForCategory(createURI, AllTypesMultiValueMap.EMPTY_COLLECTION_OF_PAIRS.iterator(), z);
            createForCategory.add(AttributeURIs.CATEGORY_NAME, str);
            createForCategory.add(StringConstants.INDEX_TYPE_STRUCTURE, AttributeURIs.PARENT_URI, uri);
            getLogger().finer("Will create new Wikipedia category '" + str + "' in index");
            this.m_indexManagerQueue.insert(createForCategory);
            this.m_wikipediaCategories.put(lowerCase, createURI);
            uri2 = createURI;
        }
        return uri2;
    }

    /* JADX INFO: Access modifiers changed from: private */
    public URI getUriForWikipediaCategory(String str) {
        return this.m_wikipediaCategories.get(str.trim().toLowerCase());
    }

    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public Map<String, ImportConfiguration> getImportCapabilities() {
        return this.m_importCapabilities;
    }

    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public ImportConfiguration getImportCapabilities(String str) {
        return this.m_importCapabilities.get(str);
    }

    public static Logger getLogger() {
        return m_logger;
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v1, types: [java.util.HashMap<java.lang.Thread, java.lang.Boolean>] */
    /* JADX WARN: Type inference failed for: r0v2, types: [java.lang.Throwable] */
    /* JADX WARN: Type inference failed for: r0v6 */
    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public void stopImport(Thread thread) {
        ?? r0 = this.m_stopRequests;
        synchronized (r0) {
            this.m_stopRequests.put(thread, Boolean.TRUE);
            r0 = r0;
        }
    }

    /* JADX WARN: Multi-variable type inference failed */
    /* JADX WARN: Type inference failed for: r0v1, types: [java.util.HashMap<java.lang.Thread, java.lang.Boolean>] */
    /* JADX WARN: Type inference failed for: r0v2, types: [java.lang.Throwable] */
    /* JADX WARN: Type inference failed for: r0v6 */
    @Override // de.dfki.catwiesel.synchronizer.importer.Importer
    public void reset(Thread thread) {
        ?? r0 = this.m_stopRequests;
        synchronized (r0) {
            this.m_stopRequests.remove(thread);
            r0 = r0;
        }
    }
}
