package de.dfki.sds.lodex.util;

import au.com.bytecode.opencsv.CSVReader;
import de.dfki.inquisitor.file.FileUtilz;
import de.dfki.inquisitor.processes.StopWatch;
import de.dfki.inquisitor.text.StringUtils;
import de.dfki.sds.lodex.EntityExplained;
import de.dfki.sds.lodex.GlobalConstants;
import de.dfki.sds.lodex.NamedEntityLinker;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URLDecoder;
import java.nio.charset.StandardCharsets;
import java.util.zip.GZIPInputStream;
import org.mapdb.BTreeMap;
import org.mapdb.DB;
import org.mapdb.DBMaker;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:de/dfki/sds/lodex/util/NamedEntityDataCreator4DbPediaCsv.class */
public class NamedEntityDataCreator4DbPediaCsv {
    public static void createNamedEntityDB(String str, Reader reader, Reader reader2, Reader reader3, Reader reader4, String str2, String str3) throws Exception {
        long currentTimeMillis = System.currentTimeMillis();
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("Named entity data path: " + str);
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("copy file contents to disk");
        new File(str).mkdirs();
        FileUtilz.string2File(str2, str + "/entityTypes4HighFrqTerms.txt");
        FileUtilz.string2File(str3, str + "/embeddings.conf");
        NamedEntityLinker namedEntityLinker = new NamedEntityLinker(str);
        DB make = DBMaker.newFileDB(new File(str + "/tempMapDbRemoveMeIfYouSeeMe")).closeOnJvmShutdown().asyncWriteEnable().deleteFilesAfterClose().transactionDisable().mmapFileEnableIfSupported().make();
        BTreeMap treeMap = make.getTreeMap("id2entity");
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("\nstart loading named entity types");
        CSVReader cSVReader = new CSVReader(reader3, ';');
        cSVReader.readNext();
        int i = 0;
        while (true) {
            String[] readNext = cSVReader.readNext();
            if (readNext == null) {
                break;
            }
            if (readNext.length >= 2) {
                String decode = decode(readNext[0]);
                String decode2 = decode(readNext[1]);
                EntityExplained entityExplained = (EntityExplained) treeMap.get(decode);
                EntityExplained entityExplained2 = entityExplained == null ? new EntityExplained(decode) : new EntityExplained(entityExplained);
                entityExplained2.types.add(decode2);
                treeMap.put(decode, entityExplained2);
                i++;
                if (i % 10000 == 0) {
                    LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i)) + " entity types for " + StringUtils.beautifyNumber(Integer.valueOf(treeMap.size())) + " entities");
                }
            }
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i)) + " entity types for " + StringUtils.beautifyNumber(Integer.valueOf(treeMap.size())) + " entities");
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("\nstart loading named entity labels");
        cSVReader.close();
        CSVReader cSVReader2 = new CSVReader(reader, ';');
        cSVReader2.readNext();
        int i2 = 0;
        while (true) {
            String[] readNext2 = cSVReader2.readNext();
            if (readNext2 == null) {
                break;
            }
            if (readNext2.length >= 2) {
                String decode3 = decode(readNext2[0]);
                String decode4 = decode(readNext2[1]);
                EntityExplained entityExplained3 = (EntityExplained) treeMap.get(decode3);
                if (entityExplained3 != null) {
                    entityExplained3.label = decode4;
                    treeMap.put(decode3, entityExplained3);
                    i2++;
                    if (i2 % 10000 == 0) {
                        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i2)) + " entity labels");
                    }
                }
            }
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i2)) + " entity labels");
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("\nstart loading named entity abstracts");
        cSVReader2.close();
        CSVReader cSVReader3 = new CSVReader(reader2, ';');
        cSVReader3.readNext();
        int i3 = 0;
        while (true) {
            String[] readNext3 = cSVReader3.readNext();
            if (readNext3 == null) {
                break;
            }
            if (readNext3.length >= 2) {
                String decode5 = decode(readNext3[0]);
                String decode6 = decode(readNext3[1]);
                EntityExplained entityExplained4 = (EntityExplained) treeMap.get(decode5);
                if (entityExplained4 != null) {
                    entityExplained4.textTrigger = decode6;
                    treeMap.put(decode5, entityExplained4);
                    i3++;
                    if (i3 % 10000 == 0) {
                        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i3)) + " entity abstracts");
                    }
                }
            }
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i3)) + " entity abstracts");
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("\nstart loading synonyms");
        cSVReader3.close();
        CSVReader cSVReader4 = new CSVReader(reader4, ';');
        cSVReader4.readNext();
        int i4 = 0;
        while (true) {
            String[] readNext4 = cSVReader4.readNext();
            if (readNext4 == null) {
                break;
            }
            if (readNext4.length >= 2) {
                String decode7 = decode(readNext4[0]);
                String decode8 = decode(readNext4[1]);
                EntityExplained entityExplained5 = (EntityExplained) treeMap.get(decode7);
                if (entityExplained5 != null) {
                    if (isNiceSynonym(entityExplained5.label, decode8)) {
                        entityExplained5.synonyms.add(decode8);
                    }
                    treeMap.put(decode7, entityExplained5);
                    i4++;
                    if (i4 % 10000 == 0) {
                        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i4)) + " synonyms");
                    }
                }
            }
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("loaded " + StringUtils.beautifyNumber(Integer.valueOf(i4)) + " synonyms");
        cSVReader4.close();
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("start writing");
        namedEntityLinker.startWriting(true);
        int i5 = 0;
        for (EntityExplained entityExplained6 : treeMap.values()) {
            if (!StringUtils.nullOrWhitespace(entityExplained6.label) && !StringUtils.nullOrWhitespace(entityExplained6.id)) {
                namedEntityLinker.addEntity2Index(entityExplained6);
                i5++;
                if (i5 % 10000 == 0) {
                    LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("added " + StringUtils.beautifyNumber(Integer.valueOf(i5)) + " entities");
                }
            }
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("added " + StringUtils.beautifyNumber(Integer.valueOf(i5)) + " entities");
        namedEntityLinker.stopWriting();
        namedEntityLinker.close();
        make.close();
        StopWatch.stopAndLogDistance(currentTimeMillis, NamedEntityDataCreator4DbPediaCsv.class);
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("finished");
    }

    public static void createNamedEntityDB(String str, String str2, String str3, String str4, String str5, String str6, String str7) throws Exception {
        if (str3 == null || str4 == null || str6 == null || str7 == null) {
            System.err.println("Error: You have to specify all input files");
            return;
        }
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("Output file path: \n" + str);
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("Input file paths:\n" + str2 + "\n" + str3 + "\n" + str4 + "\n" + str5 + "\n" + str6 + "\n");
        LoggerFactory.getLogger(NamedEntityDataCreator4DbPediaCsv.class).info("Current working path: " + new File(".").getAbsolutePath());
        createNamedEntityDB(str, str2.endsWith(".gz") ? new InputStreamReader(new GZIPInputStream(new FileInputStream(str2)), StandardCharsets.UTF_8) : new InputStreamReader(new FileInputStream(str2), StandardCharsets.UTF_8), str3.endsWith(".gz") ? new InputStreamReader(new GZIPInputStream(new FileInputStream(str3)), StandardCharsets.UTF_8) : new InputStreamReader(new FileInputStream(str3), StandardCharsets.UTF_8), str4.endsWith(".gz") ? new InputStreamReader(new GZIPInputStream(new FileInputStream(str4)), StandardCharsets.UTF_8) : new InputStreamReader(new FileInputStream(str4), StandardCharsets.UTF_8), str5.endsWith(".gz") ? new InputStreamReader(new GZIPInputStream(new FileInputStream(str5)), StandardCharsets.UTF_8) : new InputStreamReader(new FileInputStream(str5), StandardCharsets.UTF_8), FileUtilz.file2String(str6), FileUtilz.file2String(str7));
    }

    public static String decode(String str) {
        return StringUtils.nullOrWhitespace(str) ? str : URLDecoder.decode(str.replace("%FF", "%"), StandardCharsets.UTF_8);
    }

    protected static boolean isNiceSynonym(String str, String str2) {
        boolean contains = str.contains(" ");
        boolean contains2 = str2.contains(" ");
        boolean z = ((float) str2.length()) / ((float) str.length()) <= 0.4f;
        if (!contains || contains2 || !z) {
            return true;
        }
        if (str2.length() < 3) {
            return false;
        }
        boolean z2 = false;
        for (int i = 1; i < 5 && str2.length() > i; i++) {
            z2 = z2 || Character.isUpperCase(str2.charAt(i)) || str2.charAt(i) == '.';
        }
        return z2;
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length < 1 || strArr.length > 14) {
            System.out.println("invalid count of parameters: " + strArr.length);
            System.out.println("NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
            return;
        }
        String str = null;
        String str2 = null;
        String str3 = null;
        String str4 = null;
        String str5 = null;
        String str6 = null;
        String str7 = null;
        int i = 0;
        while (i < strArr.length) {
            if ("-h".equals(strArr[i]) || "--help".equals(strArr[i])) {
                System.out.println("NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                return;
            }
            if ("-o".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a path for parameter '-o'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str = strArr[i + 1];
                    if (!new File(str).isAbsolute()) {
                        str = GlobalConstants.strAppBasePath + "/" + str;
                    }
                }
            } else if ("-l".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a file name for parameter '-l'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str2 = strArr[i + 1];
                    i++;
                }
            } else if ("-a".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a file name for parameter '-a'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str3 = strArr[i + 1];
                    i++;
                }
            } else if ("-t".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a path for parameter '-t'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str4 = strArr[i + 1];
                    i++;
                }
            } else if ("-sy".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a path for parameter '-sy'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str5 = strArr[i + 1];
                    i++;
                }
            } else if ("-et".equals(strArr[i])) {
                if (strArr.length < i + 1 + 1) {
                    System.out.println("you must specify a path for parameter '-et'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                    return;
                } else {
                    str6 = strArr[i + 1];
                    i++;
                }
            } else if (!"-em".equals(strArr[i])) {
                continue;
            } else if (strArr.length < i + 1 + 1) {
                System.out.println("you must specify a path for parameter '-em'" + "NamedEntityDataCreator4DbPediaCsv -o <outputDirPath> -l <labelsFileName> -a <abstractsFileName> -t <typesFileName> -sy <synonymsFileName>\n -et <entityTypes4HighFrqTermsFileName> -em <embeddingsConfFileName>\nOptions:\n  -h/--help: this text\n  -o <outputDirPath> the directory where to write the output data. In the case you specify a relative path, the final path will be 'GlobalConstants.strAppBasePath'\n        plus the given, relative parameter path\n  -l <labelsFileName> the file with the entity labels\n  -a <abstractsFileName> the file with the entity abstracts\n  -t <typesFileName> the file with the entity types\n  -sy <synonymsFileName> the file with the synonyms\n  -et <entityTypes4HighFrqTermsFileName> the file with the entity types that indicates an entity in the case of an high frequent term. Types are defined with -t\n  -em <embeddingsConfFileName> the path to the text embeddings configuration\n");
                return;
            } else {
                str7 = strArr[i + 1];
                i++;
            }
            i++;
        }
        createNamedEntityDB(str, str2, str3, str4, str5, str6, str7);
    }
}
