package de.dfki.leech.earlyTrendRadar;

import au.com.bytecode.opencsv.CSVReader;
import de.dfki.inquisitor.collections.CollectionUtilz;
import de.dfki.inquisitor.collections.MultiValueHashMap;
import de.dfki.inquisitor.text.DateUtils;
import de.dfki.inquisitor.text.StringUtils;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.sax.PrintlnContentHandler;
import de.dfki.leech.AbstractJsonParser;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.Paths;
import java.text.ParsePosition;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.Locale;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;

/* loaded from: input_file:de/dfki/leech/earlyTrendRadar/BaseJsonParser.class */
public class BaseJsonParser extends AbstractJsonParser {
    private static final long serialVersionUID = 2872726190507335529L;
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.application("vnd.etr.base_gzip"), MediaType.application("vnd.etr.base"))));
    protected static HashMap<String, String> hsContinentCodeOld2New = new HashMap<>();
    protected static MultiValueHashMap<String, String> hsDcdeweyhuns2steep = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweyhuns2topic1 = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweyhuns2queryName = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweytens2steep = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweytens2topic1 = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweytens2queryName = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweyones2steep = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweyones2topic1 = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsDcdeweyones2queryName = new MultiValueHashMap<>(HashSet.class);
    protected static MultiValueHashMap<String, String> hsPath2AttName = new MultiValueHashMap<>();
    protected int m_iEntities = 0;
    protected int m_iErrorDateEntities = 0;
    protected int m_iNoDateEntities = 0;
    protected int m_iOnlyYearDateEntities = 0;

    public BaseJsonParser() {
        this.ignoreHistory = true;
    }

    public static void main(String[] strArr) throws Exception {
        new Leech().parse("/home/reuschling/mnt/serv-4101/serv-4101a/EarlyTrendRadar/base/2015/all-2015.990000-999999.base.json.gz", new PrintlnContentHandler(PrintlnContentHandler.Verbosity.all) { // from class: de.dfki.leech.earlyTrendRadar.BaseJsonParser.1
            boolean m_bFirst = false;

            public void processNewData(Metadata metadata, String str) {
                if (this.m_bFirst || metadata.getValues("dctype").length <= 2) {
                    return;
                }
                System.err.println(CollectionUtilz.createLinkedList(metadata.getValues("dctype")));
                System.err.println(CollectionUtilz.createLinkedList(metadata.getValues("globalSource")));
                System.err.println("################");
            }
        }, new ParseContext());
    }

    @Override // de.dfki.leech.AbstractJsonParser
    public MultiValueHashMap<String, String> getPath2AttNameMappings() {
        return hsPath2AttName;
    }

    @Override // de.dfki.leech.AbstractJsonParser
    public String getPath2JsonObjectOrArray2Extract() {
        return "$.response.docs";
    }

    @Override // de.dfki.leech.AbstractJsonParser
    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    @Override // de.dfki.leech.AbstractJsonParser
    public void handleMetadata(MultiValueHashMap<String, String> multiValueHashMap, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws Exception {
        String str;
        String str2;
        this.m_iEntities++;
        if (this.m_iEntities % 1000000 == 0) {
            Logger.getLogger(BaseJsonParser.class.getName()).info("processed entities: " + StringUtils.beautifyNumber(Integer.valueOf(this.m_iEntities)) + ", no date entities: " + this.m_iNoDateEntities + ", error date entities: " + this.m_iErrorDateEntities + ", only year entities: " + StringUtils.beautifyNumber(Integer.valueOf(this.m_iOnlyYearDateEntities)));
        }
        Collection collection = multiValueHashMap.get("modified");
        if (collection.size() == 0) {
            this.m_iNoDateEntities++;
        }
        if (collection.size() > 0 && collection.iterator().next() != null) {
            String replace = ((String) collection.iterator().next()).toLowerCase().replace("styczeń", "January").replace("stycznia", "January").replace("luty", "February").replace("lutego", "February").replace("marzec", "March").replace("marca", "March").replace("kwiecień", "April").replace("kwietnia", "April").replace("maja", "May").replace("maj", "May").replace("czerwiec", "June").replace("czerwca", "June").replace("lipiec", "July").replace("lipca", "July").replace("sierpień", "August").replace("sierpnia", "August").replace("wrzesień", "September").replace("września", "September").replace("października", "October").replace("październik", "October").replace("listopada", "November").replace("listopad", "November").replace("grudzień", "December").replace("grudnia", "December");
            LinkedList createLinkedList = CollectionUtilz.createLinkedList(new String[]{"d MMM yyyy", "dd MMM. yyyy", "dd-MMM-yyyy", "dd-MM-yyyy", "dd/MM/yyyy", "dd MMM', 'yyyy", "E, MMM dd, yyyy", "MMM d. yyyy", "MMM, dd, yyyy", "MMM d, yyyy", "MMM d,yyyy", "MMM d yyyy", "d'th' MMM, yyyy", "d'th' MMM yyyy", "MM-dd-yyyy", "MM/dd/yyyy", "MMM dd, yyyy", "dd.MM.yyyy", "d.MM.yyyy", "MMM. dd, yyyy", "'info:eu-repo/date/embargoend/'yyyy-MM-dd", "MM.yyyy", "MM-yyyy", "yyyy -MM-dd", "yyyy-MM-dd", "yyyy--MM-dd", "yyyy-MM", "yyyy/MM/dd", "yyyy-MMM-dd", "yyyy -MM- dd", "yyyy MMM dd", "yyyy-dd-MM", "yyyy/dd/MM", "'dateofgranted:'yyyy-MM-dd", "EEE, dd MMM yyyy HH:mm:ss", "MMM, yyyy", "'['yyyy-MM']'", "MM/yyyy", "MMM yyyy", "yyyy.MM", "yyyyMMdd", "yyyyMMddHHmm", "yyyyMM", "yyyy/MM", "yyyy-MMM"});
            LinkedList createLinkedList2 = CollectionUtilz.createLinkedList(new String[]{"yyyy,' nr '", "yyyy '(wyd. oryg. i cyfrowe)'", "yyyy '(wyd. cyfrowe)'", "yyyy '(wyd. oryg.)'", "yyyy'-00-00'", "'['yyyy']'", "yyyy']'", "'['yyyy", "yyyy' (in press)'", "yyyy '(modified)'", "yyyy '(created)'", "yyyy';'", "'©'yyyy", "'cop. 'yyyy", "'dl 'yyyy", "'c' yyyy", "yyyy'].'", "yyyy'-'", "'spg' yyyy", "'fall' yyyy", "'summer' yyyy", "'winter' yyyy", "'spring' yyyy", "yyyy 'summer'", "yyyy 'fall'", "yyyy 'spring'", "yyyy 'winter'", "yyyy':summer'", "yyyy':fall'", "yyyy':spring'", "yyyy':winter'", "yyyy': summer'", "yyyy': fall'", "yyyy': spring'", "yyyy': winter'", "yyyy'/summer'", "yyyy'/fall'", "yyyy'/spring'", "yyyy'/winter'", "'summer, 'yyyy", "'fall, 'yyyy", "'spring, 'yyyy", "'winter, 'yyyy", "'c'yyyy", ".yyyy", "yyyy.", "yyyy"});
            Date date = null;
            Iterator it = createLinkedList.iterator();
            while (it.hasNext()) {
                SimpleDateFormat simpleDateFormat = new SimpleDateFormat((String) it.next(), Locale.ENGLISH);
                simpleDateFormat.setLenient(false);
                date = simpleDateFormat.parse(replace, new ParsePosition(0));
                if (date != null) {
                    break;
                }
            }
            boolean z = false;
            if (date == null) {
                Iterator it2 = createLinkedList2.iterator();
                while (true) {
                    if (!it2.hasNext()) {
                        break;
                    }
                    String str3 = (String) it2.next();
                    if (!str3.equals("yyyy") || replace.length() == 4) {
                        SimpleDateFormat simpleDateFormat2 = new SimpleDateFormat(str3, Locale.ENGLISH);
                        simpleDateFormat2.setLenient(false);
                        date = simpleDateFormat2.parse(replace, new ParsePosition(0));
                        if (date != null) {
                            this.m_iOnlyYearDateEntities++;
                            z = true;
                            break;
                        }
                    }
                }
            }
            if (date != null) {
                multiValueHashMap.remove("modified");
                try {
                    if (DateUtils.daysBetween(new Date(), date) >= 182500) {
                        Logger.getLogger(BaseJsonParser.class.getName()).warning("date parse error:" + replace + " => " + date);
                        this.m_iErrorDateEntities++;
                    } else if (!z) {
                        multiValueHashMap.add("modified", String.valueOf(DateUtils.date2Number(date)));
                    }
                } catch (Exception e) {
                    Logger.getLogger(BaseJsonParser.class.getName()).warning("date parse error:" + replace + " => " + date);
                    this.m_iErrorDateEntities++;
                }
            } else {
                multiValueHashMap.remove("modified");
                this.m_iErrorDateEntities++;
            }
        }
        Collection collection2 = multiValueHashMap.get("continentCode");
        if (collection2.size() > 0 && collection2.iterator().next() != null && (str2 = hsContinentCodeOld2New.get((str = (String) collection2.iterator().next()))) != null) {
            multiValueHashMap.remove("continentCode", str);
            multiValueHashMap.add("continentCode", str2);
        }
        Collection collection3 = multiValueHashMap.get("snippet");
        if (collection3.size() > 0 && collection3.iterator().next() != null) {
            String str4 = (String) collection3.iterator().next();
            if (str4.length() > 347) {
                String str5 = str4.substring(0, 347) + "...";
                multiValueHashMap.remove("snippet", str4);
                multiValueHashMap.add("snippet", str5);
            }
        }
        LinkedList linkedList = new LinkedList(multiValueHashMap.get("globalSource"));
        multiValueHashMap.remove("globalSource");
        Iterator it3 = linkedList.iterator();
        while (it3.hasNext()) {
            String lowerCase = ((String) it3.next()).replaceAll("/$", "").replaceAll(".*/", "").replaceAll("^[^\\w\\(\\[]+", "").replaceAll("[^\\w\\)\\]]+$", "").replaceAll("(?<=[a-z])([A-Z])", " $1").replaceAll("^doc-type:", "").toLowerCase();
            if (!StringUtils.nullOrWhitespace(lowerCase) && !lowerCase.matches("[\\\\s\\\\d]*")) {
                multiValueHashMap.add("globalSource", lowerCase);
            }
        }
        for (String str6 : multiValueHashMap.get("dcdeweyones")) {
            multiValueHashMap.addAllNoDoubles("steep", hsDcdeweyones2steep.get(str6));
            multiValueHashMap.addAllNoDoubles("topic1", hsDcdeweyones2topic1.get(str6));
            multiValueHashMap.addAllNoDoubles("queryName", hsDcdeweyones2queryName.get(str6));
        }
        for (String str7 : multiValueHashMap.get("dcdeweytens")) {
            multiValueHashMap.addAllNoDoubles("steep", hsDcdeweytens2steep.get(str7));
            multiValueHashMap.addAllNoDoubles("topic1", hsDcdeweytens2topic1.get(str7));
            multiValueHashMap.addAllNoDoubles("queryName", hsDcdeweytens2queryName.get(str7));
        }
        for (String str8 : multiValueHashMap.get("dcdeweyhuns")) {
            multiValueHashMap.addAllNoDoubles("steep", hsDcdeweyhuns2steep.get(str8));
            multiValueHashMap.addAllNoDoubles("topic1", hsDcdeweyhuns2topic1.get(str8));
            multiValueHashMap.addAllNoDoubles("queryName", hsDcdeweyhuns2queryName.get(str8));
        }
        super.handleMetadata(multiValueHashMap, contentHandler, metadata, parseContext);
    }

    static {
        hsPath2AttName.add("$.response.docs[*].dccreator[*]", "creator");
        hsPath2AttName.add("$.response.docs[*].dccreator[*]", "dccreator");
        hsPath2AttName.add("$.response.docs[*].dccontributor[*]", "dccontributor");
        hsPath2AttName.add("$.response.docs[*].dccontributor[*]", "creator");
        hsPath2AttName.add("$.response.docs[*].dccontenttype[*]", "Content-Type");
        hsPath2AttName.add("$.response.docs[*].dclanguage[*]", "language");
        hsPath2AttName.add("$.response.docs[*].dcdate", "modified");
        hsPath2AttName.add("$.response.docs[*].dcdate", "dataEntityContentFingerprint");
        hsPath2AttName.add("$.response.docs[*].dcpublisher[*]", "publisher");
        hsPath2AttName.add("$.response.docs[*].dctitle", "title");
        hsPath2AttName.add("$.response.docs[*].dcdocid", "id");
        hsPath2AttName.add("$.response.docs[*].dcdocid", "dataEntityId");
        hsPath2AttName.add("$.response.docs[*].dcdocid", "dcdocid");
        hsPath2AttName.add("$.response.docs[*].dccountry", "origin");
        hsPath2AttName.add("$.response.docs[*].dccountry", "dccountry");
        hsPath2AttName.add("$.response.docs[*].dccontinent", "continentCode");
        hsPath2AttName.add("$.response.docs[*].dccontinent", "dccontinent");
        hsPath2AttName.add("$.response.docs[*].dclink", "sourceUrl");
        hsPath2AttName.add("$.response.docs[*].dclink", "dclink");
        hsPath2AttName.add("$.response.docs[*].dctype[*]", "globalSource");
        hsPath2AttName.add("$.response.docs[*].dctype[*]", "dctype");
        hsPath2AttName.add("$.response.docs[*].dcdescription", "body");
        hsPath2AttName.add("$.response.docs[*].dcdescription", "snippet");
        hsContinentCodeOld2New.put("caf", "af");
        hsContinentCodeOld2New.put("can", "an");
        hsContinentCodeOld2New.put("cas", "as");
        hsContinentCodeOld2New.put("cau", "au");
        hsContinentCodeOld2New.put("ceu", "eu");
        hsContinentCodeOld2New.put("cna", "na");
        hsContinentCodeOld2New.put("csa", "sa");
        if (!Files.exists(Paths.get("./BaseMappings_DFKI_conv.csv", new String[0]), new LinkOption[0])) {
            Logger.getLogger(BaseJsonParser.class.getName()).log(Level.WARNING, "No config file for Base json parser found");
            return;
        }
        try {
            CSVReader cSVReader = new CSVReader(new InputStreamReader(new FileInputStream("./BaseMappings_DFKI_conv.csv"), StandardCharsets.UTF_8), ';');
            try {
                cSVReader.readNext();
                while (true) {
                    String[] readNext = cSVReader.readNext();
                    if (readNext == null) {
                        cSVReader.close();
                        return;
                    }
                    String str = readNext[0] != null ? readNext[0] : "";
                    String str2 = readNext[1] != null ? readNext[1] : "";
                    String str3 = readNext[2] != null ? readNext[2] : "";
                    String str4 = readNext[3] != null ? readNext[3] : "";
                    String str5 = readNext[4] != null ? readNext[4] : "";
                    String str6 = readNext[5] != null ? readNext[5] : "";
                    for (String str7 : str.split(",\\s*")) {
                        hsDcdeweyhuns2steep.add(str7, str4);
                        hsDcdeweyhuns2topic1.add(str7, str5);
                        hsDcdeweyhuns2queryName.add(str7, str6);
                    }
                    for (String str8 : str2.split(",\\s*")) {
                        hsDcdeweytens2steep.add(str8, str4);
                        hsDcdeweytens2topic1.add(str8, str5);
                        hsDcdeweytens2queryName.add(str8, str6);
                    }
                    for (String str9 : str3.split(",\\s*")) {
                        hsDcdeweyones2steep.add(str9, str4);
                        hsDcdeweyones2topic1.add(str9, str5);
                        hsDcdeweyones2queryName.add(str9, str6);
                    }
                }
            } finally {
            }
        } catch (Exception e) {
            Logger.getLogger(BaseJsonParser.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
        }
    }
}
