package de.dfki.leech.earlyTrendRadar;

import au.com.bytecode.opencsv.CSVReader;
import de.dfki.inquisitor.collections.CollectionUtilz;
import de.dfki.km.leech.Leech;
import de.dfki.km.leech.SubDataEntityContentHandler;
import de.dfki.km.leech.sax.DataSinkContentHandlerAdapter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.Charset;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashSet;
import java.util.Set;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.zip.GZIPInputStream;
import java.util.zip.ZipInputStream;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:de/dfki/leech/earlyTrendRadar/BrandwatchCsvParser.class */
public class BrandwatchCsvParser extends AbstractParser {
    private static final long serialVersionUID = -1380648858247346911L;
    protected static Set<String> sBodyAttNames = CollectionUtilz.createHashSet(new String[]{"fulltext", "body", "text", "plaintext", "dcdescription"});
    protected static Set<String> sCopy2ModifiedAttNames = CollectionUtilz.createHashSet(new String[]{"date", "dcdate"});
    protected static Set<String> sCopy2AuthorAttNames = CollectionUtilz.createHashSet(new String[]{"dcperson", "dccreator", "dccontributor"});
    protected static Set<String> sCopy2TitleAttNames = CollectionUtilz.createHashSet(new String[]{"dctitle"});
    protected static Set<String> sCopy2ContentTypeAttNames = CollectionUtilz.createHashSet(new String[]{"dccontenttype"});
    protected static Set<String> sCopy2LanguageAttNames = CollectionUtilz.createHashSet(new String[]{"dclanguage"});
    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(new HashSet(Arrays.asList(MediaType.application("vnd.etr.brandwatch_csv"), MediaType.application("vnd.etr.brandwatch_csv_zip"), MediaType.application("vnd.etr.brandwatch_csv_gzip"))));

    public static void main(String[] strArr) throws Exception {
        new Leech().parse("/home/reuschling/mnt/serv-4101/EarlyTrendRadar/brandwatch/current", new DataSinkContentHandlerAdapter() { // from class: de.dfki.leech.earlyTrendRadar.BrandwatchCsvParser.1
            public void processErrorData(Metadata metadata) {
            }

            public void processModifiedData(Metadata metadata, String str) {
            }

            public void processNewData(Metadata metadata, String str) {
                System.out.println(metadata);
            }

            public void processRemovedData(Metadata metadata) {
            }
        }, new ParseContext());
    }

    public Set<MediaType> getSupportedTypes(ParseContext parseContext) {
        return SUPPORTED_TYPES;
    }

    public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
        if (metadata.get("source").endsWith(".gz")) {
            inputStream = new GZIPInputStream(inputStream);
        }
        if (metadata.get("source").endsWith(".zip")) {
            inputStream = new ZipInputStream(inputStream);
        }
        String str = metadata.get("source");
        String str2 = metadata.get("Content-Type");
        try {
            CSVReader cSVReader = new CSVReader(new InputStreamReader(inputStream, Charset.forName("UTF-8")));
            try {
                String[] readNext = cSVReader.readNext();
                if (readNext == null) {
                    cSVReader.close();
                    return;
                }
                String[] strArr = (String[]) readNext.clone();
                int i = -1;
                while (true) {
                    String[] readNext2 = cSVReader.readNext();
                    if (readNext2 == null) {
                        cSVReader.close();
                        return;
                    }
                    i++;
                    if (readNext2.length >= 1) {
                        for (String str3 : metadata.names()) {
                            metadata.remove(str3);
                        }
                        String str4 = "";
                        for (int i2 = 0; i2 < readNext2.length && i2 < strArr.length; i2++) {
                            String str5 = strArr[i2];
                            String str6 = readNext2[i2];
                            if (sBodyAttNames.contains(str5.toLowerCase())) {
                                str4 = str6;
                            } else {
                                if (sCopy2ModifiedAttNames.contains(str5)) {
                                    metadata.add("modified", str6);
                                }
                                if ("domain".equals(str5)) {
                                    metadata.add("title", str6 + "#" + i);
                                }
                                metadata.add(str5, str6);
                            }
                        }
                        metadata.add("Content-Type", str2);
                        metadata.add("source", str + " #" + i);
                        new SubDataEntityContentHandler(contentHandler, metadata, str4).triggerSubDataEntityHandling();
                    }
                }
            } finally {
            }
        } catch (Exception e) {
            Logger.getLogger(BrandwatchCsvParser.class.getName()).log(Level.SEVERE, "Error", (Throwable) e);
        }
    }
}
