package edu.washington.cs.knowitall.nlp;

import com.google.common.collect.Iterables;
import edu.washington.cs.knowitall.commonlib.FileUtils;
import edu.washington.cs.knowitall.extractor.ExtractorException;
import edu.washington.cs.knowitall.extractor.SentenceExtractor;
import edu.washington.cs.knowitall.util.DefaultObjects;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.StringWriter;
import java.util.ArrayList;
import java.util.Iterator;

/* loaded from: input_file:WEB-INF/lib/reverb-core-1.4.1.jar:edu/washington/cs/knowitall/nlp/ChunkedDocumentReader.class */
public class ChunkedDocumentReader {
    private SentenceExtractor sentExtractor;
    private SentenceChunker sentChunker;

    public ChunkedDocumentReader(SentenceExtractor sentenceExtractor, SentenceChunker sentenceChunker) throws IOException {
        this.sentExtractor = sentenceExtractor;
        this.sentChunker = sentenceChunker;
    }

    public ChunkedDocumentReader(SentenceExtractor sentenceExtractor) throws IOException {
        this(sentenceExtractor, new OpenNlpSentenceChunker());
    }

    public ChunkedDocumentReader(SentenceChunker sentenceChunker) throws IOException {
        this(DefaultObjects.getDefaultHtmlSentenceExtractor(), sentenceChunker);
    }

    public ChunkedDocumentReader() throws IOException {
        this(DefaultObjects.getDefaultHtmlSentenceExtractor(), new OpenNlpSentenceChunker());
    }

    public SentenceExtractor getSentenceExtractor() {
        return this.sentExtractor;
    }

    public SentenceChunker getSentenceChunker() {
        return this.sentChunker;
    }

    public ChunkedDocument readDocument(InputStream inputStream, String str) throws ExtractorException {
        StringWriter stringWriter = new StringWriter();
        try {
            FileUtils.pipe(new InputStreamReader(inputStream), stringWriter);
            return readDocument(stringWriter.toString(), str);
        } catch (IOException e) {
            throw new ExtractorException(String.format("Could not read document %s", str), e);
        }
    }

    public ChunkedDocument readDocument(File file) throws ExtractorException {
        try {
            return readDocument(new FileInputStream(file), file.getAbsolutePath());
        } catch (IOException e) {
            throw new ExtractorException(String.format("Could not extract from %s", file), e);
        }
    }

    public ChunkedDocument readDocument(String str, String str2) throws ExtractorException {
        ArrayList arrayList = new ArrayList();
        Iterables.addAll(arrayList, this.sentExtractor.extract(str));
        ArrayList arrayList2 = new ArrayList(arrayList.size());
        int i = 1;
        Iterator it = arrayList.iterator();
        while (it.hasNext()) {
            try {
                arrayList2.add(this.sentChunker.chunkSentence((String) it.next()));
                i++;
            } catch (ChunkerException e) {
                throw new ExtractorException(String.format("Could not chunk sentence %s in document %s", Integer.valueOf(i), str2));
            }
        }
        return new ChunkedDocument(str2, arrayList2);
    }
}
