package org.apache.nutch.searcher;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Vector;
import javax.servlet.ServletContext;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Closeable;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.indexer.Indexer;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.searcher.DistributedSearch;
import org.apache.nutch.util.NutchConfiguration;

/* loaded from: input_file:org/apache/nutch/searcher/NutchBean.class */
public class NutchBean implements Searcher, HitDetailer, HitSummarizer, HitContent, HitInlinks, DistributedSearch.Protocol, Closeable {
    public static final Log LOG = LogFactory.getLog(NutchBean.class);
    private String[] segmentNames;
    private Searcher searcher;
    private HitDetailer detailer;
    private HitSummarizer summarizer;
    private HitContent content;
    private HitInlinks linkDb;
    private static final int MAX_PROHIBITED_TERMS = 20;
    private Configuration conf;
    private FileSystem fs;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/nutch/searcher/NutchBean$DupHits.class */
    public class DupHits extends ArrayList {
        private boolean maxSizeExceeded;

        private DupHits() {
        }
    }

    public static NutchBean get(ServletContext servletContext, Configuration configuration) throws IOException {
        NutchBean nutchBean = (NutchBean) servletContext.getAttribute("nutchBean");
        if (nutchBean == null) {
            if (LOG.isInfoEnabled()) {
                LOG.info("creating new bean");
            }
            nutchBean = new NutchBean(configuration);
            servletContext.setAttribute("nutchBean", nutchBean);
        }
        return nutchBean;
    }

    public NutchBean(Configuration configuration) throws IOException {
        this(configuration, null);
    }

    public NutchBean(Configuration configuration, Path path) throws IOException {
        this.conf = configuration;
        this.fs = FileSystem.get(this.conf);
        path = path == null ? new Path(this.conf.get("searcher.dir", "crawl")) : path;
        Path path2 = new Path(path, "search-servers.txt");
        if (!this.fs.exists(path2)) {
            init(new Path(path, "index"), new Path(path, "indexes"), new Path(path, "segments"), new Path(path, URLNormalizers.SCOPE_LINKDB));
            return;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("searching servers in " + path2);
        }
        init(new DistributedSearch.Client(path2, configuration));
    }

    private void init(Path path, Path path2, Path path3, Path path4) throws IOException {
        IndexSearcher indexSearcher;
        if (this.fs.exists(path)) {
            if (LOG.isInfoEnabled()) {
                LOG.info("opening merged index in " + path);
            }
            indexSearcher = new IndexSearcher(path, this.conf);
        } else {
            if (LOG.isInfoEnabled()) {
                LOG.info("opening indexes in " + path2);
            }
            Vector vector = new Vector();
            Path[] listPaths = this.fs.listPaths(path2);
            for (int i = 0; i < this.fs.listPaths(path2).length; i++) {
                if (this.fs.isFile(new Path(listPaths[i], Indexer.DONE_NAME))) {
                    vector.add(listPaths[i]);
                }
            }
            Path[] pathArr = new Path[vector.size()];
            int i2 = 0;
            while (vector.size() > 0) {
                pathArr[i2] = (Path) vector.remove(0);
                i2++;
            }
            indexSearcher = new IndexSearcher(pathArr, this.conf);
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("opening segments in " + path3);
        }
        FetchedSegments fetchedSegments = new FetchedSegments(this.fs, path3.toString(), this.conf);
        this.segmentNames = fetchedSegments.getSegmentNames();
        this.searcher = indexSearcher;
        this.detailer = indexSearcher;
        this.summarizer = fetchedSegments;
        this.content = fetchedSegments;
        if (LOG.isInfoEnabled()) {
            LOG.info("opening linkdb in " + path4);
        }
        this.linkDb = new LinkDbInlinks(this.fs, path4, this.conf);
    }

    private void init(DistributedSearch.Client client) {
        this.segmentNames = client.getSegmentNames();
        this.searcher = client;
        this.detailer = client;
        this.summarizer = client;
        this.content = client;
        this.linkDb = client;
    }

    @Override // org.apache.nutch.searcher.DistributedSearch.Protocol
    public String[] getSegmentNames() {
        return this.segmentNames;
    }

    public Hits search(Query query, int i) throws IOException {
        return search(query, i, null, null, false);
    }

    @Override // org.apache.nutch.searcher.Searcher
    public Hits search(Query query, int i, String str, String str2, boolean z) throws IOException {
        return this.searcher.search(query, i, str, str2, z);
    }

    public Hits search(Query query, int i, int i2) throws IOException {
        return search(query, i, i2, "site", null, false);
    }

    public Hits search(Query query, int i, int i2, String str) throws IOException {
        return search(query, i, i2, str, null, false);
    }

    public Hits search(Query query, int i, int i2, String str, String str2, boolean z) throws IOException {
        if (i2 <= 0) {
            return search(query, i, str, str2, z);
        }
        float f = this.conf.getFloat("searcher.hostgrouping.rawhits.factor", 2.0f);
        int i3 = (int) (i * f);
        if (LOG.isInfoEnabled()) {
            LOG.info("searching for " + i3 + " raw hits");
        }
        Hits search = this.searcher.search(query, i3, str, str2, z);
        long total = search.getTotal();
        HashMap hashMap = new HashMap();
        ArrayList arrayList = new ArrayList();
        HashSet hashSet = new HashSet();
        ArrayList arrayList2 = new ArrayList();
        boolean z2 = true;
        int i4 = 0;
        while (i4 < search.getTotal()) {
            if (i4 < search.getLength()) {
                Hit hit = search.getHit(i4);
                if (!hashSet.contains(hit)) {
                    hashSet.add(hit);
                    String dedupValue = hit.getDedupValue();
                    DupHits dupHits = (DupHits) hashMap.get(dedupValue);
                    if (dupHits == null) {
                        DupHits dupHits2 = new DupHits();
                        dupHits = dupHits2;
                        hashMap.put(dedupValue, dupHits2);
                    }
                    if (dupHits.size() != i2) {
                        arrayList.add(hit);
                        dupHits.add(hit);
                        if (arrayList.size() > i) {
                            break;
                        }
                    } else {
                        if (!dupHits.maxSizeExceeded) {
                            for (int i5 = 0; i5 < dupHits.size(); i5++) {
                                ((Hit) dupHits.get(i5)).setMoreFromDupExcluded(true);
                            }
                            dupHits.maxSizeExceeded = true;
                            arrayList2.add(dedupValue);
                        }
                        z2 = false;
                    }
                } else {
                    continue;
                }
            } else {
                Query query2 = (Query) query.clone();
                for (int i6 = 0; i6 < arrayList2.size() && i6 != 20; i6++) {
                    query2.addProhibitedTerm((String) arrayList2.get(i6), str);
                }
                i3 = (int) (i3 * f);
                if (LOG.isInfoEnabled()) {
                    LOG.info("re-searching for " + i3 + " raw hits, query: " + query2);
                }
                search = this.searcher.search(query2, i3, str, str2, z);
                if (LOG.isInfoEnabled()) {
                    LOG.info("found " + search.getTotal() + " raw hits");
                }
                i4 = -1;
            }
            i4++;
        }
        Hits hits = new Hits(total, (Hit[]) arrayList.toArray(new Hit[arrayList.size()]));
        hits.setTotalIsExact(z2);
        return hits;
    }

    @Override // org.apache.nutch.searcher.Searcher
    public String getExplanation(Query query, Hit hit) throws IOException {
        return this.searcher.getExplanation(query, hit);
    }

    @Override // org.apache.nutch.searcher.HitDetailer
    public HitDetails getDetails(Hit hit) throws IOException {
        return this.detailer.getDetails(hit);
    }

    @Override // org.apache.nutch.searcher.HitDetailer
    public HitDetails[] getDetails(Hit[] hitArr) throws IOException {
        return this.detailer.getDetails(hitArr);
    }

    @Override // org.apache.nutch.searcher.HitSummarizer
    public Summary getSummary(HitDetails hitDetails, Query query) throws IOException {
        return this.summarizer.getSummary(hitDetails, query);
    }

    @Override // org.apache.nutch.searcher.HitSummarizer
    public Summary[] getSummary(HitDetails[] hitDetailsArr, Query query) throws IOException {
        return this.summarizer.getSummary(hitDetailsArr, query);
    }

    @Override // org.apache.nutch.searcher.HitContent
    public byte[] getContent(HitDetails hitDetails) throws IOException {
        return this.content.getContent(hitDetails);
    }

    @Override // org.apache.nutch.searcher.HitContent
    public ParseData getParseData(HitDetails hitDetails) throws IOException {
        return this.content.getParseData(hitDetails);
    }

    @Override // org.apache.nutch.searcher.HitContent
    public ParseText getParseText(HitDetails hitDetails) throws IOException {
        return this.content.getParseText(hitDetails);
    }

    @Override // org.apache.nutch.searcher.HitInlinks
    public String[] getAnchors(HitDetails hitDetails) throws IOException {
        return this.linkDb.getAnchors(hitDetails);
    }

    @Override // org.apache.nutch.searcher.HitInlinks
    public Inlinks getInlinks(HitDetails hitDetails) throws IOException {
        return this.linkDb.getInlinks(hitDetails);
    }

    @Override // org.apache.nutch.searcher.HitContent
    public long getFetchDate(HitDetails hitDetails) throws IOException {
        return this.content.getFetchDate(hitDetails);
    }

    public void close() throws IOException {
        if (this.content != null) {
            this.content.close();
        }
        if (this.searcher != null) {
            this.searcher.close();
        }
        if (this.linkDb != null) {
            this.linkDb.close();
        }
        if (this.fs != null) {
            this.fs.close();
        }
    }

    public static void main(String[] strArr) throws Exception {
        if (strArr.length == 0) {
            System.err.println("NutchBean query");
            System.exit(-1);
        }
        Configuration create = NutchConfiguration.create();
        NutchBean nutchBean = new NutchBean(create);
        Query parse = Query.parse(strArr[0], create);
        Hits search = nutchBean.search(parse, 10);
        System.out.println("Total hits: " + search.getTotal());
        HitDetails[] details = nutchBean.getDetails(search.getHits(0, (int) Math.min(search.getTotal(), 10L)));
        Summary[] summary = nutchBean.getSummary(details, parse);
        for (int i = 0; i < search.getLength(); i++) {
            System.out.println(" " + i + " " + details[i] + "\n" + summary[i]);
        }
    }

    public long getProtocolVersion(String str, long j) throws IOException {
        if (DistributedSearch.Protocol.class.getName().equals(str)) {
            return 1L;
        }
        throw new IOException("Unknown Protocol classname:" + str);
    }
}
