/*
 * Decompiled with CFR 0.152.
 */
package smile.nlp.collocation;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Iterator;
import smile.nlp.Bigram;
import smile.nlp.Corpus;
import smile.nlp.collocation.BigramCollocation;
import smile.sort.HeapSelect;
import smile.stat.distribution.ChiSquareDistribution;

public class BigramCollocationFinder {
    private ChiSquareDistribution chisq = new ChiSquareDistribution(1);
    private int minFreq;

    public BigramCollocationFinder(int minFreq) {
        this.minFreq = minFreq;
    }

    public BigramCollocation[] find(Corpus corpus, int k) {
        Comparable[] bigrams = new BigramCollocation[k];
        HeapSelect heap = new HeapSelect(bigrams);
        Iterator<Bigram> iterator = corpus.getBigrams();
        while (iterator.hasNext()) {
            Bigram bigram = iterator.next();
            int c12 = corpus.getBigramFrequency(bigram);
            if (c12 <= this.minFreq) continue;
            int c1 = corpus.getTermFrequency(bigram.w1);
            int c2 = corpus.getTermFrequency(bigram.w2);
            double score = this.likelihoodRatio(c1, c2, c12, corpus.size());
            heap.add(new BigramCollocation(bigram.w1, bigram.w2, c12, -score));
        }
        heap.sort();
        BigramCollocation[] collocations = new BigramCollocation[k];
        for (int i = 0; i < k; ++i) {
            Comparable bigram = bigrams[k - i - 1];
            collocations[i] = new BigramCollocation(((BigramCollocation)bigram).w1(), ((BigramCollocation)bigram).w2(), ((BigramCollocation)bigram).frequency(), -((BigramCollocation)bigram).score());
        }
        return collocations;
    }

    public BigramCollocation[] find(Corpus corpus, double p) {
        int i;
        if (p <= 0.0 || p >= 1.0) {
            throw new IllegalArgumentException("Invalid p = " + p);
        }
        double cutoff = this.chisq.quantile(p);
        ArrayList<BigramCollocation> bigrams = new ArrayList<BigramCollocation>();
        Iterator<Bigram> iterator = corpus.getBigrams();
        while (iterator.hasNext()) {
            int c2;
            int c1;
            double score;
            Bigram bigram = iterator.next();
            int c12 = corpus.getBigramFrequency(bigram);
            if (c12 <= this.minFreq || !((score = this.likelihoodRatio(c1 = corpus.getTermFrequency(bigram.w1), c2 = corpus.getTermFrequency(bigram.w2), c12, corpus.size())) > cutoff)) continue;
            bigrams.add(new BigramCollocation(bigram.w1, bigram.w2, c12, score));
        }
        int n = bigrams.size();
        Object[] collocations = new BigramCollocation[n];
        for (i = 0; i < n; ++i) {
            collocations[i] = (BigramCollocation)bigrams.get(i);
        }
        Arrays.sort(collocations);
        for (i = 0; i < n / 2; ++i) {
            Object b = collocations[i];
            collocations[i] = collocations[n - i - 1];
            collocations[n - i - 1] = b;
        }
        return collocations;
    }

    private double likelihoodRatio(int c1, int c2, int c12, long N) {
        double p = (double)c2 / (double)N;
        double p1 = (double)c12 / (double)c1;
        double p2 = (double)(c2 - c12) / (double)(N - (long)c1);
        double logLambda = this.logL(c12, c1, p) + this.logL(c2 - c12, N - (long)c1, p) - this.logL(c12, c1, p1) - this.logL(c2 - c12, N - (long)c1, p2);
        return -2.0 * logLambda;
    }

    private double logL(int k, long n, double x) {
        if (x == 0.0) {
            x = 0.01;
        }
        if (x == 1.0) {
            x = 0.99;
        }
        return (double)k * Math.log(x) + (double)(n - (long)k) * Math.log(1.0 - x);
    }
}

