/*
 * Decompiled with CFR 0.152.
 */
package com.datumbox.framework.core.common.text.analyzers;

import com.datumbox.framework.core.common.text.StringCleaner;
import com.datumbox.framework.core.common.text.analyzers.PHPSimilarText;
import com.datumbox.framework.core.common.text.extractors.NgramsExtractor;
import com.datumbox.framework.core.common.text.parsers.HTMLParser;
import com.datumbox.framework.core.common.utilities.PHPMethods;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;

public class TextSimilarity {
    public static double oliverSimilarity(String text1, String text2) {
        TextSimilarity.preprocessDocument(text1);
        TextSimilarity.preprocessDocument(text2);
        String smallerDoc = text1;
        String biggerDoc = text2;
        if (text1.length() > text2.length()) {
            smallerDoc = text2;
            biggerDoc = text1;
        }
        double p = PHPSimilarText.similarityPercentage(smallerDoc, biggerDoc);
        return p /= 100.0;
    }

    public static double shinglerSimilarity(String text1, String text2, int w) {
        TextSimilarity.preprocessDocument(text1);
        TextSimilarity.preprocessDocument(text2);
        NgramsExtractor.Parameters parameters = new NgramsExtractor.Parameters();
        parameters.setMaxCombinations(w);
        parameters.setMaxDistanceBetweenKwds(0);
        parameters.setExaminationWindowLength(w);
        NgramsExtractor ngrams = new NgramsExtractor(parameters);
        Map<String, Double> keywords1 = ngrams.extract(text1);
        Map<String, Double> keywords2 = ngrams.extract(text2);
        TextSimilarity.filterKeywordCombinations(keywords1, w);
        TextSimilarity.filterKeywordCombinations(keywords2, w);
        double totalKeywords = 0.0;
        double commonKeywords = 0.0;
        HashSet<String> union = new HashSet<String>(keywords1.keySet());
        union.addAll(keywords2.keySet());
        HashSet<String> intersect = new HashSet<String>(keywords1.keySet());
        intersect.retainAll(keywords2.keySet());
        double resemblance = (commonKeywords += (double)intersect.size()) / (totalKeywords += (double)union.size());
        return resemblance;
    }

    private static String preprocessDocument(String text) {
        text = StringCleaner.tokenizeURLs(text);
        text = HTMLParser.extractText(text);
        text = StringCleaner.removeAccents(text);
        text = StringCleaner.removeExtraSpaces(text);
        return text;
    }

    private static void filterKeywordCombinations(Map<String, Double> keywords, int w) {
        Iterator<Map.Entry<String, Double>> it = keywords.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry<String, Double> entry = it.next();
            if (PHPMethods.substr_count(entry.getKey(), ' ') == w - 1) continue;
            it.remove();
        }
    }
}

