/*
 * Decompiled with CFR 0.152.
 */
package com.datumbox.framework.applications.nlp;

import com.datumbox.framework.common.Configuration;
import com.datumbox.framework.common.dataobjects.AssociativeArray;
import com.datumbox.framework.common.dataobjects.FlatDataCollection;
import com.datumbox.framework.core.common.dataobjects.Dataframe;
import com.datumbox.framework.core.common.dataobjects.Record;
import com.datumbox.framework.core.common.interfaces.Parameterizable;
import com.datumbox.framework.core.common.text.StringCleaner;
import com.datumbox.framework.core.common.text.parsers.HTMLParser;
import com.datumbox.framework.core.common.utilities.MapMethods;
import com.datumbox.framework.core.common.utilities.PHPMethods;
import com.datumbox.framework.core.machinelearning.MLBuilder;
import com.datumbox.framework.core.machinelearning.clustering.Kmeans;
import com.datumbox.framework.core.statistics.descriptivestatistics.Descriptives;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class CETR {
    private static final Pattern NUMBER_OF_TAGS_PATTERN = Pattern.compile("<[^>]+?>", 32);
    private final Configuration configuration;

    public CETR(Configuration configuration) {
        this.configuration = configuration;
    }

    public String extract(String html, Parameters parameters) {
        html = this.clearText(html);
        List<String> rows = this.extractRows(html);
        List<Integer> selectedRowIds = this.selectRows(rows, parameters);
        StringBuilder sb = new StringBuilder(html.length());
        for (Integer rowId : selectedRowIds) {
            String row = rows.get(rowId);
            if ((row = StringCleaner.removeExtraSpaces(HTMLParser.extractText(row))).isEmpty()) continue;
            sb.append(row).append(" ");
        }
        return sb.toString().trim();
    }

    private List<Integer> selectRows(List<String> rows, Parameters parameters) {
        Integer clusterId;
        List<Double> TTRlist = this.calculateTTRlist(rows);
        this.gaussianSmoothing(TTRlist);
        boolean use2Dmodel = parameters.getAlphaWindowSizeFor2DModel() > 0;
        Dataframe dataset = new Dataframe(this.configuration);
        if (use2Dmodel) {
            List<Double> G = this.computeDerivatives(TTRlist, parameters.getAlphaWindowSizeFor2DModel());
            this.gaussianSmoothing(G);
            int n = TTRlist.size();
            for (int i = 0; i < n; ++i) {
                AssociativeArray associativeArray = new AssociativeArray();
                associativeArray.put(0, TTRlist.get(i));
                associativeArray.put(1, G.get(i));
                dataset.add(new Record(associativeArray, null));
            }
        } else {
            int n = TTRlist.size();
            for (int i = 0; i < n; ++i) {
                AssociativeArray xData = new AssociativeArray();
                xData.put(0, TTRlist.get(i));
                dataset.add(new Record(xData, null));
            }
        }
        this.performClustering(dataset, parameters.getNumberOfClusters());
        HashMap<Object, Double> avgTTRscorePerCluster = new HashMap<Object, Double>();
        HashMap<Integer, Integer> clusterCounts = new HashMap<Integer, Integer>();
        for (Record record : dataset) {
            clusterId = (Integer)record.getYPredicted();
            Double ttr = record.getX().getDouble(0);
            Double previousValue = avgTTRscorePerCluster.getOrDefault(clusterId, 0.0);
            Integer counter = clusterCounts.getOrDefault(clusterId, 0);
            avgTTRscorePerCluster.put(clusterId, previousValue + ttr);
            clusterCounts.put(clusterId, counter + 1);
        }
        for (Map.Entry entry : avgTTRscorePerCluster.entrySet()) {
            clusterId = (Integer)entry.getKey();
            double avgTTR = (Double)entry.getValue() / (double)((Integer)clusterCounts.get(clusterId)).intValue();
            avgTTRscorePerCluster.put(clusterId, avgTTR);
        }
        Map.Entry<Object, Double> entry = MapMethods.selectMinKeyValue(avgTTRscorePerCluster);
        Integer n = (Integer)entry.getKey();
        ArrayList<Integer> selectedRows = new ArrayList<Integer>();
        for (Map.Entry<Integer, Record> e : dataset.entries()) {
            Integer rId = e.getKey();
            Record r = e.getValue();
            Integer clusterId2 = (Integer)r.getYPredicted();
            if (Objects.equals(clusterId2, n)) continue;
            selectedRows.add(rId);
        }
        dataset.close();
        return selectedRows;
    }

    private void performClustering(Dataframe dataset, int numberOfClusters) {
        Kmeans.TrainingParameters param = new Kmeans.TrainingParameters();
        param.setK(numberOfClusters);
        param.setMaxIterations(200);
        param.setInitializationMethod(Kmeans.TrainingParameters.Initialization.SET_FIRST_K);
        param.setDistanceMethod(Kmeans.TrainingParameters.Distance.EUCLIDIAN);
        param.setWeighted(false);
        param.setCategoricalGamaMultiplier(1.0);
        Kmeans instance = (Kmeans)MLBuilder.create(param, this.configuration);
        instance.fit(dataset);
        instance.predict(dataset);
        instance.close();
    }

    private List<Double> calculateTTRlist(List<String> rows) {
        ArrayList<Double> TTRlist = new ArrayList<Double>();
        for (String row : rows) {
            int x = this.countContentChars(row);
            int y = this.countNumberOfTags(row);
            if (y == 0) {
                y = 1;
            }
            TTRlist.add((double)x / (double)y);
        }
        return TTRlist;
    }

    private List<Double> gaussianSmoothing(List<Double> list) {
        int i;
        int n = list.size();
        double std = Descriptives.std(new FlatDataCollection((Collection<Object>)list), false);
        double variance = std * std;
        int sygma = (int)Math.min(Math.ceil(std), ((double)n - 1.0) / 2.0);
        ArrayList<Double> gaussianKernel = new ArrayList<Double>(2 * sygma + 1);
        double normalizer = 0.0;
        for (i = 0; i <= 2 * sygma; ++i) {
            double value = 0.0;
            for (int j = -sygma; j <= sygma; ++j) {
                value += Math.exp((double)(-j * j) / (2.0 * variance));
            }
            gaussianKernel.add(value);
            normalizer += value;
        }
        for (i = 0; i <= 2 * sygma; ++i) {
            gaussianKernel.set(i, (Double)gaussianKernel.get(i) / normalizer);
        }
        ArrayList<Double> smoothedList = new ArrayList<Double>(n);
        for (int i2 = 0; i2 < n; ++i2) {
            double smoothedValue = 0.0;
            for (int j = -sygma; j <= sygma; ++j) {
                int index = i2 - j;
                if (index < 0 || index >= n) continue;
                smoothedValue += (Double)gaussianKernel.get(j + sygma) * (Double)list.get(index);
            }
            smoothedList.add(smoothedValue);
        }
        return smoothedList;
    }

    private List<Double> computeDerivatives(List<Double> list, int alphaWindowSizeFor2DModel) {
        int n = list.size();
        ArrayList<Double> G = new ArrayList<Double>(n);
        for (int i = 0; i < n; ++i) {
            double sum = 0.0;
            int counter = 0;
            for (int j = 0; j < alphaWindowSizeFor2DModel; ++j) {
                int index = i + j;
                if (index < 0 || index >= n) continue;
                sum += list.get(index).doubleValue();
                ++counter;
            }
            if (counter == 0) {
                counter = 1;
            }
            double avgInWindow = sum / (double)counter;
            G.add(Math.abs(avgInWindow - list.get(i)));
        }
        return G;
    }

    private int countNumberOfTags(String text) {
        Matcher m = NUMBER_OF_TAGS_PATTERN.matcher(text);
        int count = 0;
        while (m.find()) {
            ++count;
        }
        return count;
    }

    private int countContentChars(String text) {
        return StringCleaner.removeExtraSpaces(HTMLParser.extractText(text)).length();
    }

    private List<String> extractRows(String text) {
        return Arrays.asList(text.split("\n"));
    }

    private String clearText(String text) {
        if (PHPMethods.substr_count(text = HTMLParser.removeNonTextTagsAndAttributes(text), '\n') <= 1) {
            text = text.replace(">", ">\n");
        }
        text = text.replaceAll("[\\n\\r]+", "\n").replaceAll("(?m)^[ \t]*\r?\n", "").trim();
        return text;
    }

    public static class Parameters
    implements Parameterizable {
        private static final long serialVersionUID = 1L;
        private int numberOfClusters = 2;
        private int alphaWindowSizeFor2DModel = 3;
        private int smoothingAverageRadius = 2;

        public int getNumberOfClusters() {
            return this.numberOfClusters;
        }

        public void setNumberOfClusters(int numberOfClusters) {
            this.numberOfClusters = numberOfClusters;
        }

        public int getAlphaWindowSizeFor2DModel() {
            return this.alphaWindowSizeFor2DModel;
        }

        public void setAlphaWindowSizeFor2DModel(int alphaWindowSizeFor2DModel) {
            this.alphaWindowSizeFor2DModel = alphaWindowSizeFor2DModel;
        }

        public int getSmoothingAverageRadius() {
            return this.smoothingAverageRadius;
        }

        public void setSmoothingAverageRadius(int smoothingAverageRadius) {
            this.smoothingAverageRadius = smoothingAverageRadius;
        }
    }
}

