/*
 * Decompiled with CFR 0.152.
 */
package jsat.text;

import java.util.ArrayList;
import java.util.Collections;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.datatransform.RemoveAttributeTransform;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.BasicTextVectorCreator;
import jsat.text.TextVectorCreator;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;
import jsat.utils.IntSet;

public abstract class TextDataLoader
implements TextVectorCreator {
    private static final long serialVersionUID = -657253682338792871L;
    protected final List<SparseVector> vectors;
    protected Tokenizer tokenizer;
    protected ConcurrentHashMap<String, Integer> wordIndex;
    protected List<String> allWords;
    protected ConcurrentHashMap<Integer, AtomicInteger> termDocumentFrequencys;
    private WordWeighting weighting;
    protected ThreadLocal<StringBuilder> workSpace;
    protected ThreadLocal<List<String>> storageSpace;
    protected ThreadLocal<Map<String, Integer>> wordCounts;
    private TextVectorCreator tvc;
    protected boolean noMoreAdding;
    private final AtomicInteger currentLength = new AtomicInteger(0);
    private volatile int documents;

    public TextDataLoader(Tokenizer tokenizer, WordWeighting weighting) {
        this.vectors = new ArrayList<SparseVector>();
        this.tokenizer = tokenizer;
        this.wordIndex = new ConcurrentHashMap();
        this.termDocumentFrequencys = new ConcurrentHashMap();
        this.weighting = weighting;
        this.allWords = new ArrayList<String>();
        this.noMoreAdding = false;
        this.workSpace = new ThreadLocal();
        this.storageSpace = new ThreadLocal();
        this.wordCounts = new ThreadLocal();
    }

    public abstract void initialLoad();

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    protected int addOriginalDocument(String text) {
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        StringBuilder localWorkSpace = this.workSpace.get();
        List<String> localStorageSpace = this.storageSpace.get();
        Map<String, Integer> localWordCounts = this.wordCounts.get();
        if (localWorkSpace == null) {
            localWorkSpace = new StringBuilder();
            localStorageSpace = new ArrayList<String>();
            localWordCounts = new LinkedHashMap<String, Integer>();
            this.workSpace.set(localWorkSpace);
            this.storageSpace.set(localStorageSpace);
            this.wordCounts.set(localWordCounts);
        }
        localWorkSpace.setLength(0);
        localStorageSpace.clear();
        localWordCounts.clear();
        this.tokenizer.tokenize(text, localWorkSpace, localStorageSpace);
        for (String word : localStorageSpace) {
            Integer count = localWordCounts.get(word);
            if (count == null) {
                localWordCounts.put(word, 1);
                continue;
            }
            localWordCounts.put(word, count + 1);
        }
        SparseVector vec = new SparseVector(this.currentLength.get() + 1, localWordCounts.size());
        for (Map.Entry<String, Integer> entry : localWordCounts.entrySet()) {
            String word = entry.getKey();
            int ms_to_sleep = 1;
            while (!this.addWord(word, vec, entry.getValue())) {
                try {
                    Thread.sleep(ms_to_sleep);
                    ms_to_sleep = Math.min(100, ms_to_sleep * 2);
                }
                catch (InterruptedException ex) {
                    Logger.getLogger(TextDataLoader.class.getName()).log(Level.SEVERE, null, ex);
                }
            }
        }
        localWordCounts.clear();
        List<SparseVector> list = this.vectors;
        synchronized (list) {
            this.vectors.add(vec);
            return this.documents++;
        }
    }

    private boolean addWord(String word, SparseVector vec, Integer value) {
        Integer indx = this.wordIndex.get(word);
        if (indx == null) {
            Integer index_for_new_word = this.wordIndex.putIfAbsent(word, -1);
            if (index_for_new_word == null) {
                index_for_new_word = this.currentLength.getAndIncrement();
                this.wordIndex.put(word, index_for_new_word);
            }
            if (index_for_new_word < 0) {
                return false;
            }
            AtomicInteger termCount = new AtomicInteger(0);
            AtomicInteger tmp = null;
            tmp = this.termDocumentFrequencys.putIfAbsent(index_for_new_word, termCount);
            if (tmp != null) {
                termCount = tmp;
            }
            termCount.incrementAndGet();
            int newLen = Math.max(index_for_new_word + 1, vec.length());
            vec.setLength(newLen);
            vec.set(index_for_new_word, value.intValue());
        } else {
            if (indx < 0) {
                return false;
            }
            AtomicInteger toInc = this.termDocumentFrequencys.get(indx);
            if (toInc == null && (toInc = this.termDocumentFrequencys.putIfAbsent(indx, new AtomicInteger(1))) == null) {
                toInc = this.termDocumentFrequencys.get(indx);
            }
            toInc.incrementAndGet();
            if (vec.length() <= indx) {
                vec.setLength(indx + 1);
            }
            vec.set(indx, value.intValue());
        }
        return true;
    }

    protected void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        int finalLength = this.currentLength.get();
        int[] frqs = new int[finalLength];
        for (Map.Entry<Integer, AtomicInteger> entry : this.termDocumentFrequencys.entrySet()) {
            frqs[entry.getKey().intValue()] = entry.getValue().get();
        }
        for (SparseVector vec : this.vectors) {
            vec.setLength(finalLength);
        }
        this.weighting.setWeight(this.vectors, IntList.view(frqs, finalLength));
        System.out.println("Final Length: " + finalLength);
        for (SparseVector vec : this.vectors) {
            this.weighting.applyTo(vec);
        }
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            this.initialLoad();
            this.finishAdding();
        }
        ArrayList<DataPoint> dataPoints = new ArrayList<DataPoint>(this.vectors.size());
        for (SparseVector vec : this.vectors) {
            dataPoints.add(new DataPoint(vec, new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(dataPoints);
    }

    @Override
    public Vec newText(String text) {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.getTextVectorCreator().newText(text);
    }

    @Override
    public Vec newText(String input, StringBuilder workSpace, List<String> storageSpace) {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        return this.getTextVectorCreator().newText(input, workSpace, storageSpace);
    }

    public TextVectorCreator getTextVectorCreator() {
        if (!this.noMoreAdding) {
            throw new RuntimeException("Initial documents have not yet loaded");
        }
        if (this.tvc == null) {
            this.tvc = new BasicTextVectorCreator(this.tokenizer, this.wordIndex, this.weighting);
        }
        return this.tvc;
    }

    public String getWordForIndex(int index) {
        if (this.allWords.size() != this.wordIndex.size()) {
            while (this.allWords.size() < this.wordIndex.size()) {
                this.allWords.add("");
            }
            for (Map.Entry<String, Integer> entry : this.wordIndex.entrySet()) {
                this.allWords.set(entry.getValue(), entry.getKey());
            }
        }
        if (index >= 0 && index < this.allWords.size()) {
            return this.allWords.get(index);
        }
        return null;
    }

    public int getTermFrequency(int index) {
        return this.termDocumentFrequencys.get(index).get();
    }

    public RemoveAttributeTransform getMinimumOccurrenceDTF(int minCount) {
        IntSet numericToRemove = new IntSet();
        for (int i = 0; i < this.termDocumentFrequencys.size(); ++i) {
            if (this.termDocumentFrequencys.get(i).get() >= minCount) continue;
            numericToRemove.add(Integer.valueOf(i));
        }
        return new RemoveAttributeTransform(Collections.EMPTY_SET, numericToRemove);
    }
}

