package jsat.text;

import java.util.ArrayList;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicIntegerArray;
import jsat.DataSet;
import jsat.SimpleDataSet;
import jsat.classifiers.CategoricalData;
import jsat.classifiers.DataPoint;
import jsat.linear.SparseVector;
import jsat.linear.Vec;
import jsat.text.tokenizer.Tokenizer;
import jsat.text.wordweighting.WordWeighting;
import jsat.utils.IntList;

/* loaded from: input_file:jsat/text/HashedTextDataLoader.class */
public abstract class HashedTextDataLoader implements TextVectorCreator {
    private static final long serialVersionUID = 8513621180409278670L;
    private final int dimensionSize;
    private Tokenizer tokenizer;
    private WordWeighting weighting;
    protected List<SparseVector> vectors;
    private AtomicIntegerArray termDocumentFrequencys;
    protected boolean noMoreAdding;
    private volatile int documents;
    protected ThreadLocal<StringBuilder> workSpace;
    protected ThreadLocal<List<String>> storageSpace;
    protected ThreadLocal<Map<String, Integer>> wordCounts;
    private TextVectorCreator tvc;

    public HashedTextDataLoader(Tokenizer tokenizer, WordWeighting wordWeighting) {
        this(4194304, tokenizer, wordWeighting);
    }

    public HashedTextDataLoader(int i, Tokenizer tokenizer, WordWeighting wordWeighting) {
        this.dimensionSize = i;
        this.tokenizer = tokenizer;
        this.weighting = wordWeighting;
        this.termDocumentFrequencys = new AtomicIntegerArray(i);
        this.vectors = new ArrayList();
        this.tvc = new HashedTextVectorCreator(i, tokenizer, wordWeighting);
        this.noMoreAdding = false;
        this.workSpace = new ThreadLocal<>();
        this.storageSpace = new ThreadLocal<>();
        this.wordCounts = new ThreadLocal<>();
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public abstract void initialLoad();

    /* JADX INFO: Access modifiers changed from: protected */
    public int addOriginalDocument(String str) {
        int i;
        if (this.noMoreAdding) {
            throw new RuntimeException("Initial data set has been finalized");
        }
        StringBuilder sb = this.workSpace.get();
        List<String> list = this.storageSpace.get();
        Map<String, Integer> map = this.wordCounts.get();
        if (sb == null) {
            sb = new StringBuilder();
            list = new ArrayList();
            map = new LinkedHashMap();
            this.workSpace.set(sb);
            this.storageSpace.set(list);
            this.wordCounts.set(map);
        }
        sb.setLength(0);
        list.clear();
        this.tokenizer.tokenize(str, sb, list);
        for (String str2 : list) {
            Integer num = map.get(str2);
            if (num == null) {
                map.put(str2, 1);
            } else {
                map.put(str2, Integer.valueOf(num.intValue() + 1));
            }
        }
        SparseVector sparseVector = new SparseVector(this.dimensionSize, map.size());
        Iterator<Map.Entry<String, Integer>> it = map.entrySet().iterator();
        while (it.hasNext()) {
            Map.Entry<String, Integer> next = it.next();
            int abs = Math.abs(next.getKey().hashCode()) % this.dimensionSize;
            sparseVector.set(abs, next.getValue().intValue());
            this.termDocumentFrequencys.addAndGet(abs, next.getValue().intValue());
            it.remove();
        }
        synchronized (this.vectors) {
            this.vectors.add(sparseVector);
            i = this.documents;
            this.documents = i + 1;
        }
        return i;
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public void finishAdding() {
        this.noMoreAdding = true;
        this.workSpace = null;
        this.storageSpace = null;
        this.wordCounts = null;
        int[] iArr = new int[this.dimensionSize];
        for (int i = 0; i < this.termDocumentFrequencys.length(); i++) {
            iArr[i] = this.termDocumentFrequencys.get(i);
        }
        this.weighting.setWeight(this.vectors, IntList.unmodifiableView(iArr, this.dimensionSize));
        Iterator<SparseVector> it = this.vectors.iterator();
        while (it.hasNext()) {
            this.weighting.applyTo(it.next());
        }
        this.termDocumentFrequencys = null;
    }

    public DataSet getDataSet() {
        if (!this.noMoreAdding) {
            initialLoad();
            finishAdding();
        }
        ArrayList arrayList = new ArrayList(this.vectors.size());
        Iterator<SparseVector> it = this.vectors.iterator();
        while (it.hasNext()) {
            arrayList.add(new DataPoint(it.next(), new int[0], new CategoricalData[0]));
        }
        return new SimpleDataSet(arrayList);
    }

    @Override // jsat.text.TextVectorCreator
    public Vec newText(String str) {
        return getTextVectorCreator().newText(str);
    }

    @Override // jsat.text.TextVectorCreator
    public Vec newText(String str, StringBuilder sb, List<String> list) {
        return getTextVectorCreator().newText(str, sb, list);
    }

    public TextVectorCreator getTextVectorCreator() {
        if (this.noMoreAdding) {
            return this.tvc;
        }
        throw new RuntimeException("Initial documents have not yet loaded");
    }
}
