/*
 * Decompiled with CFR 0.152.
 */
package jsat.text.tokenizer;

import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import jsat.text.tokenizer.Tokenizer;

public class StopWordTokenizer
implements Tokenizer {
    private static final long serialVersionUID = 445704970760705567L;
    private Tokenizer base;
    private Set<String> stopWords;
    public static final Set<String> ENGLISH_STOP_SMALL_BASE = Collections.unmodifiableSet(new HashSet<String>(Arrays.asList("a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "the", "of", "to", "and", "in", "is", "it", "you", "that", "was", "for", "are", "on", "as", "have", "with", "they", "be", "at", "this", "from", "or", "had", "by", "but", "some", "what", "there", "we", "can", "out", "other", "were", "all", "your", "when", "use", "word", "said", "an", "each", "which", "do", "their", "if", "will", "way", "about", "many", "them", "would", "thing", "than", "down", "too")));

    public StopWordTokenizer(Tokenizer base, Collection<String> stopWords) {
        this.base = base;
        this.stopWords = new HashSet<String>(stopWords);
    }

    public StopWordTokenizer(Tokenizer base, String ... stopWords) {
        this(base, Arrays.asList(stopWords));
    }

    @Override
    public List<String> tokenize(String input) {
        List<String> tokens = this.base.tokenize(input);
        tokens.removeAll(this.stopWords);
        return tokens;
    }

    @Override
    public void tokenize(String input, StringBuilder workSpace, List<String> storageSpace) {
        this.base.tokenize(input, workSpace, storageSpace);
        storageSpace.removeAll(this.stopWords);
    }
}

