/*
 * Decompiled with CFR 0.152.
 */
package smile.nlp.tokenizer;

import java.util.ArrayList;
import java.util.regex.Pattern;
import smile.nlp.tokenizer.EnglishAbbreviations;
import smile.nlp.tokenizer.Tokenizer;

public class SimpleTokenizer
implements Tokenizer {
    private static final Pattern WONT_CONTRACTION = Pattern.compile("(?i)\\b(w)(on't)\\b");
    private static final Pattern SHANT_CONTRACTION = Pattern.compile("(?i)\\b(sha)(n't)\\b");
    private static final Pattern AINT_CONTRACTION = Pattern.compile("(?i)\\b(a)(in't)\\b");
    private static final Pattern[] NOT_CONTRACTIONS = new Pattern[]{Pattern.compile("(?i)\\b(can)('t|not)\\b"), Pattern.compile("(?i)(.)(n't)\\b")};
    private static final Pattern[] CONTRACTIONS2 = new Pattern[]{Pattern.compile("(?i)(.)('ll|'re|'ve|'s|'m|'d)\\b"), Pattern.compile("(?i)\\b(D)('ye)\\b"), Pattern.compile("(?i)\\b(Gim)(me)\\b"), Pattern.compile("(?i)\\b(Gon)(na)\\b"), Pattern.compile("(?i)\\b(Got)(ta)\\b"), Pattern.compile("(?i)\\b(Lem)(me)\\b"), Pattern.compile("(?i)\\b(Mor)('n)\\b"), Pattern.compile("(?i)\\b(T)(is)\\b"), Pattern.compile("(?i)\\b(T)(was)\\b"), Pattern.compile("(?i)\\b(Wan)(na)\\b")};
    private static final Pattern[] CONTRACTIONS3 = new Pattern[]{Pattern.compile("(?i)\\b(Whad)(dd)(ya)\\b"), Pattern.compile("(?i)\\b(Wha)(t)(cha)\\b")};
    private static final Pattern[] DELIMITERS = new Pattern[]{Pattern.compile("((?U)[^\\w\\.\\'\\-\\/,&])"), Pattern.compile("(?U)(,\\s)"), Pattern.compile("(?U)('\\s)"), Pattern.compile("(?U)\\. *(\\n|$)"), Pattern.compile("(?U)(\\.{3,})")};
    private static final Pattern WHITESPACE = Pattern.compile("(?U)\\s+");
    private boolean splitContraction;

    public SimpleTokenizer() {
        this(false);
    }

    public SimpleTokenizer(boolean splitContraction) {
        this.splitContraction = splitContraction;
    }

    @Override
    public String[] split(String text) {
        if (this.splitContraction) {
            text = WONT_CONTRACTION.matcher(text).replaceAll("$1ill not");
            text = SHANT_CONTRACTION.matcher(text).replaceAll("$1ll not");
            text = AINT_CONTRACTION.matcher(text).replaceAll("$1m not");
            for (Pattern regexp : NOT_CONTRACTIONS) {
                text = regexp.matcher(text).replaceAll("$1 not");
            }
            for (Pattern regexp : CONTRACTIONS2) {
                text = regexp.matcher(text).replaceAll("$1 $2");
            }
            for (Pattern regexp : CONTRACTIONS3) {
                text = regexp.matcher(text).replaceAll("$1 $2 $3");
            }
        }
        text = DELIMITERS[0].matcher(text).replaceAll(" $1 ");
        text = DELIMITERS[1].matcher(text).replaceAll(" $1");
        text = DELIMITERS[2].matcher(text).replaceAll(" $1");
        text = DELIMITERS[3].matcher(text).replaceAll(" . ");
        String[] words = WHITESPACE.split(text = DELIMITERS[4].matcher(text).replaceAll(" $1 "));
        if (words.length > 1 && words[words.length - 1].equals(".") && EnglishAbbreviations.contains(words[words.length - 2])) {
            words[words.length - 2] = words[words.length - 2] + ".";
        }
        ArrayList<String> result = new ArrayList<String>();
        for (String token : words) {
            if (token.isEmpty()) continue;
            result.add(token);
        }
        return result.toArray(new String[result.size()]);
    }
}

