/*
 * Decompiled with CFR 0.152.
 */
package com.datumbox.framework.core.common.text.parsers;

import com.datumbox.framework.core.common.text.StringCleaner;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class HTMLParser {
    private static final Pattern IMG_ALT_TITLE_PATTERN = Pattern.compile("<[\\s]*img[^>]*[alt|title]=[\\s]*[\\\"']?([^>\\\"']+)[\\\"']?[^>]*>", 34);
    private static final Pattern NON_TEXT_TAGS_PATTERN = Pattern.compile("<[\\s]*(head|style|script|object|embed|applet|noframes|noscript|noembed|option)[^>]*?>.*?</\\1>", 34);
    private static final Pattern REMOVE_ATTRIBUTES_PATTERN = Pattern.compile("<([a-z!][a-z0-9]*)[^>]*?(/?)>", 34);
    private static final Pattern TITLE_PATTERN = Pattern.compile("<title[^>]*>(.*?)</title>", 34);
    private static final Pattern HYPERLINK_PATTERN = Pattern.compile("<[\\s]*a[^>]*href[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>(.*?)</a>", 34);
    private static final Pattern METATAG_PATTERN = Pattern.compile("<[\\s]*meta[^>]*name[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*content[\\s]*=[\\s]*[\\\"']([^\\\"']*)[\\\"'][^>]*>", 34);
    private static final Pattern HX_PATTERN = Pattern.compile("<[\\s]*(H[1-6])[^>]*?>(.*?)</\\1>", 34);
    private static final String[][] ESCAPES = new String[][]{{"\"", "quot"}, {"&", "amp"}, {"<", "lt"}, {">", "gt"}, {"\u00a0", "nbsp"}, {"\u00a1", "iexcl"}, {"\u00a2", "cent"}, {"\u00a3", "pound"}, {"\u00a4", "curren"}, {"\u00a5", "yen"}, {"\u00a6", "brvbar"}, {"\u00a7", "sect"}, {"\u00a8", "uml"}, {"\u00a9", "copy"}, {"\u00aa", "ordf"}, {"\u00ab", "laquo"}, {"\u00ac", "not"}, {"\u00ad", "shy"}, {"\u00ae", "reg"}, {"\u00af", "macr"}, {"\u00b0", "deg"}, {"\u00b1", "plusmn"}, {"\u00b2", "sup2"}, {"\u00b3", "sup3"}, {"\u00b4", "acute"}, {"\u00b5", "micro"}, {"\u00b6", "para"}, {"\u00b7", "middot"}, {"\u00b8", "cedil"}, {"\u00b9", "sup1"}, {"\u00ba", "ordm"}, {"\u00bb", "raquo"}, {"\u00bc", "frac14"}, {"\u00bd", "frac12"}, {"\u00be", "frac34"}, {"\u00bf", "iquest"}, {"\u00c0", "Agrave"}, {"\u00c1", "Aacute"}, {"\u00c2", "Acirc"}, {"\u00c3", "Atilde"}, {"\u00c4", "Auml"}, {"\u00c5", "Aring"}, {"\u00c6", "AElig"}, {"\u00c7", "Ccedil"}, {"\u00c8", "Egrave"}, {"\u00c9", "Eacute"}, {"\u00ca", "Ecirc"}, {"\u00cb", "Euml"}, {"\u00cc", "Igrave"}, {"\u00cd", "Iacute"}, {"\u00ce", "Icirc"}, {"\u00cf", "Iuml"}, {"\u00d0", "ETH"}, {"\u00d1", "Ntilde"}, {"\u00d2", "Ograve"}, {"\u00d3", "Oacute"}, {"\u00d4", "Ocirc"}, {"\u00d5", "Otilde"}, {"\u00d6", "Ouml"}, {"\u00d7", "times"}, {"\u00d8", "Oslash"}, {"\u00d9", "Ugrave"}, {"\u00da", "Uacute"}, {"\u00db", "Ucirc"}, {"\u00dc", "Uuml"}, {"\u00dd", "Yacute"}, {"\u00de", "THORN"}, {"\u00df", "szlig"}, {"\u00e0", "agrave"}, {"\u00e1", "aacute"}, {"\u00e2", "acirc"}, {"\u00e3", "atilde"}, {"\u00e4", "auml"}, {"\u00e5", "aring"}, {"\u00e6", "aelig"}, {"\u00e7", "ccedil"}, {"\u00e8", "egrave"}, {"\u00e9", "eacute"}, {"\u00ea", "ecirc"}, {"\u00eb", "euml"}, {"\u00ec", "igrave"}, {"\u00ed", "iacute"}, {"\u00ee", "icirc"}, {"\u00ef", "iuml"}, {"\u00f0", "eth"}, {"\u00f1", "ntilde"}, {"\u00f2", "ograve"}, {"\u00f3", "oacute"}, {"\u00f4", "ocirc"}, {"\u00f5", "otilde"}, {"\u00f6", "ouml"}, {"\u00f7", "divide"}, {"\u00f8", "oslash"}, {"\u00f9", "ugrave"}, {"\u00fa", "uacute"}, {"\u00fb", "ucirc"}, {"\u00fc", "uuml"}, {"\u00fd", "yacute"}, {"\u00fe", "thorn"}, {"\u00ff", "yuml"}};
    private static final int MIN_ESCAPE = 2;
    private static final int MAX_ESCAPE = 6;
    private static final HashMap<String, CharSequence> LOOKUP_MAP = new HashMap();

    private static String unescapeHtml(String input) {
        StringBuilder writer = null;
        int len = input.length();
        int i = 1;
        int st = 0;
        while (true) {
            int j;
            block16: {
                if (i < len && input.charAt(i - 1) != '&') {
                    ++i;
                    continue;
                }
                if (i >= len) break;
                for (j = i; j < len && j < i + 6 + 1 && input.charAt(j) != ';'; ++j) {
                }
                if (j == len || j < i + 2 || j == i + 6 + 1) {
                    ++i;
                    continue;
                }
                if (input.charAt(i) == '#') {
                    int k = i + 1;
                    int radix = 10;
                    char firstChar = input.charAt(k);
                    if (firstChar == 'x' || firstChar == 'X') {
                        ++k;
                        radix = 16;
                    }
                    try {
                        int entityValue = Integer.parseInt(input.substring(k, j), radix);
                        if (writer == null) {
                            writer = new StringBuilder(input.length());
                        }
                        writer.append(input.substring(st, i - 1));
                        if (entityValue > 65535) {
                            char[] chrs = Character.toChars(entityValue);
                            writer.append(chrs[0]);
                            writer.append(chrs[1]);
                        } else if (entityValue == 39) {
                            writer.append('\'');
                        } else {
                            writer.append(entityValue);
                        }
                        break block16;
                    }
                    catch (NumberFormatException ex) {
                        ++i;
                        continue;
                    }
                }
                CharSequence value = LOOKUP_MAP.get(input.substring(i, j));
                if (value == null) {
                    ++i;
                    continue;
                }
                if (writer == null) {
                    writer = new StringBuilder(input.length());
                }
                writer.append(input.substring(st, i - 1));
                writer.append(value);
            }
            i = st = j + 1;
        }
        if (writer != null) {
            writer.append(input.substring(st, len));
            return writer.toString();
        }
        return input;
    }

    public static String replaceImgWithAlt(String html) {
        Matcher m = IMG_ALT_TITLE_PATTERN.matcher(html);
        if (m.find()) {
            return m.replaceAll(" $1 ");
        }
        return html;
    }

    public static String removeComments(String html) {
        return html.replaceAll("(?s)<!--.*?-->", "");
    }

    public static String unsafeRemoveAllTags(String html) {
        return html.replaceAll("\\<.*?>", " ");
    }

    public static String safeRemoveAllTags(String html) {
        html = HTMLParser.removeNonTextTags(html);
        html = HTMLParser.unsafeRemoveAllTags(html);
        return html;
    }

    private static String removeNonTextTags(String html) {
        Matcher m = NON_TEXT_TAGS_PATTERN.matcher(html = HTMLParser.removeComments(html));
        if (m.find()) {
            html = m.replaceAll(" ");
        }
        return html;
    }

    public static String removeNonTextTagsAndAttributes(String html) {
        Matcher m = REMOVE_ATTRIBUTES_PATTERN.matcher(html = HTMLParser.removeNonTextTags(html));
        if (m.find()) {
            html = m.replaceAll("<$1$2>");
        }
        html = HTMLParser.unescapeHtml(html);
        return html;
    }

    public static String extractText(String html) {
        html = HTMLParser.replaceImgWithAlt(html);
        html = HTMLParser.safeRemoveAllTags(html);
        html = HTMLParser.unescapeHtml(html);
        return html;
    }

    private static String clear(String html) {
        return StringCleaner.removeExtraSpaces(HTMLParser.unescapeHtml(HTMLParser.unsafeRemoveAllTags(html)));
    }

    public static String extractTitle(String html) {
        Matcher m = TITLE_PATTERN.matcher(html);
        if (m.find()) {
            return HTMLParser.clear(m.group(0));
        }
        return null;
    }

    public static Map<HyperlinkPart, List<String>> extractHyperlinks(String html) {
        HashMap<HyperlinkPart, List<String>> hyperlinksMap = new HashMap<HyperlinkPart, List<String>>();
        hyperlinksMap.put(HyperlinkPart.HTMLTAG, new ArrayList());
        hyperlinksMap.put(HyperlinkPart.URL, new ArrayList());
        hyperlinksMap.put(HyperlinkPart.ANCHORTEXT, new ArrayList());
        Matcher m = HYPERLINK_PATTERN.matcher(html);
        while (m.find()) {
            if (m.groupCount() != 2) continue;
            String tag = m.group(0);
            String url = m.group(1);
            String anchortext = m.group(2);
            ((List)hyperlinksMap.get((Object)HyperlinkPart.HTMLTAG)).add(tag);
            ((List)hyperlinksMap.get((Object)HyperlinkPart.URL)).add(url);
            ((List)hyperlinksMap.get((Object)HyperlinkPart.ANCHORTEXT)).add(anchortext);
        }
        return hyperlinksMap;
    }

    public static Map<String, String> extractMetatags(String html) {
        HashMap<String, String> metatagsMap = new HashMap<String, String>();
        Matcher m = METATAG_PATTERN.matcher(html);
        while (m.find()) {
            if (m.groupCount() != 2) continue;
            String name = m.group(1);
            String content = m.group(2);
            metatagsMap.put(HTMLParser.clear(name), HTMLParser.clear(content));
        }
        return metatagsMap;
    }

    public static Map<String, List<String>> extractHTMLheaders(String html) {
        HashMap<String, List<String>> hxtagsMap = new HashMap<String, List<String>>();
        for (int i = 1; i <= 6; ++i) {
            hxtagsMap.put("H" + i, new ArrayList());
        }
        Matcher m = HX_PATTERN.matcher(html);
        while (m.find()) {
            if (m.groupCount() != 2) continue;
            String tagType = m.group(1).toUpperCase(Locale.ENGLISH);
            String content = m.group(2);
            ((List)hxtagsMap.get(tagType)).add(HTMLParser.clear(content));
        }
        return hxtagsMap;
    }

    static {
        for (String[] seq : ESCAPES) {
            LOOKUP_MAP.put(seq[1].toString(), seq[0]);
        }
    }

    public static enum HyperlinkPart {
        HTMLTAG,
        URL,
        ANCHORTEXT;

    }
}

