/*
 * Decompiled with CFR 0.152.
 */
package de.unihd.dbs.uima.annotator.treetagger;

import de.unihd.dbs.uima.annotator.heideltime.utilities.Logger;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.EnumSet;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class TreeTaggerTokenizer {
    EnumSet<Flag> flags = null;
    private File abbreviationsFile = null;
    private String PChar = "\\[\u00bf\u00a1\\{\\(\\`\"\u201a\u201e\u2020\u2021\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a'";
    private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%\u201a\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a";
    private String FClitic = "";
    private String PClitic = "";
    private ArrayList<String> abbreviations = new ArrayList();

    /*
     * WARNING - Removed try catching itself - possible behaviour change.
     */
    public TreeTaggerTokenizer(String abbreviationsFile, EnumSet<Flag> flags) throws RuntimeException {
        this.flags = flags;
        if (abbreviationsFile != null) {
            this.abbreviationsFile = new File(abbreviationsFile);
            if (!this.abbreviationsFile.exists() || !this.abbreviationsFile.canRead()) {
                Logger.printError(this.getClass(), "Couldn't read abbreviations file " + abbreviationsFile + " (exist:" + this.abbreviationsFile.exists() + ",read:" + this.abbreviationsFile.canRead() + ")");
                throw new RuntimeException();
            }
            BufferedReader br = null;
            try {
                br = new BufferedReader(new FileReader(this.abbreviationsFile));
                String line = null;
                while ((line = br.readLine()) != null) {
                    line = line.replaceAll("^[ \t\r\n]+", "");
                    if ((line = line.replaceAll("[ \t\r\n]+$", "")).matches("^(#.*|\\s$)")) continue;
                    this.abbreviations.add(line);
                }
            }
            catch (Exception e) {
                e.printStackTrace();
            }
            finally {
                if (br != null) {
                    try {
                        br.close();
                    }
                    catch (Exception e) {
                        e.printStackTrace();
                    }
                }
            }
        }
        if (flags.contains((Object)Flag.ENGLISH)) {
            this.FClitic = "'(s|re|ve|d|m|em|ll)|n't";
        }
        if (flags.contains((Object)Flag.ITALIAN)) {
            this.PClitic = "[dD][ae]ll'|[nN]ell'|[Aa]ll'|[lLDd]'|[Ss]ull'|[Qq]uest'|[Uu]n'|[Ss]enz'|[Tt]utt'";
        }
        if (flags.contains((Object)Flag.FRENCH)) {
            this.PClitic = "[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";
            this.FClitic = "-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
        }
        if (flags.contains((Object)Flag.GALICIAN)) {
            this.FClitic = "-la|-las|-lo|-los|-nos";
        }
    }

    public List<String> tokenize(String in) {
        StringBuilder outBuf = new StringBuilder();
        for (String text : in.split("\n")) {
            String[] texts;
            text = text.replaceAll("[\r\n\t]", " ");
            text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\u00ff$2");
            text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\u00fe");
            text = text.replaceAll("\u00ff", " ");
            text = text.replaceAll("\u00fe", "\u00ff");
            text = text.replaceAll("(<[^<>]*>)", "\u00ff$1\u00ff");
            text = text.replaceAll("^\u00ff", "");
            text = text.replaceAll("\u00ff$", "");
            text = text.replaceAll("\u00ff\u00ff\u00ff*", "\u00ff");
            for (String line : texts = text.split("\u00ff")) {
                String[] lines;
                if (line.matches("^<.*>$")) {
                    outBuf.append(line + "\n");
                    continue;
                }
                line = " " + line + " ";
                line = line.replaceAll("\\.\\.\\.", " ... ");
                line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
                line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
                for (String token : lines = line.split(" ")) {
                    Matcher m;
                    if (token.equals("")) continue;
                    String suffix = "";
                    Boolean finished = false;
                    do {
                        finished = true;
                        m = Pattern.compile("^([" + this.PChar + "])(.)").matcher(token);
                        if (m.find()) {
                            token = token.replaceAll("^([" + this.PChar + "])(.)", "$2");
                            outBuf.append(m.group(1) + "\n");
                            finished = false;
                        }
                        if ((m = Pattern.compile("(.)([" + this.FChar + "])$").matcher(token)).find()) {
                            token = token.replaceAll("(.)([" + this.FChar + "])$", "$1");
                            suffix = m.group(2) + "\n" + suffix;
                            finished = false;
                        }
                        if (!(m = Pattern.compile("([" + this.FChar + "])\\.$").matcher(token)).find()) continue;
                        token = token.replaceAll("([" + this.FChar + "])\\.$", "");
                        suffix = ".\n" + suffix;
                        if (token.equals("")) {
                            token = m.group(1);
                        } else {
                            suffix = m.group(1) + "\n" + suffix;
                        }
                        finished = false;
                    } while (!finished.booleanValue());
                    if (this.abbreviations.contains(token)) {
                        outBuf.append(token + "\n" + suffix);
                        continue;
                    }
                    if (token.matches("^([A-Za-z-]\\.)+$")) {
                        outBuf.append(token + "\n" + suffix);
                        continue;
                    }
                    m = Pattern.compile("^(..*)\\.$").matcher(token);
                    if (!(!m.matches() || line.equals("...") || this.flags.contains((Object)Flag.GALICIAN) && token.matches("^[0-9]+\\.$"))) {
                        token = m.group(1);
                        suffix = ".\n" + suffix;
                        if (this.abbreviations.contains(token)) {
                            outBuf.append(token + "\n" + suffix);
                            continue;
                        }
                    }
                    while ((m = Pattern.compile("^(--)(.)").matcher(token)).find()) {
                        token = token.replaceAll("^(--)(.)", "$2");
                        outBuf.append(m.group(1) + "\n");
                    }
                    if (!this.PClitic.equals("")) {
                        while ((m = Pattern.compile("^(" + this.PClitic + ")(.)").matcher(token)).find()) {
                            token = token.replaceAll("^(" + this.PClitic + ")(.)", "$2");
                            outBuf.append(m.group(1) + "\n");
                        }
                    }
                    while ((m = Pattern.compile("(.)(--)$").matcher(token)).find()) {
                        token = token.replaceAll("(.)(--)$", "$1");
                        suffix = m.group(2) + "\n" + suffix;
                    }
                    if (!this.FClitic.equals("")) {
                        while ((m = Pattern.compile("(.)(" + this.FClitic + ")$").matcher(token)).find()) {
                            token = token.replaceAll("(.)(" + this.FClitic + ")$", "$1");
                            suffix = m.group(2) + "\n" + suffix;
                        }
                    }
                    outBuf.append(token + "\n" + suffix);
                }
            }
        }
        LinkedList<String> outList = new LinkedList<String>();
        for (String s : outBuf.toString().split("\n")) {
            s = s.replaceAll("^[\\p{javaWhitespace}\\p{gc=Cc}]+", "");
            s = s.replaceAll("[\\p{javaWhitespace}\\p{gc=Cc}]+$", "");
            outList.add(s);
        }
        return outList;
    }

    public static enum Flag {
        ENGLISH,
        FRENCH,
        ITALIAN,
        GALICIAN,
        Z;


        public static EnumSet<Flag> getSet(String flagName) {
            EnumSet<Flag> set = EnumSet.noneOf(Flag.class);
            if (flagName == null) {
                return set;
            }
            if (flagName.contains("-e")) {
                set.add(ENGLISH);
            }
            if (flagName.contains("-f")) {
                set.add(FRENCH);
            }
            if (flagName.contains("-i")) {
                set.add(ITALIAN);
            }
            if (flagName.contains("-g")) {
                set.add(GALICIAN);
            }
            if (flagName.contains("-z")) {
                set.add(Z);
            }
            return set;
        }
    }
}

