/*
 * Decompiled with CFR 0.152.
 */
package de.unihd.dbs.uima.annotator.alllanguagestokenizer;

import de.unihd.dbs.uima.types.heideltime.Sentence;
import de.unihd.dbs.uima.types.heideltime.Token;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.uima.analysis_component.JCasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.jcas.JCas;

public class AllLanguagesTokenizer
extends JCasAnnotator_ImplBase {
    private String PChar = "\\[\u00bf\u00a1\\{\\(\\`\"\u201a\u201e\u2020\u2021\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a'";
    private String FChar = "\\]\\}\\'\\`\"\\),;:\\!\\?\\%\u201a\u201e\u2026\u2020\u2021\u2030\u2039\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u203a";
    private String FClitic = this.FClitic + "'(s|re|ve|d|m|em|ll)|n't";
    private String PClitic = this.PClitic + "|[dcjlmnstDCJLNMST]'|[Qq]u'|[Jj]usqu'|[Ll]orsqu'";

    public AllLanguagesTokenizer() {
        this.FClitic = this.FClitic + "|-t-elles?|-t-ils?|-t-on|-ce|-elles?|-ils?|-je|-la|-les?|-leur|-lui|-mmes?|-m'|-moi|-nous|-on|-toi|-tu|-t'|-vous|-en|-y|-ci|-l";
        this.FClitic = this.FClitic + "|-la|-las|-lo|-los|-nos";
    }

    public void process(JCas jcas) throws AnalysisEngineProcessException {
        this.tokenize(jcas);
        this.sentenceTokenize(jcas);
    }

    public List<Token> tokenize(JCas jcas) {
        StringBuilder outBuf = new StringBuilder();
        for (String text : jcas.getDocumentText().split("\n")) {
            String[] texts;
            text = text.replaceAll("[\r\n\t]", " ");
            text = text.replaceAll("(<[^<> ]*) ([^<>]*>)", "$1\u00ff$2");
            text = text.replaceAll("[\\u2000-\\u200A \\u202F\\u205F\\u3000\\u00A0\\u1680\\u180E]", "\u00fe");
            text = text.replaceAll("\u00ff", " ");
            text = text.replaceAll("\u00fe", "\u00ff");
            text = text.replaceAll("(<[^<>]*>)", "\u00ff$1\u00ff");
            text = text.replaceAll("^\u00ff", "");
            text = text.replaceAll("\u00ff$", "");
            text = text.replaceAll("\u00ff\u00ff\u00ff*", "\u00ff");
            for (String line : texts = text.split("\u00ff")) {
                String[] lines;
                if (line.matches("^<.*>$")) {
                    outBuf.append(line + "\n");
                    continue;
                }
                line = " " + line + " ";
                line = line.replaceAll("\\.\\.\\.", " ... ");
                line = line.replaceAll("([;\\!\\?])([^ ])", "$1 $2");
                line = line.replaceAll("([.,:])([^ 0-9.])", "$1 $2");
                for (String token : lines = line.split(" ")) {
                    Matcher m;
                    if (token.equals("")) continue;
                    String suffix = "";
                    Boolean finished = false;
                    do {
                        finished = true;
                        m = Pattern.compile("^([" + this.PChar + "])(.)").matcher(token);
                        if (m.find()) {
                            token = token.replaceAll("^([" + this.PChar + "])(.)", "$2");
                            outBuf.append(m.group(1) + "\n");
                            finished = false;
                        }
                        if ((m = Pattern.compile("(.)([" + this.FChar + "])$").matcher(token)).find()) {
                            token = token.replaceAll("(.)([" + this.FChar + "])$", "$1");
                            suffix = m.group(2) + "\n" + suffix;
                            finished = false;
                        }
                        if (!(m = Pattern.compile("([" + this.FChar + "])\\.$").matcher(token)).find()) continue;
                        token = token.replaceAll("([" + this.FChar + "])\\.$", "");
                        suffix = ".\n" + suffix;
                        if (token.equals("")) {
                            token = m.group(1);
                        } else {
                            suffix = m.group(1) + "\n" + suffix;
                        }
                        finished = false;
                    } while (!finished.booleanValue());
                    if (token.matches("^([A-Za-z-]\\.)+$")) {
                        outBuf.append(token + "\n" + suffix);
                        continue;
                    }
                    m = Pattern.compile("^(..*)\\.$").matcher(token);
                    if (m.matches() && !line.equals("...")) {
                        token = m.group(1);
                        suffix = ".\n" + suffix;
                    }
                    while ((m = Pattern.compile("^(--)(.)").matcher(token)).find()) {
                        token = token.replaceAll("^(--)(.)", "$2");
                        outBuf.append(m.group(1) + "\n");
                    }
                    if (!this.PClitic.equals("")) {
                        while ((m = Pattern.compile("^(" + this.PClitic + ")(.)").matcher(token)).find()) {
                            token = token.replaceAll("^(" + this.PClitic + ")(.)", "$2");
                            outBuf.append(m.group(1) + "\n");
                        }
                    }
                    while ((m = Pattern.compile("^(--)(.)").matcher(token)).find()) {
                        token = token.replaceAll("^(--)(.)", "$1");
                        suffix = m.group(2) + "\n" + suffix;
                    }
                    if (!this.FClitic.equals("")) {
                        while ((m = Pattern.compile("(.)(" + this.FClitic + ")$").matcher(token)).find()) {
                            token = token.replaceAll("(.)(" + this.FClitic + ")$", "$1");
                            suffix = m.group(2) + "\n" + suffix;
                        }
                    }
                    outBuf.append(token + "\n" + suffix);
                }
            }
        }
        LinkedList<Token> outList = new LinkedList<Token>();
        String origText = jcas.getDocumentText();
        Integer origTextOffset = 0;
        for (String s : outBuf.toString().split("\n")) {
            Integer begin = origText.indexOf(s, (int)origTextOffset);
            Integer end = begin + s.length();
            Token t = new Token(jcas);
            t.setBegin(begin);
            t.setPos("");
            t.setEnd(end);
            t.addToIndexes();
            origTextOffset = t.getEnd();
            outList.add(t);
        }
        return outList;
    }

    public List<Sentence> sentenceTokenize(JCas jcas) {
        LinkedList<Sentence> outList = new LinkedList<Sentence>();
        FSIterator tokIt = jcas.getAnnotationIndex(Token.type).iterator();
        Sentence s = new Sentence(jcas);
        Boolean sentenceStarted = false;
        Token tOld = null;
        Token t = null;
        while (tokIt.hasNext()) {
            if (t != null) {
                tOld = t;
            }
            t = (Token)((Object)tokIt.next());
            if (!sentenceStarted.booleanValue()) {
                sentenceStarted = true;
                s.setBegin(t.getBegin());
            }
            if (tokIt.hasNext() && (!t.getCoveredText().matches("[.:!\\?]+") || tOld.getCoveredText().matches("[\\d]+") || jcas.getDocumentText().substring(t.getEnd()).length() > 2 && jcas.getDocumentText().substring(t.getEnd(), t.getEnd() + 3).matches(" [A-Z][.-]"))) continue;
            sentenceStarted = false;
            s.setEnd(t.getEnd());
            if (tokIt.hasNext()) {
                Token tNext = (Token)((Object)tokIt.next());
                if (tNext.getCoveredText().matches("[\u00bb\u2019'\"\u201b\u201d\u201f\u203a\u301e\u300f\u300d\ufe44\uff02\uff07\uff63\ufe42]+")) {
                    s.setEnd(tNext.getEnd());
                } else {
                    tokIt.moveToPrevious();
                }
            }
            s.addToIndexes();
            outList.add(s);
            s = new Sentence(jcas);
        }
        return outList;
    }
}

