/*
 * Decompiled with CFR 0.152.
 */
package eu.fbk.dh.tint.tokenizer;

import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.process.CoreLabelTokenFactory;
import eu.fbk.dh.tint.tokenizer.token.Token;
import eu.fbk.dh.tint.tokenizer.token.TokenGroup;
import eu.fbk.utils.core.PropertiesUtils;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathFactory;
import org.ahocorasick.trie.Emit;
import org.ahocorasick.trie.Trie;
import org.apache.commons.lang.mutable.MutableBoolean;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

public class ItalianTokenizer {
    static Logger logger = LoggerFactory.getLogger(ItalianTokenizer.class);
    static Pattern spaceTokenizer = Pattern.compile("\\s+");
    private Trie trie;
    private Set<Integer> splittingChars = new HashSet<Integer>();
    private Set<Integer> sentenceChars = new HashSet<Integer>();
    private Map<Integer, String> normalizedChars = new HashMap<Integer, String>();
    private Map<String, String> normalizedStrings = new HashMap<String, String>();
    private Map<Pattern, Integer> expressions = new HashMap<Pattern, Integer>();
    CoreLabelTokenFactory factory = new CoreLabelTokenFactory();

    public ItalianTokenizer() {
        this(null);
    }

    public ItalianTokenizer(@Nullable File settingFile) {
        Trie.TrieBuilder builder = Trie.builder().removeOverlaps();
        InputStream stream = null;
        if (settingFile != null) {
            try {
                stream = new FileInputStream(settingFile);
            }
            catch (FileNotFoundException fileNotFoundException) {
                // empty catch block
            }
        }
        if (stream == null) {
            stream = this.getClass().getResourceAsStream("/token-settings.xml");
        }
        logger.trace("Loading model");
        try {
            Node item;
            int i;
            String charID;
            Element element;
            Node item2;
            int i2;
            DocumentBuilderFactory dbFactory = DocumentBuilderFactory.newInstance();
            DocumentBuilder dBuilder = dbFactory.newDocumentBuilder();
            XPathFactory xPathfactory = XPathFactory.newInstance();
            XPath xpath = xPathfactory.newXPath();
            Document doc = dBuilder.parse(stream);
            doc.getDocumentElement().normalize();
            XPathExpression expr = xpath.compile("/settings/normalization/char");
            NodeList nl = (NodeList)expr.evaluate(doc, XPathConstants.NODESET);
            for (i2 = 0; i2 < nl.getLength(); ++i2) {
                item2 = nl.item(i2);
                element = (Element)item2;
                String hexCode = element.getAttribute("hexcode");
                String content = element.getTextContent();
                if (content.equals("`")) {
                    content = "'";
                }
                int num = Integer.parseInt(hexCode, 16);
                if (content.length() == 0) continue;
                this.normalizedChars.put(num, content);
            }
            logger.info("Loaded {} normalization rules", (Object)this.normalizedChars.size());
            expr = xpath.compile("/settings/sentenceSplitting/char");
            nl = (NodeList)expr.evaluate(doc, XPathConstants.NODESET);
            for (i2 = 0; i2 < nl.getLength(); ++i2) {
                item2 = nl.item(i2);
                element = (Element)item2;
                charID = element.getAttribute("id");
                this.sentenceChars.add(Integer.parseInt(charID));
            }
            logger.info("Loaded {} sentence splitting rules", (Object)this.sentenceChars.size());
            expr = xpath.compile("/settings/tokenSplitting/char");
            nl = (NodeList)expr.evaluate(doc, XPathConstants.NODESET);
            for (i2 = 0; i2 < nl.getLength(); ++i2) {
                item2 = nl.item(i2);
                element = (Element)item2;
                charID = element.getAttribute("id");
                this.splittingChars.add(Integer.parseInt(charID));
            }
            logger.info("Loaded {} token splitting rules", (Object)this.splittingChars.size());
            expr = xpath.compile("/settings/expressions/expression");
            nl = (NodeList)expr.evaluate(doc, XPathConstants.NODESET);
            StringBuilder b = new StringBuilder();
            b.append("(");
            boolean first = true;
            int count = 0;
            for (i = 0; i < nl.getLength(); ++i) {
                item = nl.item(i);
                Element element2 = (Element)item;
                String regExp = element2.getAttribute("find");
                boolean merge = PropertiesUtils.getBoolean((String)element2.getAttribute("merge"), (boolean)true);
                Integer group = PropertiesUtils.getInteger((String)element2.getAttribute("get"), (int)1);
                if (merge) {
                    if (!first) {
                        b.append("|");
                    }
                    b.append(regExp);
                    ++count;
                    first = false;
                    continue;
                }
                this.expressions.put(Pattern.compile(regExp), group);
                ++count;
            }
            b.append(")");
            this.expressions.put(Pattern.compile(b.toString()), 1);
            logger.info("Loaded {} regular expressions", (Object)count);
            expr = xpath.compile("/settings/abbreviations/abbreviation");
            nl = (NodeList)expr.evaluate(doc, XPathConstants.NODESET);
            count = 0;
            for (i = 0; i < nl.getLength(); ++i) {
                item = nl.item(i);
                String abbr = item.getTextContent();
                abbr = ItalianTokenizer.getString(this.tokenArray(abbr));
                builder.addKeyword(" " + abbr + " ");
                ++count;
            }
            logger.info("Loaded {} abbreviations", (Object)count);
        }
        catch (Exception e) {
            e.printStackTrace();
        }
        this.trie = builder.build();
    }

    public TokenGroup tokenArray(String text) {
        if (text.length() == 0) {
            return new TokenGroup();
        }
        TokenGroup tokenGroup = new TokenGroup();
        Character previousChar = null;
        int start = 0;
        MutableBoolean isNewLine = new MutableBoolean(false);
        Token lastToken = new Token(0, 0, "");
        for (int i = 0; i < text.length(); ++i) {
            Character currentChar = Character.valueOf(text.charAt(i));
            Boolean isCurrentCharLetterOrDigit = Character.isLetterOrDigit(currentChar.charValue());
            Boolean isPreviousCharLetterOrDigit = previousChar != null && Character.isLetterOrDigit(previousChar.charValue());
            if (isCurrentCharLetterOrDigit.booleanValue()) {
                if (!isPreviousCharLetterOrDigit.booleanValue()) {
                    start = i;
                }
            } else if (isPreviousCharLetterOrDigit.booleanValue()) {
                String substring = text.substring(start, i);
                this.addToken(tokenGroup, start, i, substring, isNewLine, lastToken);
                if (!this.isSeparatorChar(currentChar)) {
                    String charString = new String(new char[]{currentChar.charValue()});
                    this.addToken(tokenGroup, i, i + 1, charString, isNewLine, lastToken);
                }
            } else if (!this.isSeparatorChar(currentChar)) {
                String charString = new String(new char[]{currentChar.charValue()});
                this.addToken(tokenGroup, i, i + 1, charString, isNewLine, lastToken);
            }
            if (currentChar.charValue() == '\n') {
                isNewLine.setValue(true);
            }
            previousChar = currentChar;
        }
        if (Character.isLetterOrDigit(previousChar.charValue())) {
            String substring = text.substring(start, text.length());
            this.addToken(tokenGroup, start, text.length(), substring, isNewLine, lastToken);
        }
        return tokenGroup;
    }

    private void addToken(TokenGroup group, int start, int end, String charString, MutableBoolean isNewLine, Token lastToken) {
        String n;
        Token token = new Token(start, end, charString);
        if (isNewLine.booleanValue()) {
            group.addNewLine(start);
            isNewLine.setValue(false);
        }
        token.setPreceedBySpace(start - lastToken.getEnd() > 0);
        int spaces = 0;
        if (lastToken != null && lastToken.getEnd() != 0) {
            int endLast = lastToken.getEnd();
            spaces = lastToken.getSpaceOffset();
            spaces = start == endLast ? ++spaces : (spaces -= Math.max(0, start - endLast - 1));
        }
        token.setSpaceOffset(spaces);
        if (charString.length() == 1) {
            char c = charString.charAt(0);
            n = this.normalizedChars.get(c);
        } else {
            n = this.normalizedStrings.get(charString);
        }
        if (n != null) {
            token.setNormForm(n);
        }
        lastToken.updateByToken(token);
        group.addToken(token);
    }

    public static String getString(TokenGroup tokenGroup) {
        StringBuilder buffer = new StringBuilder();
        ArrayList<Token> tokens = tokenGroup.getSupport();
        if (tokens.size() > 0) {
            for (int i = 0; i < tokens.size() - 1; ++i) {
                Token token = tokens.get(i);
                buffer.append(token.getForm()).append(' ');
            }
            buffer.append(tokens.get(tokens.size() - 1).getForm());
        }
        return buffer.toString();
    }

    public boolean isSeparatorChar(Character ch) {
        if (this.splittingChars.size() > 0) {
            return this.splittingChars.contains(ch.hashCode());
        }
        if (ch.charValue() == ' ') {
            return true;
        }
        if (ch.charValue() == '\r') {
            return true;
        }
        if (ch.charValue() == '\n') {
            return true;
        }
        if (ch.charValue() == '\t') {
            return true;
        }
        return ch.charValue() == '\f';
    }

    public List<List<CoreLabel>> parse(String text) {
        return this.parse(text, true, false, false);
    }

    public List<List<CoreLabel>> parse(String text, boolean newlineIsSentenceBreak, boolean tokenizeOnlyOnSpace, boolean ssplitOnlyOnNewLine) {
        ArrayList<List<CoreLabel>> ret = new ArrayList<List<CoreLabel>>();
        ArrayList<CoreLabel> temp = new ArrayList<CoreLabel>();
        int index = 0;
        if (tokenizeOnlyOnSpace) {
            int nextStart = 0;
            boolean lastIsWhitespace = true;
            int newLineCount = 0;
            for (int i = 0; i < text.length(); ++i) {
                char currentChar = text.charAt(i);
                if (Character.isWhitespace(currentChar)) {
                    if (!lastIsWhitespace) {
                        System.out.println("---" + text.substring(nextStart, i) + "---" + newLineCount);
                        String word = text.substring(nextStart, i);
                        CoreLabel clToken = this.factory.makeToken(word, word, nextStart, i - nextStart);
                        clToken.setIndex(++index);
                        if (newlineIsSentenceBreak && newLineCount > 0 && temp.size() > 0) {
                            ret.add(temp);
                            temp = new ArrayList();
                        }
                        temp.add(clToken);
                        if (!ssplitOnlyOnNewLine && word.length() == 1 && this.sentenceChars.contains(word.charAt(0))) {
                            ret.add(temp);
                            temp = new ArrayList();
                        }
                        newLineCount = 0;
                    }
                    if (currentChar == '\n') {
                        ++newLineCount;
                    }
                    lastIsWhitespace = true;
                    continue;
                }
                if (lastIsWhitespace) {
                    nextStart = i;
                }
                lastIsWhitespace = false;
            }
            if (temp.size() > 0) {
                ret.add(temp);
            }
        } else {
            HashMap<Integer, Integer> mergeList = new HashMap<Integer, Integer>();
            for (Pattern expression : this.expressions.keySet()) {
                int get = this.expressions.get(expression);
                Matcher matcher = expression.matcher(text);
                while (matcher.find()) {
                    mergeList.put(matcher.start(get), matcher.end(get));
                }
            }
            TokenGroup tokenGroup = this.tokenArray(text);
            ArrayList<Token> tokens = tokenGroup.getSupport();
            if (tokens.size() == 0) {
                return ret;
            }
            int offset = tokens.get(0).getStart();
            String s = " " + ItalianTokenizer.getString(tokenGroup) + " ";
            Collection emits = this.trie.parseText((CharSequence)s);
            for (Emit emit : emits) {
                Token startToken = tokenGroup.getStartOffIndexes().get(emit.getStart() + 1 - 1 + offset);
                Token endToken = tokenGroup.getEndOffIndexes().get(emit.getEnd() - 1 - 1 + offset);
                if (startToken != null && endToken != null) {
                    mergeList.put(startToken.getStart(), endToken.getEnd());
                    continue;
                }
                logger.warn("Something is null! -- " + emit.toString());
            }
            Integer end = null;
            Integer start = null;
            for (int i = 0; i < tokens.size(); ++i) {
                String word;
                Token token = tokens.get(i);
                boolean merging = false;
                if (mergeList.containsKey(token.getStart()) || end != null) {
                    merging = true;
                    if (end == null) {
                        end = (Integer)mergeList.get(token.getStart());
                    }
                }
                if (merging && end != null && token.getEnd() >= end) {
                    end = null;
                    merging = false;
                }
                if (token.getNormForm().equals("'")) {
                    CoreLabel lastToken;
                    Token prevToken = null;
                    Token nextToken = null;
                    if (i > 0) {
                        prevToken = tokens.get(i - 1);
                    }
                    if (i < tokens.size() - 1) {
                        nextToken = tokens.get(i + 1);
                    }
                    if (prevToken != null && nextToken != null && !token.isPreceedBySpace() && !nextToken.isPreceedBySpace() && Character.isLetter(prevToken.getForm().charAt(prevToken.getForm().length() - 1)) && Character.isLetter(nextToken.getForm().charAt(0))) {
                        lastToken = (CoreLabel)temp.get(temp.size() - 1);
                        start = lastToken.beginPosition();
                        temp.remove(temp.size() - 1);
                        --index;
                    } else if (!(prevToken == null || !Character.isLetter(prevToken.getForm().charAt(prevToken.getForm().length() - 1)) || token.isPreceedBySpace() || nextToken != null && nextToken.getNormForm().equals("'"))) {
                        lastToken = (CoreLabel)temp.get(temp.size() - 1);
                        start = lastToken.beginPosition();
                        temp.remove(temp.size() - 1);
                        --index;
                    } else if (!(nextToken == null || !Character.isLetter(nextToken.getForm().charAt(0)) || nextToken.isPreceedBySpace() || prevToken != null && prevToken.getNormForm().equals("'"))) {
                        merging = true;
                    }
                }
                if (merging) {
                    if (start != null) continue;
                    start = token.getStart();
                    continue;
                }
                if (start == null) {
                    start = token.getStart();
                }
                int finish = token.getEnd();
                String normWord = word = text.substring(start, finish);
                if (word.charAt(word.length() - 1) == '\u2019' || word.charAt(word.length() - 1) == '`') {
                    normWord = word.substring(0, word.length() - 1) + "'";
                }
                CoreLabel clToken = this.factory.makeToken(normWord, word, start.intValue(), finish - start);
                clToken.setIndex(++index);
                if (newlineIsSentenceBreak && tokenGroup.getNewLines().contains(start) && temp.size() > 0) {
                    ret.add(temp);
                    temp = new ArrayList();
                }
                temp.add(clToken);
                if (!ssplitOnlyOnNewLine && word.length() == 1 && this.sentenceChars.contains(word.charAt(0))) {
                    ret.add(temp);
                    temp = new ArrayList();
                }
                start = null;
            }
            if (temp.size() > 0) {
                ret.add(temp);
            }
        }
        return ret;
    }

    public static void main(String[] argv) throws IOException {
        ItalianTokenizer tokenizer = new ItalianTokenizer();
        String text = "Clinton in testa nei sondaggi dopo l\u2019\u00abassoluzione\u00bb dell\u2019Fbi sull\u2019uso di un server di posta privato quando era Segretario di stato.";
        long time = System.currentTimeMillis();
        List<List<CoreLabel>> sentences = tokenizer.parse(text);
        time = System.currentTimeMillis() - time;
        for (int i = 0; i < Math.min(10, sentences.size()); ++i) {
            List<CoreLabel> sentence = sentences.get(i);
            for (CoreLabel token : sentence) {
                System.out.println(token.word() + " -- " + token.originalText() + " -- " + token.beginPosition());
            }
            System.out.println();
        }
        int sentenceSize = sentences.size();
        int lastTokenIndex = sentences.get(sentenceSize - 1).get(sentences.get(sentenceSize - 1).size() - 1).index();
        System.out.println("Length: " + text.length());
        System.out.println("Time: " + time);
        System.out.println("Sentences: " + sentenceSize);
        System.out.println("Tokens: " + lastTokenIndex);
    }
}

