/*
 * Decompiled with CFR 0.152.
 */
package edu.stanford.nlp.wordseg;

import edu.stanford.nlp.fsm.DFSA;
import edu.stanford.nlp.fsm.DFSAState;
import edu.stanford.nlp.fsm.DFSATransition;
import edu.stanford.nlp.ling.CoreAnnotation;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.IteratorFromReaderFactory;
import edu.stanford.nlp.objectbank.LineIterator;
import edu.stanford.nlp.process.ChineseDocumentToSentenceProcessor;
import edu.stanford.nlp.sequences.DocumentReaderAndWriter;
import edu.stanford.nlp.sequences.LatticeWriter;
import edu.stanford.nlp.sequences.SeqClassifierFlags;
import edu.stanford.nlp.trees.international.pennchinese.ChineseUtils;
import edu.stanford.nlp.util.Characters;
import edu.stanford.nlp.util.Generics;
import edu.stanford.nlp.util.MutableInteger;
import edu.stanford.nlp.util.StringUtils;
import edu.stanford.nlp.wordseg.ChineseDictionary;
import edu.stanford.nlp.wordseg.ChineseStringUtils;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.io.Serializable;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.function.Function;
import java.util.regex.Pattern;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

public class Sighan2005DocumentReaderAndWriter
implements DocumentReaderAndWriter<CoreLabel>,
LatticeWriter<CoreLabel, String, Integer> {
    private static final long serialVersionUID = 3260295150250263237L;
    private static Logger logger = LoggerFactory.getLogger(Sighan2005DocumentReaderAndWriter.class);
    private static final boolean DEBUG = false;
    private static final boolean DEBUG_MORE = false;
    private static final Pattern dateChars = Pattern.compile("[\u5e74\u6708\u65e5]");
    private static final Pattern dateCharsPlus = Pattern.compile("[\u5e74\u6708\u65e5\u53f7]");
    private static final Pattern numberChars = Pattern.compile("[0-9\uff10-\uff19\u4e00\u4e8c\u4e09\u56db\u4e94\u516d\u4e03\u516b\u4e5d\u5341\u96f6\u3007\u767e\u5343\u4e07\u4ebf\u5169\u25cb\u25ef\u3021-\u3029\u3038-\u303a]");
    private static final Pattern letterChars = Pattern.compile("[A-Za-z\uff21-\uff3a\uff41-\uff5a]");
    private static final Pattern periodChars = Pattern.compile("[\ufe52\u2027\uff0e.\u70b9]");
    private final Pattern separatingPuncChars = Pattern.compile("[]!\"(),;:<=>?\\[\\\\`{|}~^\u3001-\u3003\u3008-\u3011\u3014-\u301f\u3030\uff3d\uff01\uff02\uff08\uff09\uff0c\uff1b\uff1a\uff1c\uff1d\uff1e\uff1f\uff3b\uff3c\uff40\uff5b\uff5c\uff5d\uff5e\uff3e]");
    private final Pattern ambiguousPuncChars = Pattern.compile("[-#$%&'*+/@_\uff0d\uff03\uff04\uff05\uff06\uff07\uff0a\uff0b\uff0f\uff20\uff3f]");
    private final Pattern midDotPattern = Pattern.compile("[\u00b7\u0387\u2022\u2024\u2027\u2219\u22c5\u30fb]");
    private ChineseDocumentToSentenceProcessor cdtos;
    private ChineseDictionary cdict;
    private ChineseDictionary cdict2;
    private SeqClassifierFlags flags;
    private IteratorFromReaderFactory<List<CoreLabel>> factory;

    @Override
    public Iterator<List<CoreLabel>> getIterator(Reader r) {
        return this.factory.getIterator(r);
    }

    @Override
    public void init(SeqClassifierFlags flags) {
        this.flags = flags;
        this.factory = LineIterator.getFactory(new CTBDocumentParser());
        this.cdtos = new ChineseDocumentToSentenceProcessor(flags.normalizationTable);
        if (flags.dictionary != null) {
            String[] dicts = flags.dictionary.split(",");
            this.cdict = new ChineseDictionary(dicts, this.cdtos, flags.expandMidDot);
        }
        if (flags.serializedDictionary != null) {
            String dict = flags.serializedDictionary;
            this.cdict = new ChineseDictionary(dict, this.cdtos, flags.expandMidDot);
        }
        if (flags.dictionary2 != null) {
            String[] dicts2 = flags.dictionary2.split(",");
            this.cdict2 = new ChineseDictionary(dicts2, this.cdtos, flags.expandMidDot);
        }
    }

    private String shapeOf(String input) {
        String shape = this.flags.augmentedDateChars && dateCharsPlus.matcher(input).matches() ? "D" : (dateChars.matcher(input).matches() ? "D" : (numberChars.matcher(input).matches() ? "N" : (letterChars.matcher(input).matches() ? "L" : (periodChars.matcher(input).matches() ? "P" : (this.separatingPuncChars.matcher(input).matches() ? "S" : (this.ambiguousPuncChars.matcher(input).matches() ? "A" : (this.flags.useMidDotShape && this.midDotPattern.matcher(input).matches() ? "M" : "C")))))));
        return shape;
    }

    private static void addDictionaryFeatures(ChineseDictionary dict, Class<? extends CoreAnnotation<String>> lbeginFieldName, Class<? extends CoreAnnotation<String>> lmiddleFieldName, Class<? extends CoreAnnotation<String>> lendFieldName, String nonspaceLine, List<CoreLabel> lwi) {
        int i;
        int lwiSize = lwi.size();
        if (lwiSize != nonspaceLine.length()) {
            throw new RuntimeException();
        }
        int[] lbegin = new int[lwiSize];
        int[] lmiddle = new int[lwiSize];
        int[] lend = new int[lwiSize];
        for (i = 0; i < lwiSize; ++i) {
            lend[i] = 0;
            lmiddle[i] = 0;
            lbegin[i] = 0;
        }
        for (i = 0; i < lwiSize; ++i) {
            for (int leng = 6; leng >= 1; --leng) {
                if (i + leng - 1 >= lwiSize || !dict.contains(nonspaceLine.substring(i, i + leng))) continue;
                if (leng > lbegin[i]) {
                    lbegin[i] = leng;
                }
                int last = i + leng - 1;
                if (leng == 6) {
                    ++last;
                }
                for (int mid = i + 1; mid < last; ++mid) {
                    if (leng <= lmiddle[mid]) continue;
                    lmiddle[mid] = leng;
                }
                if (leng >= 6 || leng <= lend[i + leng - 1]) continue;
                lend[i + leng - 1] = leng;
            }
        }
        for (i = 0; i < lwiSize; ++i) {
            StringBuilder sb = new StringBuilder();
            sb.append(lbegin[i]);
            if (lbegin[i] == 6) {
                sb.append("+");
            }
            lwi.get(i).set(lbeginFieldName, sb.toString());
            sb = new StringBuilder();
            sb.append(lmiddle[i]);
            if (lmiddle[i] == 6) {
                sb.append("+");
            }
            lwi.get(i).set(lmiddleFieldName, sb.toString());
            sb = new StringBuilder();
            sb.append(lend[i]);
            if (lend[i] == 6) {
                sb.append("+");
            }
            lwi.get(i).set(lendFieldName, sb.toString());
        }
    }

    @Override
    public void printAnswers(List<CoreLabel> doc, PrintWriter pw) {
        String ansStr = ChineseStringUtils.combineSegmentedSentence(doc, this.flags);
        pw.print(ansStr);
        pw.println();
    }

    private static String intern(String s) {
        return s.trim().intern();
    }

    @Override
    public void printLattice(DFSA<String, Integer> tagLattice, List<CoreLabel> doc, PrintWriter out2) {
        CoreLabel[] docArray = doc.toArray(new CoreLabel[doc.size()]);
        MutableInteger nodeId = new MutableInteger(0);
        DFSA<String, Integer> answerLattice = new DFSA<String, Integer>(null);
        DFSAState<String, Integer> aInitState = new DFSAState<String, Integer>(nodeId.intValue(), answerLattice);
        answerLattice.setInitialState(aInitState);
        Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks = Generics.newHashMap();
        this.tagLatticeToAnswerLattice(tagLattice.initialState(), aInitState, new StringBuilder(""), nodeId, 0, 0.0, stateLinks, answerLattice, docArray);
        try {
            answerLattice.printAttFsmFormat(out2);
        }
        catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    private void tagLatticeToAnswerLattice(DFSAState<String, Integer> tSource, DFSAState<String, Integer> aSource, StringBuilder answer, MutableInteger nodeId, int pos, double cost, Map<DFSAState<String, Integer>, DFSAState<String, Integer>> stateLinks, DFSA<String, Integer> answerLattice, CoreLabel[] docArray) {
        if (tSource.isAccepting() && tSource.continuingInputs().isEmpty()) {
            tSource.addTransition(new DFSATransition<String, Integer>("", tSource, new DFSAState(-1, null), "1", "", 0.0));
        }
        CoreLabel curLabel = pos < docArray.length ? docArray[pos] : null;
        String curChr = null;
        String origSpace = null;
        if (curLabel != null) {
            curChr = (String)curLabel.get(CoreAnnotations.OriginalCharAnnotation.class);
            assert (curChr.length() == 1);
            origSpace = (String)curLabel.get(CoreAnnotations.SpaceBeforeAnnotation.class);
        }
        Set<String> inputs = tSource.continuingInputs();
        String answerConstraint = null;
        if (pos == 0) {
            double minCost = Double.POSITIVE_INFINITY;
            for (String predictSpace : inputs) {
                DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
                double transitionCost = transition.score();
                if (!(transitionCost < minCost) || predictSpace == null) continue;
                logger.info(String.format("mincost (%s): %e -> %e%n", predictSpace, minCost, transitionCost));
                minCost = transitionCost;
                answerConstraint = predictSpace;
            }
        }
        for (String predictSpace : inputs) {
            DFSATransition<String, Integer> transition = tSource.transition(predictSpace);
            DFSAState<String, Integer> tDest = transition.target();
            DFSAState<String, Integer> newASource = aSource;
            StringBuilder newAnswer = new StringBuilder(answer.toString());
            int answerLen = newAnswer.length();
            String prevChr = answerLen > 0 ? newAnswer.substring(answerLen - 1) : null;
            double newCost = cost;
            if (answerConstraint != null && !answerConstraint.equals(predictSpace)) {
                logger.info(String.format("Skipping transition %s at pos 0.%n", predictSpace));
                continue;
            }
            if (this.flags.keepAllWhitespaces && "0".equals(predictSpace) && "1".equals(origSpace)) {
                logger.info(String.format("Skipping non-boundary at pos %d, since space in the input.%n", pos));
                continue;
            }
            if ("1".equals(predictSpace) && "0".equals(origSpace) && prevChr != null && curChr != null) {
                char p = prevChr.charAt(0);
                char c = curChr.charAt(0);
                if (ChineseStringUtils.isLetterASCII(p) && ChineseStringUtils.isLetterASCII(c)) {
                    logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two ASCII letters (%s and %s).%n", pos, prevChr, curChr));
                    continue;
                }
                if (ChineseUtils.isNumber(p) && ChineseUtils.isNumber(c)) {
                    logger.info(String.format("Not hypothesizing a boundary at pos %d, since between two numeral characters (%s and %s).%n", pos, prevChr, curChr));
                    continue;
                }
            }
            if ("1".equals(predictSpace) && newAnswer.toString().length() > 0) {
                if (stateLinks.containsKey(tSource)) {
                    DFSAState<String, Integer> aDest = stateLinks.get(tSource);
                    newASource.addTransition(new DFSATransition<String, Integer>("", newASource, aDest, newAnswer.toString(), "", newCost));
                    continue;
                }
                nodeId.incValue(1);
                DFSAState<String, Integer> aDest = new DFSAState<String, Integer>(nodeId.intValue(), answerLattice, 0.0);
                stateLinks.put(tSource, aDest);
                newASource.addTransition(new DFSATransition<String, Integer>("", newASource, aDest, newAnswer.toString(), "", newCost));
                if (tSource.isAccepting()) {
                    aDest.setAccepting(true);
                    continue;
                }
                newASource = aDest;
                newAnswer = new StringBuilder();
                newCost = 0.0;
            }
            assert (curChr != null);
            newAnswer.append(curChr);
            if (!((newCost += transition.score()) < this.flags.searchGraphPrune) && !ChineseStringUtils.isLetterASCII(curChr.charAt(0))) continue;
            this.tagLatticeToAnswerLattice(tDest, newASource, newAnswer, nodeId, pos + 1, newCost, stateLinks, answerLattice, docArray);
        }
    }

    class CTBDocumentParser
    implements Function<String, List<CoreLabel>>,
    Serializable {
        private static final long serialVersionUID = 3260297180259462337L;
        private String defaultMap = "char=0,answer=1";
        public String[] map = StringUtils.mapStringToArray(this.defaultMap);

        CTBDocumentParser() {
        }

        @Override
        public List<CoreLabel> apply(String line) {
            if (line == null) {
                return null;
            }
            line = line.trim();
            ArrayList<CoreLabel> lwi = new ArrayList<CoreLabel>();
            String origLine = line;
            line = Sighan2005DocumentReaderAndWriter.this.cdtos.normalization(origLine);
            int origIndex = 0;
            int position = 0;
            StringBuilder nonspaceLineSB = new StringBuilder();
            int len = line.length();
            for (int index = 0; index < len; ++index) {
                char ch = line.charAt(index);
                CoreLabel wi = new CoreLabel();
                String wordString = Character.toString(ch);
                if (Character.isWhitespace(ch) || Character.isISOControl(ch)) continue;
                wi.set(CoreAnnotations.CharAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(wordString));
                nonspaceLineSB.append(wordString);
                while (Character.isWhitespace(origLine.charAt(origIndex)) || Character.isISOControl(origLine.charAt(origIndex)) || origLine.charAt(origIndex) == '\u00a0') {
                    ++origIndex;
                }
                wordString = Character.toString(origLine.charAt(origIndex));
                wi.set(CoreAnnotations.OriginalCharAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(wordString));
                if (((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.useShapeStrings) {
                    wi.set(CoreAnnotations.ShapeAnnotation.class, Sighan2005DocumentReaderAndWriter.this.shapeOf(wordString));
                }
                if (((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.useUnicodeType || ((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.useUnicodeType4gram || ((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.useUnicodeType5gram) {
                    wi.set(CoreAnnotations.UTypeAnnotation.class, Character.getType(ch));
                }
                if (((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.useUnicodeBlock) {
                    wi.set(CoreAnnotations.UBlockAnnotation.class, Characters.unicodeBlockStringOf(ch));
                }
                ++origIndex;
                if (index == 0) {
                    wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
                    wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
                    wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
                } else if (Character.isWhitespace(line.charAt(index - 1)) || Character.isISOControl(line.charAt(index - 1))) {
                    wi.set(CoreAnnotations.AnswerAnnotation.class, "1");
                    wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "1");
                    wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "1");
                } else {
                    wi.set(CoreAnnotations.AnswerAnnotation.class, "0");
                    wi.set(CoreAnnotations.SpaceBeforeAnnotation.class, "0");
                    wi.set(CoreAnnotations.GoldAnswerAnnotation.class, "0");
                }
                wi.set(CoreAnnotations.PositionAnnotation.class, Sighan2005DocumentReaderAndWriter.intern(String.valueOf(position)));
                ++position;
                lwi.add(wi);
            }
            if (((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.dictionary != null || ((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.serializedDictionary != null) {
                String nonspaceLine = nonspaceLineSB.toString();
                Sighan2005DocumentReaderAndWriter.addDictionaryFeatures(Sighan2005DocumentReaderAndWriter.this.cdict, CoreAnnotations.LBeginAnnotation.class, CoreAnnotations.LMiddleAnnotation.class, CoreAnnotations.LEndAnnotation.class, nonspaceLine, lwi);
            }
            if (((Sighan2005DocumentReaderAndWriter)Sighan2005DocumentReaderAndWriter.this).flags.dictionary2 != null) {
                String nonspaceLine = nonspaceLineSB.toString();
                Sighan2005DocumentReaderAndWriter.addDictionaryFeatures(Sighan2005DocumentReaderAndWriter.this.cdict2, CoreAnnotations.D2_LBeginAnnotation.class, CoreAnnotations.D2_LMiddleAnnotation.class, CoreAnnotations.D2_LEndAnnotation.class, nonspaceLine, lwi);
            }
            return lwi;
        }
    }
}

