package eu.fbk.dh.kd;

import com.google.common.base.Joiner;
import edu.stanford.nlp.ling.HasWord;
import edu.stanford.nlp.ling.TaggedWord;
import edu.stanford.nlp.process.Morphology;
import edu.stanford.nlp.tagger.maxent.MaxentTagger;
import eu.fbk.dh.kd.lib.KD_configuration;
import eu.fbk.dh.kd.lib.KD_core;
import eu.fbk.dh.kd.lib.KD_core.Language;

import eu.fbk.dh.kd.lib.KD_core.Threads;

import eu.fbk.dh.kd.lib.KD_configuration.ColumExtraction;
import eu.fbk.dh.kd.lib.KD_configuration.Group;
import eu.fbk.dh.kd.lib.KD_keyconcept;
import eu.fbk.dh.kd.lib.KD_loader;
import org.apache.commons.cli.*;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;


import java.io.*;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;


/**
 * eu.fbk.dh.kd.Main runnable class
 * Please refer to the help for more information about the parameters
 *
 * @author Giovanni Moretti - DH Group FBK.
 */
public class Main {

    @SuppressWarnings("static-access")
    public static void main(String[] args) {


        KD_configuration configuration = new KD_configuration();
        configuration.numberOfConcepts = -1;
        configuration.max_keyword_length = 4;
        configuration.local_frequency_threshold = 2;
        configuration.prefer_specific_concept = KD_configuration.Prefer_Specific_Concept.MEDIUM;
        configuration.skip_proper_noun = false;
        configuration.skip_keyword_with_proper_noun = false;
        configuration.rerank_by_position = false;
        configuration.group_by = KD_configuration.Group.NONE;
        configuration.column_configuration = KD_configuration.ColumExtraction.TOKEN_POS_LEMMA;
        configuration.only_multiword = false;
        configuration.tagset = KD_configuration.Tagset.TEXTPRO;


        Language lang = Language.ENGLISH;

        int cores = Runtime.getRuntime().availableProcessors();

        boolean useStanford = false;
        boolean save_stanford = false;

        Threads t;
        switch (cores) {
            case 1:
                t = Threads.ONE;
                break;
            case 2:
                t = Threads.TWO;
                break;
            case 4:
                t = Threads.FOUR;
                break;
            case 6:
                t = Threads.SIX;
                break;
            case 8:
                t = Threads.EIGHT;
                break;
            case 10:
                t = Threads.TEN;
                break;
            case 12:
                t = Threads.TWELVE;
                break;
            default:
                t = Threads.TWO;
                break;
        }


        boolean STDOUT = false;

        ///////////////////////////////// command line parser //////////////////////////////////////

        CommandLineParser parser = new PosixParser();
        Options options = new Options();
        //options.addOption( "n", "number_of_concept", false, "do not hide entries starting with ." );

        options.addOption(OptionBuilder.withLongOpt("number_of_concept").withDescription("number of output keywords").withArgName("Integer").withType(Integer.class).hasArg().create("n"));

        options.addOption(OptionBuilder.withLongOpt("max_keyword_length").withDescription("maximum length of multi-word expressions").withArgName("Integer").withType(Integer.class).hasArg().create("m"));

        options.addOption(OptionBuilder.withLongOpt("local_frequency_threshold").withDescription("min number of occurrences in a text").withArgName("Integer").withType(Integer.class).hasArg().create("l"));

        options.addOption(OptionBuilder.withLongOpt("number_of_threads").withDescription("number of threads used by the program").withType(Integer.class).withArgName("ONE | TWO | FOUR | SIX | EIGHT | TEN | TWELVE").hasArg().create("t"));

        options.addOption(OptionBuilder.withLongOpt("prefer_specific_concept").withDescription("give a boost to more specific key-concept (multi-word)").withArgName("NO | WEAK | MEDIUM | STRONG | MAX").hasArg().create("p"));

        options.addOption(OptionBuilder.withLongOpt("column_configuration").withDescription("specify the input file column configuration\neg: CUSTOM_0,9,6 token is 0, lemma is 9, pos is 6").withArgName("TOKEN_LEMMA_POS | TOKEN_POS_LEMMA | CUSTOM_#token,#lemma,#pos").hasArg().create("c"));

        options.addOption(OptionBuilder.withLongOpt("language").withDescription("specify the language of the input file").withArgName("ENGLISH | ITALIAN | CUSTOM | CUSTOM_").hasArg().create("lang"));

        options.addOption(OptionBuilder.withLongOpt("tagset").withDescription("specify the tagset of the pos tagger (default is TEXTPRO)").withArgName("TEXTPRO | STANFORD | TREETAGGER | CUSTOM").hasArg().create("ts"));

        options.addOption(OptionBuilder.withLongOpt("group").withDescription("set the group configuration").withArgName("NONE | BY_LIST | BY_STEM | ALL_LEMMA").hasArg().create("g"));

        options.addOption(OptionBuilder.withLongOpt("lang_folder").withDescription("set the language folder path").withArgName("Path to the folder").hasArg().create("lp"));


        options.addOption(OptionBuilder.withLongOpt("new_language").withDescription("create new empty language folder").withArgName("Language name").hasArg().create("nl"));



        options.addOption("STDOUT", "standard_out", false, "print results on standard out");
        options.addOption("h", "help", false, "print this message");
        options.addOption("om", "only_multiword", false, "display only multi-words");
        options.addOption("luc", "use_lucene", false, "use lucene ranking score");
        options.addOption("fas", "skip_frequency_absorption", false, "skip frequency absorption");
        options.addOption("wp", "use_pattern_weight", false, "use the weight of pattern");
        options.addOption("ba", "boost_acronyms", false, "boost acronyms (for scientific articles)");
        options.addOption("v", "verbose", false, "verbose output");
        options.addOption("us", "use_stanford", false, "use included stanford pos tagger (only english)");
        options.addOption("ve", "version", false, "print version and exit");
        options.addOption("s", "skip_proper_noun", false, "skip proper nouns");
        options.addOption("sk", "skip_keyword_with_proper_noun", false, "skip keyword with proper nouns");
        options.addOption("r", "rerank_by_position", false, "give a boost to key-concepts on the top of the document");
        options.addOption("ns", "no_synonyms", false, "disable the synonym resolution");
        options.addOption("ss", "save_stanford", false, "save stanford preprocessed file");
        options.addOption("nr", "no_rerank", false, "disable the re-rank function");
        options.addOption("cp", "capitalize_pos", false, "capitalize token with specified pos");
        options.addOption("nabs", "no_abstract_keyconcept", false, "disable the boost on abstract key-concepts");
        options.addOption("nidf", "no_idf", false, "disable the boost by the idf value");

        CommandLine line = null;
        try {
            line = parser.parse(options, args);

            //---------------------------------------boolean values
            if (line.hasOption("help")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.setWidth(500);
                formatter.printHelp("KD_Keyphrase_Digger", options);
                System.exit(0);
            }

            if (line.hasOption("version")) {
                HelpFormatter formatter = new HelpFormatter();
                formatter.setWidth(500);
                System.out.println("\n" + KD_core.getVersion() + "\n");
                System.exit(0);
            }


            if (line.hasOption("no_rerank")) {
                configuration.no_rerank = true;
            }
            if (line.hasOption("use_pattern_weight")) {
                configuration.use_pattern_weight = true;
            }
            if (line.hasOption("boost_acronyms")) {
                configuration.boost_acronyms = true;
            }

            if (line.hasOption("no_abstract_keyconcept")) {
                configuration.no_abstract = true;
            }


            if (line.hasOption("no_idf")) {
                configuration.no_idf = true;
            }

            if (line.hasOption("no_synonyms")) {
                configuration.no_syn = true;
            }

            if (line.hasOption("rerank_by_position")) {
                configuration.rerank_by_position = true;
            }

            if (line.hasOption("capitalize_pos")) {
                configuration.capitalize_pos = true;
            }

            if (line.hasOption("verbose")) {
                configuration.verbose = true;
            }
            if (line.hasOption("use_lucene")) {
                configuration.use_lucene = true;
            }

            if (line.hasOption("skip_frequency_absorption")) {
                configuration.skipFrequencyAbsorption = true;
            }

            if (line.hasOption("only_multiword")) {
                configuration.only_multiword = true;
            }

            if (line.hasOption("skip_proper_noun")) {
                configuration.skip_proper_noun = true;
            }

            if (line.hasOption("skip_keyword_with_proper_noun")) {
                configuration.skip_keyword_with_proper_noun = true;
            }

            if (line.hasOption("use_stanford")) {

                useStanford = true;
            }
            if (line.hasOption("save_stanford")) {

                save_stanford = true;
            }


            if (line.hasOption("standard_out")) {
                STDOUT = true;
            }

            //-------------------------------------end boolean values

            //-------------------------------------properties values
            if (line.hasOption("number_of_threads")) {
                try {
                    t = Threads.valueOf(line.getOptionValue("number_of_threads").toUpperCase());
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option number_of_threads\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }


            if (line.hasOption("lang_folder")) {
                try {
                    configuration.languagePackPath = line.getOptionValue("lang_folder");
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option lang_folder\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("new_language")) {
                try {
                    String lang_name = line.getOptionValue("new_language");
                    KD_core kxc = new KD_core(t);
                    kxc.createNewEmptyLanguage(lang_name.toUpperCase(),configuration);
                    System.out.println ("\nThe new \""+lang_name.toUpperCase()+"\" language has been added to the languages.\nTo use it please specify \"CUSTOM_"+lang_name.toUpperCase()+"\" in the language parameter.");
                    System.exit(0);

                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option new_language\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }




            if (line.hasOption("prefer_specific_concept")) {
                try {
                    configuration.prefer_specific_concept = KD_configuration.Prefer_Specific_Concept.valueOf(line.getOptionValue("prefer_specific_concept").toUpperCase());
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option prefer_speficic_concept\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("column_configuration")) {
                try {

                    if (line.getOptionValue("column_configuration").toUpperCase().startsWith("CUSTOM_")) {
                        configuration.column_configuration = ColumExtraction.valueOf("CUSTOM");
                        String columnPositions = line.getOptionValue("column_configuration").split("_")[1];

                        configuration.token_position = Integer.parseInt(columnPositions.split(",")[0].trim().replace("{", "").replace("}", ""));
                        configuration.lemma_position = Integer.parseInt(columnPositions.split(",")[1].trim().replace("{", "").replace("}", ""));
                        configuration.pos_position = Integer.parseInt(columnPositions.split(",")[2].trim().replace("{", "").replace("}", ""));
                    } else {
                        configuration.column_configuration = ColumExtraction.valueOf(line.getOptionValue("column_configuration").toUpperCase());
                    }
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option column_configuration\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("number_of_concept")) {
                try {
                    configuration.numberOfConcepts = Integer.parseInt(line.getOptionValue("number_of_concept"));
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option number_of_concept\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("max_keyword_length")) {
                try {
                    configuration.max_keyword_length = Integer.parseInt(line.getOptionValue("max_keyword_length"));
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option max_keyword_length\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("local_frequency_threshold")) {
                try {
                    configuration.local_frequency_threshold = Integer.parseInt(line.getOptionValue("local_frequency_threshold"));
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option local_frequency_threshold\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("group")) {
                try {
                    configuration.group_by = Group.valueOf(line.getOptionValue("group").toUpperCase());
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option group\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

            if (line.hasOption("language")) {
                try {

                    String stringlang = line.getOptionValue("language").toUpperCase().split("_")[0];
                    lang = Language.valueOf(stringlang);
                    if (line.getOptionValue("language").toUpperCase().split("_",2).length > 1){
                        String custom_lang = line.getOptionValue("language").toUpperCase().split("_",2)[1];
                        lang.set_Custom_Language(custom_lang);
                    }


                } catch (Exception e) {
                    e.printStackTrace();

                    System.out.println("\nerror: Wrong value for the option language\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }


            if (line.hasOption("tagset")) {
                try {
                    configuration.tagset = KD_configuration.Tagset.valueOf(line.getOptionValue("tagset").toUpperCase());
                } catch (Exception e) {
                    System.out.println("\nerror: Wrong value for the option tagset\n");
                    HelpFormatter formatter = new HelpFormatter();
                    formatter.setWidth(500);
                    formatter.printHelp("KD_Keyphrase_Digger", options);
                    System.exit(1);
                }
            }

        } catch (Exception exp) {
            System.out.println("Unexpected exception:" + exp.getMessage());
        }


        if (lang != Language.ENGLISH && useStanford) {
            System.err.println("Please specify english as language if you want to use the integrated Stanford POS Tagger");
            System.exit(1);
        }


        //////////////////////////////////// end command line parser////////////////////////////////
        KD_loader.run_the_updater(lang, configuration.languagePackPath);


        System.out.println("Processor detected: " + cores + " used " + t.toString().toLowerCase());
        MaxentTagger tagger = null;
        Morphology morpha = new Morphology();
        if (useStanford) {
            System.out.println("Load Stanford Model");

            System.out.println("Override column configuration");
            configuration.column_configuration = ColumExtraction.CUSTOM;
            configuration.token_position = 0;
            configuration.pos_position = 2;
            configuration.lemma_position = 1;

            System.out.println("Force tagset to STANFORD");
            configuration.tagset = KD_configuration.Tagset.STANFORD;
            tagger = new MaxentTagger(configuration.languagePackPath +"/"+ lang.name() + "/tagger/stanford_model/english-bidirectional-distsim.tagger");

        }


        @SuppressWarnings("unchecked")
        List filePaths = line.getArgList();
        KD_core kxc = null;
        StringBuffer processed_filecontent = null;
        LinkedList concept_list = null;

        for (String filePath : filePaths) {
            List files = new ArrayList();
            if (new File(filePath).isDirectory()) {
                String[] extensions = new String[]{"txt", "txp"};
                files = (List) FileUtils.listFiles(new File(filePath), extensions, true);
            } else if (new File(filePath).isFile()) {

                files.add(new File(filePath));
            }
            for (File f : files) {
                System.out.println("Starting the extraction for file:" + FilenameUtils.getBaseName(f.getAbsolutePath()));
                long startTime = System.currentTimeMillis();

                kxc = new KD_core(t);

                processed_filecontent = new StringBuffer();
                if (useStanford) {
                    if (configuration.verbose) {
                        System.out.print("Start Stanford preprocessing....");
                    }
                    long startTimeStanford = System.currentTimeMillis();
                    try {
                        processed_filecontent = new StringBuffer();
                        List> sentences = MaxentTagger.tokenizeText(new BufferedReader(new InputStreamReader(new FileInputStream(f), "UTF-8")));

                        for (List sentence : sentences) {
                            List tSentence = tagger.tagSentence(sentence);

                            for (TaggedWord tw : tSentence) {

                                processed_filecontent.append(tw.word() + "\t" + morpha.stem(tw) + "\t" + tw.tag() + "\n");
                            }

                            processed_filecontent.append("\n");
                        }

                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                    long estimatedTimeStanford = System.currentTimeMillis() - startTimeStanford;
                    if (configuration.verbose) {
                        System.out.println("End Stanford preprocessing in : " + estimatedTimeStanford);
                    }

                    if (save_stanford) {
                        try {
                            BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameUtils.getFullPath(f.getAbsolutePath()) + FilenameUtils.getBaseName(f.getAbsolutePath()) + "_stanford.tsv"), "UTF-8"));
                            out.write(processed_filecontent.toString());
                            out.close();
                        } catch (Exception e) {
                            e.printStackTrace();
                        }
                    }

                }


                concept_list = kxc.extractExpressions(lang, configuration, f.getAbsolutePath(), processed_filecontent);


                if (STDOUT) {
                    int r = 1;
                    StringBuffer output = new StringBuffer();
                    for (KD_keyconcept k : concept_list) {

                        switch (configuration.group_by){
                            case ALL_LEMMA:
                                 output.append(r + "." + "\t" + Joiner.on(" ").join(k.getLemmaArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n");
                                break;
                            case NONE:
                                output.append(r + "." + "\t" + Joiner.on(" ").join(k.getTokenArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n");
                                break;
                            case BY_LIST:
                                output.append(r + "." + "\t" + Joiner.on(" ").join(k.getTokenArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n");
                                break;
                            case BY_STEM:
                                output.append(r + "." + "\t" + Joiner.on(" ").join(k.getStemArray()) +" (" + k.getSysnonyms() + ")\tfrequency: " + k.frequency + " ,score: " + k.score + " idf: " + k.getIdf() + " boost " + k.getScoreBoost() + " pattern_boost " + k.getPatternBoost() + " chain_l " + k.getTokenChainLength() + " stem: "+ k.getStemArray().toString() +"\n");
                                break;
                        }


                        r++;
                    }
                    if (output.toString().length() > 0) {
                        System.out.println(output.toString().substring(0, output.toString().length() - 1));
                    } else {
                        System.out.println("Mmmmm no keywords extracted... That's strange....");
                    }
                } else {
                    int r = 1;
                    StringBuffer output = new StringBuffer();
                    output.append("rank\tkeyword\tsynonyms\tscore\tfrequency\n");
                    for (KD_keyconcept k : concept_list) {
                        output.append(r + "\t" + k.getString() + "\t" + k.getSysnonyms() + "\t" + k.score + "\t" + k.frequency + "\n");
                        r++;
                    }


                    try {
                        BufferedWriter out = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(FilenameUtils.getFullPath(f.getAbsolutePath()) + FilenameUtils.getBaseName(f.getAbsolutePath()) + ".tsv"), "UTF-8"));
                        out.write(output.toString());
                        out.close();
                    } catch (Exception e) {
                        e.printStackTrace();
                    }
                }

                long estimatedTime = System.currentTimeMillis() - startTime;
                System.out.println("Finished in: " + estimatedTime + " milliseconds\n");
            }
        }
        //kxc.key_concept_extraction();
    }

}