Beispiel #1
        private string DetectLanguage(string Text)
            var languageIdentifier         = new LanguageIdentifier(Server.MapPath("/Wikipedia-Experimental-UTF8Only/"));
            var languageIdentifierSettings = new IvanAkcheurov.NTextCat.Lib.Legacy.LanguageIdentifier.LanguageIdentifierSettings(100);

            var languages           = languageIdentifier.ClassifyText(Text, languageIdentifierSettings).ToList();
            var mostCertainLanguage = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
        static void Main(string[] args)
            //MemoryStream s = new MemoryStream();
            double defaultWorstAcceptableThreshold  = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]);
            int    defaultTooManyLanguagesThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]);
            string defaultLanguageModelsDirectory   = ConfigurationManager.AppSettings["LanguageModelsDirectory"];
            int    defaultOccuranceNumberThreshold  = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]);
            int    defaultMaximumSizeOfDistribution = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]);

            bool   opt_help                      = false;
            bool   opt_train                     = false;
            string opt_trainOnFile               = null;
            string opt_classifyFromArgument      = null;
            bool   opt_classifyFromInputPerLine  = false;
            double opt_WorstAcceptableThreshold  = defaultWorstAcceptableThreshold;
            int    opt_TooManyLanguagesThreshold = defaultTooManyLanguagesThreshold;
            string opt_LanguageModelsDirectory   = defaultLanguageModelsDirectory;
            int    opt_OccuranceNumberThreshold  = defaultOccuranceNumberThreshold;
            long   opt_OnlyReadFirstNLines       = long.MaxValue;
            int    opt_MaximumSizeOfDistribution = defaultMaximumSizeOfDistribution;
            bool   opt_verbose                   = false;
            bool   opt_noPrompt                  = false;

            OptionSet option_set = new OptionSet()

                                   .Add("?|help|h", "Prints out the options.", option => opt_help = option != null)

                                   .Add("n|train:", "Trains from the file specified or input stream.",
                                        option =>
                opt_train       = true;
                opt_trainOnFile = option;
                                        @"Determine language of each line of input.",
                                        option => opt_classifyFromInputPerLine = option != null)
                                        @"the program returns the best-scoring language together" + Environment.NewLine +
                                        @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine +
                                        @"If the number of languages to be printed is larger than the value " + Environment.NewLine +
                                        @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine +
                                        @"instead a message that the input is of an unknown language is" + Environment.NewLine +
                                        @"printed. Default: " + defaultTooManyLanguagesThreshold + @".",
                                        (int option) => opt_TooManyLanguagesThreshold = option)
                                        @"indicates in which directory the language models are" + Environment.NewLine +
                                        @"located (files ending in .lm). Currently only a single" + Environment.NewLine +
                                        @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".",
                                        option => opt_LanguageModelsDirectory = option)
                                        @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine +
                                        @"of times or less are removed. This can be used to speed up" + Environment.NewLine +
                                        @"the program for longer inputs. For short inputs you should use" + Environment.NewLine +
                                        @"-f 0." + Environment.NewLine +
                                        @"Default: " + defaultOccuranceNumberThreshold + @".",
                                        (int option) => opt_OccuranceNumberThreshold = option)
                                        @"only read first N lines",
                                        (int option) => opt_OnlyReadFirstNLines = option)
                                        @"indicates that input is given as an argument on the command line," + Environment.NewLine +
                                        @"e.g. text_cat -l ""this is english text""" + Environment.NewLine +
                                        @"Cannot be used in combination with -n.",
                                        option => opt_classifyFromArgument = option)
                                        @"indicates the topmost number of ngrams that should be used." + Environment.NewLine +
                                        @"If used in combination with -n this determines the size of the" + Environment.NewLine +
                                        @"output. If used with categorization this determines" + Environment.NewLine +
                                        @"the number of ngrams that are compared with each of the language" + Environment.NewLine +
                                        @"models (but each of those models is used completely)." + Environment.NewLine +
                                        @"Default: " + defaultMaximumSizeOfDistribution + @".",
                                        (int option) => opt_MaximumSizeOfDistribution = option)
                                        @"determines how much worse result must be in order not to be" + Environment.NewLine +
                                        "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine +
                                        "Default: " + defaultWorstAcceptableThreshold + @".",
                                        (double option) => opt_WorstAcceptableThreshold = option)
                                        @"verbose. Continuation messages are written to standard error.",
                                        option => opt_verbose = option != null)
                                        @"prevents text input prompt from being shown.",
                                        option => opt_noPrompt = option != null);

            catch (OptionException ex)
                Console.WriteLine("Error occured: " + ex.ToString());

            if (opt_help)

            if (opt_train)
                LanguageModel <ulong> langaugeModel;
                Stream input;
                if (string.IsNullOrEmpty(opt_trainOnFile))
                    if (!opt_noPrompt)
                        DisplayInputPrompt("Train from text input");
                    input = Console.OpenStandardInput();
                    input = File.OpenRead(opt_trainOnFile);
                using (input)
                    IEnumerable <UInt64> tokens = new ByteToUInt64NGramExtractor(5, opt_OnlyReadFirstNLines).GetFeatures(input);
                    langaugeModel = new LanguageModel <UInt64>(
                        LanguageModelCreator.CreateLangaugeModel(tokens, opt_OccuranceNumberThreshold, opt_MaximumSizeOfDistribution),
                        new LanguageInfo(null, null, null, null) /*API should ask about language*/);
                using (Stream standardOutput = Console.OpenStandardOutput())
                    new ByteLanguageModelPersister().Save(langaugeModel, standardOutput);
                var languageIdentifier = new LanguageIdentifier(opt_LanguageModelsDirectory, opt_MaximumSizeOfDistribution);
                var settings           = new LanguageIdentifier.LanguageIdentifierSettings(
                    opt_TooManyLanguagesThreshold, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines,
                    opt_WorstAcceptableThreshold, 5);
                if (opt_classifyFromArgument != null)
                    var languages = languageIdentifier.ClassifyText(opt_classifyFromArgument, settings);
                else if (opt_classifyFromInputPerLine)
                    if (!opt_noPrompt)
                        DisplayInputPrompt("Classify each line from text input");
                    using (Stream input = Console.OpenStandardInput())
                        // suboptimal read performance, but per-line mode is not intended to be used in heavy scenarios
                        foreach (IEnumerable <byte> line in Split <byte>(EnumerateAllBytes(input), true, 0xD, 0xA))
                            using (var linestream = new MemoryStream(line.ToArray()))
                                var languages = languageIdentifier.ClassifyBytes(linestream, null, settings);
                    if (!opt_noPrompt)
                        DisplayInputPrompt("Classify text input");
                    using (Stream input = Console.OpenStandardInput())
                        var languages = languageIdentifier.ClassifyBytes(input, null, settings);