Ejemplo n.º 1
0
        /// <summary>
        /// returns possible languages of text contained in <paramref name="input"/> or empty sequence if too uncertain.
        /// </summary>
        /// <param name="input"></param>
        /// <param name="encoding">encoding of text contained in <paramref name="input"/> or null if encoding is unknown beforehand.
        /// <para> When encoding is not null, for performance and quality reasons
        /// please make sure that <see cref="LanguageIdentifier"/> is created with
        /// languageModelsDirectory parameter of constructor pointing to models
        /// built from UTF8 encoded files (models from folder "Wikipedia-Experimental-UTF8Only")</para></param>
        /// <param name="settings">null for default settings</param>
        /// <returns></returns>
        public IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(Stream input, Encoding encoding = null, LanguageIdentifierSettings settings = null)
        {
            if (encoding != null && encoding != Encoding.UTF8)
            {
                // we can afford to not dispose TextReaderStream wrapper as it doesn't contain unmanaged resources
                // we do not own base stream passed so we cannot close it
                input = new TextReaderStream(new StreamReader(input, encoding), Encoding.UTF8); // decodes stream into UTF8 from any other encoding
                // todo: restrict to searching among UTF8 language models only
            }
            if (settings == null)
            {
                settings = new LanguageIdentifierSettings();
            }

            IEnumerable <UInt64> tokens =
                new ByteToUInt64NGramExtractor(settings.MaxNgramLength, settings.OnlyReadFirstNLines)
                .GetFeatures(input);
            var langaugeModel = LanguageModelCreator.CreateLangaugeModel(
                tokens, settings.OccuranceNumberThreshold, _maximumSizeOfDistribution);

            List <Tuple <LanguageInfo, double> > result = _classifier.Classify(langaugeModel).ToList();
            double leastDistance = result.First().Item2;
            List <Tuple <LanguageInfo, double> > acceptableResults =
                result.Where(t => t.Item2 <= leastDistance * settings.WorstAcceptableThreshold).ToList();

            if (acceptableResults.Count == 0 || acceptableResults.Count > settings.TooManyLanguagesThreshold)
            {
                return(Enumerable.Empty <Tuple <LanguageInfo, double> >());
            }
            return(acceptableResults);
        }
Ejemplo n.º 2
0
        public void Test()
        {
            byte[] englishEtalon =
                Encoding.GetEncoding(1251).GetBytes(
                    @"Many of his novels, with their recurrent concern for social reform, first appeared in magazines in serialised form, 
a popular format at the time. Unlike other authors who completed entire novels before serialisation, Dickens often created the episodes as they were being serialized. 
The practice lent his stories a particular rhythm, punctuated by cliffhangers to keep the public looking forward to the next installment.[2] 
The continuing popularity of his novels and short stories is such that they have never gone out of print.[3]");

            byte[] englishQuery =
                Encoding.GetEncoding(1251).GetBytes(
                    @"Shakespeare was born and raised in Stratford-upon-Avon. At the age of 18, he married Anne Hathaway, 
with whom he had three children: Susanna, and twins Hamnet and Judith. Between 1585 and 1592, he began a successful career in London as an actor, writer, 
and part owner of a playing company called the Lord Chamberlain's Men, later known as the King's Men. He appears to have retired to Stratford around 1613, 
where he died three years later. Few records of Shakespeare's private life survive, and there has been considerable speculation about such matters as his physical 
appearance, sexuality, religious beliefs, and whether the works attributed to him were written by others.
Shakespeare produced most of his known work between 1589 and 1613.[5][nb 4] His early plays were mainly comedies and histories, genres he 
raised to the peak of sophistication and artistry by the end of the 16th century. He then wrote mainly tragedies until about 1608, including Hamlet, 
King Lear, and Macbeth, considered some of the finest works in the English language. In his last phase, he wrote tragicomedies, also known as romances, 
and collaborated with other playwrights.");

            byte[] dutchEtalon =
                Encoding.GetEncoding(1251).GetBytes(
                    @"Dickens werd geboren als zoon van John Dickens en Elizabeth Barrow. Toen hij tien was, verhuisde de familie naar Londen. 
Door financiële moeilijkheden van zijn vader (hij werd wegens schulden in de gevangenis gezet) moest de jonge Charles enkele malen zijn school 
verlaten om te gaan werken. Zo belandde hij op twaalfjarige leeftijd in een schoensmeerfabriek waar hij tien uur per dag moest werken. 
De omstandigheden waaronder arbeiders moesten leven werden een belangrijk onderwerp in zijn latere werk.");
            byte[] dutchQuery =
                Encoding.GetEncoding(1251).GetBytes(
                    @"William Shakespeare (ook gespeld Shakspere, Shaksper, en Shake-speare, 
omdat de spelling in de Elizabethaanse periode niet absoluut was) werd geboren in Stratford-upon-Avon in Warwickshire, in april 1564. 
William was de zoon van John Shakespeare, een succesvolle handelaar en wethouder, en van Mary Arden, een dochter uit een adellijke familie. 
De Shakespeares woonden toen in Henley Street. Bekend is dat William op 26 april werd gedoopt. Omdat het destijds gebruikelijk was om een kind drie 
dagen na de geboorte te dopen, is Shakespeare waarschijnlijk op zondag 23 april geboren. Het huis in Stratford is bekend als 'de geboorteplaats van Shakespeare,' 
maar deze status is onzeker. Shakespeares vader was een welvarende handschoenenmaker en verkreeg vele titels tijdens zijn leven, met inbegrip van chamberlain,[1] 
wethouder, deurwaarder (equivalent van burgemeester), en eerste schepen. Later werd hij vervolgd voor deelname aan de zwarte markt in wol en verloor zijn positie 
als wethouder. Sommige gegevens wijzen op mogelijke roomse sympathieën aan beide kanten van het gezin - een gevaar onder de strenge anti-katholieke regels van 
koningin Elizabeth.");

            byte[] russianEtalon =
                Encoding.GetEncoding(1251).GetBytes(
                    @"Его отец был довольно состоятельным чиновником, человеком весьма легкомысленным, но весёлым и добродушным, 
со вкусом пользовавшимся тем уютом, тем комфортом, которым так дорожила всякая зажиточная семья старой Англии. Своих детей и, 
в частности, своего любимца Чарли, мистер Диккенс окружил заботой и лаской.
Маленький Чарльз унаследовал от отца богатое воображение, лёгкость слова, по-видимому, присоединив к этому некоторую жизненную серьёзность, 
унаследованную от матери, на плечи которой падали все житейские заботы по сохранению благосостояния семьи.
Богатые способности мальчика восхищали родителей, и артистически настроенный отец буквально изводил своего сынишку, заставляя его разыгрывать 
разные сцены, рассказывать свои впечатления, импровизировать, читать стихи и т. д. Диккенс превратился в маленького актёра, преисполненного самовлюблённости и тщеславия.");
            byte[] russianQuery =
                Encoding.GetEncoding(1251).GetBytes(
                    @"Считается, что Шекспир учился в стратфордской «грамматической школе» (англ. «grammar school»), 
где получил серьёзное образование: стратфордский учитель латинского языка и словесности писал стихи на латыни. Некоторые 
учёные утверждают, что Шекспир посещал школу короля Эдуарда VI в Стратфорде-на-Эйвоне, где изучал творчество таких поэтов, 
как Овидий и Плавт[6], однако школьные журналы не сохранились[7], и теперь ничего нельзя сказать наверняка.");

            var tokenizer = new ByteToUInt64NGramExtractor(5);
            Func <byte[], IDistribution <UInt64> > createLanguageModel =
                bytes => LanguageModelCreator.CreateLangaugeModel(tokenizer.GetFeatures(bytes), 0, 400);

            var guesser = new RankedClassifier <ulong>(400);

            guesser.AddEtalonLanguageModel(new LanguageModel <ulong>(createLanguageModel(englishEtalon), new LanguageInfo("en", null, null, null)));
            guesser.AddEtalonLanguageModel(new LanguageModel <ulong>(createLanguageModel(dutchEtalon), new LanguageInfo("nl", null, null, null)));
            guesser.AddEtalonLanguageModel(new LanguageModel <ulong>(createLanguageModel(russianEtalon), new LanguageInfo("ru", null, null, null)));
            Assert.AreEqual("nl", guesser.Classify(createLanguageModel(dutchQuery)).First().Item1.Iso639_2T);
            Assert.AreEqual("ru", guesser.Classify(createLanguageModel(russianQuery)).First().Item1.Iso639_2T);
            Assert.AreEqual("en", guesser.Classify(createLanguageModel(englishQuery)).First().Item1.Iso639_2T);
        }
Ejemplo n.º 3
0
        static void Main(string[] args)
        {
            //Debugger.Launch();
            //MemoryStream s = new MemoryStream();
            //Console.OpenStandardInput().CopyTo(s);
            double defaultWorstAcceptableThreshold  = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]);
            int    defaultTooManyLanguagesThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]);
            string defaultLanguageModelsDirectory   = ConfigurationManager.AppSettings["LanguageModelsDirectory"];
            int    defaultOccuranceNumberThreshold  = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]);
            int    defaultMaximumSizeOfDistribution = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]);

            bool   opt_help                      = false;
            bool   opt_train                     = false;
            string opt_trainOnFile               = null;
            string opt_classifyFromArgument      = null;
            bool   opt_classifyFromInputPerLine  = false;
            double opt_WorstAcceptableThreshold  = defaultWorstAcceptableThreshold;
            int    opt_TooManyLanguagesThreshold = defaultTooManyLanguagesThreshold;
            string opt_LanguageModelsDirectory   = defaultLanguageModelsDirectory;
            int    opt_OccuranceNumberThreshold  = defaultOccuranceNumberThreshold;
            long   opt_OnlyReadFirstNLines       = long.MaxValue;
            int    opt_MaximumSizeOfDistribution = defaultMaximumSizeOfDistribution;
            bool   opt_verbose                   = false;
            bool   opt_noPrompt                  = false;

            OptionSet option_set = new OptionSet()

                                   .Add("?|help|h", "Prints out the options.", option => opt_help = option != null)

                                   .Add("n|train:", "Trains from the file specified or input stream.",
                                        option =>
            {
                opt_train       = true;
                opt_trainOnFile = option;
            })
                                   .Add("s",
                                        @"Determine language of each line of input.",
                                        option => opt_classifyFromInputPerLine = option != null)
                                   .Add("a=",
                                        @"the program returns the best-scoring language together" + Environment.NewLine +
                                        @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine +
                                        @"If the number of languages to be printed is larger than the value " + Environment.NewLine +
                                        @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine +
                                        @"instead a message that the input is of an unknown language is" + Environment.NewLine +
                                        @"printed. Default: " + defaultTooManyLanguagesThreshold + @".",
                                        (int option) => opt_TooManyLanguagesThreshold = option)
                                   .Add("d=",
                                        @"indicates in which directory the language models are" + Environment.NewLine +
                                        @"located (files ending in .lm). Currently only a single" + Environment.NewLine +
                                        @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".",
                                        option => opt_LanguageModelsDirectory = option)
                                   .Add("f=",
                                        @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine +
                                        @"of times or less are removed. This can be used to speed up" + Environment.NewLine +
                                        @"the program for longer inputs. For short inputs you should use" + Environment.NewLine +
                                        @"-f 0." + Environment.NewLine +
                                        @"Default: " + defaultOccuranceNumberThreshold + @".",
                                        (int option) => opt_OccuranceNumberThreshold = option)
                                   .Add("i=",
                                        @"only read first N lines",
                                        (int option) => opt_OnlyReadFirstNLines = option)
                                   .Add("l=",
                                        @"indicates that input is given as an argument on the command line," + Environment.NewLine +
                                        @"e.g. text_cat -l ""this is english text""" + Environment.NewLine +
                                        @"Cannot be used in combination with -n.",
                                        option => opt_classifyFromArgument = option)
                                   .Add("t=",
                                        @"indicates the topmost number of ngrams that should be used." + Environment.NewLine +
                                        @"If used in combination with -n this determines the size of the" + Environment.NewLine +
                                        @"output. If used with categorization this determines" + Environment.NewLine +
                                        @"the number of ngrams that are compared with each of the language" + Environment.NewLine +
                                        @"models (but each of those models is used completely)." + Environment.NewLine +
                                        @"Default: " + defaultMaximumSizeOfDistribution + @".",
                                        (int option) => opt_MaximumSizeOfDistribution = option)
                                   .Add("u=",
                                        @"determines how much worse result must be in order not to be" + Environment.NewLine +
                                        "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine +
                                        "Default: " + defaultWorstAcceptableThreshold + @".",
                                        (double option) => opt_WorstAcceptableThreshold = option)
                                   .Add("v",
                                        @"verbose. Continuation messages are written to standard error.",
                                        option => opt_verbose = option != null)
                                   .Add(NoPromptSwitch,
                                        @"prevents text input prompt from being shown.",
                                        option => opt_noPrompt = option != null);

            try
            {
                option_set.Parse(args);
            }
            catch (OptionException ex)
            {
                Console.WriteLine("Error occured: " + ex.ToString());
                ShowHelp(option_set);
            }

            if (opt_help)
            {
                ShowHelp(option_set);
                return;
            }

            if (opt_train)
            {
                LanguageModel <ulong> langaugeModel;
                Stream input;
                if (string.IsNullOrEmpty(opt_trainOnFile))
                {
                    if (!opt_noPrompt)
                    {
                        DisplayInputPrompt("Train from text input");
                    }
                    input = Console.OpenStandardInput();
                }
                else
                {
                    input = File.OpenRead(opt_trainOnFile);
                }
                using (input)
                {
                    IEnumerable <UInt64> tokens = new ByteToUInt64NGramExtractor(5, opt_OnlyReadFirstNLines).GetFeatures(input);
                    langaugeModel = new LanguageModel <UInt64>(
                        LanguageModelCreator.CreateLangaugeModel(tokens, opt_OccuranceNumberThreshold, opt_MaximumSizeOfDistribution),
                        new LanguageInfo(null, null, null, null) /*API should ask about language*/);
                }
                using (Stream standardOutput = Console.OpenStandardOutput())
                {
                    new ByteLanguageModelPersister().Save(langaugeModel, standardOutput);
                }
            }
            else
            {
                var languageIdentifier = new LanguageIdentifier(opt_LanguageModelsDirectory, opt_MaximumSizeOfDistribution);
                var settings           = new LanguageIdentifier.LanguageIdentifierSettings(
                    opt_TooManyLanguagesThreshold, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines,
                    opt_WorstAcceptableThreshold, 5);
                if (opt_classifyFromArgument != null)
                {
                    var languages = languageIdentifier.ClassifyText(opt_classifyFromArgument, settings);
                    OutputIdentifiedLanguages(languages);
                }
                else if (opt_classifyFromInputPerLine)
                {
                    if (!opt_noPrompt)
                    {
                        DisplayInputPrompt("Classify each line from text input");
                    }
                    using (Stream input = Console.OpenStandardInput())
                    {
                        // suboptimal read performance, but per-line mode is not intended to be used in heavy scenarios
                        foreach (IEnumerable <byte> line in Split <byte>(EnumerateAllBytes(input), true, 0xD, 0xA))
                        {
                            using (var linestream = new MemoryStream(line.ToArray()))
                            {
                                var languages = languageIdentifier.ClassifyBytes(linestream, null, settings);
                                OutputIdentifiedLanguages(languages);
                            }
                        }
                    }
                }
                else
                {
                    if (!opt_noPrompt)
                    {
                        DisplayInputPrompt("Classify text input");
                    }
                    using (Stream input = Console.OpenStandardInput())
                    {
                        var languages = languageIdentifier.ClassifyBytes(input, null, settings);
                        OutputIdentifiedLanguages(languages);
                    }
                }
            }
        }