Пример #1
0
        /// <summary>
        /// returns possible languages of text contained in <paramref name="input"/> or empty sequence if too uncertain.
        /// </summary>
        /// <param name="input"></param>
        /// <param name="encoding">encoding of text contained in <paramref name="input"/> or null if encoding is unknown beforehand.
        /// <para> When encoding is not null, for performance and quality reasons
        /// please make sure that <see cref="LanguageIdentifier"/> is created with
        /// languageModelsDirectory parameter of constructor pointing to models
        /// built from UTF8 encoded files (models from folder "Wikipedia-Experimental-UTF8Only")</para></param>
        /// <param name="settings">null for default settings</param>
        /// <returns></returns>
        public IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(Stream input, Encoding encoding = null, LanguageIdentifierSettings settings = null)
        {
            if (encoding != null && encoding != Encoding.UTF8)
            {
                // we can afford to not dispose TextReaderStream wrapper as it doesn't contain unmanaged resources
                // we do not own base stream passed so we cannot close it
                input = new TextReaderStream(new StreamReader(input, encoding), Encoding.UTF8); // decodes stream into UTF8 from any other encoding
                // todo: restrict to searching among UTF8 language models only
            }
            if (settings == null)
            {
                settings = new LanguageIdentifierSettings();
            }

            IEnumerable <UInt64> tokens =
                new ByteToUInt64NGramExtractor(settings.MaxNgramLength, settings.OnlyReadFirstNLines)
                .GetFeatures(input);
            var langaugeModel = LanguageModelCreator.CreateLangaugeModel(
                tokens, settings.OccuranceNumberThreshold, _maximumSizeOfDistribution);

            List <Tuple <LanguageInfo, double> > result = _classifier.Classify(langaugeModel).ToList();
            double leastDistance = result.First().Item2;
            List <Tuple <LanguageInfo, double> > acceptableResults =
                result.Where(t => t.Item2 <= leastDistance * settings.WorstAcceptableThreshold).ToList();

            if (acceptableResults.Count == 0 || acceptableResults.Count > settings.TooManyLanguagesThreshold)
            {
                return(Enumerable.Empty <Tuple <LanguageInfo, double> >());
            }
            return(acceptableResults);
        }
Пример #2
0
        /// <summary>
        /// returns possible languages of text passed or empty sequence if too uncertain
        /// </summary>
        /// <param name="text">text language of which should be identified</param>
        /// <param name="languageModelsDirectory"></param>
        /// <param name="maximumSizeOfDistribution"></param>
        /// <param name="settings">null for default settings</param>
        /// <returns></returns>
        public static IEnumerable <Tuple <LanguageInfo, double> > ClassifyText(
            string text,
            string languageModelsDirectory      = LanguageModelsDirectoryDefault,
            int maximumSizeOfDistribution       = MaximumSizeOfDistributionDefault,
            LanguageIdentifierSettings settings = null
            )
        {
            var languageIdentifier = new LanguageIdentifier(languageModelsDirectory, maximumSizeOfDistribution);

            return(languageIdentifier.ClassifyText(text, settings));
        }
Пример #3
0
        /// <summary>
        /// returns possible languages of text contained in <paramref name="input"/> or empty sequence if too uncertain.
        /// </summary>
        /// <param name="input"></param>
        /// <param name="encoding">encoding of text contained in stream or null if encoding is unknown beforehand.
        /// <para> When encoding is not null, for performance and quality reasons
        /// make sure that <paramref name="languageModelsDirectory"/> points to models
        /// built from UTF8 encoded files (Wikipedia-Experimental-UTF8Only)</para></param>
        /// <param name="languageModelsDirectory"></param>
        /// <param name="maximumSizeOfDistribution"></param>
        /// <param name="settings">null for default settings</param>
        /// <returns></returns>
        public static IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(
            Stream input,
            Encoding encoding = null,
            string languageModelsDirectory      = LanguageModelsDirectoryDefault,
            int maximumSizeOfDistribution       = MaximumSizeOfDistributionDefault,
            LanguageIdentifierSettings settings = null
            )
        {
            var languageIdentifier = new LanguageIdentifier(languageModelsDirectory, maximumSizeOfDistribution);

            return(languageIdentifier.ClassifyBytes(input, encoding, settings));
        }
Пример #4
0
 /// <summary>
 /// returns possible languages of text passed or empty sequence if too uncertain
 /// Almost all parameters are optional.
 /// </summary>
 /// <param name="text">text language of which should be identified</param>
 /// <param name="settings">null for default settings</param>
 /// <returns></returns>
 public IEnumerable <Tuple <LanguageInfo, double> > ClassifyText(string text, LanguageIdentifierSettings settings = null)
 {
     return(ClassifyBytes(new TextReaderStream(new StringReader(text), Encoding.UTF8), Encoding.UTF8, settings));
 }
Пример #5
0
 /// <summary>
 /// returns possible languages of text encoded in <paramref name="input"/> or empty sequence if too uncertain
 /// </summary>
 /// <param name="input"></param>
 /// <param name="encoding">encoding of text contained in stream or null if encoding is unknown beforehand.
 /// <para> When encoding is not null, for performance and quality reasons
 /// make sure that <see cref="LanguageIdentifier"/> is created with
 /// languageModelsDirectory parameter of constructor pointing to models
 /// built from UTF8 encoded files (models from folder "Wikipedia-Experimental-UTF8Only")</para></param>
 /// <param name="settings">null for default settings</param>
 /// <returns></returns>
 public IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(byte[] input, Encoding encoding = null, LanguageIdentifierSettings settings = null)
 {
     using (var stream = new MemoryStream(input))
     {
         return(ClassifyBytes(stream, encoding, settings));
     }
 }