/// <summary> /// returns possible languages of text contained in <paramref name="input"/> or empty sequence if too uncertain. /// </summary> /// <param name="input"></param> /// <param name="encoding">encoding of text contained in <paramref name="input"/> or null if encoding is unknown beforehand. /// <para> When encoding is not null, for performance and quality reasons /// please make sure that <see cref="LanguageIdentifier"/> is created with /// languageModelsDirectory parameter of constructor pointing to models /// built from UTF8 encoded files (models from folder "Wikipedia-Experimental-UTF8Only")</para></param> /// <param name="settings">null for default settings</param> /// <returns></returns> public IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(Stream input, Encoding encoding = null, LanguageIdentifierSettings settings = null) { if (encoding != null && encoding != Encoding.UTF8) { // we can afford to not dispose TextReaderStream wrapper as it doesn't contain unmanaged resources // we do not own base stream passed so we cannot close it input = new TextReaderStream(new StreamReader(input, encoding), Encoding.UTF8); // decodes stream into UTF8 from any other encoding // todo: restrict to searching among UTF8 language models only } if (settings == null) { settings = new LanguageIdentifierSettings(); } IEnumerable <UInt64> tokens = new ByteToUInt64NGramExtractor(settings.MaxNgramLength, settings.OnlyReadFirstNLines) .GetFeatures(input); var langaugeModel = LanguageModelCreator.CreateLangaugeModel( tokens, settings.OccuranceNumberThreshold, _maximumSizeOfDistribution); List <Tuple <LanguageInfo, double> > result = _classifier.Classify(langaugeModel).ToList(); double leastDistance = result.First().Item2; List <Tuple <LanguageInfo, double> > acceptableResults = result.Where(t => t.Item2 <= leastDistance * settings.WorstAcceptableThreshold).ToList(); if (acceptableResults.Count == 0 || acceptableResults.Count > settings.TooManyLanguagesThreshold) { return(Enumerable.Empty <Tuple <LanguageInfo, double> >()); } return(acceptableResults); }
/// <summary> /// returns possible languages of text passed or empty sequence if too uncertain /// </summary> /// <param name="text">text language of which should be identified</param> /// <param name="languageModelsDirectory"></param> /// <param name="maximumSizeOfDistribution"></param> /// <param name="settings">null for default settings</param> /// <returns></returns> public static IEnumerable <Tuple <LanguageInfo, double> > ClassifyText( string text, string languageModelsDirectory = LanguageModelsDirectoryDefault, int maximumSizeOfDistribution = MaximumSizeOfDistributionDefault, LanguageIdentifierSettings settings = null ) { var languageIdentifier = new LanguageIdentifier(languageModelsDirectory, maximumSizeOfDistribution); return(languageIdentifier.ClassifyText(text, settings)); }
/// <summary> /// returns possible languages of text contained in <paramref name="input"/> or empty sequence if too uncertain. /// </summary> /// <param name="input"></param> /// <param name="encoding">encoding of text contained in stream or null if encoding is unknown beforehand. /// <para> When encoding is not null, for performance and quality reasons /// make sure that <paramref name="languageModelsDirectory"/> points to models /// built from UTF8 encoded files (Wikipedia-Experimental-UTF8Only)</para></param> /// <param name="languageModelsDirectory"></param> /// <param name="maximumSizeOfDistribution"></param> /// <param name="settings">null for default settings</param> /// <returns></returns> public static IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes( Stream input, Encoding encoding = null, string languageModelsDirectory = LanguageModelsDirectoryDefault, int maximumSizeOfDistribution = MaximumSizeOfDistributionDefault, LanguageIdentifierSettings settings = null ) { var languageIdentifier = new LanguageIdentifier(languageModelsDirectory, maximumSizeOfDistribution); return(languageIdentifier.ClassifyBytes(input, encoding, settings)); }
/// <summary> /// returns possible languages of text passed or empty sequence if too uncertain /// Almost all parameters are optional. /// </summary> /// <param name="text">text language of which should be identified</param> /// <param name="settings">null for default settings</param> /// <returns></returns> public IEnumerable <Tuple <LanguageInfo, double> > ClassifyText(string text, LanguageIdentifierSettings settings = null) { return(ClassifyBytes(new TextReaderStream(new StringReader(text), Encoding.UTF8), Encoding.UTF8, settings)); }
/// <summary> /// returns possible languages of text encoded in <paramref name="input"/> or empty sequence if too uncertain /// </summary> /// <param name="input"></param> /// <param name="encoding">encoding of text contained in stream or null if encoding is unknown beforehand. /// <para> When encoding is not null, for performance and quality reasons /// make sure that <see cref="LanguageIdentifier"/> is created with /// languageModelsDirectory parameter of constructor pointing to models /// built from UTF8 encoded files (models from folder "Wikipedia-Experimental-UTF8Only")</para></param> /// <param name="settings">null for default settings</param> /// <returns></returns> public IEnumerable <Tuple <LanguageInfo, double> > ClassifyBytes(byte[] input, Encoding encoding = null, LanguageIdentifierSettings settings = null) { using (var stream = new MemoryStream(input)) { return(ClassifyBytes(stream, encoding, settings)); } }