Exemplo n.º 1
0
        private async Task MainAsync()
        {
            var credentials = JsonConvert.DeserializeObject <Credentials>(File.ReadAllText("Keys/Credentials.json"));

            if (credentials.BotToken == null)
            {
                throw new NullReferenceException("Invalid Credentials file");
            }

            var factory = new RankedLanguageIdentifierFactory();

            _identifier = factory.Load("Keys/Core14.profile.xml");

            foreach (var file in Directory.GetFiles("Dictionaries"))
            {
                FileInfo fi = new FileInfo(file);
                if (fi.Name.Split('.')[1] == "dic")
                {
                    _dictionaries.Add(fi.Name.Split('.')[0], WordList.CreateFromFiles(file));
                }
            }

            await _commands.AddModuleAsync <Communication>(null);

            Client.MessageReceived += HandleCommandAsync;

            StartTime = DateTime.Now;
            await Client.LoginAsync(TokenType.Bot, credentials.BotToken);

            await Client.StartAsync();

            await Task.Delay(-1);
        }
Exemplo n.º 2
0
        public ContentLanguageSelection Detect(string str, ContentLanguageSelection def = ContentLanguageSelection.Unspecified)
        {
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(LanguageFilePath);
            var res        = identifier.Identify(str).FirstOrDefault();

            if (res == null)
            {
                return(def);
            }

            var langCode = res.Item1.Iso639_2T;

            if (string.Equals(langCode, "jpn", StringComparison.InvariantCultureIgnoreCase))
            {
                return(ContentLanguageSelection.Japanese);
            }

            if (string.Equals(langCode, "eng", StringComparison.InvariantCultureIgnoreCase))
            {
                return(ContentLanguageSelection.English);
            }

            return(def);
        }
 public void Apply(INews news)
 {
     var factory = new RankedLanguageIdentifierFactory();
     var identifier = factory.Load(RecognitionFile);
     var lang = identifier.Identify(news.Content);
     news.Data.Set("Language", lang.First().Item1.Iso639_3);
 }
        //https://www.codeproject.com/Articles/43198/Detect-a-written-text-s-language
        //https://ivanakcheurov.github.io/ntextcat/
        //https://github.com/ivanakcheurov/ntextcat
        public static string RunDetectLanguage(string InputText, String ConfigFile)
        {
            string Result = null;
            // Don't forget to deploy a language profile (e.g. Core14.profile.xml) with your application.
            // (take a look at "content" folder inside of NTextCat nupkg and here: https://github.com/ivanakcheurov/ntextcat/tree/master/src/LanguageModels).
            var factory             = new RankedLanguageIdentifierFactory();
            var identifier          = factory.Load(ConfigFile.Replace("\\", "/")); // can be an absolute or relative path. Beware of 260 chars limitation of the path length in Windows. Linux allows 4096 chars.
            var languages           = identifier.Identify(InputText);
            var mostCertainLanguage = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
            {
                //Console.WriteLine("The language of the text is '{0}' (ISO639-3 code)", mostCertainLanguage.Item1.Iso639_3);
                Result = mostCertainLanguage.Item1.Iso639_3;
            }

            else
            {
                Result = "The language couldn’t be identified with an acceptable degree of certainty";
            }
            //Console.WriteLine("The language couldn’t be identified with an acceptable degree of certainty");

            // outputs: The language of the text is 'eng' (ISO639-3 code)


            return(Result);
        }
Exemplo n.º 5
0
        public string TryDetectLanguague(string text)
        {
            string result = string.Empty;

            if (_LanguageIdentificationFailed)
            {
                return(result);
            }

            try
            {
                if (_NTextCatFactory == null || _NTextCatIdentifier == null)
                {
                    _NTextCatFactory    = new RankedLanguageIdentifierFactory();
                    _NTextCatIdentifier = _NTextCatFactory.Load(_NTextCatLanguageModelsPath);
                }

                var languages           = _NTextCatIdentifier.Identify(text);
                var mostCertainLanguage = languages.FirstOrDefault();

                if (mostCertainLanguage != null)
                {
                    result = ConvertISOLangugueNameToSystemName(mostCertainLanguage.Item1.Iso639_3);
                }
            }
            catch (Exception e)
            {
                _LanguageIdentificationFailed = true;
                _Logger?.WriteLog(e.ToString());
            }

            return(result);
        }
Exemplo n.º 6
0
        static void Main(string[] args)
        {
            Console.WriteLine("Program started at {0}", DateTime.Now.ToString());
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load("Core14.profile.xml");

            var result = Permute(ALPHABET, 8);
            int count  = 0;

            foreach (var perm in result)
            {
                count++;
                int      index      = 0;
                string[] currentSeq = (string[])SEQUENCE.Clone();

                foreach (var c in perm)
                {
                    currentSeq = currentSeq.Select(x => x.Replace(index.ToString()[0], c)).ToArray();
                    index++;
                }
                DetectLanguage(identifier, String.Join(null, currentSeq).ToLower(), count);
            }
            Console.WriteLine("Finished processing at {0}", DateTime.Now.ToString());
            Console.ReadKey();
        }
Exemplo n.º 7
0
        static void Main(string[] args)
        {
            //inicializimi i libraris per gjuhe
            var factory     = new RankedLanguageIdentifierFactory();
            var identitfier = factory.Load("NTextCat.0.2.1.30\\Core14.profile.xml");

            int i = 1;

            while (i == 1)
            {
                Console.WriteLine("enter your text :");
                var read = Console.ReadLine();

                //leximi i gjuhes ne baz te tekstit hyres
                var languages = identitfier.Identify(read);

                var mostCertainLanguage = languages.FirstOrDefault();

                if (mostCertainLanguage != null)
                {
                    Console.ReadLine();
                    Console.WriteLine("The language of the text is '{0}' \n", mostCertainLanguage.Item1.Iso639_3);
                }
                else
                {
                    Console.ReadLine();
                    Console.WriteLine("the text coudnt be identified");
                }
            }
        }
 public void Initialize()
 {
     NBFactory    = new NaiveBayesLanguageIdentifierFactory();
     RLFactory    = new RankedLanguageIdentifierFactory();
     NBIdentifier = NBFactory.Load(Path.Combine(ModelFolder, SelectedModel));
     RLIdentifier = RLFactory.Load(Path.Combine(ModelFolder, SelectedModel));
 }
Exemplo n.º 9
0
        public void Apply(INews news)
        {
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(RecognitionFile);
            var lang       = identifier.Identify(news.Content);

            news.Data.Set("Language", lang.First().Item1.Iso639_3);
        }
Exemplo n.º 10
0
        static LanguageDetector()
        {
            var factory          = new RankedLanguageIdentifierFactory();
            var currentDirectory = AppDomain.CurrentDomain.BaseDirectory;
            var coreProfileFile  = Path.Combine(currentDirectory, "Core14.profile.xml");

            _identifier = factory.Load(coreProfileFile);
        }
        public NoticeParserFactory()
        {
            _tedLabelDictionary = new TedLabelDictionary();

            var factory = new RankedLanguageIdentifierFactory();

            _rankedLanguageIdentifier = factory.Load("Core14.profile.xml");
        }
Exemplo n.º 12
0
        public static void Initialize()
        {
            var file = new FileInfo(@".\Application\Configuration\Core14.profile.xml");

            using (var readStream = File.Open(file.FullName, FileMode.Open))
            {
                var factory = new RankedLanguageIdentifierFactory();
                Identifier = factory.Load(readStream);
            }
        }
Exemplo n.º 13
0
        public LanguageDetector()
        {
            var xml     = Resources.Core14_profile;
            var bytes   = Encoding.UTF8.GetBytes(xml);
            var factory = new RankedLanguageIdentifierFactory();

            using (var s = new MemoryStream(bytes))
            {
                _languageIdentifier = factory.Load(s);
            }
        }
        public void TestRankedLanguageIdentifierFactory()
        {
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(_identifierFile);
            var res        = identifier.Identify("был зачитан вслух");

            Assert.That(res.First().Item1.Iso639_2T, Is.EqualTo("rus"));
            var res2 = identifier.Identify("Главная задача сэмпла - предоставить желающим качать возможность оценить реальное качество материала без скачивания всей раздачи целиком. Поэтому вырезать сэмпл надо из середины фильма и без каких либо искажений. Достаточно фрагмента на 1-2 минуты. Заливать сэмпл следует только на файлообменники");

            Assert.That(res2.First().Item1.Iso639_2T, Is.EqualTo("rus"));
        }
        public void TestTrainIdentifyCycle_Ranked()
        {
            TestCase[] testCases  = PrepareTestCases();
            var        factory    = new RankedLanguageIdentifierFactory();
            var        identifier = factory.Train(
                testCases.Select(t => Tuple.Create(new LanguageInfo(t.ISO639_2T, null, null, null), (TextReader) new StringReader(t.Standard))));

            foreach (var testCase in testCases)
            {
                Assert.That(identifier.Identify(testCase.Query).First().Item1.Iso639_2T, Is.EqualTo(testCase.ISO639_2T));
            }
        }
		public GoogleLanguageDetection(int maxDegreeOfParallelism)
		{
			MaxDegreeOfParallelism = maxDegreeOfParallelism;

			Assembly assembly = Assembly.GetExecutingAssembly();
			string resourceName = "NCrawler.LanguageDetection.Google.Core14.profile.xml";
			using (Stream stream = assembly.GetManifestResourceStream(resourceName))
			{
				RankedLanguageIdentifierFactory factory = new RankedLanguageIdentifierFactory();
				_identifier = factory.Load(stream);
			}
		}
        public GoogleLanguageDetection(int maxDegreeOfParallelism)
        {
            MaxDegreeOfParallelism = maxDegreeOfParallelism;

            Assembly assembly     = Assembly.GetExecutingAssembly();
            string   resourceName = "NCrawler.LanguageDetection.Google.Core14.profile.xml";

            using (Stream stream = assembly.GetManifestResourceStream(resourceName))
            {
                RankedLanguageIdentifierFactory factory = new RankedLanguageIdentifierFactory();
                _identifier = factory.Load(stream);
            }
        }
Exemplo n.º 18
0
        public void Evaluate()
        {
            {
                var factory = new NaiveBayesLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T));
                GetConfusions(identifier.Identify, "Naive", _mostCommonLanguages);
            }

            {
                var factory = new RankedLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T));
                GetConfusions(identifier.Identify, "Ranked", _mostCommonLanguages);
            }
        }
Exemplo n.º 19
0
        public void SanityCheck()
        {
            {
                var factory = new NaiveBayesLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3));
                var result = identifier.Identify("you got me").ToArray();
            }

            {
                var factory = new RankedLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3));
                var result = identifier.Identify("you got me").ToArray();
            }
        }
Exemplo n.º 20
0
    private static RankedLanguageIdentifier LoadLanguageIdentifier(IdentifierSettings settings)
    {
        var factory = new RankedLanguageIdentifierFactory(
            settings.MaxNGramLength,
            settings.MaximumSizeOfDistribution,
            settings.OccuranceNumberThreshold, settings.OnlyReadFirstNLines,
            false);

        using (var stream = OpenInternalFile(settings.EmbeddedProfilePath))
        {
            var identifier = factory.Load(stream);
            return(identifier);
        }
    }
Exemplo n.º 21
0
        public void Evaluate()
        {
            {
                var factory    = new NaiveBayesLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T));
                GetConfusions(identifier.Identify, "Naive", _mostCommonLanguages);
            }

            {
                var factory    = new RankedLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T));
                GetConfusions(identifier.Identify, "Ranked", _mostCommonLanguages);
            }
        }
Exemplo n.º 22
0
        public void SanityCheck()
        {
            {
                var factory    = new NaiveBayesLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3));
                var result     = identifier.Identify("you got me").ToArray();
            }

            {
                var factory    = new RankedLanguageIdentifierFactory();
                var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3));
                var result     = identifier.Identify("you got me").ToArray();
            }
        }
Exemplo n.º 23
0
        string getLanguage(string text)
        {
            if (isEnglishCharacters(text))
            {
                return("eng");
            }
            if (isJapaneseCharacters(text))
            {
                return("jpn");
            }

            if (ncIdentifier_ == null)
            {
                var file = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath),
                                        @"Core14.profile.xml");
                if (!File.Exists(file))
                {
                    MessageBox.Show("Profile file of NTextCat not found.");
                    return(string.Empty);
                }
                var fac = new RankedLanguageIdentifierFactory();
                ncIdentifier_ = fac.Load(file);
            }

            var languages           = ncIdentifier_.Identify(text);
            var mostCertainLanguage = languages.FirstOrDefault();

            string lang;

            if (mostCertainLanguage == null)
            {
                lang = string.Empty;
            }
            else
            {
                lang = mostCertainLanguage.Item1.Iso639_3;
            }

            //if (string.IsNullOrEmpty(lang) ||
            //    (lang != "eng" && lang != "jpn"))
            //{
            //    if (isEnglish_obsolete(text))
            //        lang = "eng";
            //    else if (isJapanese_obsolete(text))
            //        lang = "jpn";
            //}

            return(lang);
        }
Exemplo n.º 24
0
        private static void Main()
        {
            var stopwatch = new Stopwatch();

            stopwatch.Start();
            if (!Directory.Exists(StemmedFolder))
            {
                Directory.CreateDirectory(StemmedFolder);
            }

            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(@"..\..\..\..\Core14.profile.xml");

            Parallel.ForEach(Directory.GetFiles(ResultsFolder), (file, _, _) =>
            {
                Console.WriteLine($"Processing {file}");
                var text              = File.ReadAllText(file);
                var text1             = Regex.Replace(text, new string(Splitters), " ");
                var sb                = new StringBuilder(text1);
                sb                    = sb.ReplaceAll(Splitters.Select(x => x.ToString()));
                var wordsByWhiteSpace = sb.ToString();

                var languages           = identifier.Identify(wordsByWhiteSpace.Substring(100));
                var mostCertainLanguage = languages.FirstOrDefault();
                var langCode            = mostCertainLanguage?.Item1.Iso639_3;
                Console.WriteLine($"Lang code: {langCode}");
                IStemmer stemmer = langCode switch
                {
                    "eng" => new EnglishStemmer(),
                    "rus" => new RussianStemmer(),
                    _ => throw new NotSupportedException()
                };
                var stemmedFile = file.Replace(ResultsFolder, StemmedFolder);
                if (File.Exists(stemmedFile))
                {
                    File.Delete(stemmedFile);
                }

                using var stemmedFileWriter = File.AppendText(stemmedFile);
                foreach (var word in wordsByWhiteSpace.Split(" ").Where(x => !string.IsNullOrWhiteSpace(x)))
                {
                    var stemmed = stemmer.Stem(word);
                    stemmedFileWriter.WriteLine(stemmed);
                    // Console.WriteLine($"{word} -> {stemmed}");
                }
            });
Exemplo n.º 25
0
        public string DetectLanguage(string inputText)
        {
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(LanguageIdentifier.GetCore14());

            var languages           = identifier.Identify(inputText);
            var mostCertainLanguage = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
            {
                return(mostCertainLanguage.Item1.Iso639_3);
            }
            else
            {
                return(String.Empty);
            }
        }
        public void DetectNcat(int a)
        {
            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load("Core14.profile.xml");

            for (int i = a; i < m_text.Count; i += m_threadsCount)
            {
                string text = m_text[i];



                string res   = identifier.Identify(text).First().Item1.Iso639_2T;
                int    count = GetCountOfWords(text, res);
                AddPointToLang(res, count);
                //m_wordCount.Add(res)
            }
        }
Exemplo n.º 27
0
        public static string IdentityLanguage(string sentence)
        {
            var factory = new RankedLanguageIdentifierFactory();

            var identifier = factory.Load(@"C:\Users\Andrew Romanuk\source\Projects\SpilnaSpravaTask2\SpilnaSpravaTask2\LanguageModels\Core14.profile.xml");

            var languages = identifier.Identify(sentence);

            var mostCertainLanguage = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
            {
                string resultLanguage = mostCertainLanguage.Item1.Iso639_3;

                return(resultLanguage);
            }

            return("System error: The language couldn’t be identified with an acceptable degree of certainty");
        }
Exemplo n.º 28
0
        private string GetLanguage(string input)
        {
            var    factory = new RankedLanguageIdentifierFactory();
            string lang;

            var identifier          = factory.Load(configPath + "Core14.profile.xml");
            var languages           = identifier.Identify(input);
            var mostCertainLanguage = languages.FirstOrDefault();

            if (mostCertainLanguage != null)
            {
                lang = mostCertainLanguage.Item1.Iso639_3.Substring(0, 2);
            }
            else
            {
                lang = "en";
            }
            return(lang);
        }
		public ContentLanguageSelection Detect(string str, ContentLanguageSelection def = ContentLanguageSelection.Unspecified) {
			
			var factory = new RankedLanguageIdentifierFactory();
			var identifier = factory.Load(LanguageFilePath);
			var res = identifier.Identify(str).FirstOrDefault();

			if (res == null)
				return def;

			var langCode = res.Item1.Iso639_2T;

			if (string.Equals(langCode, "jpn", StringComparison.InvariantCultureIgnoreCase))
				return ContentLanguageSelection.Japanese;

			if (string.Equals(langCode, "eng", StringComparison.InvariantCultureIgnoreCase))
				return ContentLanguageSelection.English;

			return def;

		}
Exemplo n.º 30
0
        public void ChangeCurrentCulture()
        {
            var spanishMessage = "No stock disponible en la maquina por favor";

            var file = new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Configuration\Core14.profile.xml"));

            using (var readStream = File.Open(file.FullName, FileMode.Open))
            {
                var sut = new RankedLanguageIdentifierFactory();
                _identifier = sut.Load(readStream);
            }

            var spanishLanguageIdentifier = _identifier.Identify(spanishMessage);
            var mostCertainSpaLanguage    = spanishLanguageIdentifier.FirstOrDefault();
            var theSpaLanguage            = mostCertainSpaLanguage.Item1.Iso639_3;

            CultureInfo ci = new CultureInfo(theSpaLanguage);

            System.Threading.Thread.CurrentThread.CurrentUICulture = ci;
            System.Threading.Thread.CurrentThread.CurrentCulture   = CultureInfo.CreateSpecificCulture(ci.Name);
            System.Threading.Thread.CurrentThread.CurrentCulture.Name.Should().Be("es-MX", ci.Name);
        }
Exemplo n.º 31
0
        public async Task Initialize(Configuration configuration)
        {
            this.configuration = configuration;
            minDate            = configuration.MediaMinDate;
            dateLimitReached   = false;

            var factory = new RankedLanguageIdentifierFactory();

            languageIdentifier = factory.Load(Utils.GetPath(@"packages\NTextCat.0.2.1.30\Core14.profile.xml"));

            instaApi = InstaApiBuilder.CreateBuilder()
                       .SetUser(new UserSessionData()
            {
                UserName  = configuration.InstagramUserName,
                Password  = configuration.InstagramPassword,
                CsrfToken = configuration.InstagramCsrfToken
            })
                       .SetRequestDelay(RequestDelay.FromSeconds(0, 0))
                       .Build();


            await instaApi.LoginAsync();
        }
Exemplo n.º 32
0
        private static RankedLanguageIdentifier CreateIdentifier()
        {
            var factory = new RankedLanguageIdentifierFactory();
            var identifierProfilePath =
                RankedLanguageIdentifierFactory.GetSetting("LanguageIdentificationProfileFilePath", (string)null);
            string mappedPath = null;

            if (identifierProfilePath != null && System.IO.File.Exists(identifierProfilePath) == false)
            {
                Log.DebugFormat("Cannot find a profile in the following path: '{0}'. Trying HostingEnvironment.MapPath", identifierProfilePath);
                mappedPath = HostingEnvironment.MapPath(identifierProfilePath);
            }
            var finalPath = mappedPath ?? identifierProfilePath;

            if (finalPath == null || System.IO.File.Exists(finalPath) == false)
            {
                Log.DebugFormat("Cannot find a profile in the following path: '{0}'.", finalPath);
                throw new InvalidOperationException(string.Format("Cannot find a profile in the following path: '{0}'.", finalPath));
            }
            var identifier = factory.Load(finalPath);

            return(identifier);
        }
Exemplo n.º 33
0
        public void RetriveLanguageDataFromString()
        {
            var spanishMessage = "como esta por que";
            var englishMessage = "why are we doing this";

            var file = new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Configuration\Core14.profile.xml"));

            using (var readStream = File.Open(file.FullName, FileMode.Open))
            {
                var sut = new RankedLanguageIdentifierFactory();
                _identifier = sut.Load(readStream);
            }

            var spanishLanguageIdentifier = _identifier.Identify(spanishMessage);
            var englishLanguageIdentifier = _identifier.Identify(englishMessage);
            var mostCertainSpaLanguage    = spanishLanguageIdentifier.FirstOrDefault();
            var mostCertainEngLanguage    = englishLanguageIdentifier.FirstOrDefault();

            var theSpaLanguage = mostCertainSpaLanguage.Item1.Iso639_3;
            var theEngLanguage = mostCertainEngLanguage.Item1.Iso639_3;

            theSpaLanguage.Should().Be("es-MX");
            theEngLanguage.Should().Be("en-US");
        }
Exemplo n.º 34
0
 public void TestRankedLanguageIdentifierFactory()
 {
     var factory = new RankedLanguageIdentifierFactory();
     var identifier = factory.Load(_identifierFile);
     var res = identifier.Identify("был зачитан вслух");
     Assert.That(res.First().Item1.Iso639_2T, Is.EqualTo("rus"));
     var res2 = identifier.Identify("Главная задача сэмпла - предоставить желающим качать возможность оценить реальное качество материала без скачивания всей раздачи целиком. Поэтому вырезать сэмпл надо из середины фильма и без каких либо искажений. Достаточно фрагмента на 1-2 минуты. Заливать сэмпл следует только на файлообменники");
     Assert.That(res2.First().Item1.Iso639_2T, Is.EqualTo("rus"));
 }
Exemplo n.º 35
0
        public void TestTrainIdentifyCycle_Ranked()
        {
            TestCase[] testCases = PrepareTestCases();
            var factory = new RankedLanguageIdentifierFactory();
            var identifier = factory.Train(
                testCases.Select(t => Tuple.Create(new LanguageInfo(t.ISO639_2T, null, null, null), (TextReader)new StringReader(t.Standard))));

            foreach (var testCase in testCases)
            {
                Assert.That(identifier.Identify(testCase.Query).First().Item1.Iso639_2T, Is.EqualTo(testCase.ISO639_2T));
            }
        }
Exemplo n.º 36
0
 private static RankedLanguageIdentifier LoadLanguageIdentifier(IdentifierSettings settings)
 {
     var factory = new RankedLanguageIdentifierFactory(
         settings.MaxNGramLength,
         settings.MaximumSizeOfDistribution,
         settings.OccuranceNumberThreshold, settings.OnlyReadFirstNLines,
         false);
     using (var stream = OpenInternalFile(settings.EmbeddedProfilePath))
     {
         var identifier = factory.Load(stream);
         return identifier;
     }
 }
Exemplo n.º 37
0
        private static void Main(string[] args)
        {
            var fileLines = File.ReadAllLines(TfIdfFilePath);
            Dictionary <int, Dictionary <string, decimal> > docVectors     = new();
            ConcurrentDictionary <int, string[]>            documentsTerms = new();

            Parallel.ForEach(Directory.GetFiles(StemmedFolder), filePath =>
            {
                // Console.WriteLine($"Processing terms of file {filePath}...");
                var fileLines1 = File.ReadAllLines(filePath);
                documentsTerms.GetOrAdd(int.Parse(Path.GetFileNameWithoutExtension(filePath)), fileLines1);
            });
            // Dictionary<int, Dictionary<string, decimal>> tfs = new();
            var totalDocumentCount = 100;

            ConcurrentDictionary <int, List <Numbers> > tfIdfCollection = new();
            ConcurrentDictionary <string, decimal>      uniqueTerms     = new(); // word - tfIdf

            var factory    = new RankedLanguageIdentifierFactory();
            var identifier = factory.Load(Config);

            // parse tf-idf
            foreach (var fileLine in fileLines)
            {
                var values = fileLine.Split(';', ' ').Where(x => !string.IsNullOrWhiteSpace(x)).ToArray();

                var documentIndex = int.Parse(values[1]);

                if (!tfIdfCollection.ContainsKey(documentIndex))
                {
                    tfIdfCollection.GetOrAdd(documentIndex, new List <Numbers>());
                }

                // if (!docVectors.ContainsKey(documentIndex))
                // {
                //     docVectors.Add(documentIndex, new Dictionary<string, decimal>());
                // }
                //
                // if (!tfs.ContainsKey(documentIndex))
                // {
                //     tfs.Add(documentIndex, new Dictionary<string, decimal>());
                // }

                var word  = values[0];
                var tf    = decimal.Parse(values[2]);
                var idf   = decimal.Parse(values[3]);
                var tfIdf = decimal.Parse(values[4]);
                tfIdfCollection[documentIndex].Add(new Numbers()
                {
                    Idf = idf, Tf = tf, Word = word, DocIndex = documentIndex, TfIdf = tfIdf
                });

                uniqueTerms.GetOrAdd(word, tfIdf);
                // docVectors[documentIndex].Add(word, decimal.Parse(tfIdf));
                // tfs[documentIndex].Add(word, decimal.Parse(tf));
            }

            var queryVector = new Numbers[uniqueTerms.Count];

            // Dictionary<int, List<decimal>> docResVectors = new();

            // normalize query terms
            var query = args.Select(x => PrepareWord(identifier, x)).ToArray();
            // List<decimal> queryVector = new();

            // step 3 - посчитать tf-idf для запроса
            // ReSharper disable once LoopCanBeConvertedToQuery

            // Console.WriteLine(string.Join(",", uniqueTerms.Take(10)));
            // Console.WriteLine(uniqueTerms["transact"]);
            var i1 = 0;

            foreach (var(term, _) in uniqueTerms)
            {
                var numberOfOccurrences = query.Count(x => x == term);
                var tf = numberOfOccurrences / (double)query.Length;

                if (tf == 0)
                {
                    queryVector[i1] = new Numbers {
                        Word = term
                    };
                    i1++;
                    continue;
                }

                var numberOfDocumentsContainingTheTerm = documentsTerms.Count(x => x.Value.Contains(term));
                var idf = Math.Log10(totalDocumentCount / (double)numberOfDocumentsContainingTheTerm);

                var tfIdf = tf * idf;
                queryVector[i1] = new Numbers()
                {
                    Word = term, TfIdf = Round(tfIdf), Idf = Round(idf), Tf = Round(tf)
                };
                i1++;

                // queryVector.Add(tfIdf);

                // foreach (var (key, _) in docVectors)
                // {
                //     if (!docResVectors.ContainsKey(key))
                //     {
                //         docResVectors.Add(key, new List<decimal>());
                //     }
                //
                //     docResVectors[key]
                //         .Add(docVectors.ContainsKey(key)
                //             ? docVectors[key]
                //                 .ContainsKey(queryWord)
                //                 ? docVectors[key][queryWord]
                //                 : 0
                //             : 0);
                // }
            }

            Dictionary <int, double> cos = new();

            // for (var j = 0; j < query.Length; j++)
            for (var i = 0; i < totalDocumentCount; i++)
            {
                var v1 = queryVector;
                var v2 = tfIdfCollection[i];

                if (v1.Length != v2.Count)
                {
                    throw new Exception("Vector count not match");
                }
                if (v1.Length != uniqueTerms.Count)
                {
                    throw new Exception("Vector count not match");
                }

                var sum = WordJoin(v1, v2);
                var p1  = Math.Sqrt(WordJoin(v1, v1));
                var p2  = Math.Sqrt(WordJoin(v2, v2));

                var cosVal = sum / (p1 * p2);
                cos.Add(i, cosVal);

                // var preparedQueryWord = query[j];
                // var tf = tfs[i].ContainsKey(preparedQueryWord) ? tfs[i][preparedQueryWord] : 0;
                // var tfMax = tfs[i].Max(x => x.Value);
                // var idf = Math.Log10(totalNumberOfDocuments / (double) (1 + docResVectors.Select(x => x.Value[j] > 0 ? 1 : 0).Sum()));
                // var value = idf * (0.5 + 0.5 * ((double) tf / (double) tfMax));

                // if (!cos.ContainsKey(j)) cos.Add(j, new Dictionary<int, double>());
                // cos[j].Add(i, value);
            }

            cos.OrderByDescending(x => x.Value).Take(10)
            .ForEach(x => Console.WriteLine(x.Key + ";" + Round(x.Value)));

            // Console.WriteLine(String.Join(",", queryVector));
            Console.WriteLine(cos.Select(y => y.Value).Any(x => x < 0 || x > 1));
        }
 static LanguageDetector()
 {
     var factory = new RankedLanguageIdentifierFactory();
     _identifier = factory.Load("Core14.profile.xml");
 }
Exemplo n.º 39
0
        static void Main(string[] args)
        {
            //Debugger.Launch();
            //MemoryStream s = new MemoryStream();
            //Console.OpenStandardInput().CopyTo(s);
            double defaultWorstAcceptableThreshold              = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]);
            int    defaultTooManyLanguagesThreshold             = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]);
            string defaultLanguageIdentificationProfileFilePath = ConfigurationManager.AppSettings["LanguageIdentificationProfileFilePath"];
            int    defaultOccuranceNumberThreshold              = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]);
            int    defaultMaximumSizeOfDistribution             = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]);
            bool   defaultDisallowMultithreading = XmlConvert.ToBoolean(ConfigurationManager.AppSettings["DisallowMultithreading"]);

            bool     opt_help                                  = false;
            string   opt_train                                 = null;
            Encoding opt_InputEncoding                         = null;
            string   opt_classifyFromArgument                  = null;
            bool     opt_classifyFromInputPerLine              = false;
            double   opt_WorstAcceptableThreshold              = defaultWorstAcceptableThreshold;
            int      opt_TooManyLanguagesThreshold             = defaultTooManyLanguagesThreshold;
            string   opt_LanguageIdentificationProfileFilePath = defaultLanguageIdentificationProfileFilePath;
            int      opt_OccuranceNumberThreshold              = defaultOccuranceNumberThreshold;
            int      opt_OnlyReadFirstNLines                   = int.MaxValue;
            int      opt_MaximumSizeOfDistribution             = defaultMaximumSizeOfDistribution;
            bool     opt_verbose                               = false;
            bool     opt_disallowMultithreading                = defaultDisallowMultithreading;
            bool     opt_noPrompt                              = false;

            int    codepage;
            string currentExeName = Path.GetFileName(Environment.GetCommandLineArgs()[0]);

            OptionSet option_set = new OptionSet()

                                   .Add("?|help|h", "Prints out the options.", option => opt_help = option != null)

                                   .Add("train=", "Trains from the files specified by VALUE. " + Environment.NewLine +
                                        "VALUE can be a wildcard or a directory that contains training files (non-recursive)." + Environment.NewLine +
                                        "Examples:" + Environment.NewLine +
                                        "  " + currentExeName + " -train=SomeDir1" + Environment.NewLine +
                                        "  " + currentExeName + " -train=c:\\temp\\dataset\\*.txt",
                                        option =>
            {
                opt_train = option;
            })
                                   .Add("e=",
                                        "indicates which encoding to use to decode the input stream or files into proper text. Ignored when -l option is specified." + Environment.NewLine +
                                        "If no encoding is specified, tries to detect the encoding via BOM or uses the standard system's encoding" + Environment.NewLine +
                                        "The encoding is specified either via its codepage or its name, e.g. \"UTF-8\"",
                                        option => opt_InputEncoding = int.TryParse(option, out codepage) ? Encoding.GetEncoding(codepage) : Encoding.GetEncoding(option))
                                   .Add("s",
                                        @"Determine language of each line of input.",
                                        option => opt_classifyFromInputPerLine = option != null)
                                   .Add("a=",
                                        @"the program returns the best-scoring language together" + Environment.NewLine +
                                        @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine +
                                        @"If the number of languages to be printed is larger than the value " + Environment.NewLine +
                                        @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine +
                                        @"instead a message that the input is of an unknown language is" + Environment.NewLine +
                                        @"printed. Default: " + defaultTooManyLanguagesThreshold + @".",
                                        (int option) => opt_TooManyLanguagesThreshold = option)
                                   //.Add("d=",
                                   //    @"indicates in which directory the language models are" + Environment.NewLine +
                                   //    @"located (files ending in .lm). Currently only a single" + Environment.NewLine +
                                   //    @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".",
                                   //   option => opt_LanguageModelsDirectory = option)
                                   .Add("p=",
                                        @"indicates a file from which to load a language identification profile. Default: """ + defaultLanguageIdentificationProfileFilePath + @""".",
                                        option => opt_LanguageIdentificationProfileFilePath = option)
                                   .Add("f=",
                                        @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine +
                                        @"of times or less are removed. This can be used to speed up" + Environment.NewLine +
                                        @"the program for longer inputs. For short inputs you should use" + Environment.NewLine +
                                        @"-f 0." + Environment.NewLine +
                                        @"Default: " + defaultOccuranceNumberThreshold + @".",
                                        (int option) => opt_OccuranceNumberThreshold = option)
                                   .Add("i=",
                                        @"only read first N lines",
                                        (int option) => opt_OnlyReadFirstNLines = option)
                                   .Add("l=",
                                        @"indicates that input is given as an argument on the command line," + Environment.NewLine +
                                        @"e.g. text_cat -l ""this is english text""" + Environment.NewLine +
                                        @"Cannot be used in combination with -n.",
                                        option => opt_classifyFromArgument = option)
                                   .Add("t=",
                                        @"indicates the topmost number of ngrams that should be used." + Environment.NewLine +
                                        @"If used in combination with -n this determines the size of the" + Environment.NewLine +
                                        @"output. If used with categorization this determines" + Environment.NewLine +
                                        @"the number of ngrams that are compared with each of the language" + Environment.NewLine +
                                        @"models (but each of those models is used completely)." + Environment.NewLine +
                                        @"Default: " + defaultMaximumSizeOfDistribution + @".",
                                        (int option) => opt_MaximumSizeOfDistribution = option)
                                   .Add("u=",
                                        @"determines how much worse result must be in order not to be" + Environment.NewLine +
                                        "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine +
                                        "Default: " + defaultWorstAcceptableThreshold + @".",
                                        (double option) => opt_WorstAcceptableThreshold = option)
                                   .Add("v",
                                        @"verbose. Continuation messages are written to standard error.",
                                        option => opt_verbose = option != null)
                                   .Add("m",
                                        @"disallow multithreading. If set to true, training and identification will use a single thread.",
                                        option => opt_disallowMultithreading = option != null)
                                   .Add(NoPromptSwitch,
                                        @"prevents text input prompt from being shown.",
                                        option => opt_noPrompt = option != null);

            try
            {
                option_set.Parse(args);
            }
            catch (OptionException ex)
            {
                Console.WriteLine("Error occured: " + ex.ToString());
                ShowHelp(option_set);
            }

            if (opt_help)
            {
                ShowHelp(option_set);
                return;
            }

            if (opt_train != null)
            {
                string[] files = null;
                if (Directory.Exists(opt_train))
                {
                    files = Directory.GetFiles(opt_train);
                    if (files.Length == 0)
                    {
                        throw new InvalidOperationException("Cannot find files int the following directory: " + opt_train);
                    }
                }
                else // treat as a wildcard
                {
                    // avoiding System.ArgumentException: Illegal characters in path.
                    var  path     = Path.GetDirectoryName(opt_train.Replace('*', '_').Replace('?', '_')) ?? String.Empty;
                    var  wildcard = opt_train.Substring(path.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar);
                    bool failed   = true;
                    if (!string.IsNullOrWhiteSpace(wildcard))
                    {
                        files = Directory.GetFiles(string.IsNullOrWhiteSpace(path) ? "." : path, wildcard, SearchOption.TopDirectoryOnly);
                        if (files.Length > 0)
                        {
                            failed = false;
                        }
                    }
                    if (failed)
                    {
                        throw new InvalidOperationException("Cannot find files with the following wildcard path: " + opt_train);
                    }
                }
                var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading);
                var input   =
                    files.Select(f =>
                                 Tuple.Create(
                                     new LanguageInfo(Path.GetFileNameWithoutExtension(f), Path.GetFileNameWithoutExtension(f), null, null),
                                     GetTextReader(f, opt_InputEncoding)));
                using (var standardOutput = Console.OpenStandardOutput())
                {
                    var identifier = factory.TrainAndSave(input, standardOutput);
                }
            }
            else
            {
                var factory            = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading);
                var languageIdentifier = factory.Load(opt_LanguageIdentificationProfileFilePath);

                if (opt_classifyFromArgument != null)
                {
                    var languages = languageIdentifier.Identify(opt_classifyFromArgument);
                    OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                }
                else if (opt_classifyFromInputPerLine)
                {
                    if (!opt_noPrompt)
                    {
                        DisplayInputPrompt("Classify each line from text input");
                    }
                    using (Stream input = Console.OpenStandardInput())
                        using (var reader = GetTextReader(input, opt_InputEncoding))
                        {
                            string line;
                            while ((line = reader.ReadLine()) != null)
                            {
                                var languages = languageIdentifier.Identify(line);
                                OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                            }
                        }
                }
                else // classify all from input
                {
                    if (!opt_noPrompt)
                    {
                        DisplayInputPrompt("Classify text input");
                    }
                    using (var input = Console.OpenStandardInput())
                        using (var reader = GetTextReader(input, opt_InputEncoding))
                        {
                            var text      = reader.ReadToEnd();
                            var languages = languageIdentifier.Identify(text);
                            OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                        }
                }
            }
        }
Exemplo n.º 40
0
        static void Main(string[] args)
        {
            //Debugger.Launch();
            //MemoryStream s = new MemoryStream();
            //Console.OpenStandardInput().CopyTo(s);
            double defaultWorstAcceptableThreshold = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]);
            int defaultTooManyLanguagesThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]);
            string defaultLanguageIdentificationProfileFilePath = ConfigurationManager.AppSettings["LanguageIdentificationProfileFilePath"];
            int defaultOccuranceNumberThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]);
            int defaultMaximumSizeOfDistribution = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]);
            bool defaultDisallowMultithreading = XmlConvert.ToBoolean(ConfigurationManager.AppSettings["DisallowMultithreading"]);

            bool opt_help = false;
            string opt_train = null;
            Encoding opt_InputEncoding = null;
            string opt_classifyFromArgument = null;
            bool opt_classifyFromInputPerLine = false;
            double opt_WorstAcceptableThreshold = defaultWorstAcceptableThreshold;
            int opt_TooManyLanguagesThreshold = defaultTooManyLanguagesThreshold;
            string opt_LanguageIdentificationProfileFilePath = defaultLanguageIdentificationProfileFilePath;
            int opt_OccuranceNumberThreshold = defaultOccuranceNumberThreshold;
            int opt_OnlyReadFirstNLines = int.MaxValue;
            int opt_MaximumSizeOfDistribution = defaultMaximumSizeOfDistribution;
            bool opt_verbose = false;
            bool opt_disallowMultithreading = defaultDisallowMultithreading;
            bool opt_noPrompt = false;

            int codepage;
            string currentExeName = Path.GetFileName(Environment.GetCommandLineArgs()[0]);

            OptionSet option_set = new OptionSet()

                .Add("?|help|h", "Prints out the options.", option => opt_help = option != null)

                .Add("train=", "Trains from the files specified by VALUE. " + Environment.NewLine +
                               "VALUE can be a wildcard or a directory that contains training files (non-recursive)." + Environment.NewLine +
                                 "Examples:" + Environment.NewLine +
                                 "  " + currentExeName + " -train=SomeDir1" + Environment.NewLine +
                                 "  " + currentExeName + " -train=c:\\temp\\dataset\\*.txt",
                    option =>
                    {
                        opt_train = option;
                    })
                .Add("e=",
                    "indicates which encoding to use to decode the input stream or files into proper text. Ignored when -l option is specified." + Environment.NewLine +
                    "If no encoding is specified, tries to detect the encoding via BOM or uses the standard system's encoding" + Environment.NewLine +
                    "The encoding is specified either via its codepage or its name, e.g. \"UTF-8\"",
                    option => opt_InputEncoding = int.TryParse(option, out codepage) ? Encoding.GetEncoding(codepage) : Encoding.GetEncoding(option))
                .Add("s",
                    @"Determine language of each line of input.",
                    option => opt_classifyFromInputPerLine = option != null)
                .Add("a=",
                    @"the program returns the best-scoring language together" + Environment.NewLine +
                    @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine +
                    @"If the number of languages to be printed is larger than the value " + Environment.NewLine +
                    @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine +
                    @"instead a message that the input is of an unknown language is" + Environment.NewLine +
                    @"printed. Default: " + defaultTooManyLanguagesThreshold + @".",
                   (int option) => opt_TooManyLanguagesThreshold = option)
                //.Add("d=",
                //    @"indicates in which directory the language models are" + Environment.NewLine +
                //    @"located (files ending in .lm). Currently only a single" + Environment.NewLine +
                //    @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".",
                //   option => opt_LanguageModelsDirectory = option)
                .Add("p=",
                    @"indicates a file from which to load a language identification profile. Default: """ + defaultLanguageIdentificationProfileFilePath + @""".",
                   option => opt_LanguageIdentificationProfileFilePath = option)
                .Add("f=",
                    @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine +
                    @"of times or less are removed. This can be used to speed up" + Environment.NewLine +
                    @"the program for longer inputs. For short inputs you should use" + Environment.NewLine +
                    @"-f 0." + Environment.NewLine +
                    @"Default: " + defaultOccuranceNumberThreshold + @".",
                   (int option) => opt_OccuranceNumberThreshold = option)
                .Add("i=",
                    @"only read first N lines",
                   (int option) => opt_OnlyReadFirstNLines = option)
                .Add("l=",
                    @"indicates that input is given as an argument on the command line," + Environment.NewLine +
                    @"e.g. text_cat -l ""this is english text""" + Environment.NewLine +
                    @"Cannot be used in combination with -n.",
                   option => opt_classifyFromArgument = option)
                .Add("t=",
                    @"indicates the topmost number of ngrams that should be used." + Environment.NewLine +
                    @"If used in combination with -n this determines the size of the" + Environment.NewLine +
                    @"output. If used with categorization this determines" + Environment.NewLine +
                    @"the number of ngrams that are compared with each of the language" + Environment.NewLine +
                    @"models (but each of those models is used completely)." + Environment.NewLine +
                    @"Default: " + defaultMaximumSizeOfDistribution + @".",
                   (int option) => opt_MaximumSizeOfDistribution = option)
                .Add("u=",
                   @"determines how much worse result must be in order not to be" + Environment.NewLine +
                    "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine +
                    "Default: " + defaultWorstAcceptableThreshold + @".",
                   (double option) => opt_WorstAcceptableThreshold = option)
                .Add("v",
                   @"verbose. Continuation messages are written to standard error.",
                   option => opt_verbose = option != null)
                .Add("m",
                   @"disallow multithreading. If set to true, training and identification will use a single thread.",
                   option => opt_disallowMultithreading = option != null)
                .Add(NoPromptSwitch,
                   @"prevents text input prompt from being shown.",
                   option => opt_noPrompt = option != null);

            try
            {
                option_set.Parse(args);
            }
            catch (OptionException ex)
            {
                Console.WriteLine("Error occured: " + ex.ToString());
                ShowHelp(option_set);
            }

            if (opt_help)
            {
                ShowHelp(option_set);
                return;
            }

            if (opt_train != null)
            {
                string[] files = null;
                if (Directory.Exists(opt_train))
                {
                    files = Directory.GetFiles(opt_train);
                    if (files.Length == 0)
                        throw new InvalidOperationException("Cannot find files int the following directory: " + opt_train);
                }
                else // treat as a wildcard
                {
                    // avoiding System.ArgumentException: Illegal characters in path.
                    var path = Path.GetDirectoryName(opt_train.Replace('*', '_').Replace('?', '_')) ?? String.Empty;
                    var wildcard = opt_train.Substring(path.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar);
                    bool failed = true;
                    if (!string.IsNullOrWhiteSpace(wildcard))
                    {
                        files = Directory.GetFiles(string.IsNullOrWhiteSpace(path) ? "." : path, wildcard, SearchOption.TopDirectoryOnly);
                        if (files.Length > 0)
                            failed = false;
                    }
                    if (failed)
                        throw new InvalidOperationException("Cannot find files with the following wildcard path: " + opt_train);
                }
                var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading);
                var input =
                    files.Select(f =>
                                 Tuple.Create(
                                     new LanguageInfo(Path.GetFileNameWithoutExtension(f), Path.GetFileNameWithoutExtension(f), null, null),
                                     GetTextReader(f, opt_InputEncoding)));
                using (var standardOutput = Console.OpenStandardOutput())
                {
                    var identifier = factory.TrainAndSave(input, standardOutput);
                }
            }
            else
            {
                var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading);
                var languageIdentifier = factory.Load(opt_LanguageIdentificationProfileFilePath);

                if (opt_classifyFromArgument != null)
                {
                    var languages = languageIdentifier.Identify(opt_classifyFromArgument);
                    OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                }
                else if (opt_classifyFromInputPerLine)
                {
                    if (!opt_noPrompt)
                        DisplayInputPrompt("Classify each line from text input");
                    using (Stream input = Console.OpenStandardInput())
                    using (var reader = GetTextReader(input, opt_InputEncoding))
                    {
                        string line;
                        while((line = reader.ReadLine()) != null)
                        {
                            var languages = languageIdentifier.Identify(line);
                            OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                        }
                    }
                }
                else // classify all from input
                {
                    if (!opt_noPrompt)
                        DisplayInputPrompt("Classify text input");
                    using (var input = Console.OpenStandardInput())
                    using (var reader = GetTextReader(input, opt_InputEncoding))
                    {
                        var text = reader.ReadToEnd();
                        var languages = languageIdentifier.Identify(text);
                        OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold);
                    }
                }
            }
        }
Exemplo n.º 41
0
 private static RankedLanguageIdentifier CreateIdentifier()
 {
     var factory = new RankedLanguageIdentifierFactory();
     var identifierProfilePath =
         RankedLanguageIdentifierFactory.GetSetting("LanguageIdentificationProfileFilePath", (string)null);
     string mappedPath = null;
     if (identifierProfilePath != null && System.IO.File.Exists(identifierProfilePath) == false)
     {
         Log.DebugFormat("Cannot find a profile in the following path: '{0}'. Trying HostingEnvironment.MapPath", identifierProfilePath);
         mappedPath = HostingEnvironment.MapPath(identifierProfilePath);
     }
     var finalPath = mappedPath ?? identifierProfilePath;
     if (finalPath == null || System.IO.File.Exists(finalPath) == false)
     {
         Log.DebugFormat("Cannot find a profile in the following path: '{0}'.", finalPath);
         throw new InvalidOperationException(string.Format("Cannot find a profile in the following path: '{0}'.", finalPath));
     }
     var identifier = factory.Load(finalPath);
     return identifier;
 }