private async Task MainAsync() { var credentials = JsonConvert.DeserializeObject <Credentials>(File.ReadAllText("Keys/Credentials.json")); if (credentials.BotToken == null) { throw new NullReferenceException("Invalid Credentials file"); } var factory = new RankedLanguageIdentifierFactory(); _identifier = factory.Load("Keys/Core14.profile.xml"); foreach (var file in Directory.GetFiles("Dictionaries")) { FileInfo fi = new FileInfo(file); if (fi.Name.Split('.')[1] == "dic") { _dictionaries.Add(fi.Name.Split('.')[0], WordList.CreateFromFiles(file)); } } await _commands.AddModuleAsync <Communication>(null); Client.MessageReceived += HandleCommandAsync; StartTime = DateTime.Now; await Client.LoginAsync(TokenType.Bot, credentials.BotToken); await Client.StartAsync(); await Task.Delay(-1); }
public ContentLanguageSelection Detect(string str, ContentLanguageSelection def = ContentLanguageSelection.Unspecified) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(LanguageFilePath); var res = identifier.Identify(str).FirstOrDefault(); if (res == null) { return(def); } var langCode = res.Item1.Iso639_2T; if (string.Equals(langCode, "jpn", StringComparison.InvariantCultureIgnoreCase)) { return(ContentLanguageSelection.Japanese); } if (string.Equals(langCode, "eng", StringComparison.InvariantCultureIgnoreCase)) { return(ContentLanguageSelection.English); } return(def); }
public void Apply(INews news) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(RecognitionFile); var lang = identifier.Identify(news.Content); news.Data.Set("Language", lang.First().Item1.Iso639_3); }
//https://www.codeproject.com/Articles/43198/Detect-a-written-text-s-language //https://ivanakcheurov.github.io/ntextcat/ //https://github.com/ivanakcheurov/ntextcat public static string RunDetectLanguage(string InputText, String ConfigFile) { string Result = null; // Don't forget to deploy a language profile (e.g. Core14.profile.xml) with your application. // (take a look at "content" folder inside of NTextCat nupkg and here: https://github.com/ivanakcheurov/ntextcat/tree/master/src/LanguageModels). var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(ConfigFile.Replace("\\", "/")); // can be an absolute or relative path. Beware of 260 chars limitation of the path length in Windows. Linux allows 4096 chars. var languages = identifier.Identify(InputText); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { //Console.WriteLine("The language of the text is '{0}' (ISO639-3 code)", mostCertainLanguage.Item1.Iso639_3); Result = mostCertainLanguage.Item1.Iso639_3; } else { Result = "The language couldn’t be identified with an acceptable degree of certainty"; } //Console.WriteLine("The language couldn’t be identified with an acceptable degree of certainty"); // outputs: The language of the text is 'eng' (ISO639-3 code) return(Result); }
public string TryDetectLanguague(string text) { string result = string.Empty; if (_LanguageIdentificationFailed) { return(result); } try { if (_NTextCatFactory == null || _NTextCatIdentifier == null) { _NTextCatFactory = new RankedLanguageIdentifierFactory(); _NTextCatIdentifier = _NTextCatFactory.Load(_NTextCatLanguageModelsPath); } var languages = _NTextCatIdentifier.Identify(text); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { result = ConvertISOLangugueNameToSystemName(mostCertainLanguage.Item1.Iso639_3); } } catch (Exception e) { _LanguageIdentificationFailed = true; _Logger?.WriteLog(e.ToString()); } return(result); }
static void Main(string[] args) { Console.WriteLine("Program started at {0}", DateTime.Now.ToString()); var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load("Core14.profile.xml"); var result = Permute(ALPHABET, 8); int count = 0; foreach (var perm in result) { count++; int index = 0; string[] currentSeq = (string[])SEQUENCE.Clone(); foreach (var c in perm) { currentSeq = currentSeq.Select(x => x.Replace(index.ToString()[0], c)).ToArray(); index++; } DetectLanguage(identifier, String.Join(null, currentSeq).ToLower(), count); } Console.WriteLine("Finished processing at {0}", DateTime.Now.ToString()); Console.ReadKey(); }
static void Main(string[] args) { //inicializimi i libraris per gjuhe var factory = new RankedLanguageIdentifierFactory(); var identitfier = factory.Load("NTextCat.0.2.1.30\\Core14.profile.xml"); int i = 1; while (i == 1) { Console.WriteLine("enter your text :"); var read = Console.ReadLine(); //leximi i gjuhes ne baz te tekstit hyres var languages = identitfier.Identify(read); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { Console.ReadLine(); Console.WriteLine("The language of the text is '{0}' \n", mostCertainLanguage.Item1.Iso639_3); } else { Console.ReadLine(); Console.WriteLine("the text coudnt be identified"); } } }
public void Initialize() { NBFactory = new NaiveBayesLanguageIdentifierFactory(); RLFactory = new RankedLanguageIdentifierFactory(); NBIdentifier = NBFactory.Load(Path.Combine(ModelFolder, SelectedModel)); RLIdentifier = RLFactory.Load(Path.Combine(ModelFolder, SelectedModel)); }
static LanguageDetector() { var factory = new RankedLanguageIdentifierFactory(); var currentDirectory = AppDomain.CurrentDomain.BaseDirectory; var coreProfileFile = Path.Combine(currentDirectory, "Core14.profile.xml"); _identifier = factory.Load(coreProfileFile); }
public NoticeParserFactory() { _tedLabelDictionary = new TedLabelDictionary(); var factory = new RankedLanguageIdentifierFactory(); _rankedLanguageIdentifier = factory.Load("Core14.profile.xml"); }
public static void Initialize() { var file = new FileInfo(@".\Application\Configuration\Core14.profile.xml"); using (var readStream = File.Open(file.FullName, FileMode.Open)) { var factory = new RankedLanguageIdentifierFactory(); Identifier = factory.Load(readStream); } }
public LanguageDetector() { var xml = Resources.Core14_profile; var bytes = Encoding.UTF8.GetBytes(xml); var factory = new RankedLanguageIdentifierFactory(); using (var s = new MemoryStream(bytes)) { _languageIdentifier = factory.Load(s); } }
public void TestRankedLanguageIdentifierFactory() { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(_identifierFile); var res = identifier.Identify("был зачитан вслух"); Assert.That(res.First().Item1.Iso639_2T, Is.EqualTo("rus")); var res2 = identifier.Identify("Главная задача сэмпла - предоставить желающим качать возможность оценить реальное качество материала без скачивания всей раздачи целиком. Поэтому вырезать сэмпл надо из середины фильма и без каких либо искажений. Достаточно фрагмента на 1-2 минуты. Заливать сэмпл следует только на файлообменники"); Assert.That(res2.First().Item1.Iso639_2T, Is.EqualTo("rus")); }
public void TestTrainIdentifyCycle_Ranked() { TestCase[] testCases = PrepareTestCases(); var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Train( testCases.Select(t => Tuple.Create(new LanguageInfo(t.ISO639_2T, null, null, null), (TextReader) new StringReader(t.Standard)))); foreach (var testCase in testCases) { Assert.That(identifier.Identify(testCase.Query).First().Item1.Iso639_2T, Is.EqualTo(testCase.ISO639_2T)); } }
public GoogleLanguageDetection(int maxDegreeOfParallelism) { MaxDegreeOfParallelism = maxDegreeOfParallelism; Assembly assembly = Assembly.GetExecutingAssembly(); string resourceName = "NCrawler.LanguageDetection.Google.Core14.profile.xml"; using (Stream stream = assembly.GetManifestResourceStream(resourceName)) { RankedLanguageIdentifierFactory factory = new RankedLanguageIdentifierFactory(); _identifier = factory.Load(stream); } }
public void Evaluate() { { var factory = new NaiveBayesLanguageIdentifierFactory(); var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T)); GetConfusions(identifier.Identify, "Naive", _mostCommonLanguages); } { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T)); GetConfusions(identifier.Identify, "Ranked", _mostCommonLanguages); } }
public void SanityCheck() { { var factory = new NaiveBayesLanguageIdentifierFactory(); var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3)); var result = identifier.Identify("you got me").ToArray(); } { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(_identifierFile, lm => _mostCommonLanguages.Contains(lm.Language.Iso639_2T ?? lm.Language.Iso639_3)); var result = identifier.Identify("you got me").ToArray(); } }
private static RankedLanguageIdentifier LoadLanguageIdentifier(IdentifierSettings settings) { var factory = new RankedLanguageIdentifierFactory( settings.MaxNGramLength, settings.MaximumSizeOfDistribution, settings.OccuranceNumberThreshold, settings.OnlyReadFirstNLines, false); using (var stream = OpenInternalFile(settings.EmbeddedProfilePath)) { var identifier = factory.Load(stream); return(identifier); } }
string getLanguage(string text) { if (isEnglishCharacters(text)) { return("eng"); } if (isJapaneseCharacters(text)) { return("jpn"); } if (ncIdentifier_ == null) { var file = Path.Combine(Path.GetDirectoryName(Application.ExecutablePath), @"Core14.profile.xml"); if (!File.Exists(file)) { MessageBox.Show("Profile file of NTextCat not found."); return(string.Empty); } var fac = new RankedLanguageIdentifierFactory(); ncIdentifier_ = fac.Load(file); } var languages = ncIdentifier_.Identify(text); var mostCertainLanguage = languages.FirstOrDefault(); string lang; if (mostCertainLanguage == null) { lang = string.Empty; } else { lang = mostCertainLanguage.Item1.Iso639_3; } //if (string.IsNullOrEmpty(lang) || // (lang != "eng" && lang != "jpn")) //{ // if (isEnglish_obsolete(text)) // lang = "eng"; // else if (isJapanese_obsolete(text)) // lang = "jpn"; //} return(lang); }
private static void Main() { var stopwatch = new Stopwatch(); stopwatch.Start(); if (!Directory.Exists(StemmedFolder)) { Directory.CreateDirectory(StemmedFolder); } var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(@"..\..\..\..\Core14.profile.xml"); Parallel.ForEach(Directory.GetFiles(ResultsFolder), (file, _, _) => { Console.WriteLine($"Processing {file}"); var text = File.ReadAllText(file); var text1 = Regex.Replace(text, new string(Splitters), " "); var sb = new StringBuilder(text1); sb = sb.ReplaceAll(Splitters.Select(x => x.ToString())); var wordsByWhiteSpace = sb.ToString(); var languages = identifier.Identify(wordsByWhiteSpace.Substring(100)); var mostCertainLanguage = languages.FirstOrDefault(); var langCode = mostCertainLanguage?.Item1.Iso639_3; Console.WriteLine($"Lang code: {langCode}"); IStemmer stemmer = langCode switch { "eng" => new EnglishStemmer(), "rus" => new RussianStemmer(), _ => throw new NotSupportedException() }; var stemmedFile = file.Replace(ResultsFolder, StemmedFolder); if (File.Exists(stemmedFile)) { File.Delete(stemmedFile); } using var stemmedFileWriter = File.AppendText(stemmedFile); foreach (var word in wordsByWhiteSpace.Split(" ").Where(x => !string.IsNullOrWhiteSpace(x))) { var stemmed = stemmer.Stem(word); stemmedFileWriter.WriteLine(stemmed); // Console.WriteLine($"{word} -> {stemmed}"); } });
public string DetectLanguage(string inputText) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(LanguageIdentifier.GetCore14()); var languages = identifier.Identify(inputText); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { return(mostCertainLanguage.Item1.Iso639_3); } else { return(String.Empty); } }
public void DetectNcat(int a) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load("Core14.profile.xml"); for (int i = a; i < m_text.Count; i += m_threadsCount) { string text = m_text[i]; string res = identifier.Identify(text).First().Item1.Iso639_2T; int count = GetCountOfWords(text, res); AddPointToLang(res, count); //m_wordCount.Add(res) } }
public static string IdentityLanguage(string sentence) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(@"C:\Users\Andrew Romanuk\source\Projects\SpilnaSpravaTask2\SpilnaSpravaTask2\LanguageModels\Core14.profile.xml"); var languages = identifier.Identify(sentence); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { string resultLanguage = mostCertainLanguage.Item1.Iso639_3; return(resultLanguage); } return("System error: The language couldn’t be identified with an acceptable degree of certainty"); }
private string GetLanguage(string input) { var factory = new RankedLanguageIdentifierFactory(); string lang; var identifier = factory.Load(configPath + "Core14.profile.xml"); var languages = identifier.Identify(input); var mostCertainLanguage = languages.FirstOrDefault(); if (mostCertainLanguage != null) { lang = mostCertainLanguage.Item1.Iso639_3.Substring(0, 2); } else { lang = "en"; } return(lang); }
public ContentLanguageSelection Detect(string str, ContentLanguageSelection def = ContentLanguageSelection.Unspecified) { var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(LanguageFilePath); var res = identifier.Identify(str).FirstOrDefault(); if (res == null) return def; var langCode = res.Item1.Iso639_2T; if (string.Equals(langCode, "jpn", StringComparison.InvariantCultureIgnoreCase)) return ContentLanguageSelection.Japanese; if (string.Equals(langCode, "eng", StringComparison.InvariantCultureIgnoreCase)) return ContentLanguageSelection.English; return def; }
public void ChangeCurrentCulture() { var spanishMessage = "No stock disponible en la maquina por favor"; var file = new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Configuration\Core14.profile.xml")); using (var readStream = File.Open(file.FullName, FileMode.Open)) { var sut = new RankedLanguageIdentifierFactory(); _identifier = sut.Load(readStream); } var spanishLanguageIdentifier = _identifier.Identify(spanishMessage); var mostCertainSpaLanguage = spanishLanguageIdentifier.FirstOrDefault(); var theSpaLanguage = mostCertainSpaLanguage.Item1.Iso639_3; CultureInfo ci = new CultureInfo(theSpaLanguage); System.Threading.Thread.CurrentThread.CurrentUICulture = ci; System.Threading.Thread.CurrentThread.CurrentCulture = CultureInfo.CreateSpecificCulture(ci.Name); System.Threading.Thread.CurrentThread.CurrentCulture.Name.Should().Be("es-MX", ci.Name); }
public async Task Initialize(Configuration configuration) { this.configuration = configuration; minDate = configuration.MediaMinDate; dateLimitReached = false; var factory = new RankedLanguageIdentifierFactory(); languageIdentifier = factory.Load(Utils.GetPath(@"packages\NTextCat.0.2.1.30\Core14.profile.xml")); instaApi = InstaApiBuilder.CreateBuilder() .SetUser(new UserSessionData() { UserName = configuration.InstagramUserName, Password = configuration.InstagramPassword, CsrfToken = configuration.InstagramCsrfToken }) .SetRequestDelay(RequestDelay.FromSeconds(0, 0)) .Build(); await instaApi.LoginAsync(); }
private static RankedLanguageIdentifier CreateIdentifier() { var factory = new RankedLanguageIdentifierFactory(); var identifierProfilePath = RankedLanguageIdentifierFactory.GetSetting("LanguageIdentificationProfileFilePath", (string)null); string mappedPath = null; if (identifierProfilePath != null && System.IO.File.Exists(identifierProfilePath) == false) { Log.DebugFormat("Cannot find a profile in the following path: '{0}'. Trying HostingEnvironment.MapPath", identifierProfilePath); mappedPath = HostingEnvironment.MapPath(identifierProfilePath); } var finalPath = mappedPath ?? identifierProfilePath; if (finalPath == null || System.IO.File.Exists(finalPath) == false) { Log.DebugFormat("Cannot find a profile in the following path: '{0}'.", finalPath); throw new InvalidOperationException(string.Format("Cannot find a profile in the following path: '{0}'.", finalPath)); } var identifier = factory.Load(finalPath); return(identifier); }
public void RetriveLanguageDataFromString() { var spanishMessage = "como esta por que"; var englishMessage = "why are we doing this"; var file = new FileInfo(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, @"Configuration\Core14.profile.xml")); using (var readStream = File.Open(file.FullName, FileMode.Open)) { var sut = new RankedLanguageIdentifierFactory(); _identifier = sut.Load(readStream); } var spanishLanguageIdentifier = _identifier.Identify(spanishMessage); var englishLanguageIdentifier = _identifier.Identify(englishMessage); var mostCertainSpaLanguage = spanishLanguageIdentifier.FirstOrDefault(); var mostCertainEngLanguage = englishLanguageIdentifier.FirstOrDefault(); var theSpaLanguage = mostCertainSpaLanguage.Item1.Iso639_3; var theEngLanguage = mostCertainEngLanguage.Item1.Iso639_3; theSpaLanguage.Should().Be("es-MX"); theEngLanguage.Should().Be("en-US"); }
public void TestTrainIdentifyCycle_Ranked() { TestCase[] testCases = PrepareTestCases(); var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Train( testCases.Select(t => Tuple.Create(new LanguageInfo(t.ISO639_2T, null, null, null), (TextReader)new StringReader(t.Standard)))); foreach (var testCase in testCases) { Assert.That(identifier.Identify(testCase.Query).First().Item1.Iso639_2T, Is.EqualTo(testCase.ISO639_2T)); } }
private static RankedLanguageIdentifier LoadLanguageIdentifier(IdentifierSettings settings) { var factory = new RankedLanguageIdentifierFactory( settings.MaxNGramLength, settings.MaximumSizeOfDistribution, settings.OccuranceNumberThreshold, settings.OnlyReadFirstNLines, false); using (var stream = OpenInternalFile(settings.EmbeddedProfilePath)) { var identifier = factory.Load(stream); return identifier; } }
private static void Main(string[] args) { var fileLines = File.ReadAllLines(TfIdfFilePath); Dictionary <int, Dictionary <string, decimal> > docVectors = new(); ConcurrentDictionary <int, string[]> documentsTerms = new(); Parallel.ForEach(Directory.GetFiles(StemmedFolder), filePath => { // Console.WriteLine($"Processing terms of file {filePath}..."); var fileLines1 = File.ReadAllLines(filePath); documentsTerms.GetOrAdd(int.Parse(Path.GetFileNameWithoutExtension(filePath)), fileLines1); }); // Dictionary<int, Dictionary<string, decimal>> tfs = new(); var totalDocumentCount = 100; ConcurrentDictionary <int, List <Numbers> > tfIdfCollection = new(); ConcurrentDictionary <string, decimal> uniqueTerms = new(); // word - tfIdf var factory = new RankedLanguageIdentifierFactory(); var identifier = factory.Load(Config); // parse tf-idf foreach (var fileLine in fileLines) { var values = fileLine.Split(';', ' ').Where(x => !string.IsNullOrWhiteSpace(x)).ToArray(); var documentIndex = int.Parse(values[1]); if (!tfIdfCollection.ContainsKey(documentIndex)) { tfIdfCollection.GetOrAdd(documentIndex, new List <Numbers>()); } // if (!docVectors.ContainsKey(documentIndex)) // { // docVectors.Add(documentIndex, new Dictionary<string, decimal>()); // } // // if (!tfs.ContainsKey(documentIndex)) // { // tfs.Add(documentIndex, new Dictionary<string, decimal>()); // } var word = values[0]; var tf = decimal.Parse(values[2]); var idf = decimal.Parse(values[3]); var tfIdf = decimal.Parse(values[4]); tfIdfCollection[documentIndex].Add(new Numbers() { Idf = idf, Tf = tf, Word = word, DocIndex = documentIndex, TfIdf = tfIdf }); uniqueTerms.GetOrAdd(word, tfIdf); // docVectors[documentIndex].Add(word, decimal.Parse(tfIdf)); // tfs[documentIndex].Add(word, decimal.Parse(tf)); } var queryVector = new Numbers[uniqueTerms.Count]; // Dictionary<int, List<decimal>> docResVectors = new(); // normalize query terms var query = args.Select(x => PrepareWord(identifier, x)).ToArray(); // List<decimal> queryVector = new(); // step 3 - посчитать tf-idf для запроса // ReSharper disable once LoopCanBeConvertedToQuery // Console.WriteLine(string.Join(",", uniqueTerms.Take(10))); // Console.WriteLine(uniqueTerms["transact"]); var i1 = 0; foreach (var(term, _) in uniqueTerms) { var numberOfOccurrences = query.Count(x => x == term); var tf = numberOfOccurrences / (double)query.Length; if (tf == 0) { queryVector[i1] = new Numbers { Word = term }; i1++; continue; } var numberOfDocumentsContainingTheTerm = documentsTerms.Count(x => x.Value.Contains(term)); var idf = Math.Log10(totalDocumentCount / (double)numberOfDocumentsContainingTheTerm); var tfIdf = tf * idf; queryVector[i1] = new Numbers() { Word = term, TfIdf = Round(tfIdf), Idf = Round(idf), Tf = Round(tf) }; i1++; // queryVector.Add(tfIdf); // foreach (var (key, _) in docVectors) // { // if (!docResVectors.ContainsKey(key)) // { // docResVectors.Add(key, new List<decimal>()); // } // // docResVectors[key] // .Add(docVectors.ContainsKey(key) // ? docVectors[key] // .ContainsKey(queryWord) // ? docVectors[key][queryWord] // : 0 // : 0); // } } Dictionary <int, double> cos = new(); // for (var j = 0; j < query.Length; j++) for (var i = 0; i < totalDocumentCount; i++) { var v1 = queryVector; var v2 = tfIdfCollection[i]; if (v1.Length != v2.Count) { throw new Exception("Vector count not match"); } if (v1.Length != uniqueTerms.Count) { throw new Exception("Vector count not match"); } var sum = WordJoin(v1, v2); var p1 = Math.Sqrt(WordJoin(v1, v1)); var p2 = Math.Sqrt(WordJoin(v2, v2)); var cosVal = sum / (p1 * p2); cos.Add(i, cosVal); // var preparedQueryWord = query[j]; // var tf = tfs[i].ContainsKey(preparedQueryWord) ? tfs[i][preparedQueryWord] : 0; // var tfMax = tfs[i].Max(x => x.Value); // var idf = Math.Log10(totalNumberOfDocuments / (double) (1 + docResVectors.Select(x => x.Value[j] > 0 ? 1 : 0).Sum())); // var value = idf * (0.5 + 0.5 * ((double) tf / (double) tfMax)); // if (!cos.ContainsKey(j)) cos.Add(j, new Dictionary<int, double>()); // cos[j].Add(i, value); } cos.OrderByDescending(x => x.Value).Take(10) .ForEach(x => Console.WriteLine(x.Key + ";" + Round(x.Value))); // Console.WriteLine(String.Join(",", queryVector)); Console.WriteLine(cos.Select(y => y.Value).Any(x => x < 0 || x > 1)); }
static LanguageDetector() { var factory = new RankedLanguageIdentifierFactory(); _identifier = factory.Load("Core14.profile.xml"); }
static void Main(string[] args) { //Debugger.Launch(); //MemoryStream s = new MemoryStream(); //Console.OpenStandardInput().CopyTo(s); double defaultWorstAcceptableThreshold = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]); int defaultTooManyLanguagesThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]); string defaultLanguageIdentificationProfileFilePath = ConfigurationManager.AppSettings["LanguageIdentificationProfileFilePath"]; int defaultOccuranceNumberThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]); int defaultMaximumSizeOfDistribution = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]); bool defaultDisallowMultithreading = XmlConvert.ToBoolean(ConfigurationManager.AppSettings["DisallowMultithreading"]); bool opt_help = false; string opt_train = null; Encoding opt_InputEncoding = null; string opt_classifyFromArgument = null; bool opt_classifyFromInputPerLine = false; double opt_WorstAcceptableThreshold = defaultWorstAcceptableThreshold; int opt_TooManyLanguagesThreshold = defaultTooManyLanguagesThreshold; string opt_LanguageIdentificationProfileFilePath = defaultLanguageIdentificationProfileFilePath; int opt_OccuranceNumberThreshold = defaultOccuranceNumberThreshold; int opt_OnlyReadFirstNLines = int.MaxValue; int opt_MaximumSizeOfDistribution = defaultMaximumSizeOfDistribution; bool opt_verbose = false; bool opt_disallowMultithreading = defaultDisallowMultithreading; bool opt_noPrompt = false; int codepage; string currentExeName = Path.GetFileName(Environment.GetCommandLineArgs()[0]); OptionSet option_set = new OptionSet() .Add("?|help|h", "Prints out the options.", option => opt_help = option != null) .Add("train=", "Trains from the files specified by VALUE. " + Environment.NewLine + "VALUE can be a wildcard or a directory that contains training files (non-recursive)." + Environment.NewLine + "Examples:" + Environment.NewLine + " " + currentExeName + " -train=SomeDir1" + Environment.NewLine + " " + currentExeName + " -train=c:\\temp\\dataset\\*.txt", option => { opt_train = option; }) .Add("e=", "indicates which encoding to use to decode the input stream or files into proper text. Ignored when -l option is specified." + Environment.NewLine + "If no encoding is specified, tries to detect the encoding via BOM or uses the standard system's encoding" + Environment.NewLine + "The encoding is specified either via its codepage or its name, e.g. \"UTF-8\"", option => opt_InputEncoding = int.TryParse(option, out codepage) ? Encoding.GetEncoding(codepage) : Encoding.GetEncoding(option)) .Add("s", @"Determine language of each line of input.", option => opt_classifyFromInputPerLine = option != null) .Add("a=", @"the program returns the best-scoring language together" + Environment.NewLine + @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine + @"If the number of languages to be printed is larger than the value " + Environment.NewLine + @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine + @"instead a message that the input is of an unknown language is" + Environment.NewLine + @"printed. Default: " + defaultTooManyLanguagesThreshold + @".", (int option) => opt_TooManyLanguagesThreshold = option) //.Add("d=", // @"indicates in which directory the language models are" + Environment.NewLine + // @"located (files ending in .lm). Currently only a single" + Environment.NewLine + // @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".", // option => opt_LanguageModelsDirectory = option) .Add("p=", @"indicates a file from which to load a language identification profile. Default: """ + defaultLanguageIdentificationProfileFilePath + @""".", option => opt_LanguageIdentificationProfileFilePath = option) .Add("f=", @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine + @"of times or less are removed. This can be used to speed up" + Environment.NewLine + @"the program for longer inputs. For short inputs you should use" + Environment.NewLine + @"-f 0." + Environment.NewLine + @"Default: " + defaultOccuranceNumberThreshold + @".", (int option) => opt_OccuranceNumberThreshold = option) .Add("i=", @"only read first N lines", (int option) => opt_OnlyReadFirstNLines = option) .Add("l=", @"indicates that input is given as an argument on the command line," + Environment.NewLine + @"e.g. text_cat -l ""this is english text""" + Environment.NewLine + @"Cannot be used in combination with -n.", option => opt_classifyFromArgument = option) .Add("t=", @"indicates the topmost number of ngrams that should be used." + Environment.NewLine + @"If used in combination with -n this determines the size of the" + Environment.NewLine + @"output. If used with categorization this determines" + Environment.NewLine + @"the number of ngrams that are compared with each of the language" + Environment.NewLine + @"models (but each of those models is used completely)." + Environment.NewLine + @"Default: " + defaultMaximumSizeOfDistribution + @".", (int option) => opt_MaximumSizeOfDistribution = option) .Add("u=", @"determines how much worse result must be in order not to be" + Environment.NewLine + "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine + "Default: " + defaultWorstAcceptableThreshold + @".", (double option) => opt_WorstAcceptableThreshold = option) .Add("v", @"verbose. Continuation messages are written to standard error.", option => opt_verbose = option != null) .Add("m", @"disallow multithreading. If set to true, training and identification will use a single thread.", option => opt_disallowMultithreading = option != null) .Add(NoPromptSwitch, @"prevents text input prompt from being shown.", option => opt_noPrompt = option != null); try { option_set.Parse(args); } catch (OptionException ex) { Console.WriteLine("Error occured: " + ex.ToString()); ShowHelp(option_set); } if (opt_help) { ShowHelp(option_set); return; } if (opt_train != null) { string[] files = null; if (Directory.Exists(opt_train)) { files = Directory.GetFiles(opt_train); if (files.Length == 0) { throw new InvalidOperationException("Cannot find files int the following directory: " + opt_train); } } else // treat as a wildcard { // avoiding System.ArgumentException: Illegal characters in path. var path = Path.GetDirectoryName(opt_train.Replace('*', '_').Replace('?', '_')) ?? String.Empty; var wildcard = opt_train.Substring(path.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar); bool failed = true; if (!string.IsNullOrWhiteSpace(wildcard)) { files = Directory.GetFiles(string.IsNullOrWhiteSpace(path) ? "." : path, wildcard, SearchOption.TopDirectoryOnly); if (files.Length > 0) { failed = false; } } if (failed) { throw new InvalidOperationException("Cannot find files with the following wildcard path: " + opt_train); } } var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading); var input = files.Select(f => Tuple.Create( new LanguageInfo(Path.GetFileNameWithoutExtension(f), Path.GetFileNameWithoutExtension(f), null, null), GetTextReader(f, opt_InputEncoding))); using (var standardOutput = Console.OpenStandardOutput()) { var identifier = factory.TrainAndSave(input, standardOutput); } } else { var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading); var languageIdentifier = factory.Load(opt_LanguageIdentificationProfileFilePath); if (opt_classifyFromArgument != null) { var languages = languageIdentifier.Identify(opt_classifyFromArgument); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } else if (opt_classifyFromInputPerLine) { if (!opt_noPrompt) { DisplayInputPrompt("Classify each line from text input"); } using (Stream input = Console.OpenStandardInput()) using (var reader = GetTextReader(input, opt_InputEncoding)) { string line; while ((line = reader.ReadLine()) != null) { var languages = languageIdentifier.Identify(line); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } } } else // classify all from input { if (!opt_noPrompt) { DisplayInputPrompt("Classify text input"); } using (var input = Console.OpenStandardInput()) using (var reader = GetTextReader(input, opt_InputEncoding)) { var text = reader.ReadToEnd(); var languages = languageIdentifier.Identify(text); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } } } }
static void Main(string[] args) { //Debugger.Launch(); //MemoryStream s = new MemoryStream(); //Console.OpenStandardInput().CopyTo(s); double defaultWorstAcceptableThreshold = XmlConvert.ToDouble(ConfigurationManager.AppSettings["WorstAcceptableThreshold"]); int defaultTooManyLanguagesThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["TooManyLanguagesThreshold"]); string defaultLanguageIdentificationProfileFilePath = ConfigurationManager.AppSettings["LanguageIdentificationProfileFilePath"]; int defaultOccuranceNumberThreshold = XmlConvert.ToInt32(ConfigurationManager.AppSettings["OccuranceNumberThreshold"]); int defaultMaximumSizeOfDistribution = XmlConvert.ToInt32(ConfigurationManager.AppSettings["MaximumSizeOfDistribution"]); bool defaultDisallowMultithreading = XmlConvert.ToBoolean(ConfigurationManager.AppSettings["DisallowMultithreading"]); bool opt_help = false; string opt_train = null; Encoding opt_InputEncoding = null; string opt_classifyFromArgument = null; bool opt_classifyFromInputPerLine = false; double opt_WorstAcceptableThreshold = defaultWorstAcceptableThreshold; int opt_TooManyLanguagesThreshold = defaultTooManyLanguagesThreshold; string opt_LanguageIdentificationProfileFilePath = defaultLanguageIdentificationProfileFilePath; int opt_OccuranceNumberThreshold = defaultOccuranceNumberThreshold; int opt_OnlyReadFirstNLines = int.MaxValue; int opt_MaximumSizeOfDistribution = defaultMaximumSizeOfDistribution; bool opt_verbose = false; bool opt_disallowMultithreading = defaultDisallowMultithreading; bool opt_noPrompt = false; int codepage; string currentExeName = Path.GetFileName(Environment.GetCommandLineArgs()[0]); OptionSet option_set = new OptionSet() .Add("?|help|h", "Prints out the options.", option => opt_help = option != null) .Add("train=", "Trains from the files specified by VALUE. " + Environment.NewLine + "VALUE can be a wildcard or a directory that contains training files (non-recursive)." + Environment.NewLine + "Examples:" + Environment.NewLine + " " + currentExeName + " -train=SomeDir1" + Environment.NewLine + " " + currentExeName + " -train=c:\\temp\\dataset\\*.txt", option => { opt_train = option; }) .Add("e=", "indicates which encoding to use to decode the input stream or files into proper text. Ignored when -l option is specified." + Environment.NewLine + "If no encoding is specified, tries to detect the encoding via BOM or uses the standard system's encoding" + Environment.NewLine + "The encoding is specified either via its codepage or its name, e.g. \"UTF-8\"", option => opt_InputEncoding = int.TryParse(option, out codepage) ? Encoding.GetEncoding(codepage) : Encoding.GetEncoding(option)) .Add("s", @"Determine language of each line of input.", option => opt_classifyFromInputPerLine = option != null) .Add("a=", @"the program returns the best-scoring language together" + Environment.NewLine + @"with all languages which are " + defaultWorstAcceptableThreshold + @" times worse (cf option -u). " + Environment.NewLine + @"If the number of languages to be printed is larger than the value " + Environment.NewLine + @"of this option (default: " + defaultTooManyLanguagesThreshold + @") then no language is returned, but" + Environment.NewLine + @"instead a message that the input is of an unknown language is" + Environment.NewLine + @"printed. Default: " + defaultTooManyLanguagesThreshold + @".", (int option) => opt_TooManyLanguagesThreshold = option) //.Add("d=", // @"indicates in which directory the language models are" + Environment.NewLine + // @"located (files ending in .lm). Currently only a single" + Environment.NewLine + // @"directory is supported. Default: """ + defaultLanguageModelsDirectory + @""".", // option => opt_LanguageModelsDirectory = option) .Add("p=", @"indicates a file from which to load a language identification profile. Default: """ + defaultLanguageIdentificationProfileFilePath + @""".", option => opt_LanguageIdentificationProfileFilePath = option) .Add("f=", @"Before sorting is performed the Ngrams which occur this number" + Environment.NewLine + @"of times or less are removed. This can be used to speed up" + Environment.NewLine + @"the program for longer inputs. For short inputs you should use" + Environment.NewLine + @"-f 0." + Environment.NewLine + @"Default: " + defaultOccuranceNumberThreshold + @".", (int option) => opt_OccuranceNumberThreshold = option) .Add("i=", @"only read first N lines", (int option) => opt_OnlyReadFirstNLines = option) .Add("l=", @"indicates that input is given as an argument on the command line," + Environment.NewLine + @"e.g. text_cat -l ""this is english text""" + Environment.NewLine + @"Cannot be used in combination with -n.", option => opt_classifyFromArgument = option) .Add("t=", @"indicates the topmost number of ngrams that should be used." + Environment.NewLine + @"If used in combination with -n this determines the size of the" + Environment.NewLine + @"output. If used with categorization this determines" + Environment.NewLine + @"the number of ngrams that are compared with each of the language" + Environment.NewLine + @"models (but each of those models is used completely)." + Environment.NewLine + @"Default: " + defaultMaximumSizeOfDistribution + @".", (int option) => opt_MaximumSizeOfDistribution = option) .Add("u=", @"determines how much worse result must be in order not to be" + Environment.NewLine + "mentioned as an alternative. Typical value: 1.05 or 1.1. " + Environment.NewLine + "Default: " + defaultWorstAcceptableThreshold + @".", (double option) => opt_WorstAcceptableThreshold = option) .Add("v", @"verbose. Continuation messages are written to standard error.", option => opt_verbose = option != null) .Add("m", @"disallow multithreading. If set to true, training and identification will use a single thread.", option => opt_disallowMultithreading = option != null) .Add(NoPromptSwitch, @"prevents text input prompt from being shown.", option => opt_noPrompt = option != null); try { option_set.Parse(args); } catch (OptionException ex) { Console.WriteLine("Error occured: " + ex.ToString()); ShowHelp(option_set); } if (opt_help) { ShowHelp(option_set); return; } if (opt_train != null) { string[] files = null; if (Directory.Exists(opt_train)) { files = Directory.GetFiles(opt_train); if (files.Length == 0) throw new InvalidOperationException("Cannot find files int the following directory: " + opt_train); } else // treat as a wildcard { // avoiding System.ArgumentException: Illegal characters in path. var path = Path.GetDirectoryName(opt_train.Replace('*', '_').Replace('?', '_')) ?? String.Empty; var wildcard = opt_train.Substring(path.Length).TrimStart(Path.DirectorySeparatorChar, Path.AltDirectorySeparatorChar); bool failed = true; if (!string.IsNullOrWhiteSpace(wildcard)) { files = Directory.GetFiles(string.IsNullOrWhiteSpace(path) ? "." : path, wildcard, SearchOption.TopDirectoryOnly); if (files.Length > 0) failed = false; } if (failed) throw new InvalidOperationException("Cannot find files with the following wildcard path: " + opt_train); } var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading); var input = files.Select(f => Tuple.Create( new LanguageInfo(Path.GetFileNameWithoutExtension(f), Path.GetFileNameWithoutExtension(f), null, null), GetTextReader(f, opt_InputEncoding))); using (var standardOutput = Console.OpenStandardOutput()) { var identifier = factory.TrainAndSave(input, standardOutput); } } else { var factory = new RankedLanguageIdentifierFactory(5, opt_MaximumSizeOfDistribution, opt_OccuranceNumberThreshold, opt_OnlyReadFirstNLines, !opt_disallowMultithreading); var languageIdentifier = factory.Load(opt_LanguageIdentificationProfileFilePath); if (opt_classifyFromArgument != null) { var languages = languageIdentifier.Identify(opt_classifyFromArgument); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } else if (opt_classifyFromInputPerLine) { if (!opt_noPrompt) DisplayInputPrompt("Classify each line from text input"); using (Stream input = Console.OpenStandardInput()) using (var reader = GetTextReader(input, opt_InputEncoding)) { string line; while((line = reader.ReadLine()) != null) { var languages = languageIdentifier.Identify(line); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } } } else // classify all from input { if (!opt_noPrompt) DisplayInputPrompt("Classify text input"); using (var input = Console.OpenStandardInput()) using (var reader = GetTextReader(input, opt_InputEncoding)) { var text = reader.ReadToEnd(); var languages = languageIdentifier.Identify(text); OutputIdentifiedLanguages(languages, opt_WorstAcceptableThreshold, opt_TooManyLanguagesThreshold); } } } }
private static RankedLanguageIdentifier CreateIdentifier() { var factory = new RankedLanguageIdentifierFactory(); var identifierProfilePath = RankedLanguageIdentifierFactory.GetSetting("LanguageIdentificationProfileFilePath", (string)null); string mappedPath = null; if (identifierProfilePath != null && System.IO.File.Exists(identifierProfilePath) == false) { Log.DebugFormat("Cannot find a profile in the following path: '{0}'. Trying HostingEnvironment.MapPath", identifierProfilePath); mappedPath = HostingEnvironment.MapPath(identifierProfilePath); } var finalPath = mappedPath ?? identifierProfilePath; if (finalPath == null || System.IO.File.Exists(finalPath) == false) { Log.DebugFormat("Cannot find a profile in the following path: '{0}'.", finalPath); throw new InvalidOperationException(string.Format("Cannot find a profile in the following path: '{0}'.", finalPath)); } var identifier = factory.Load(finalPath); return identifier; }