public void ReadCorpus(string dir, int cutOff) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); if (languageProfiles != null) { languageProfiles.Clear(); } string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); DateTime dtStart = DateTime.Now; LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // Skips file names starting with unknown language code if (lang == Language.Unspecified) { continue; } if (lang.Equals(lastLang) == false) { // Adds new language mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f)); langProfile = new LanguageProfile(n, lang); langProfile.AddTokensFromFile(f); languageProfiles.Add(langProfile); lastLang = lang; } else { // Adds corpus file to the last language added mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f)); langProfile.AddTokensFromFile(f); } } // Does ranking of language profiles // No n-grams should be added to languages after this! foreach (LanguageProfile l in languageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }
public void BuildProfilesFromCorpus(string dir, int cutOff, Encoding codePage, Encoding loadAs) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); mLanguageProfiles.Clear(); string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // skip file names starting with unknown language code if (lang == Language.Unspecified) { mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f)); continue; } if (lang != lastLang) { // add new language mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile = new LanguageProfile(mN, lang, codePage); langProfile.AddTokensFromFile(f, loadAs); mLanguageProfiles.Add(langProfile); lastLang = lang; } else { // add corpus file to the last language added mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile.AddTokensFromFile(f, loadAs); } } // ranks n-grams // *** n-grams should not be added to languages after this foreach (LanguageProfile l in mLanguageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }
public void BuildProfilesFromCorpus(string dir, int cutOff) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); mLanguageProfiles.Clear(); string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // skip file names starting with unknown language code if (lang == Language.Unspecified) { mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f)); continue; } if (lang != lastLang) { // add new language mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile = new LanguageProfile(mN, lang); langProfile.AddTokensFromFile(f); mLanguageProfiles.Add(langProfile); lastLang = lang; } else { // add corpus file to the last language added mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile.AddTokensFromFile(f); } } // ranks n-grams // *** n-grams should not be added to languages after this foreach (LanguageProfile l in mLanguageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }