public void ReadCorpus(string dir, int cutOff) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); if (languageProfiles != null) { languageProfiles.Clear(); } string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); DateTime dtStart = DateTime.Now; LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // Skips file names starting with unknown language code if (lang == Language.Unspecified) { continue; } if (lang.Equals(lastLang) == false) { // Adds new language mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f)); langProfile = new LanguageProfile(n, lang); langProfile.AddTokensFromFile(f); languageProfiles.Add(langProfile); lastLang = lang; } else { // Adds corpus file to the last language added mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f)); langProfile.AddTokensFromFile(f); } } // Does ranking of language profiles // No n-grams should be added to languages after this! foreach (LanguageProfile l in languageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }
public void BuildProfilesFromCorpus(string dir, int cutOff, Encoding codePage, Encoding loadAs) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); mLanguageProfiles.Clear(); string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // skip file names starting with unknown language code if (lang == Language.Unspecified) { mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f)); continue; } if (lang != lastLang) { // add new language mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile = new LanguageProfile(mN, lang, codePage); langProfile.AddTokensFromFile(f, loadAs); mLanguageProfiles.Add(langProfile); lastLang = lang; } else { // add corpus file to the last language added mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile.AddTokensFromFile(f, loadAs); } } // ranks n-grams // *** n-grams should not be added to languages after this foreach (LanguageProfile l in mLanguageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }
public static LanguageDetector GetLanguageDetectorPrebuilt() { LanguageDetector ld = new LanguageDetector(); Assembly assembly = Assembly.GetExecutingAssembly(); foreach (string resName in assembly.GetManifestResourceNames()) { if (resName.EndsWith(".ldp")) { // load language detector profile BinarySerializer ser = new BinarySerializer(assembly.GetManifestResourceStream(resName)); LanguageProfile langProfile = new LanguageProfile(ser); ser.Close(); ld.AddLanguageProfile(langProfile); mLogger.Debug("GetLanguageDetectorPrebuilt", "Loaded resource {0}.", resName); } } return(ld); }
public LanguageProfile DetectLanguage(NGramProfile p, int cutOff) { Utils.ThrowException(mLanguageProfiles.Count == 0 ? new InvalidOperationException() : null); Utils.ThrowException(p == null ? new ArgumentNullException("p") : null); Utils.ThrowException((!p.IsRanked) ? new ArgumentValueException("p") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); LanguageProfile matchingLang = null; double minDist = Double.MaxValue; foreach (LanguageProfile l in mLanguageProfiles) { double dist = p.CalcOutOfPlace(l, cutOff); if (dist < minDist) { matchingLang = l; minDist = dist; } } return(matchingLang); }
public void AddLanguageProfile(LanguageProfile l) { Utils.ThrowException(l == null ? new ArgumentNullException("l") : null); Utils.ThrowException((!l.IsRanked || l.N != mN) ? new ArgumentValueException("l") : null); mLanguageProfiles.Add(l); }
public static LanguageDetector GetLanguageDetectorPrebuilt() { LanguageDetector ld = new LanguageDetector(); Assembly assembly = Assembly.GetExecutingAssembly(); foreach (string resName in assembly.GetManifestResourceNames()) { if (resName.EndsWith(".ldp")) { // load language detector profile BinarySerializer ser = new BinarySerializer(assembly.GetManifestResourceStream(resName)); LanguageProfile langProfile = new LanguageProfile(ser); ser.Close(); ld.AddLanguageProfile(langProfile); mLogger.Debug("GetLanguageDetectorPrebuilt", "Loaded resource {0}.", resName); } } return ld; }
public void BuildProfilesFromCorpus(string dir, int cutOff) { Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null); Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/true) ? new ArgumentValueException("dir") : null); Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null); mLanguageProfiles.Clear(); string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories); Array.Sort(files); LanguageProfile langProfile = null; Language lastLang = Language.Unspecified; foreach (string f in files) { Language lang; string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower(); lang = TextMiningUtils.GetLanguage(fileLangCode); // skip file names starting with unknown language code if (lang == Language.Unspecified) { mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f)); continue; } if (lang != lastLang) { // add new language mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile = new LanguageProfile(mN, lang); langProfile.AddTokensFromFile(f); mLanguageProfiles.Add(langProfile); lastLang = lang; } else { // add corpus file to the last language added mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f)); langProfile.AddTokensFromFile(f); } } // ranks n-grams // *** n-grams should not be added to languages after this foreach (LanguageProfile l in mLanguageProfiles) { l.Trim(cutOff); l.DoRanking(); // throws InvalidOperationException } }
public void AddLanguageProfile(LanguageProfile l) { Utils.ThrowException(l == null ? new ArgumentNullException("l") : null); Utils.ThrowException((!l.IsRanked || l.N != mN) ? new ArgumentValueException("l") : null); mLanguageProfiles.Add(l); }