Exemplo n.º 1
0
        public void ReadCorpus(string dir, int cutOff)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);

            if (languageProfiles != null)
            {
                languageProfiles.Clear();
            }

            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);

            DateTime        dtStart     = DateTime.Now;
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // Skips file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    continue;
                }
                if (lang.Equals(lastLang) == false)
                {
                    // Adds new language
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile = new LanguageProfile(n, lang);
                    langProfile.AddTokensFromFile(f);
                    languageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // Adds corpus file to the last language added
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f);
                }
            }

            // Does ranking of language profiles
            // No n-grams should be added to languages after this!
            foreach (LanguageProfile l in languageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Exemplo n.º 2
0
        public void BuildProfilesFromCorpus(string dir, int cutOff, Encoding codePage, Encoding loadAs)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
            mLanguageProfiles.Clear();
            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // skip file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f));
                    continue;
                }
                if (lang != lastLang)
                {
                    // add new language
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile = new LanguageProfile(mN, lang, codePage);
                    langProfile.AddTokensFromFile(f, loadAs);
                    mLanguageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // add corpus file to the last language added
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f, loadAs);
                }
            }
            // ranks n-grams
            // *** n-grams should not be added to languages after this
            foreach (LanguageProfile l in mLanguageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Exemplo n.º 3
0
 public void BuildProfilesFromCorpus(string dir, int cutOff)
 {
     Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
     Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/true) ? new ArgumentValueException("dir") : null);
     Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
     mLanguageProfiles.Clear();
     string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
     Array.Sort(files);
     LanguageProfile langProfile = null;
     Language lastLang = Language.Unspecified;
     foreach (string f in files)
     {
         Language lang;
         string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
         lang = TextMiningUtils.GetLanguage(fileLangCode);
         // skip file names starting with unknown language code
         if (lang == Language.Unspecified)
         {
             mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f));
             continue;
         }
         if (lang != lastLang)
         {
             // add new language
             mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
             langProfile = new LanguageProfile(mN, lang);
             langProfile.AddTokensFromFile(f);
             mLanguageProfiles.Add(langProfile);
             lastLang = lang;
         }
         else
         {
             // add corpus file to the last language added
             mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
             langProfile.AddTokensFromFile(f);
         }
     }
     // ranks n-grams
     // *** n-grams should not be added to languages after this
     foreach (LanguageProfile l in mLanguageProfiles)
     {
         l.Trim(cutOff);
         l.DoRanking(); // throws InvalidOperationException
     }
 }