Ejemplo n.º 1
0
        public void ReadCorpus(string dir, int cutOff)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);

            if (languageProfiles != null)
            {
                languageProfiles.Clear();
            }

            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);

            DateTime        dtStart     = DateTime.Now;
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // Skips file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    continue;
                }
                if (lang.Equals(lastLang) == false)
                {
                    // Adds new language
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile = new LanguageProfile(n, lang);
                    langProfile.AddTokensFromFile(f);
                    languageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // Adds corpus file to the last language added
                    mLogger.Debug("ReadCorpus", lang + ":\t" + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f);
                }
            }

            // Does ranking of language profiles
            // No n-grams should be added to languages after this!
            foreach (LanguageProfile l in languageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Ejemplo n.º 2
0
        public void BuildProfilesFromCorpus(string dir, int cutOff, Encoding codePage, Encoding loadAs)
        {
            Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
            Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/ true) ? new ArgumentValueException("dir") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
            mLanguageProfiles.Clear();
            string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
            Array.Sort(files);
            LanguageProfile langProfile = null;
            Language        lastLang    = Language.Unspecified;

            foreach (string f in files)
            {
                Language lang;
                string   fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
                lang = TextMiningUtils.GetLanguage(fileLangCode);
                // skip file names starting with unknown language code
                if (lang == Language.Unspecified)
                {
                    mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f));
                    continue;
                }
                if (lang != lastLang)
                {
                    // add new language
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile = new LanguageProfile(mN, lang, codePage);
                    langProfile.AddTokensFromFile(f, loadAs);
                    mLanguageProfiles.Add(langProfile);
                    lastLang = lang;
                }
                else
                {
                    // add corpus file to the last language added
                    mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
                    langProfile.AddTokensFromFile(f, loadAs);
                }
            }
            // ranks n-grams
            // *** n-grams should not be added to languages after this
            foreach (LanguageProfile l in mLanguageProfiles)
            {
                l.Trim(cutOff);
                l.DoRanking(); // throws InvalidOperationException
            }
        }
Ejemplo n.º 3
0
        public static LanguageDetector GetLanguageDetectorPrebuilt()
        {
            LanguageDetector ld       = new LanguageDetector();
            Assembly         assembly = Assembly.GetExecutingAssembly();

            foreach (string resName in assembly.GetManifestResourceNames())
            {
                if (resName.EndsWith(".ldp"))
                {
                    // load language detector profile
                    BinarySerializer ser         = new BinarySerializer(assembly.GetManifestResourceStream(resName));
                    LanguageProfile  langProfile = new LanguageProfile(ser);
                    ser.Close();
                    ld.AddLanguageProfile(langProfile);
                    mLogger.Debug("GetLanguageDetectorPrebuilt", "Loaded resource {0}.", resName);
                }
            }
            return(ld);
        }
Ejemplo n.º 4
0
        public LanguageProfile DetectLanguage(NGramProfile p, int cutOff)
        {
            Utils.ThrowException(mLanguageProfiles.Count == 0 ? new InvalidOperationException() : null);
            Utils.ThrowException(p == null ? new ArgumentNullException("p") : null);
            Utils.ThrowException((!p.IsRanked) ? new ArgumentValueException("p") : null);
            Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
            LanguageProfile matchingLang = null;
            double          minDist      = Double.MaxValue;

            foreach (LanguageProfile l in mLanguageProfiles)
            {
                double dist = p.CalcOutOfPlace(l, cutOff);
                if (dist < minDist)
                {
                    matchingLang = l;
                    minDist      = dist;
                }
            }
            return(matchingLang);
        }
Ejemplo n.º 5
0
 public void AddLanguageProfile(LanguageProfile l)
 {
     Utils.ThrowException(l == null ? new ArgumentNullException("l") : null);
     Utils.ThrowException((!l.IsRanked || l.N != mN) ? new ArgumentValueException("l") : null);
     mLanguageProfiles.Add(l);
 }
Ejemplo n.º 6
0
 public static LanguageDetector GetLanguageDetectorPrebuilt()
 {
     LanguageDetector ld = new LanguageDetector();
     Assembly assembly = Assembly.GetExecutingAssembly();
     foreach (string resName in assembly.GetManifestResourceNames())
     {
         if (resName.EndsWith(".ldp"))
         {
             // load language detector profile
             BinarySerializer ser = new BinarySerializer(assembly.GetManifestResourceStream(resName));
             LanguageProfile langProfile = new LanguageProfile(ser);
             ser.Close();
             ld.AddLanguageProfile(langProfile);
             mLogger.Debug("GetLanguageDetectorPrebuilt", "Loaded resource {0}.", resName);
         }
     }
     return ld;
 }
Ejemplo n.º 7
0
 public void BuildProfilesFromCorpus(string dir, int cutOff)
 {
     Utils.ThrowException(dir == null ? new ArgumentNullException("dir") : null);
     Utils.ThrowException(!Utils.VerifyFolderName(dir, /*mustExist=*/true) ? new ArgumentValueException("dir") : null);
     Utils.ThrowException(cutOff < 1 ? new ArgumentOutOfRangeException("cutOff") : null);
     mLanguageProfiles.Clear();
     string[] files = Directory.GetFiles(dir, "*.txt", SearchOption.AllDirectories);
     Array.Sort(files);
     LanguageProfile langProfile = null;
     Language lastLang = Language.Unspecified;
     foreach (string f in files)
     {
         Language lang;
         string fileLangCode = Path.GetFileName(f).Substring(0, 2).ToLower();
         lang = TextMiningUtils.GetLanguage(fileLangCode);
         // skip file names starting with unknown language code
         if (lang == Language.Unspecified)
         {
             mLogger.Warn("BuildProfilesFromCorpus", "Unknown: " + Path.GetFileName(f));
             continue;
         }
         if (lang != lastLang)
         {
             // add new language
             mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
             langProfile = new LanguageProfile(mN, lang);
             langProfile.AddTokensFromFile(f);
             mLanguageProfiles.Add(langProfile);
             lastLang = lang;
         }
         else
         {
             // add corpus file to the last language added
             mLogger.Debug("BuildProfilesFromCorpus", lang + ": " + Path.GetFileName(f));
             langProfile.AddTokensFromFile(f);
         }
     }
     // ranks n-grams
     // *** n-grams should not be added to languages after this
     foreach (LanguageProfile l in mLanguageProfiles)
     {
         l.Trim(cutOff);
         l.DoRanking(); // throws InvalidOperationException
     }
 }
Ejemplo n.º 8
0
 public void AddLanguageProfile(LanguageProfile l)
 {
     Utils.ThrowException(l == null ? new ArgumentNullException("l") : null);
     Utils.ThrowException((!l.IsRanked || l.N != mN) ? new ArgumentValueException("l") : null);
     mLanguageProfiles.Add(l);
 }