Exemple #1
0
        /// <summary>
        /// Generate Language Profile from Text File
        /// <para />
        /// usage: --genprofile-text -l [language code] [text file path]
        /// </summary>
        private void generateProfileFromText()
        {
            if (arglist.Count != 1)
            {
                System.Console.WriteLine("Need to specify text file path");
                return;
            }
            string file = arglist[0];

            if (!System.IO.File.Exists(file))
            {
                System.Console.WriteLine("Need to specify existing text file path");
                return;
            }

            string lang = get("lang");

            if (lang == null)
            {
                System.Console.WriteLine("Need to specify langage code(-l)");
                return;
            }

            LangProfile profile = GenProfile.loadFromText(lang, file);

            profile.omitLessFreq();

            using (var os = System.IO.File.CreateText(lang))
            {
                os.Write(JsonConvert.SerializeObject(profile));
            }
        }
        /// <summary>
        /// Load profiles from specified directory.
        /// This method must be called once before language detection.
        /// </summary>
        /// <param name="profileDirectory">profile directory path</param>
        /// <exception cref="LangDetectException">
        /// Can't open profiles(error code = ErrorCode#FileLoadError)
        /// or profile's format is wrong (error code = ErrorCode#FormatError)
        /// </exception>
        public static void loadProfile(string profileDirectory)
        {
            string[] listFiles = System.IO.Directory.GetFiles(profileDirectory);
            if (listFiles == null)
            {
                throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory);
            }

            int langsize = listFiles.Length;
            int index    = 0;

            foreach (string file in listFiles)
            {
                FileInfo fi = new FileInfo(file);
                if (fi.Name.StartsWith("."))
                {
                    continue;
                }

                using (StreamReader sr = new StreamReader(file))
                {
                    LangProfile profile = JsonConvert.DeserializeObject <LangProfile>(sr.ReadToEnd());

                    addProfile(profile, index, langsize);
                    ++index;
                }
            }
        }
Exemple #3
0
        /// <summary>
        /// Load profiles from specified directory.
        /// This method must be called once before language detection.
        /// </summary>
        /// <param name="profileDirectory">profile directory path</param>
        /// <exception>LangDetectException  Can't open profiles(error code = {@link ErrorCode#FileLoadError})</exception>
        ///                              or profile's format is wrong (error code = {@link ErrorCode#FormatError})
        public static void LoadProfile(string profileDirectory)
        {
            string[] listFiles = Directory.GetFiles(profileDirectory);
            if (listFiles == null)
            {
                throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory);
            }

            int langsize = listFiles.Length, index = 0;

            foreach (string file in listFiles)
            {
                if (Path.GetFileName(file).StartsWith(".") || !File.Exists(file))
                {
                    continue;
                }
                try
                {
                    LangProfile profile = JsonSerializer.Deserialize <LangProfile>(File.ReadAllText(file));
                    AddProfile(profile, index, langsize);
                    ++index;
                }
                catch (NotSupportedException e)
                {
                    throw new LangDetectException(ErrorCode.FormatError, "profile format error in '" + file + "'");
                }
                catch (IOException e)
                {
                    throw new LangDetectException(ErrorCode.FileLoadError, "can't open '" + file + "'");
                }
            }
        }
        public void testAddIllegally1()
        {
            LangProfile profile = new LangProfile();       // Illegal ( available for only JSONIC ) but ignore

            profile.add("a");                              // ignore
            Assert.IsFalse(profile.freq.ContainsKey("a")); // ignored
        }
Exemple #5
0
        public void testAddIllegally1()
        {
            LangProfile profile = new LangProfile();  // Illegal ( available for only JSONIC ) but ignore

            profile.Add("a");                         // ignore
            Assert.AreEqual(profile.Freq["a"], null); // ignored
        }
Exemple #6
0
        /// <summary>
        /// Load text file with UTF-8 and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception>LangDetectException </exception>

        public static LangProfile LoadFromText(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);

            try
            {
                using (var strm = new StreamReader(File.OpenRead(file)))
                {
                    int    count = 0;
                    string line;
                    while ((line = strm.ReadLine()) != null)
                    {
                        profile.Update(line);
                        ++count;
                    }

                    Console.WriteLine(lang + ":" + count);
                }
            }
            catch (IOException)
            {
                throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'");
            }

            return(profile);
        }
Exemple #7
0
        public void testNormalScenario()
        {
            TagExtractor extractor = new TagExtractor("abstract", 10);

            Assert.AreEqual(extractor.count(), 0);

            LangProfile profile = new LangProfile("en");

            // normal
            extractor.setTag("abstract");
            extractor.add("This is a sample text.");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);
            Assert.AreEqual(profile.n_words[0], 17);  // Thisisasampletext
            Assert.AreEqual(profile.n_words[1], 22);  // _T, Th, hi, ...
            Assert.AreEqual(profile.n_words[2], 17);  // _Th, Thi, his, ...

            // too short
            extractor.setTag("abstract");
            extractor.add("sample");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);

            // other tags
            extractor.setTag("div");
            extractor.add("This is a sample text which is enough long.");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);
        }
Exemple #8
0
        public void setUp()
        {
            DetectorFactory.clear();

            LangProfile profile_en = new LangProfile("en");

            foreach (string w in TRAINING_EN.Split(' '))
            {
                profile_en.add(w);
            }
            DetectorFactory.addProfile(profile_en, 0, 3);

            LangProfile profile_fr = new LangProfile("fr");

            foreach (string w in TRAINING_FR.Split(' '))
            {
                profile_fr.add(w);
            }

            DetectorFactory.addProfile(profile_fr, 1, 3);

            LangProfile profile_ja = new LangProfile("ja");

            foreach (string w in TRAINING_JA.Split(' '))
            {
                profile_ja.add(w);
            }
            DetectorFactory.addProfile(profile_ja, 2, 3);
        }
Exemple #9
0
        /// <summary>
        /// Generate Language Profile from Text File
        /// <pre>
        /// usage: --genprofile-text -l [language code] [text file path]
        /// </pre>
        /// </summary>
        private void generateProfileFromText()
        {
            if (arglist.Count != 1)
            {
                Console.Error.WriteLine("Need to specify text file path");
                return;
            }
            string file = arglist[0];

            if (!File.Exists(file))
            {
                Console.Error.WriteLine("Need to specify existing text file path");
                return;
            }

            string lang = Get("lang");

            if (lang == null)
            {
                Console.Error.WriteLine("Need to specify langage code(-l)");
                return;
            }

            FileStream os = null;

            try
            {
                LangProfile profile = GenProfile.LoadFromText(lang, file);
                profile.OmitLessFreq();

                string profile_path = lang;
                File.WriteAllText(profile_path, JsonSerializer.Serialize(profile));
            }
            catch (NotSupportedException e)
            {
                Debug.WriteLine(e);
            }
            catch (IOException e)
            {
                Debug.WriteLine(e);
            }
            catch (LangDetectException e)
            {
                Debug.WriteLine(e);
            }
            finally
            {
                try
                {
                    if (os != null)
                    {
                        os.Close();
                    }
                }
                catch (IOException e) { }
            }
        }
        public void testAdd()
        {
            LangProfile profile = new LangProfile("en");

            profile.add("a");
            Assert.AreEqual((int)profile.freq["a"], 1);
            profile.add("a");
            Assert.AreEqual((int)profile.freq["a"], 2);
            profile.omitLessFreq();
        }
        public void testAddIllegally2()
        {
            LangProfile profile = new LangProfile("en");

            profile.add("a");
            profile.add("");                                        // Illegal (string's length of parameter must be between 1 and 3) but ignore
            profile.add("abcd");                                    // as well
            Assert.AreEqual((int)profile.freq["a"], 1);
            Assert.IsFalse(profile.freq.ContainsKey(""), null);     // ignored
            Assert.IsFalse(profile.freq.ContainsKey("abcd"), null); // ignored
        }
Exemple #12
0
        public void testAddIllegally2()
        {
            LangProfile profile = new LangProfile("en");

            profile.Add("a");
            profile.Add("");                             // Illegal (string's.Length of parameter must be between 1 and 3) but ignore
            profile.Add("abcd");                         // as well
            Assert.AreEqual((int)profile.Freq["a"], 1);
            Assert.AreEqual(profile.Freq[""], null);     // ignored
            Assert.AreEqual(profile.Freq["abcd"], null); // ignored
        }
Exemple #13
0
        /// <summary>
        /// Load Wikipedia abstract database file and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target database file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception cref="LangDetectException" />
        public static LangProfile loadFromWikipediaAbstract(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);
            FileInfo    fi      = new FileInfo(file);
            Stream      _is     = null;

            try
            {
                _is = fi.OpenRead();
                if (fi.Name.EndsWith(".gz"))
                {
                    _is = new GZipStream(_is, CompressionMode.Decompress);
                }

                using (StreamReader br = new StreamReader(_is, System.Text.Encoding.UTF8))
                {
                    TagExtractor tagextractor = new TagExtractor("abstract", 100);
                    using (XmlReader reader = XmlReader.Create(br))
                    {
                        while (reader.Read())
                        {
                            switch (reader.NodeType)
                            {
                            case XmlNodeType.Element:
                                tagextractor.setTag(reader.Name);
                                break;

                            case XmlNodeType.Text:
                                tagextractor.add(reader.Value);
                                break;

                            case XmlNodeType.EndElement:
                                string text = tagextractor.closeTag();
                                if (text != null)
                                {
                                    profile.update(text);
                                }
                                break;
                            }
                        }
                    }
                }
            }
            finally
            {
                if (null != _is)
                {
                    _is.Close();
                    _is.Dispose();
                }
            }
            return(profile);
        }
Exemple #14
0
        public static LangProfile load(string lang, string file)
        {
            LangProfile  profile      = new LangProfile(lang);
            TagExtractor tagextractor = new TagExtractor("abstract", 100);
            Stream       inputStream  = null;

            try
            {
                inputStream = File.OpenRead(file);

                string extension = Path.GetExtension(file) ?? "";

                if (extension.ToUpper() == ".GZ")
                {
                    inputStream = new GZipStream(inputStream, CompressionMode.Decompress);
                }

                using (XmlReader xmlReader = XmlReader.Create(inputStream))
                {
                    while (xmlReader.Read())
                    {
                        switch (xmlReader.NodeType)
                        {
                        case XmlNodeType.Element:
                            tagextractor.SetTag(xmlReader.Name);
                            break;

                        case XmlNodeType.Text:
                            tagextractor.Add(xmlReader.Value);
                            break;

                        case XmlNodeType.EndElement:
                            tagextractor.CloseTag(profile);
                            break;
                        }
                    }
                }
            }
            finally
            {
                if (inputStream != null)
                {
                    inputStream.Close();
                }
            }

            Console.WriteLine(lang + ": " + tagextractor.Count);

            return(profile);
        }
Exemple #15
0
        /// <summary>
        /// Generate Language Profile from Wikipedia Abstract Database File
        /// <pre>
        /// usage: --genprofile -d [abstracts directory] [language names]
        /// </pre>
        /// </summary>
        public void GenerateProfile()
        {
            string directory = Get("directory");

            foreach (string lang in arglist)
            {
                string file = SearchFile(directory, lang + "wiki-.*-abstract\\.xml.*");
                if (file == null)
                {
                    Console.Error.WriteLine("Not Found abstract xml : lang = " + lang);
                    continue;
                }

                FileStream os = null;
                try
                {
                    LangProfile profile = GenProfile.LoadFromWikipediaAbstract(lang, file);
                    profile.OmitLessFreq();

                    string profile_path = Get("directory") + "/profiles/" + lang;
                    File.WriteAllText(profile_path, JsonSerializer.Serialize(profile));
                }
                catch (NotSupportedException e)
                {
                    Debug.WriteLine(e);
                }
                catch (IOException e)
                {
                    Debug.WriteLine(e);
                }
                catch (LangDetectException e)
                {
                    Debug.WriteLine(e);
                }
                finally
                {
                    try
                    {
                        if (os != null)
                        {
                            os.Close();
                        }
                    }
                    catch (IOException e) { }
                }
            }
        }
        /// <summary>
        /// Load profiles from specified directory.
        /// This method must be called once before language detection.
        /// </summary>
        /// <param name="json_profiles">profile directory path</param>
        /// <exception cref="LangDetectException">
        /// Can't open profiles(error code = ErrorCode#FileLoadError)
        /// or profile's format is wrong (error code = ErrorCode#FormatError)
        /// </exception>
        public static void loadProfile(IList <string> json_profiles)
        {
            int index    = 0;
            int langsize = json_profiles.Count;

            if (langsize < 2)
            {
                throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles");
            }

            foreach (string json in json_profiles)
            {
                LangProfile profile = JsonConvert.DeserializeObject <LangProfile>(json);
                addProfile(profile, index, langsize);
                ++index;
            }
        }
Exemple #17
0
        /// <summary>
        /// Load text file with UTF-8 and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception cref="LangDetectException" />
        public static LangProfile loadFromText(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);

            using (StreamReader _is = new StreamReader(file, System.Text.Encoding.UTF8))
            {
                int count = 0;
                while (!_is.EndOfStream)
                {
                    string line = _is.ReadLine();
                    profile.update(line);
                    ++count;
                }
                System.Console.WriteLine(lang + ":" + count);
            }
            return(profile);
        }
Exemple #18
0
        public void testOmitLessFreq()
        {
            LangProfile profile = new LangProfile("en");

            string[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".Split(" ");
            for (int i = 0; i < 5; ++i)
            {
                foreach (string g in grams)
                {
                    profile.Add(g);
                }
            }
            profile.Add("\u3050");

            Assert.AreEqual((int)profile.Freq["a"], 5);
            Assert.AreEqual((int)profile.Freq["\u3042"], 5);
            Assert.AreEqual((int)profile.Freq["\u3050"], 1);
            profile.OmitLessFreq();
            Assert.AreEqual(profile.Freq["a"], null);      // omitted
            Assert.AreEqual((int)profile.Freq["\u3042"], 5);
            Assert.AreEqual(profile.Freq["\u3050"], null); // omitted
        }
        public void testOmitLessFreq()
        {
            LangProfile profile = new LangProfile("en");

            string[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".Split(' ');
            for (int i = 0; i < 5; ++i)
            {
                foreach (string g in grams)
                {
                    profile.add(g);
                }
            }
            profile.add("\u3050");

            Assert.AreEqual((int)profile.freq["a"], 5);
            Assert.AreEqual((int)profile.freq["\u3042"], 5);
            Assert.AreEqual((int)profile.freq["\u3050"], 1);
            profile.omitLessFreq();
            Assert.IsFalse(profile.freq.ContainsKey("a"));      // omitted
            Assert.AreEqual((int)profile.freq["\u3042"], 5);
            Assert.IsFalse(profile.freq.ContainsKey("\u3050")); // omitted
        }
Exemple #20
0
        /// <summary>
        /// </summary>
        /// <param name="profile"></param>
        /// <param name="langsize"></param>
        /// <param name="index"></param>
        /// <exception>LangDetectException </exception>
        static public /*internal*/ void AddProfile(LangProfile profile, int index, int langsize)
        {
            string lang = profile.Name;

            if (instance_.langlist.Contains(lang))
            {
                throw new LangDetectException(ErrorCode.DuplicateLangError, "duplicate the same language profile");
            }
            instance_.langlist.Add(lang);
            foreach (string word in profile.Freq.Keys)
            {
                if (!instance_.wordLangProbMap.ContainsKey(word))
                {
                    instance_.wordLangProbMap[word] = new double[langsize];
                }
                int length = word.Length;
                if (length >= 1 && length <= 3)
                {
                    double prob = ((double)profile.Freq[word]) / profile.N_Words[length - 1];
                    instance_.wordLangProbMap[word][index] = prob;
                }
            }
        }
Exemple #21
0
        internal static void AddProfile(LangProfile profile, int index)
        {
            var lang = profile.name;

            if (_instance.Langlist.Contains(lang))
            {
                throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError);
            }

            _instance.Langlist.Add(lang);

            foreach (string word in profile.freq.Keys)
            {
                if (!_instance.WordLangProbMap.ContainsKey(word))
                {
                    _instance.WordLangProbMap.Add(word, new ProbVector());
                }

                double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1];

                _instance.WordLangProbMap[word][index] = prob;
            }
        }
Exemple #22
0
        /// <summary>
        /// Generate Language Profile from Wikipedia Abstract Database File
        /// <para />
        /// usage: --genprofile -d [abstracts directory] [language names]
        /// </summary>
        public void generateProfile()
        {
            string directory = get("directory");

            foreach (string lang in arglist)
            {
                string file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*");
                if (file == null)
                {
                    System.Console.WriteLine("Not Found abstract xml : lang = " + lang);
                    continue;
                }

                LangProfile profile = GenProfile.loadFromWikipediaAbstract(lang, file);
                profile.omitLessFreq();

                string profile_path = get("directory") + "/profiles/" + lang;
                using (var os = System.IO.File.CreateText(profile_path))
                {
                    os.Write(JsonConvert.SerializeObject(profile));
                }
            }
        }
Exemple #23
0
        /// <summary>
        /// Load profiles from specified directory.
        /// This method must be called once before language detection.
        /// </summary>
        /// <param name="profileDirectory">profile directory path</param>
        /// <exception>LangDetectException  Can't open profiles(error code = {@link ErrorCode#FileLoadError})</exception>
        ///                              or profile's format is wrong (error code = {@link ErrorCode#FormatError})
        public static void LoadProfile(IList <string> json_profiles)
        {
            int index    = 0;
            int langsize = json_profiles.Count;

            if (langsize < 2)
            {
                throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles");
            }

            foreach (string json in json_profiles)
            {
                try
                {
                    LangProfile profile = JsonSerializer.Deserialize <LangProfile>(json);
                    AddProfile(profile, index, langsize);
                    ++index;
                }
                catch (NotSupportedException e)
                {
                    throw new LangDetectException(ErrorCode.FormatError, "profile format error");
                }
            }
        }
Exemple #24
0
        /// <summary>
        /// Load Wikipedia abstract database file and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target database file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception>LangDetectException </exception>
        public static LangProfile LoadFromWikipediaAbstract(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);

            StreamReader br = null;

            try
            {
                Stream strm = File.OpenRead(file);
                if (file.EndsWith(".gz"))
                {
                    strm = new GZipStream(strm, CompressionMode.Decompress);
                }
                br = new StreamReader(strm);

                TagExtractor tagextractor = new TagExtractor("abstract", 100);

                XmlReader reader = XmlReader.Create(br);
                try
                {
                    while (reader.Read())
                    {
                        switch (reader.NodeType)
                        {
                        case XmlNodeType.Element:
                            tagextractor.SetTag(reader.Name);
                            break;

                        case XmlNodeType.Text:
                            tagextractor.Add(reader.Value);
                            break;

                        case XmlNodeType.EndElement:
                            string text = tagextractor.CloseTag();
                            if (text != null)
                            {
                                profile.Update(text);
                            }
                            break;
                        }
                    }
                }
                catch (XmlException e)
                {
                    throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file + "' is an invalid XML.");
                }
                finally
                {
                    try
                    {
                        if (reader != null)
                        {
                            reader.Close();
                        }
                    }
                    catch (XmlException e) { }
                }
                Console.WriteLine(lang + ":" + tagextractor.Count());
            }
            catch (IOException e)
            {
                throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'");
            }
            finally
            {
                try
                {
                    if (br != null)
                    {
                        br.Close();
                    }
                }
                catch (IOException e) { }
            }
            return(profile);
        }
        public void testOmitLessFreqIllegally()
        {
            LangProfile profile = new LangProfile();

            profile.omitLessFreq();  // ignore
        }
        public void testLangProfile()
        {
            LangProfile profile = new LangProfile();

            Assert.AreEqual(profile.name, null);
        }
        public void testLangProfileStringInt()
        {
            LangProfile profile = new LangProfile("en");

            Assert.AreEqual(profile.name, "en");
        }