/// <summary> /// Generate Language Profile from Text File /// <para /> /// usage: --genprofile-text -l [language code] [text file path] /// </summary> private void generateProfileFromText() { if (arglist.Count != 1) { System.Console.WriteLine("Need to specify text file path"); return; } string file = arglist[0]; if (!System.IO.File.Exists(file)) { System.Console.WriteLine("Need to specify existing text file path"); return; } string lang = get("lang"); if (lang == null) { System.Console.WriteLine("Need to specify langage code(-l)"); return; } LangProfile profile = GenProfile.loadFromText(lang, file); profile.omitLessFreq(); using (var os = System.IO.File.CreateText(lang)) { os.Write(JsonConvert.SerializeObject(profile)); } }
/// <summary> /// Load profiles from specified directory. /// This method must be called once before language detection. /// </summary> /// <param name="profileDirectory">profile directory path</param> /// <exception cref="LangDetectException"> /// Can't open profiles(error code = ErrorCode#FileLoadError) /// or profile's format is wrong (error code = ErrorCode#FormatError) /// </exception> public static void loadProfile(string profileDirectory) { string[] listFiles = System.IO.Directory.GetFiles(profileDirectory); if (listFiles == null) { throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory); } int langsize = listFiles.Length; int index = 0; foreach (string file in listFiles) { FileInfo fi = new FileInfo(file); if (fi.Name.StartsWith(".")) { continue; } using (StreamReader sr = new StreamReader(file)) { LangProfile profile = JsonConvert.DeserializeObject <LangProfile>(sr.ReadToEnd()); addProfile(profile, index, langsize); ++index; } } }
/// <summary> /// Load profiles from specified directory. /// This method must be called once before language detection. /// </summary> /// <param name="profileDirectory">profile directory path</param> /// <exception>LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError})</exception> /// or profile's format is wrong (error code = {@link ErrorCode#FormatError}) public static void LoadProfile(string profileDirectory) { string[] listFiles = Directory.GetFiles(profileDirectory); if (listFiles == null) { throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Not found profile: " + profileDirectory); } int langsize = listFiles.Length, index = 0; foreach (string file in listFiles) { if (Path.GetFileName(file).StartsWith(".") || !File.Exists(file)) { continue; } try { LangProfile profile = JsonSerializer.Deserialize <LangProfile>(File.ReadAllText(file)); AddProfile(profile, index, langsize); ++index; } catch (NotSupportedException e) { throw new LangDetectException(ErrorCode.FormatError, "profile format error in '" + file + "'"); } catch (IOException e) { throw new LangDetectException(ErrorCode.FileLoadError, "can't open '" + file + "'"); } } }
public void testAddIllegally1() { LangProfile profile = new LangProfile(); // Illegal ( available for only JSONIC ) but ignore profile.add("a"); // ignore Assert.IsFalse(profile.freq.ContainsKey("a")); // ignored }
public void testAddIllegally1() { LangProfile profile = new LangProfile(); // Illegal ( available for only JSONIC ) but ignore profile.Add("a"); // ignore Assert.AreEqual(profile.Freq["a"], null); // ignored }
/// <summary> /// Load text file with UTF-8 and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target file path</param> /// <returns>Language profile instance</returns> /// <exception>LangDetectException </exception> public static LangProfile LoadFromText(string lang, string file) { LangProfile profile = new LangProfile(lang); try { using (var strm = new StreamReader(File.OpenRead(file))) { int count = 0; string line; while ((line = strm.ReadLine()) != null) { profile.Update(line); ++count; } Console.WriteLine(lang + ":" + count); } } catch (IOException) { throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'"); } return(profile); }
public void testNormalScenario() { TagExtractor extractor = new TagExtractor("abstract", 10); Assert.AreEqual(extractor.count(), 0); LangProfile profile = new LangProfile("en"); // normal extractor.setTag("abstract"); extractor.add("This is a sample text."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); Assert.AreEqual(profile.n_words[0], 17); // Thisisasampletext Assert.AreEqual(profile.n_words[1], 22); // _T, Th, hi, ... Assert.AreEqual(profile.n_words[2], 17); // _Th, Thi, his, ... // too short extractor.setTag("abstract"); extractor.add("sample"); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); // other tags extractor.setTag("div"); extractor.add("This is a sample text which is enough long."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); }
public void setUp() { DetectorFactory.clear(); LangProfile profile_en = new LangProfile("en"); foreach (string w in TRAINING_EN.Split(' ')) { profile_en.add(w); } DetectorFactory.addProfile(profile_en, 0, 3); LangProfile profile_fr = new LangProfile("fr"); foreach (string w in TRAINING_FR.Split(' ')) { profile_fr.add(w); } DetectorFactory.addProfile(profile_fr, 1, 3); LangProfile profile_ja = new LangProfile("ja"); foreach (string w in TRAINING_JA.Split(' ')) { profile_ja.add(w); } DetectorFactory.addProfile(profile_ja, 2, 3); }
/// <summary> /// Generate Language Profile from Text File /// <pre> /// usage: --genprofile-text -l [language code] [text file path] /// </pre> /// </summary> private void generateProfileFromText() { if (arglist.Count != 1) { Console.Error.WriteLine("Need to specify text file path"); return; } string file = arglist[0]; if (!File.Exists(file)) { Console.Error.WriteLine("Need to specify existing text file path"); return; } string lang = Get("lang"); if (lang == null) { Console.Error.WriteLine("Need to specify langage code(-l)"); return; } FileStream os = null; try { LangProfile profile = GenProfile.LoadFromText(lang, file); profile.OmitLessFreq(); string profile_path = lang; File.WriteAllText(profile_path, JsonSerializer.Serialize(profile)); } catch (NotSupportedException e) { Debug.WriteLine(e); } catch (IOException e) { Debug.WriteLine(e); } catch (LangDetectException e) { Debug.WriteLine(e); } finally { try { if (os != null) { os.Close(); } } catch (IOException e) { } } }
public void testAdd() { LangProfile profile = new LangProfile("en"); profile.add("a"); Assert.AreEqual((int)profile.freq["a"], 1); profile.add("a"); Assert.AreEqual((int)profile.freq["a"], 2); profile.omitLessFreq(); }
public void testAddIllegally2() { LangProfile profile = new LangProfile("en"); profile.add("a"); profile.add(""); // Illegal (string's length of parameter must be between 1 and 3) but ignore profile.add("abcd"); // as well Assert.AreEqual((int)profile.freq["a"], 1); Assert.IsFalse(profile.freq.ContainsKey(""), null); // ignored Assert.IsFalse(profile.freq.ContainsKey("abcd"), null); // ignored }
public void testAddIllegally2() { LangProfile profile = new LangProfile("en"); profile.Add("a"); profile.Add(""); // Illegal (string's.Length of parameter must be between 1 and 3) but ignore profile.Add("abcd"); // as well Assert.AreEqual((int)profile.Freq["a"], 1); Assert.AreEqual(profile.Freq[""], null); // ignored Assert.AreEqual(profile.Freq["abcd"], null); // ignored }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception cref="LangDetectException" /> public static LangProfile loadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); FileInfo fi = new FileInfo(file); Stream _is = null; try { _is = fi.OpenRead(); if (fi.Name.EndsWith(".gz")) { _is = new GZipStream(_is, CompressionMode.Decompress); } using (StreamReader br = new StreamReader(_is, System.Text.Encoding.UTF8)) { TagExtractor tagextractor = new TagExtractor("abstract", 100); using (XmlReader reader = XmlReader.Create(br)) { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.setTag(reader.Name); break; case XmlNodeType.Text: tagextractor.add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.closeTag(); if (text != null) { profile.update(text); } break; } } } } } finally { if (null != _is) { _is.Close(); _is.Dispose(); } } return(profile); }
public static LangProfile load(string lang, string file) { LangProfile profile = new LangProfile(lang); TagExtractor tagextractor = new TagExtractor("abstract", 100); Stream inputStream = null; try { inputStream = File.OpenRead(file); string extension = Path.GetExtension(file) ?? ""; if (extension.ToUpper() == ".GZ") { inputStream = new GZipStream(inputStream, CompressionMode.Decompress); } using (XmlReader xmlReader = XmlReader.Create(inputStream)) { while (xmlReader.Read()) { switch (xmlReader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(xmlReader.Name); break; case XmlNodeType.Text: tagextractor.Add(xmlReader.Value); break; case XmlNodeType.EndElement: tagextractor.CloseTag(profile); break; } } } } finally { if (inputStream != null) { inputStream.Close(); } } Console.WriteLine(lang + ": " + tagextractor.Count); return(profile); }
/// <summary> /// Generate Language Profile from Wikipedia Abstract Database File /// <pre> /// usage: --genprofile -d [abstracts directory] [language names] /// </pre> /// </summary> public void GenerateProfile() { string directory = Get("directory"); foreach (string lang in arglist) { string file = SearchFile(directory, lang + "wiki-.*-abstract\\.xml.*"); if (file == null) { Console.Error.WriteLine("Not Found abstract xml : lang = " + lang); continue; } FileStream os = null; try { LangProfile profile = GenProfile.LoadFromWikipediaAbstract(lang, file); profile.OmitLessFreq(); string profile_path = Get("directory") + "/profiles/" + lang; File.WriteAllText(profile_path, JsonSerializer.Serialize(profile)); } catch (NotSupportedException e) { Debug.WriteLine(e); } catch (IOException e) { Debug.WriteLine(e); } catch (LangDetectException e) { Debug.WriteLine(e); } finally { try { if (os != null) { os.Close(); } } catch (IOException e) { } } } }
/// <summary> /// Load profiles from specified directory. /// This method must be called once before language detection. /// </summary> /// <param name="json_profiles">profile directory path</param> /// <exception cref="LangDetectException"> /// Can't open profiles(error code = ErrorCode#FileLoadError) /// or profile's format is wrong (error code = ErrorCode#FormatError) /// </exception> public static void loadProfile(IList <string> json_profiles) { int index = 0; int langsize = json_profiles.Count; if (langsize < 2) { throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles"); } foreach (string json in json_profiles) { LangProfile profile = JsonConvert.DeserializeObject <LangProfile>(json); addProfile(profile, index, langsize); ++index; } }
/// <summary> /// Load text file with UTF-8 and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target file path</param> /// <returns>Language profile instance</returns> /// <exception cref="LangDetectException" /> public static LangProfile loadFromText(string lang, string file) { LangProfile profile = new LangProfile(lang); using (StreamReader _is = new StreamReader(file, System.Text.Encoding.UTF8)) { int count = 0; while (!_is.EndOfStream) { string line = _is.ReadLine(); profile.update(line); ++count; } System.Console.WriteLine(lang + ":" + count); } return(profile); }
public void testOmitLessFreq() { LangProfile profile = new LangProfile("en"); string[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".Split(" "); for (int i = 0; i < 5; ++i) { foreach (string g in grams) { profile.Add(g); } } profile.Add("\u3050"); Assert.AreEqual((int)profile.Freq["a"], 5); Assert.AreEqual((int)profile.Freq["\u3042"], 5); Assert.AreEqual((int)profile.Freq["\u3050"], 1); profile.OmitLessFreq(); Assert.AreEqual(profile.Freq["a"], null); // omitted Assert.AreEqual((int)profile.Freq["\u3042"], 5); Assert.AreEqual(profile.Freq["\u3050"], null); // omitted }
public void testOmitLessFreq() { LangProfile profile = new LangProfile("en"); string[] grams = "a b c \u3042 \u3044 \u3046 \u3048 \u304a \u304b \u304c \u304d \u304e \u304f".Split(' '); for (int i = 0; i < 5; ++i) { foreach (string g in grams) { profile.add(g); } } profile.add("\u3050"); Assert.AreEqual((int)profile.freq["a"], 5); Assert.AreEqual((int)profile.freq["\u3042"], 5); Assert.AreEqual((int)profile.freq["\u3050"], 1); profile.omitLessFreq(); Assert.IsFalse(profile.freq.ContainsKey("a")); // omitted Assert.AreEqual((int)profile.freq["\u3042"], 5); Assert.IsFalse(profile.freq.ContainsKey("\u3050")); // omitted }
/// <summary> /// </summary> /// <param name="profile"></param> /// <param name="langsize"></param> /// <param name="index"></param> /// <exception>LangDetectException </exception> static public /*internal*/ void AddProfile(LangProfile profile, int index, int langsize) { string lang = profile.Name; if (instance_.langlist.Contains(lang)) { throw new LangDetectException(ErrorCode.DuplicateLangError, "duplicate the same language profile"); } instance_.langlist.Add(lang); foreach (string word in profile.Freq.Keys) { if (!instance_.wordLangProbMap.ContainsKey(word)) { instance_.wordLangProbMap[word] = new double[langsize]; } int length = word.Length; if (length >= 1 && length <= 3) { double prob = ((double)profile.Freq[word]) / profile.N_Words[length - 1]; instance_.wordLangProbMap[word][index] = prob; } } }
internal static void AddProfile(LangProfile profile, int index) { var lang = profile.name; if (_instance.Langlist.Contains(lang)) { throw new NLangDetectException("duplicate the same language profile", ErrorCode.DuplicateLangError); } _instance.Langlist.Add(lang); foreach (string word in profile.freq.Keys) { if (!_instance.WordLangProbMap.ContainsKey(word)) { _instance.WordLangProbMap.Add(word, new ProbVector()); } double prob = (double)profile.freq[word] / profile.n_words[word.Length - 1]; _instance.WordLangProbMap[word][index] = prob; } }
/// <summary> /// Generate Language Profile from Wikipedia Abstract Database File /// <para /> /// usage: --genprofile -d [abstracts directory] [language names] /// </summary> public void generateProfile() { string directory = get("directory"); foreach (string lang in arglist) { string file = searchFile(directory, lang + "wiki-.*-abstract\\.xml.*"); if (file == null) { System.Console.WriteLine("Not Found abstract xml : lang = " + lang); continue; } LangProfile profile = GenProfile.loadFromWikipediaAbstract(lang, file); profile.omitLessFreq(); string profile_path = get("directory") + "/profiles/" + lang; using (var os = System.IO.File.CreateText(profile_path)) { os.Write(JsonConvert.SerializeObject(profile)); } } }
/// <summary> /// Load profiles from specified directory. /// This method must be called once before language detection. /// </summary> /// <param name="profileDirectory">profile directory path</param> /// <exception>LangDetectException Can't open profiles(error code = {@link ErrorCode#FileLoadError})</exception> /// or profile's format is wrong (error code = {@link ErrorCode#FormatError}) public static void LoadProfile(IList <string> json_profiles) { int index = 0; int langsize = json_profiles.Count; if (langsize < 2) { throw new LangDetectException(ErrorCode.NeedLoadProfileError, "Need more than 2 profiles"); } foreach (string json in json_profiles) { try { LangProfile profile = JsonSerializer.Deserialize <LangProfile>(json); AddProfile(profile, index, langsize); ++index; } catch (NotSupportedException e) { throw new LangDetectException(ErrorCode.FormatError, "profile format error"); } } }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception>LangDetectException </exception> public static LangProfile LoadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); StreamReader br = null; try { Stream strm = File.OpenRead(file); if (file.EndsWith(".gz")) { strm = new GZipStream(strm, CompressionMode.Decompress); } br = new StreamReader(strm); TagExtractor tagextractor = new TagExtractor("abstract", 100); XmlReader reader = XmlReader.Create(br); try { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(reader.Name); break; case XmlNodeType.Text: tagextractor.Add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.CloseTag(); if (text != null) { profile.Update(text); } break; } } } catch (XmlException e) { throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file + "' is an invalid XML."); } finally { try { if (reader != null) { reader.Close(); } } catch (XmlException e) { } } Console.WriteLine(lang + ":" + tagextractor.Count()); } catch (IOException e) { throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'"); } finally { try { if (br != null) { br.Close(); } } catch (IOException e) { } } return(profile); }
public void testOmitLessFreqIllegally() { LangProfile profile = new LangProfile(); profile.omitLessFreq(); // ignore }
public void testLangProfile() { LangProfile profile = new LangProfile(); Assert.AreEqual(profile.name, null); }
public void testLangProfileStringInt() { LangProfile profile = new LangProfile("en"); Assert.AreEqual(profile.name, "en"); }