public void testNormalScenario() { TagExtractor extractor = new TagExtractor("abstract", 10); Assert.AreEqual(extractor.count(), 0); LangProfile profile = new LangProfile("en"); // normal extractor.setTag("abstract"); extractor.add("This is a sample text."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); Assert.AreEqual(profile.n_words[0], 17); // Thisisasampletext Assert.AreEqual(profile.n_words[1], 22); // _T, Th, hi, ... Assert.AreEqual(profile.n_words[2], 17); // _Th, Thi, his, ... // too short extractor.setTag("abstract"); extractor.add("sample"); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); // other tags extractor.setTag("div"); extractor.add("This is a sample text which is enough long."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception cref="LangDetectException" /> public static LangProfile loadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); FileInfo fi = new FileInfo(file); Stream _is = null; try { _is = fi.OpenRead(); if (fi.Name.EndsWith(".gz")) { _is = new GZipStream(_is, CompressionMode.Decompress); } using (StreamReader br = new StreamReader(_is, System.Text.Encoding.UTF8)) { TagExtractor tagextractor = new TagExtractor("abstract", 100); using (XmlReader reader = XmlReader.Create(br)) { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.setTag(reader.Name); break; case XmlNodeType.Text: tagextractor.add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.closeTag(); if (text != null) { profile.update(text); } break; } } } } } finally { if (null != _is) { _is.Close(); _is.Dispose(); } } return(profile); }
public void testCloseTag() { TagExtractor extractor = new TagExtractor(null, 0); extractor.closeTag(); // ignore }