public void testNormalScenario() { TagExtractor extractor = new TagExtractor("abstract", 10); Assert.AreEqual(extractor.Count(), 0); LangProfile profile = new LangProfile("en"); // normal extractor.SetTag("abstract"); extractor.Add("This is a sample text."); profile.Update(extractor.CloseTag()); Assert.AreEqual(extractor.Count(), 1); Assert.AreEqual(profile.N_Words[0], 17); // Thisisasampletext Assert.AreEqual(profile.N_Words[1], 22); // _T, Th, hi, ... Assert.AreEqual(profile.N_Words[2], 17); // _Th, Thi, his, ... // too short extractor.SetTag("abstract"); extractor.Add("sample"); profile.Update(extractor.CloseTag()); Assert.AreEqual(extractor.Count(), 1); // other tags extractor.SetTag("div"); extractor.Add("This is a sample text which is enough long."); profile.Update(extractor.CloseTag()); Assert.AreEqual(extractor.Count(), 1); }
public static LangProfile load(string lang, string file) { LangProfile profile = new LangProfile(lang); TagExtractor tagextractor = new TagExtractor("abstract", 100); Stream inputStream = null; try { inputStream = File.OpenRead(file); string extension = Path.GetExtension(file) ?? ""; if (extension.ToUpper() == ".GZ") { inputStream = new GZipStream(inputStream, CompressionMode.Decompress); } using (XmlReader xmlReader = XmlReader.Create(inputStream)) { while (xmlReader.Read()) { switch (xmlReader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(xmlReader.Name); break; case XmlNodeType.Text: tagextractor.Add(xmlReader.Value); break; case XmlNodeType.EndElement: tagextractor.CloseTag(profile); break; } } } } finally { if (inputStream != null) { inputStream.Close(); } } Console.WriteLine(lang + ": " + tagextractor.Count); return(profile); }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception>LangDetectException </exception> public static LangProfile LoadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); StreamReader br = null; try { Stream strm = File.OpenRead(file); if (file.EndsWith(".gz")) { strm = new GZipStream(strm, CompressionMode.Decompress); } br = new StreamReader(strm); TagExtractor tagextractor = new TagExtractor("abstract", 100); XmlReader reader = XmlReader.Create(br); try { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(reader.Name); break; case XmlNodeType.Text: tagextractor.Add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.CloseTag(); if (text != null) { profile.Update(text); } break; } } } catch (XmlException e) { throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file + "' is an invalid XML."); } finally { try { if (reader != null) { reader.Close(); } } catch (XmlException e) { } } Console.WriteLine(lang + ":" + tagextractor.Count()); } catch (IOException e) { throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'"); } finally { try { if (br != null) { br.Close(); } } catch (IOException e) { } } return(profile); }
public void testCloseTag() { TagExtractor extractor = new TagExtractor(null, 0); extractor.CloseTag(); // ignore }