Пример #1
0
        public void testNormalScenario()
        {
            TagExtractor extractor = new TagExtractor("abstract", 10);

            Assert.AreEqual(extractor.count(), 0);

            LangProfile profile = new LangProfile("en");

            // normal
            extractor.setTag("abstract");
            extractor.add("This is a sample text.");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);
            Assert.AreEqual(profile.n_words[0], 17);  // Thisisasampletext
            Assert.AreEqual(profile.n_words[1], 22);  // _T, Th, hi, ...
            Assert.AreEqual(profile.n_words[2], 17);  // _Th, Thi, his, ...

            // too short
            extractor.setTag("abstract");
            extractor.add("sample");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);

            // other tags
            extractor.setTag("div");
            extractor.add("This is a sample text which is enough long.");
            profile.update(extractor.closeTag());
            Assert.AreEqual(extractor.count(), 1);
        }
Пример #2
0
        public void testSetTag()
        {
            TagExtractor extractor = new TagExtractor(null, 0);

            extractor.setTag("");
            Assert.AreEqual(extractor.tag_, "");
            extractor.setTag(null);
            Assert.AreEqual(extractor.tag_, null);
        }
Пример #3
0
        public void testClear()
        {
            TagExtractor extractor = new TagExtractor("abstract", 10);

            extractor.setTag("abstract");
            extractor.add("This is a sample text.");
            Assert.AreEqual(extractor.buf_.ToString(), "This is a sample text.");
            Assert.AreEqual(extractor.tag_, "abstract");
            extractor.clear();
            Assert.AreEqual(extractor.buf_.ToString(), "");
            Assert.AreEqual(extractor.tag_, null);
        }
Пример #4
0
        /// <summary>
        /// Load Wikipedia abstract database file and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target database file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception cref="LangDetectException" />
        public static LangProfile loadFromWikipediaAbstract(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);
            FileInfo    fi      = new FileInfo(file);
            Stream      _is     = null;

            try
            {
                _is = fi.OpenRead();
                if (fi.Name.EndsWith(".gz"))
                {
                    _is = new GZipStream(_is, CompressionMode.Decompress);
                }

                using (StreamReader br = new StreamReader(_is, System.Text.Encoding.UTF8))
                {
                    TagExtractor tagextractor = new TagExtractor("abstract", 100);
                    using (XmlReader reader = XmlReader.Create(br))
                    {
                        while (reader.Read())
                        {
                            switch (reader.NodeType)
                            {
                            case XmlNodeType.Element:
                                tagextractor.setTag(reader.Name);
                                break;

                            case XmlNodeType.Text:
                                tagextractor.add(reader.Value);
                                break;

                            case XmlNodeType.EndElement:
                                string text = tagextractor.closeTag();
                                if (text != null)
                                {
                                    profile.update(text);
                                }
                                break;
                            }
                        }
                    }
                }
            }
            finally
            {
                if (null != _is)
                {
                    _is.Close();
                    _is.Dispose();
                }
            }
            return(profile);
        }