Exemplo n.º 1
0
        public void testNormalScenario()
        {
            TagExtractor extractor = new TagExtractor("abstract", 10);

            Assert.AreEqual(extractor.Count(), 0);

            LangProfile profile = new LangProfile("en");

            // normal
            extractor.SetTag("abstract");
            extractor.Add("This is a sample text.");
            profile.Update(extractor.CloseTag());
            Assert.AreEqual(extractor.Count(), 1);
            Assert.AreEqual(profile.N_Words[0], 17);  // Thisisasampletext
            Assert.AreEqual(profile.N_Words[1], 22);  // _T, Th, hi, ...
            Assert.AreEqual(profile.N_Words[2], 17);  // _Th, Thi, his, ...

            // too short
            extractor.SetTag("abstract");
            extractor.Add("sample");
            profile.Update(extractor.CloseTag());
            Assert.AreEqual(extractor.Count(), 1);

            // other tags
            extractor.SetTag("div");
            extractor.Add("This is a sample text which is enough long.");
            profile.Update(extractor.CloseTag());
            Assert.AreEqual(extractor.Count(), 1);
        }
Exemplo n.º 2
0
        /// <summary>
        /// Load Wikipedia abstract database file and generate its language profile
        /// </summary>
        /// <param name="lang">target language name</param>
        /// <param name="file">target database file path</param>
        /// <returns>Language profile instance</returns>
        /// <exception>LangDetectException </exception>
        public static LangProfile LoadFromWikipediaAbstract(string lang, string file)
        {
            LangProfile profile = new LangProfile(lang);

            StreamReader br = null;

            try
            {
                Stream strm = File.OpenRead(file);
                if (file.EndsWith(".gz"))
                {
                    strm = new GZipStream(strm, CompressionMode.Decompress);
                }
                br = new StreamReader(strm);

                TagExtractor tagextractor = new TagExtractor("abstract", 100);

                XmlReader reader = XmlReader.Create(br);
                try
                {
                    while (reader.Read())
                    {
                        switch (reader.NodeType)
                        {
                        case XmlNodeType.Element:
                            tagextractor.SetTag(reader.Name);
                            break;

                        case XmlNodeType.Text:
                            tagextractor.Add(reader.Value);
                            break;

                        case XmlNodeType.EndElement:
                            string text = tagextractor.CloseTag();
                            if (text != null)
                            {
                                profile.Update(text);
                            }
                            break;
                        }
                    }
                }
                catch (XmlException e)
                {
                    throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file + "' is an invalid XML.");
                }
                finally
                {
                    try
                    {
                        if (reader != null)
                        {
                            reader.Close();
                        }
                    }
                    catch (XmlException e) { }
                }
                Console.WriteLine(lang + ":" + tagextractor.Count());
            }
            catch (IOException e)
            {
                throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'");
            }
            finally
            {
                try
                {
                    if (br != null)
                    {
                        br.Close();
                    }
                }
                catch (IOException e) { }
            }
            return(profile);
        }