Пример #1
0
        public void ContainsSections()
        {
            var data          = new List <KnowlegeData>();
            var inputFilePath = GetDataPath();

            using (var reader = XmlReader.Create(inputFilePath))
            {
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page"))
                    {
                        var outerXml = reader.ReadOuterXml();

                        var infoboxes = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);
                        var geoboxes  = outerXml.Split(WordUtils.Geobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);
                        var citations = outerXml.Split(WordUtils.Citacia, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);

                        if (infoboxes.Any() || geoboxes.Any() || citations.Any())
                        {
                            WordUtils.TrimBoxes <Infobox>(infoboxes, ref data);
                            WordUtils.TrimBoxes <Geobox>(geoboxes, ref data);
                            WordUtils.TrimBoxes <Citation>(citations, ref data);

                            break;
                        }
                    }
                }
            }

            Assert.IsTrue(data.Any());
        }
Пример #2
0
        /// <summary>
        /// Read XML - get pages structure
        /// </summary>
        /// <param name="inputFilePath"></param>
        public void SetPagesFromInputFile(string inputFilePath)
        {
            using (var reader = XmlReader.Create(inputFilePath))
            {
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page"))
                    {
                        var outerXml = reader.ReadOuterXml();

                        //paralel work
                        Parallel.Invoke(() =>
                        {
                            //get all infoboxes from page
                            var infoboxes = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);
                            //get all geoboxes from page
                            var geoboxes = outerXml.Split(WordUtils.Geobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);
                            //get all citations from page
                            var citations = outerXml.Split(WordUtils.Citacia, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);

                            if (infoboxes.Any() || geoboxes.Any() || citations.Any())
                            {
                                var data = new List <KnowlegeData>();

                                WordUtils.TrimBoxes <Infobox>(infoboxes, ref data);
                                WordUtils.TrimBoxes <Geobox>(geoboxes, ref data);
                                WordUtils.TrimBoxes <Citation>(citations, ref data);

                                if (data.Any())
                                {
                                    Pages.Add(new Page(data));
                                }
                            }
                        });
                    }
                }
            }
        }
Пример #3
0
        /// <summary>
        /// Get issac newton page
        /// </summary>
        /// <returns></returns>
        private Page GetPage()
        {
            var inputFilePath = GetDataPath();

            Page page = null;

            using (var reader = XmlReader.Create(inputFilePath))
            {
                while (reader.Read())
                {
                    if (reader.NodeType == XmlNodeType.Element && reader.Name.Equals("page"))
                    {
                        var outerXml = reader.ReadOuterXml();

                        if (outerXml.Contains("<title>Isaac Newton</title>"))
                        {
                            var infobox = outerXml.Split(WordUtils.Infobox, System.StringSplitOptions.RemoveEmptyEntries).Skip(1);
                            var data    = new List <KnowlegeData>();

                            WordUtils.TrimBoxes <Infobox>(infobox, ref data);

                            page = new Page(data);
                            page.Infoboxes.ForEach(x => x.SetRegexAttributes());
                            break;
                        }
                    }
                }
            }

            if (page == null)
            {
                new AssertFailedException("Page not found");
            }

            return(page);
        }