Пример #1
0
            internal static Section[] GetSections(IElement elem, Section[] sections = null)
            {
                if (sections == null)
                {
                    return(GetSections(elem, new Section[0]));
                }
                if (FiFiPageEntry.IsEnd(elem))
                {
                    return(sections);
                }

                var sectionHeadTags = new string[2] {
                    "H4", "H5"
                };

                if (sectionHeadTags.Contains(elem.TagName))
                {
                    // some sections have no text content
                    var  sectionElems = FiFiPage.GetElementsUntil(sectionHeadTags, elem.NextElementSibling);
                    bool hasContent   = sectionElems.Count() > 0 && sectionElems.Select(e => e.TextContent.Trim()).Aggregate((x, y) => x + y).Length > 0;

                    if (hasContent)
                    {
                        return(GetSections(elem.NextElementSibling, AddSection(sections, new Section(elem))));
                    }
                }

                return(GetSections(elem.NextElementSibling, sections));
            }
Пример #2
0
 internal static FiFiPageEntry[] AddEntry(FiFiPageEntry[] pages, FiFiPageEntry page)
 {
     FiFiPageEntry[] newArray = new FiFiPageEntry[pages.Length + 1];
     pages.CopyTo(newArray, 0);
     newArray.SetValue(page, pages.Length);
     return(newArray);
 }
Пример #3
0
            public FiFiPageEntry(IElement[] elements)
            {
                Invalidate(elements);

                IElement wordElem = GetWordElement(elements.Skip(1));

                term = wordElem.FirstElementChild.TextContent.Trim();

                // entry info order:
                // category
                category = GetWordCategory(elements.First());
                if (!PartsOfSpeech.Contains(category.ToLower()))
                {
                    Console.WriteLine();
                    Logger.Warn(String.Format("{0} has unknown word category {1} ", term, category));
                }


                // word itself + other info

                bool hasVerbConjugation = FiFiPageEntry.HasVerbConjugationLink(wordElem);

                if (hasVerbConjugation)
                {
                    verbConjugation = VerbConjugationPage.GetPage(term);
                }

                IElement infoClone = (IElement)wordElem.Clone();

                infoClone.RemoveChild(infoClone.FirstElementChild);
                info = infoClone.TextContent.Trim();

                var anchors = wordElem.QuerySelectorAll("a").Where(a => a.HasAttribute("href"));

                if (anchors.Count() > 0)
                {
                    linkDict = new LinkDictionary(anchors);
                }
                else
                {
                    linkDict = new LinkDictionary();
                }


                // 'Get' here could be 'Discover'.  DiscoverDefinitions, DiscoverSections, etc.
                // definitions
                definitions = Definition.GetDefinitions(elements.Skip(1).FirstOrDefault(), new Definition[0]);

                // sections
                sections = Section.GetSections(elements.Skip(1).First());

                // inflection tables
                declensionTable = DeclensionTable.GetTables(elements.Skip(1).First()).ToArray();

                //inflections = declensions.SelectMany(iTable => iTable.declensions).ToArray();
            }
Пример #4
0
        //internal readonly WordCategories[] Categories; // lexical category

        // accepts the string representation of a Wikisanakirja (fi.wiktionary.org) page for a Finnish word
        // if the argument is bad, returns a instance where IsFinnish() returns false.
        internal FiFiPage(string page) : base(page)
        {
            headline = QuerySelector("h2:has(span):contains('Suomi')");
            if (headline == null)
            {
                return;
            }

            elements = GetFinnishElements(headline, new IElement[0]);

            var orphanElements = elements.Skip(1).TakeWhile(elem => elem.TagName != "H3");

            if (orphanElements != null && orphanElements.Count() > 0)
            {
                // TODO: do something aobut this edge case. (q.v. "kuka")
                // Ideally, tack them in order just after the first H3 in elements.
                // for now, just note it
                // Logger.Warn("{0} has {1} orphaned elements", word, orphanElements.Count());
            }

            entries = FiFiPageEntry.GetEntries(elements.ToArray(), new FiFiPageEntry[0]);
        }
Пример #5
0
            internal static Definition[] GetDefinitions(IElement elem, Definition[] defs)
            {
                if (FiFiPageEntry.IsEnd(elem))
                {
                    return(defs);
                }


                if (elem.TagName == "OL")
                {
                    var newDefs = GetDefinitions(elem.FirstElementChild, defs);
                    return(newDefs);
                }

                if (elem.TagName == "LI")
                {
                    var def     = new Definition(elem, defs.Count() + 1);
                    var newDefs = Definition.AddDefinition(defs, def);
                    return(GetDefinitions(elem.NextElementSibling, newDefs));
                }

                return(GetDefinitions(elem.NextElementSibling, defs));
            }
Пример #6
0
            internal static FiFiPageEntry[] GetEntries(IElement[] elements, FiFiPageEntry[] entries)
            {
                // cycle through Finnish elements on HtmlPage
                IElement elem = elements.FirstOrDefault();

                if (FiFiPage.IsEndOfPage(elem))
                {
                    return(entries);
                }

                // page entries are headed by h3 tags
                // if element is not an h3 then
                if (elem.TagName != "H3")
                {
                    return(GetEntries(elements.Skip(1).ToArray(), entries));
                }


                // otherwise, create a new PageEntry, add it to entries, and continue.
                FiFiPageEntry entry = new FiFiPageEntry(elements);

                return(GetEntries(elements.Skip(1).ToArray(), FiFiPageEntry.AddEntry(entries, entry)));
            }