internal static Section[] GetSections(IElement elem, Section[] sections = null) { if (sections == null) { return(GetSections(elem, new Section[0])); } if (FiFiPageEntry.IsEnd(elem)) { return(sections); } var sectionHeadTags = new string[2] { "H4", "H5" }; if (sectionHeadTags.Contains(elem.TagName)) { // some sections have no text content var sectionElems = FiFiPage.GetElementsUntil(sectionHeadTags, elem.NextElementSibling); bool hasContent = sectionElems.Count() > 0 && sectionElems.Select(e => e.TextContent.Trim()).Aggregate((x, y) => x + y).Length > 0; if (hasContent) { return(GetSections(elem.NextElementSibling, AddSection(sections, new Section(elem)))); } } return(GetSections(elem.NextElementSibling, sections)); }
internal static FiFiPageEntry[] AddEntry(FiFiPageEntry[] pages, FiFiPageEntry page) { FiFiPageEntry[] newArray = new FiFiPageEntry[pages.Length + 1]; pages.CopyTo(newArray, 0); newArray.SetValue(page, pages.Length); return(newArray); }
public FiFiPageEntry(IElement[] elements) { Invalidate(elements); IElement wordElem = GetWordElement(elements.Skip(1)); term = wordElem.FirstElementChild.TextContent.Trim(); // entry info order: // category category = GetWordCategory(elements.First()); if (!PartsOfSpeech.Contains(category.ToLower())) { Console.WriteLine(); Logger.Warn(String.Format("{0} has unknown word category {1} ", term, category)); } // word itself + other info bool hasVerbConjugation = FiFiPageEntry.HasVerbConjugationLink(wordElem); if (hasVerbConjugation) { verbConjugation = VerbConjugationPage.GetPage(term); } IElement infoClone = (IElement)wordElem.Clone(); infoClone.RemoveChild(infoClone.FirstElementChild); info = infoClone.TextContent.Trim(); var anchors = wordElem.QuerySelectorAll("a").Where(a => a.HasAttribute("href")); if (anchors.Count() > 0) { linkDict = new LinkDictionary(anchors); } else { linkDict = new LinkDictionary(); } // 'Get' here could be 'Discover'. DiscoverDefinitions, DiscoverSections, etc. // definitions definitions = Definition.GetDefinitions(elements.Skip(1).FirstOrDefault(), new Definition[0]); // sections sections = Section.GetSections(elements.Skip(1).First()); // inflection tables declensionTable = DeclensionTable.GetTables(elements.Skip(1).First()).ToArray(); //inflections = declensions.SelectMany(iTable => iTable.declensions).ToArray(); }
//internal readonly WordCategories[] Categories; // lexical category // accepts the string representation of a Wikisanakirja (fi.wiktionary.org) page for a Finnish word // if the argument is bad, returns a instance where IsFinnish() returns false. internal FiFiPage(string page) : base(page) { headline = QuerySelector("h2:has(span):contains('Suomi')"); if (headline == null) { return; } elements = GetFinnishElements(headline, new IElement[0]); var orphanElements = elements.Skip(1).TakeWhile(elem => elem.TagName != "H3"); if (orphanElements != null && orphanElements.Count() > 0) { // TODO: do something aobut this edge case. (q.v. "kuka") // Ideally, tack them in order just after the first H3 in elements. // for now, just note it // Logger.Warn("{0} has {1} orphaned elements", word, orphanElements.Count()); } entries = FiFiPageEntry.GetEntries(elements.ToArray(), new FiFiPageEntry[0]); }
internal static Definition[] GetDefinitions(IElement elem, Definition[] defs) { if (FiFiPageEntry.IsEnd(elem)) { return(defs); } if (elem.TagName == "OL") { var newDefs = GetDefinitions(elem.FirstElementChild, defs); return(newDefs); } if (elem.TagName == "LI") { var def = new Definition(elem, defs.Count() + 1); var newDefs = Definition.AddDefinition(defs, def); return(GetDefinitions(elem.NextElementSibling, newDefs)); } return(GetDefinitions(elem.NextElementSibling, defs)); }
internal static FiFiPageEntry[] GetEntries(IElement[] elements, FiFiPageEntry[] entries) { // cycle through Finnish elements on HtmlPage IElement elem = elements.FirstOrDefault(); if (FiFiPage.IsEndOfPage(elem)) { return(entries); } // page entries are headed by h3 tags // if element is not an h3 then if (elem.TagName != "H3") { return(GetEntries(elements.Skip(1).ToArray(), entries)); } // otherwise, create a new PageEntry, add it to entries, and continue. FiFiPageEntry entry = new FiFiPageEntry(elements); return(GetEntries(elements.Skip(1).ToArray(), FiFiPageEntry.AddEntry(entries, entry))); }