public async Task <object> Tag(IDocument document) { if (document == null) { throw new ArgumentNullException(nameof(document)); } var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement); var data = (await this.miner.Mine(textContent)) .ToArray(); var specimenCodes = data.Select(s => new SpecimenCodeSerializableModel { Title = s.ContentType.IndexOf("http") == 0 ? null : s.ContentType, Href = s.ContentType.IndexOf("http") == 0 ? s.ContentType : null, Value = s.Content }); var settings = new ContentTaggerSettings { CaseSensitive = true, MinimalTextSelect = true }; return(await this.tagger.Tag( document.XmlDocument.DocumentElement, document.NamespaceManager, specimenCodes, XPathStrings.RootNodesOfContext, settings)); }
public async Task <object> Tag(IDocument document) { if (document == null) { throw new ArgumentNullException(nameof(document)); } var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement); var data = (await this.miner.Mine(textContent)) .Select(t => new EnvoTermResponseModel { EntityId = t.EntityId, EnvoId = t.EnvoId, Content = t.Content }) .Select(t => new EnvoTermSerializableModel { Value = t.Content, EnvoId = t.EnvoId, Id = t.EntityId, VerbatimTerm = t.Content }); var settings = new ContentTaggerSettings { CaseSensitive = false, MinimalTextSelect = true }; await this.contentTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, data, XPath, settings); return(true); }
public async Task <object> Tag(IDocument document) { if (document == null) { throw new ArgumentNullException(nameof(document)); } var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement); var data = (await this.miner.Mine(textContent)) .Select(t => new EnvoExtractHcmrSerializableModel { Value = t.Content, Type = string.Join("|", t.Types), Identifier = string.Join("|", t.Identifiers) }); var settings = new ContentTaggerSettings { CaseSensitive = false, MinimalTextSelect = true }; await this.contentTagger.Tag(document.XmlDocument, document.NamespaceManager, data, XPath, settings); return(true); }
private async Task ReplaceSpecimenCodesInXml(IDocument document, string xpathTemplate, IEnumerable <ISpecimenCode> specimenCodes, XmlElement tagModel) { foreach (var specimenCode in specimenCodes) { var codeElement = (XmlElement)tagModel.CloneNode(true); codeElement.SetAttribute("prefix", specimenCode.Prefix); codeElement.SetAttribute("type", specimenCode.Type); var settings = new ContentTaggerSettings { CaseSensitive = true, MinimalTextSelect = false }; await this.contentTagger.TagContentInDocument(specimenCode.Code, codeElement, xpathTemplate, document, settings); } /* * Here we might have nested <specimen_code> which probably is due to mistaken codes. */ { string nestedSpecimenCodesXpath = string.Format(".//{0}[{0}]", tagModel.Name); foreach (XmlNode nestedSpecimenCodesNode in document.SelectNodes(nestedSpecimenCodesXpath)) { this.logger?.Log("WARNING: Nested specimen codes: " + nestedSpecimenCodesNode.InnerXml); } } }
public async Task <object> Tag(IDocument document) { if (document == null) { throw new ArgumentNullException(nameof(document)); } var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement); var data = (await this.miner.Mine(textContent)) .Select(i => new ExternalLinkXmlModel { Href = i.Href, ExternalLinkType = i.Type.GetName(), Value = i.Content }); var settings = new ContentTaggerSettings { CaseSensitive = false, MinimalTextSelect = true }; await this.contentTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, data, XPath, settings); return(true); }
private async Task DeepTag(IDocument document) { var knownLowerTaxaNamesXml = new HashSet <string>(document.SelectNodes(".//tn[@type='lower']").Select(x => x.InnerXml)); ////// string clearUselessTaxonNamePartsSubpattern = string.Join( ////// "|", ////// SpeciesPartsPrefixesResolver ////// .SpeciesPartsRanks ////// .Keys ////// .Select(k => $"\\b{Regex.Escape(k)}\\b\\.")); // TODO: This algorithm must be refined: generate smaller pattern strings from the original. var taxa = knownLowerTaxaNamesXml .Select(t => t.RegexReplace(@"<(sensu)[^>/]*>.*?</\1>|<((?:basionym-)?authority)[^>/]*>.*?</\2>|<(infraspecific-rank)[^>/]*>.*?</\3>|\bcf\b\.|\bvar\b\.", string.Empty)) .Select(t => t.RegexReplace(@"<[^>]*>", string.Empty).RegexReplace(@"[\s\d\?]+\-?", " ")) .Select(t => t.RegexReplace(@"[^\w\.]+", " ").Trim()) .Distinct() .ToList(); taxa.AddRange(document.SelectNodes(".//treatment//species_name/fields | .//checklist_taxon/fields") .Select(this.GetSystemTaxonNameString) .Where(t => !string.IsNullOrWhiteSpace(t)) .Distinct() .ToArray()); foreach (string taxon in new HashSet <string>(taxa)) { taxa.AddRange( taxon.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries) .Where(s => !string.IsNullOrWhiteSpace(s) && s.Length > 2)); } var orderedTaxaParts = new HashSet <string>(taxa).OrderByDescending(t => t.Length); var tagModel = document.CreateTaxonNameXmlElement(TaxonType.Lower); foreach (var item in orderedTaxaParts) { try { var settings = new ContentTaggerSettings { CaseSensitive = true, MinimalTextSelect = true }; await this.contentTagger.TagContentInDocument(item, tagModel, LowerTaxaXPath, document, settings); } catch (Exception e) { this.logger.Log(e, "‘{0}’", item); } } }
private async Task TagCollections( IDocument document, IEnumerable <IBiorepositoriesCollection> data) { var collections = data.Select(c => new BiorepositoriesCollectionSerializableModel { Url = c.Url, Value = c.CollectionName }); var settings = new ContentTaggerSettings { CaseSensitive = true, MinimalTextSelect = true }; await this.collectionsTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, collections, XPath, settings); }
private async Task TagInstitutionalCodes( IDocument document, IEnumerable <IBiorepositoriesInstitution> data) { var institutionalCodes = data.Select(i => new BiorepositoriesInstitutionalCodeSerializableModel { Description = i.NameOfInstitution, Url = i.Url, Value = i.InstitutionalCode }); var settings = new ContentTaggerSettings { CaseSensitive = true, MinimalTextSelect = true }; await this.institutionalCodesTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, institutionalCodes, XPath, settings); }
public async Task <object> Tag(IDocument document, IEnumerable <string> data, XmlElement tagModel, string contentNodesXPath) { if (document == null) { throw new ArgumentNullException(nameof(document)); } if (data == null) { throw new ArgumentNullException(nameof(data)); } if (tagModel == null) { throw new ArgumentNullException(nameof(tagModel)); } if (string.IsNullOrWhiteSpace(contentNodesXPath)) { throw new ArgumentNullException(nameof(contentNodesXPath)); } var itemsToTag = data.ToList() .Select(d => this.XmlEncode(document, d)) .OrderByDescending(i => i.Length) .ToList(); var settings = new ContentTaggerSettings { CaseSensitive = false, MinimalTextSelect = true }; await this.contentTagger.TagContentInDocument(itemsToTag, tagModel, contentNodesXPath, document, settings); return(true); }