public async Task <object> Tag(IDocument document)
        {
            if (document == null)
            {
                throw new ArgumentNullException(nameof(document));
            }

            var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement);

            var data = (await this.miner.Mine(textContent))
                       .ToArray();

            var specimenCodes = data.Select(s => new SpecimenCodeSerializableModel
            {
                Title = s.ContentType.IndexOf("http") == 0 ? null : s.ContentType,
                Href  = s.ContentType.IndexOf("http") == 0 ? s.ContentType : null,
                Value = s.Content
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = true,
                MinimalTextSelect = true
            };

            return(await this.tagger.Tag(
                       document.XmlDocument.DocumentElement,
                       document.NamespaceManager,
                       specimenCodes,
                       XPathStrings.RootNodesOfContext,
                       settings));
        }
        public async Task <object> Tag(IDocument document)
        {
            if (document == null)
            {
                throw new ArgumentNullException(nameof(document));
            }

            var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement);

            var data = (await this.miner.Mine(textContent))
                       .Select(t => new EnvoTermResponseModel
            {
                EntityId = t.EntityId,
                EnvoId   = t.EnvoId,
                Content  = t.Content
            })
                       .Select(t => new EnvoTermSerializableModel
            {
                Value        = t.Content,
                EnvoId       = t.EnvoId,
                Id           = t.EntityId,
                VerbatimTerm = t.Content
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = false,
                MinimalTextSelect = true
            };

            await this.contentTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, data, XPath, settings);

            return(true);
        }
        public async Task <object> Tag(IDocument document)
        {
            if (document == null)
            {
                throw new ArgumentNullException(nameof(document));
            }

            var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement);

            var data = (await this.miner.Mine(textContent))
                       .Select(t => new EnvoExtractHcmrSerializableModel
            {
                Value      = t.Content,
                Type       = string.Join("|", t.Types),
                Identifier = string.Join("|", t.Identifiers)
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = false,
                MinimalTextSelect = true
            };

            await this.contentTagger.Tag(document.XmlDocument, document.NamespaceManager, data, XPath, settings);

            return(true);
        }
Пример #4
0
        private async Task ReplaceSpecimenCodesInXml(IDocument document, string xpathTemplate, IEnumerable <ISpecimenCode> specimenCodes, XmlElement tagModel)
        {
            foreach (var specimenCode in specimenCodes)
            {
                var codeElement = (XmlElement)tagModel.CloneNode(true);
                codeElement.SetAttribute("prefix", specimenCode.Prefix);
                codeElement.SetAttribute("type", specimenCode.Type);

                var settings = new ContentTaggerSettings
                {
                    CaseSensitive     = true,
                    MinimalTextSelect = false
                };

                await this.contentTagger.TagContentInDocument(specimenCode.Code, codeElement, xpathTemplate, document, settings);
            }

            /*
             * Here we might have nested <specimen_code> which probably is due to mistaken codes.
             */
            {
                string nestedSpecimenCodesXpath = string.Format(".//{0}[{0}]", tagModel.Name);
                foreach (XmlNode nestedSpecimenCodesNode in document.SelectNodes(nestedSpecimenCodesXpath))
                {
                    this.logger?.Log("WARNING: Nested specimen codes: " + nestedSpecimenCodesNode.InnerXml);
                }
            }
        }
        public async Task <object> Tag(IDocument document)
        {
            if (document == null)
            {
                throw new ArgumentNullException(nameof(document));
            }

            var textContent = await this.contentHarvester.Harvest(document.XmlDocument.DocumentElement);

            var data = (await this.miner.Mine(textContent))
                       .Select(i => new ExternalLinkXmlModel
            {
                Href             = i.Href,
                ExternalLinkType = i.Type.GetName(),
                Value            = i.Content
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = false,
                MinimalTextSelect = true
            };

            await this.contentTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, data, XPath, settings);

            return(true);
        }
        private async Task DeepTag(IDocument document)
        {
            var knownLowerTaxaNamesXml = new HashSet <string>(document.SelectNodes(".//tn[@type='lower']").Select(x => x.InnerXml));

            ////// string clearUselessTaxonNamePartsSubpattern = string.Join(
            //////    "|",
            //////    SpeciesPartsPrefixesResolver
            //////        .SpeciesPartsRanks
            //////        .Keys
            //////        .Select(k => $"\\b{Regex.Escape(k)}\\b\\."));

            // TODO: This algorithm must be refined: generate smaller pattern strings from the original.
            var taxa = knownLowerTaxaNamesXml
                       .Select(t => t.RegexReplace(@"<(sensu)[^>/]*>.*?</\1>|<((?:basionym-)?authority)[^>/]*>.*?</\2>|<(infraspecific-rank)[^>/]*>.*?</\3>|\bcf\b\.|\bvar\b\.", string.Empty))
                       .Select(t => t.RegexReplace(@"<[^>]*>", string.Empty).RegexReplace(@"[\s\d\?]+\-?", " "))
                       .Select(t => t.RegexReplace(@"[^\w\.]+", " ").Trim())
                       .Distinct()
                       .ToList();

            taxa.AddRange(document.SelectNodes(".//treatment//species_name/fields | .//checklist_taxon/fields")
                          .Select(this.GetSystemTaxonNameString)
                          .Where(t => !string.IsNullOrWhiteSpace(t))
                          .Distinct()
                          .ToArray());

            foreach (string taxon in new HashSet <string>(taxa))
            {
                taxa.AddRange(
                    taxon.Split(new char[] { ' ' }, StringSplitOptions.RemoveEmptyEntries)
                    .Where(s => !string.IsNullOrWhiteSpace(s) && s.Length > 2));
            }

            var orderedTaxaParts = new HashSet <string>(taxa).OrderByDescending(t => t.Length);

            var tagModel = document.CreateTaxonNameXmlElement(TaxonType.Lower);

            foreach (var item in orderedTaxaParts)
            {
                try
                {
                    var settings = new ContentTaggerSettings
                    {
                        CaseSensitive     = true,
                        MinimalTextSelect = true
                    };

                    await this.contentTagger.TagContentInDocument(item, tagModel, LowerTaxaXPath, document, settings);
                }
                catch (Exception e)
                {
                    this.logger.Log(e, "‘{0}’", item);
                }
            }
        }
        private async Task TagCollections(
            IDocument document,
            IEnumerable <IBiorepositoriesCollection> data)
        {
            var collections = data.Select(c => new BiorepositoriesCollectionSerializableModel
            {
                Url   = c.Url,
                Value = c.CollectionName
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = true,
                MinimalTextSelect = true
            };

            await this.collectionsTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, collections, XPath, settings);
        }
Пример #8
0
        private async Task TagInstitutionalCodes(
            IDocument document,
            IEnumerable <IBiorepositoriesInstitution> data)
        {
            var institutionalCodes = data.Select(i => new BiorepositoriesInstitutionalCodeSerializableModel
            {
                Description = i.NameOfInstitution,
                Url         = i.Url,
                Value       = i.InstitutionalCode
            });

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = true,
                MinimalTextSelect = true
            };

            await this.institutionalCodesTagger.Tag(document.XmlDocument.DocumentElement, document.NamespaceManager, institutionalCodes, XPath, settings);
        }
Пример #9
0
        public async Task <object> Tag(IDocument document, IEnumerable <string> data, XmlElement tagModel, string contentNodesXPath)
        {
            if (document == null)
            {
                throw new ArgumentNullException(nameof(document));
            }

            if (data == null)
            {
                throw new ArgumentNullException(nameof(data));
            }

            if (tagModel == null)
            {
                throw new ArgumentNullException(nameof(tagModel));
            }

            if (string.IsNullOrWhiteSpace(contentNodesXPath))
            {
                throw new ArgumentNullException(nameof(contentNodesXPath));
            }

            var itemsToTag = data.ToList()
                             .Select(d => this.XmlEncode(document, d))
                             .OrderByDescending(i => i.Length)
                             .ToList();

            var settings = new ContentTaggerSettings
            {
                CaseSensitive     = false,
                MinimalTextSelect = true
            };

            await this.contentTagger.TagContentInDocument(itemsToTag, tagModel, contentNodesXPath, document, settings);

            return(true);
        }