Beispiel #1
0
        public async IAsyncEnumerable <BiorxivHtmlFileInfoDto> FindNewBiorxivStudiesAndSave(string rssFeedUrl,
                                                                                            Guid livingSearchId, Guid fileId, Guid projectId, string description, int batchSize)
        {
            var fileNumber             = 1;
            var studyNumber            = 0;
            var nodes                  = new Collection <HtmlNode>();
            var biorxivStudyReferences = new Collection <BiorxivStudyReference>();
            var stream                 = await GetStream(rssFeedUrl);

            var web = new HtmlWeb();

            using var result = new StreamReader(stream);
            var xmlReaderSettings = new XmlReaderSettings()
            {
                Async = true
            };

            using var reader = XmlReader.Create(stream, xmlReaderSettings);

            await reader.MoveToContentAsync();

            XNamespace xmlns = reader[6];

            while (!reader.EOF)
            {
                if (reader.NodeType == XmlNodeType.Element && reader.Name == "item")
                {
                    if (!(XNode.ReadFrom(reader) is XElement el))
                    {
                        continue;
                    }
                    var doi          = el.Descendants(xmlns + "identifier").First().Value.Split(":")[1];
                    var studyPageUrl = el.FirstAttribute.Value;
                    if (await _lsUnitOfWork.BiorxivStudyReferenceRepository.ContainsReferenceWith(projectId, doi))
                    {
                        continue;
                    }
                    var studyId = Guid.NewGuid();
                    el.Add(new XElement("StudyId", studyId.ToString()));
                    biorxivStudyReferences.Add(new BiorxivStudyReference(studyId, projectId, livingSearchId, doi,
                                                                         studyPageUrl));
                    HtmlDocument studyPage;
                    try
                    {
                        studyPage = await web.LoadFromWebAsync(studyPageUrl);

                        nodes.Add(studyPage.DocumentNode.SelectSingleNode("//head"));
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(
                            $"An error occured while fetching study with doi: {doi} from BioRxiv. Error message: {e.Message}");
                        studyPage = new HtmlDocument();
                        studyPage.DocumentNode.SelectSingleNode("html").AppendChild(
                            HtmlNode.CreateNode(
                                $"<head>An error occured while getting study with DOI: {doi} from BioRxiv. Error message: {e.Message}</head>"));
                        nodes.Add(studyPage.DocumentNode.SelectSingleNode("//head"));
                    }

                    studyNumber++;
                    if (studyNumber != batchSize)
                    {
                        continue;
                    }
                    var fileUri = await SaveBatchOfBiorxivHeadNodes(nodes, projectId, livingSearchId,
                                                                    fileId, description, fileNumber);

                    await _lsUnitOfWork.SaveManyAsync(biorxivStudyReferences);

                    nodes.Clear();
                    biorxivStudyReferences.Clear();
                    yield return(new BiorxivHtmlFileInfoDto(fileUri, fileNumber, studyNumber));

                    studyNumber = 0;
                    fileNumber++;
                }
                else
                {
                    await reader.ReadAsync();
                }
            }

            if (studyNumber != 0)
            {
                var fileUri = await SaveBatchOfBiorxivHeadNodes(nodes, projectId, livingSearchId,
                                                                fileId, description, fileNumber);

                await _lsUnitOfWork.SaveManyAsync(biorxivStudyReferences);

                yield return(new BiorxivHtmlFileInfoDto(fileUri, fileNumber, studyNumber));
            }
        }
Beispiel #2
0
        public async IAsyncEnumerable <PubmedXmlFileInfoDto> FindNewPubmedStudiesAndSave(Guid livingSearchId,
                                                                                         Guid fileId,
                                                                                         string description, Guid projectId,
                                                                                         string searchTerm, int batchSize)
        {
            var queryWebResult = await _pubmedWebClient.SubmitSearch(searchTerm);

            var fileNumber           = 1;
            var numberOfNewReference = 0;
            var totalFileNumber      = queryWebResult.Count / batchSize;
            var nodes = new Collection <XElement>();
            var pubmedStudyReferences = new Collection <PubmedStudyReference>();

            for (var retStart = 0; retStart < queryWebResult.Count; retStart += batchSize)
            {
                var xmlString = await _pubmedWebClient.GetRecordsXmlString(queryWebResult.WebEnv,
                                                                           queryWebResult.QueryKey, batchSize, retStart);

                var xmlReaderSettings = new XmlReaderSettings()
                {
                    Async = true
                };
                using var reader = XmlReader.Create(xmlString, xmlReaderSettings);
                await reader.MoveToContentAsync();

                while (!reader.EOF)
                {
                    if (reader.NodeType == XmlNodeType.Element && reader.Name == "PubmedArticle")
                    {
                        if (!(XNode.ReadFrom(reader) is XElement el))
                        {
                            continue;
                        }
                        var pubmedId = el.Descendants("PMID").First().Value;
                        if (await _lsUnitOfWork.PubmedStudyReferenceRepository.ContainsReferenceWith(projectId, pubmedId))
                        {
                            continue;
                        }
                        var studyId = Guid.NewGuid();
                        el.Add(new XElement("StudyId", studyId.ToString()));
                        pubmedStudyReferences.Add(new PubmedStudyReference(studyId, projectId,
                                                                           livingSearchId, el.Descendants("ELocationID").First().Value, pubmedId));
                        nodes.Add(el);
                        numberOfNewReference++;
                        if (numberOfNewReference != batchSize)
                        {
                            continue;
                        }
                        var fileUri = await SaveBatchOfPubmedNodes(nodes, projectId, livingSearchId, fileId,
                                                                   description, fileNumber);

                        nodes.Clear();
                        await _lsUnitOfWork.SaveManyAsync(pubmedStudyReferences);

                        pubmedStudyReferences.Clear();
                        yield return(new PubmedXmlFileInfoDto(fileUri, queryWebResult.WebEnv, queryWebResult.Count,
                                                              fileNumber, numberOfNewReference, totalFileNumber, queryWebResult.QueryKey));

                        numberOfNewReference = 0;
                        fileNumber++;
                    }
                    else
                    {
                        await reader.ReadAsync();
                    }
                }
            }

            if (numberOfNewReference != 0)
            {
                var fileUri = await SaveBatchOfPubmedNodes(nodes, projectId, livingSearchId, fileId,
                                                           description, fileNumber);

                await _lsUnitOfWork.SaveManyAsync(pubmedStudyReferences);

                yield return(new PubmedXmlFileInfoDto(fileUri, queryWebResult.WebEnv, queryWebResult.Count,
                                                      fileNumber, numberOfNewReference, totalFileNumber, queryWebResult.QueryKey));
            }
        }