Пример #1
0
        public Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type)
        {
            var wordTokenizer = new LatinWordTokenizer();
            var texts         = new List <IText>();

            foreach (string projectId in projects)
            {
                string dir = null;
                switch (type)
                {
                case TextCorpusType.Source:
                    dir = "source";
                    break;

                case TextCorpusType.Target:
                    dir = "target";
                    break;
                }

                foreach (string file in Directory.EnumerateFiles(Path.Combine(_textFileDir, projectId, dir), "*.txt"))
                {
                    var text = new TextFileText(wordTokenizer, $"{projectId}_{Path.GetFileNameWithoutExtension(file)}",
                                                file);
                    texts.Add(text);
                }
            }

            return(Task.FromResult <ITextCorpus>(new DictionaryTextCorpus(texts)));
        }
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                SFDataAccessConstants.TextDataCollectionName);
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                TextType textType;
                switch (type)
                {
                case TextCorpusType.Source:
                    textType = TextType.Source;
                    break;

                case TextCorpusType.Target:
                    textType = TextType.Target;
                    break;

                default:
                    throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextType));
                }

                List <TextEntity> textList = await _texts.Query().Where(t => t.ProjectRef == projectId).ToListAsync();

                foreach (TextEntity text in textList)
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter
                                                                 .Eq("_id", TextEntity.GetTextDataId(text.Id, chapter.Number, textType));

                        BsonDocument doc = await textDataColl.Find(filter).FirstAsync();

                        texts.Add(new SFScriptureText(wordTokenizer, projectId, text.Id, chapter.Number, doc));
                    }
                }
            }

            return(texts);
        }
Пример #3
0
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                _realtimeService.GetCollectionName <TextData>());
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId);

                if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef))
                {
                    throw new DataNotFoundException("The source project reference is missing");
                }
                string textCorpusProjectId = type switch
                {
                    TextCorpusType.Source => project.TranslateConfig.Source.ProjectRef,
                    TextCorpusType.Target => projectId,
                    _ => throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)),
                };
                foreach (TextInfo text in project.Texts.Where(t => t.HasSource))
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number);
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id);

                        BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync();

                        if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null)
                        {
                            texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc));
                        }
                    }
                }
            }

            return(texts);
        }
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer                = new LatinWordTokenizer();
            IMongoDatabase  sfDatabase                   = _mongoClient.GetDatabase("scriptureforge");
            IMongoDatabase  realtimeDatabase             = _mongoClient.GetDatabase("realtime");
            IMongoCollection <BsonDocument> projectsColl = sfDatabase.GetCollection <BsonDocument>("projects");
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                Project project = await _projectRepo.GetAsync(projectId);

                if (project == null)
                {
                    continue;
                }

                string segmentType = null;
                string suffix      = null;
                switch (type)
                {
                case TextCorpusType.Source:
                    suffix      = "source";
                    segmentType = project.SourceSegmentType;
                    break;

                case TextCorpusType.Target:
                    suffix      = "target";
                    segmentType = project.TargetSegmentType;
                    break;
                }
                StringTokenizer segmentTokenizer = null;
                if (segmentType != null)
                {
                    segmentTokenizer = WebApiUtils.CreateSegmentTokenizer(segmentType);
                }

                FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id",
                                                                                            ObjectId.Parse(projectId));

                BsonDocument projectDoc = await projectsColl.Find(filter).FirstOrDefaultAsync();

                if (projectDoc == null)
                {
                    continue;
                }
                var code        = "sf_" + (string)projectDoc["projectCode"];
                var isScripture = (bool)projectDoc["config"]["isTranslationDataScripture"];

                IMongoCollection <BsonDocument> projectColl = realtimeDatabase.GetCollection <BsonDocument>(code);
                IMongoDatabase projectDatabase = _mongoClient.GetDatabase(code);
                IMongoCollection <BsonDocument> translateColl = projectDatabase.GetCollection <BsonDocument>("translate");
                filter = Builders <BsonDocument> .Filter.Eq("isDeleted", false);

                using (IAsyncCursor <BsonDocument> cursor = await translateColl.Find(filter).ToCursorAsync())
                {
                    while (await cursor.MoveNextAsync())
                    {
                        foreach (BsonDocument docInfo in cursor.Current)
                        {
                            var id = (ObjectId)docInfo["_id"];
                            filter = Builders <BsonDocument> .Filter.Eq("_id", $"{id}:{suffix}");

                            BsonDocument doc = await projectColl.Find(filter).FirstAsync();

                            if (isScripture)
                            {
                                texts.Add(new XForgeScriptureText(wordTokenizer, project.Id, doc));
                            }
                            else
                            {
                                texts.Add(new XForgeRichText(segmentTokenizer, wordTokenizer, project.Id, doc));
                            }
                        }
                    }
                }
            }

            return(texts);
        }
 public async Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type)
 {
     return(new DictionaryTextCorpus(await CreateTextsAsync(projects, type)));
 }
Пример #6
0
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                _realtimeService.GetCollectionName <TextData>());
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId);

                if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef))
                {
                    throw new DataNotFoundException("The source project reference is missing");
                }

                string textCorpusProjectId;
                string paratextId;
                switch (type)
                {
                case TextCorpusType.Source:
                    textCorpusProjectId = project.TranslateConfig.Source.ProjectRef;
                    paratextId          = project.TranslateConfig.Source.ParatextId;
                    break;

                case TextCorpusType.Target:
                    textCorpusProjectId = projectId;
                    paratextId          = project.ParatextId;
                    break;

                default:
                    throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType));
                }

                foreach (TextInfo text in project.Texts.Where(t => t.HasSource))
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number);
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id);

                        BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync();

                        if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null)
                        {
                            texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc));
                        }
                    }
                }

                string termRenderingsFileName = Path.Combine(_siteOptions.Value.SiteDir, "sync", paratextId,
                                                             "target", "TermRenderings.xml");
                if (_fileSystemService.FileExists(termRenderingsFileName))
                {
                    using var stream = _fileSystemService.OpenFile(termRenderingsFileName, FileMode.Open);
                    XDocument termRenderingsDoc = await XDocument.LoadAsync(stream, LoadOptions.None,
                                                                            CancellationToken.None);

                    texts.Add(new SFBiblicalTermsText(wordTokenizer, projectId, termRenderingsDoc));
                }
            }

            return(texts);
        }